// SPDX-FileCopyrightText: © 2025 Nøkken.io // SPDX-License-Identifier: AGPL-3.0 // // clustering.cpp // Implementation of clustering and pattern recognition functions // #include "health_analytics_engine.h" #include "utils.h" /** * @brief Perform cluster analysis on multivariate health data * * @param data 2D array of data points [n_samples x n_features] * @param factorCount Number of features per data point * @param dataLength Number of data points * @param maxClusters Maximum number of clusters to identify * @return ClusterResult* Array of cluster results */ ClusterResult* perform_cluster_analysis(const double** data, int factorCount, int dataLength, int maxClusters) { if (factorCount < 2 || dataLength < 5 || maxClusters <= 0) { ClusterResult* dummy = new ClusterResult[1]; memset(dummy, 0, sizeof(ClusterResult)); dummy[0].clusterId = -1; // Mark as invalid return dummy; } // Normalize data for better clustering std::vector> normalizedData(dataLength, std::vector(factorCount)); std::vector means(factorCount, 0); std::vector stdDevs(factorCount, 0); // Calculate means for (int j = 0; j < factorCount; j++) { for (int i = 0; i < dataLength; i++) { means[j] += data[i][j]; } means[j] /= dataLength; } // Calculate standard deviations for (int j = 0; j < factorCount; j++) { for (int i = 0; i < dataLength; i++) { double diff = data[i][j] - means[j]; stdDevs[j] += diff * diff; } stdDevs[j] = sqrt(stdDevs[j] / dataLength); if (stdDevs[j] < 1e-10) stdDevs[j] = 1.0; // Avoid division by zero } // Normalize data for (int i = 0; i < dataLength; i++) { for (int j = 0; j < factorCount; j++) { normalizedData[i][j] = (data[i][j] - means[j]) / stdDevs[j]; } } // Find optimal number of clusters (between 2 and maxClusters) int optimalClusters = 2; double bestSilhouette = -1; // Arrays for k-means algorithm std::vector assignments(dataLength); std::vector> centroids(maxClusters, std::vector(factorCount)); std::vector normalizedDataPtrs(dataLength); for (int i = 0; i < dataLength; i++) { normalizedDataPtrs[i] = normalizedData[i].data(); } for (int k = 2; k <= maxClusters; k++) { // Run k-means clustering std::vector tempAssignments(dataLength, 0); std::vector> tempCentroids(k, std::vector(factorCount, 0)); std::vector centroidPtrs(k); for (int i = 0; i < k; i++) { centroidPtrs[i] = tempCentroids[i].data(); } kMeansClustering(normalizedDataPtrs.data(), dataLength, factorCount, k, 100, centroidPtrs.data(), tempAssignments.data()); // Calculate silhouette coefficient std::vector dataPtrs(dataLength); for (int i = 0; i < dataLength; i++) { dataPtrs[i] = data[i]; // Use original data for silhouette } double silhouette = calculateSilhouetteCoefficient( dataPtrs.data(), dataLength, factorCount, tempAssignments.data(), k); // Update if better silhouette found if (silhouette > bestSilhouette) { bestSilhouette = silhouette; optimalClusters = k; assignments = tempAssignments; centroids = tempCentroids; } } // Allocate cluster results (plus one for terminator) ClusterResult* results = new ClusterResult[optimalClusters + 1]; // Count points in each cluster std::vector clusterSizes(optimalClusters, 0); for (int i = 0; i < dataLength; i++) { clusterSizes[assignments[i]]++; } // Calculate cluster statistics for (int c = 0; c < optimalClusters; c++) { ClusterResult& cluster = results[c]; cluster.clusterId = c; cluster.dataPointCount = clusterSizes[c]; // Calculate cluster significance based on size and compactness double avgDistance = 0; int count = 0; for (int i = 0; i < dataLength; i++) { if (assignments[i] == c) { double dist = 0; for (int j = 0; j < factorCount; j++) { double diff = normalizedData[i][j] - centroids[c][j]; dist += diff * diff; } avgDistance += sqrt(dist); count++; } } if (count > 0) { avgDistance /= count; } // Higher significance for larger and more compact clusters double sizeFactor = static_cast(count) / dataLength; double compactnessFactor = 1.0 - std::min(1.0, avgDistance / 3.0); cluster.significance = sizeFactor * compactnessFactor; // Identify important factors for this cluster std::vector> factorImportance; for (int j = 0; j < factorCount; j++) { // Calculate how different this centroid is from global mean for this factor double differenceFromMean = std::abs(centroids[c][j]); factorImportance.push_back(std::make_pair(j, differenceFromMean)); } // Sort factors by importance std::sort(factorImportance.begin(), factorImportance.end(), [](const std::pair& a, const std::pair& b) { return a.second > b.second; }); // Store top factors (up to 3) int numFactors = std::min(3, factorCount); for (int j = 0; j < numFactors; j++) { cluster.factorIndices[j] = factorImportance[j].first; cluster.factorWeights[j] = std::min(1.0, factorImportance[j].second); // Normalize weights to sum to 1 double totalWeight = 0; for (int k = 0; k < numFactors; k++) { totalWeight += cluster.factorWeights[k]; } if (totalWeight > 0) { for (int k = 0; k < numFactors; k++) { cluster.factorWeights[k] /= totalWeight; } } } // Generate descriptive cluster name based on key factors // In a real implementation, this would use domain knowledge snprintf(cluster.clusterName, MAX_STRING_SIZE, "Cluster %d - %s", c + 1, (cluster.significance > 0.7) ? "High Significance" : (cluster.significance > 0.4) ? "Medium Significance" : "Low Significance"); // Generate detailed description snprintf(cluster.description, MAX_STRING_SIZE, "Cluster of %d data points characterized by %s factors. Silhouette: %.2f", cluster.dataPointCount, (numFactors > 0) ? "multiple interrelated" : "non-specific", bestSilhouette); } // Mark the end of valid results results[optimalClusters].clusterId = -1; return results; } /** * @brief Free memory for cluster analysis results * * @param results Pointer to cluster results array */ void free_cluster_results(ClusterResult* results) { delete[] results; }