203 lines
No EOL
6.8 KiB
C++
203 lines
No EOL
6.8 KiB
C++
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
|
|
// SPDX-License-Identifier: AGPL-3.0
|
|
//
|
|
// clustering.cpp
|
|
// Implementation of clustering and pattern recognition functions
|
|
//
|
|
#include "health_analytics_engine.h"
|
|
#include "utils.h"
|
|
/**
|
|
* @brief Perform cluster analysis on multivariate health data
|
|
*
|
|
* @param data 2D array of data points [n_samples x n_features]
|
|
* @param factorCount Number of features per data point
|
|
* @param dataLength Number of data points
|
|
* @param maxClusters Maximum number of clusters to identify
|
|
* @return ClusterResult* Array of cluster results
|
|
*/
|
|
ClusterResult* perform_cluster_analysis(const double** data,
|
|
int factorCount,
|
|
int dataLength,
|
|
int maxClusters) {
|
|
if (factorCount < 2 || dataLength < 5 || maxClusters <= 0) {
|
|
ClusterResult* dummy = new ClusterResult[1];
|
|
memset(dummy, 0, sizeof(ClusterResult));
|
|
dummy[0].clusterId = -1; // Mark as invalid
|
|
return dummy;
|
|
}
|
|
|
|
// Normalize data for better clustering
|
|
std::vector<std::vector<double>> normalizedData(dataLength, std::vector<double>(factorCount));
|
|
std::vector<double> means(factorCount, 0);
|
|
std::vector<double> stdDevs(factorCount, 0);
|
|
|
|
// Calculate means
|
|
for (int j = 0; j < factorCount; j++) {
|
|
for (int i = 0; i < dataLength; i++) {
|
|
means[j] += data[i][j];
|
|
}
|
|
means[j] /= dataLength;
|
|
}
|
|
|
|
// Calculate standard deviations
|
|
for (int j = 0; j < factorCount; j++) {
|
|
for (int i = 0; i < dataLength; i++) {
|
|
double diff = data[i][j] - means[j];
|
|
stdDevs[j] += diff * diff;
|
|
}
|
|
stdDevs[j] = sqrt(stdDevs[j] / dataLength);
|
|
if (stdDevs[j] < 1e-10) stdDevs[j] = 1.0; // Avoid division by zero
|
|
}
|
|
|
|
// Normalize data
|
|
for (int i = 0; i < dataLength; i++) {
|
|
for (int j = 0; j < factorCount; j++) {
|
|
normalizedData[i][j] = (data[i][j] - means[j]) / stdDevs[j];
|
|
}
|
|
}
|
|
|
|
// Find optimal number of clusters (between 2 and maxClusters)
|
|
int optimalClusters = 2;
|
|
double bestSilhouette = -1;
|
|
|
|
// Arrays for k-means algorithm
|
|
std::vector<int> assignments(dataLength);
|
|
std::vector<std::vector<double>> centroids(maxClusters, std::vector<double>(factorCount));
|
|
std::vector<const double*> normalizedDataPtrs(dataLength);
|
|
for (int i = 0; i < dataLength; i++) {
|
|
normalizedDataPtrs[i] = normalizedData[i].data();
|
|
}
|
|
|
|
for (int k = 2; k <= maxClusters; k++) {
|
|
// Run k-means clustering
|
|
std::vector<int> tempAssignments(dataLength, 0);
|
|
std::vector<std::vector<double>> tempCentroids(k, std::vector<double>(factorCount, 0));
|
|
std::vector<double*> centroidPtrs(k);
|
|
for (int i = 0; i < k; i++) {
|
|
centroidPtrs[i] = tempCentroids[i].data();
|
|
}
|
|
|
|
kMeansClustering(normalizedDataPtrs.data(), dataLength, factorCount, k,
|
|
100, centroidPtrs.data(), tempAssignments.data());
|
|
|
|
// Calculate silhouette coefficient
|
|
std::vector<const double*> dataPtrs(dataLength);
|
|
for (int i = 0; i < dataLength; i++) {
|
|
dataPtrs[i] = data[i]; // Use original data for silhouette
|
|
}
|
|
|
|
double silhouette = calculateSilhouetteCoefficient(
|
|
dataPtrs.data(), dataLength, factorCount, tempAssignments.data(), k);
|
|
|
|
// Update if better silhouette found
|
|
if (silhouette > bestSilhouette) {
|
|
bestSilhouette = silhouette;
|
|
optimalClusters = k;
|
|
assignments = tempAssignments;
|
|
centroids = tempCentroids;
|
|
}
|
|
}
|
|
|
|
// Allocate cluster results (plus one for terminator)
|
|
ClusterResult* results = new ClusterResult[optimalClusters + 1];
|
|
|
|
// Count points in each cluster
|
|
std::vector<int> clusterSizes(optimalClusters, 0);
|
|
for (int i = 0; i < dataLength; i++) {
|
|
clusterSizes[assignments[i]]++;
|
|
}
|
|
|
|
// Calculate cluster statistics
|
|
for (int c = 0; c < optimalClusters; c++) {
|
|
ClusterResult& cluster = results[c];
|
|
cluster.clusterId = c;
|
|
cluster.dataPointCount = clusterSizes[c];
|
|
|
|
// Calculate cluster significance based on size and compactness
|
|
double avgDistance = 0;
|
|
int count = 0;
|
|
|
|
for (int i = 0; i < dataLength; i++) {
|
|
if (assignments[i] == c) {
|
|
double dist = 0;
|
|
for (int j = 0; j < factorCount; j++) {
|
|
double diff = normalizedData[i][j] - centroids[c][j];
|
|
dist += diff * diff;
|
|
}
|
|
avgDistance += sqrt(dist);
|
|
count++;
|
|
}
|
|
}
|
|
|
|
if (count > 0) {
|
|
avgDistance /= count;
|
|
}
|
|
|
|
// Higher significance for larger and more compact clusters
|
|
double sizeFactor = static_cast<double>(count) / dataLength;
|
|
double compactnessFactor = 1.0 - std::min(1.0, avgDistance / 3.0);
|
|
cluster.significance = sizeFactor * compactnessFactor;
|
|
|
|
// Identify important factors for this cluster
|
|
std::vector<std::pair<int, double>> factorImportance;
|
|
|
|
for (int j = 0; j < factorCount; j++) {
|
|
// Calculate how different this centroid is from global mean for this factor
|
|
double differenceFromMean = std::abs(centroids[c][j]);
|
|
factorImportance.push_back(std::make_pair(j, differenceFromMean));
|
|
}
|
|
|
|
// Sort factors by importance
|
|
std::sort(factorImportance.begin(), factorImportance.end(),
|
|
[](const std::pair<int, double>& a, const std::pair<int, double>& b) {
|
|
return a.second > b.second;
|
|
});
|
|
|
|
// Store top factors (up to 3)
|
|
int numFactors = std::min(3, factorCount);
|
|
for (int j = 0; j < numFactors; j++) {
|
|
cluster.factorIndices[j] = factorImportance[j].first;
|
|
cluster.factorWeights[j] = std::min(1.0, factorImportance[j].second);
|
|
|
|
// Normalize weights to sum to 1
|
|
double totalWeight = 0;
|
|
for (int k = 0; k < numFactors; k++) {
|
|
totalWeight += cluster.factorWeights[k];
|
|
}
|
|
|
|
if (totalWeight > 0) {
|
|
for (int k = 0; k < numFactors; k++) {
|
|
cluster.factorWeights[k] /= totalWeight;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Generate descriptive cluster name based on key factors
|
|
// In a real implementation, this would use domain knowledge
|
|
snprintf(cluster.clusterName, MAX_STRING_SIZE,
|
|
"Cluster %d - %s", c + 1,
|
|
(cluster.significance > 0.7) ? "High Significance" :
|
|
(cluster.significance > 0.4) ? "Medium Significance" : "Low Significance");
|
|
|
|
// Generate detailed description
|
|
snprintf(cluster.description, MAX_STRING_SIZE,
|
|
"Cluster of %d data points characterized by %s factors. Silhouette: %.2f",
|
|
cluster.dataPointCount,
|
|
(numFactors > 0) ? "multiple interrelated" : "non-specific",
|
|
bestSilhouette);
|
|
}
|
|
|
|
// Mark the end of valid results
|
|
results[optimalClusters].clusterId = -1;
|
|
|
|
return results;
|
|
}
|
|
|
|
/**
|
|
* @brief Free memory for cluster analysis results
|
|
*
|
|
* @param results Pointer to cluster results array
|
|
*/
|
|
void free_cluster_results(ClusterResult* results) {
|
|
delete[] results;
|
|
} |