nokken/native/statistics/clustering.cpp
2025-04-20 11:17:03 -04:00

203 lines
No EOL
6.8 KiB
C++

// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// clustering.cpp
// Implementation of clustering and pattern recognition functions
//
#include "health_analytics_engine.h"
#include "utils.h"
/**
* @brief Perform cluster analysis on multivariate health data
*
* @param data 2D array of data points [n_samples x n_features]
* @param factorCount Number of features per data point
* @param dataLength Number of data points
* @param maxClusters Maximum number of clusters to identify
* @return ClusterResult* Array of cluster results
*/
ClusterResult* perform_cluster_analysis(const double** data,
int factorCount,
int dataLength,
int maxClusters) {
if (factorCount < 2 || dataLength < 5 || maxClusters <= 0) {
ClusterResult* dummy = new ClusterResult[1];
memset(dummy, 0, sizeof(ClusterResult));
dummy[0].clusterId = -1; // Mark as invalid
return dummy;
}
// Normalize data for better clustering
std::vector<std::vector<double>> normalizedData(dataLength, std::vector<double>(factorCount));
std::vector<double> means(factorCount, 0);
std::vector<double> stdDevs(factorCount, 0);
// Calculate means
for (int j = 0; j < factorCount; j++) {
for (int i = 0; i < dataLength; i++) {
means[j] += data[i][j];
}
means[j] /= dataLength;
}
// Calculate standard deviations
for (int j = 0; j < factorCount; j++) {
for (int i = 0; i < dataLength; i++) {
double diff = data[i][j] - means[j];
stdDevs[j] += diff * diff;
}
stdDevs[j] = sqrt(stdDevs[j] / dataLength);
if (stdDevs[j] < 1e-10) stdDevs[j] = 1.0; // Avoid division by zero
}
// Normalize data
for (int i = 0; i < dataLength; i++) {
for (int j = 0; j < factorCount; j++) {
normalizedData[i][j] = (data[i][j] - means[j]) / stdDevs[j];
}
}
// Find optimal number of clusters (between 2 and maxClusters)
int optimalClusters = 2;
double bestSilhouette = -1;
// Arrays for k-means algorithm
std::vector<int> assignments(dataLength);
std::vector<std::vector<double>> centroids(maxClusters, std::vector<double>(factorCount));
std::vector<const double*> normalizedDataPtrs(dataLength);
for (int i = 0; i < dataLength; i++) {
normalizedDataPtrs[i] = normalizedData[i].data();
}
for (int k = 2; k <= maxClusters; k++) {
// Run k-means clustering
std::vector<int> tempAssignments(dataLength, 0);
std::vector<std::vector<double>> tempCentroids(k, std::vector<double>(factorCount, 0));
std::vector<double*> centroidPtrs(k);
for (int i = 0; i < k; i++) {
centroidPtrs[i] = tempCentroids[i].data();
}
kMeansClustering(normalizedDataPtrs.data(), dataLength, factorCount, k,
100, centroidPtrs.data(), tempAssignments.data());
// Calculate silhouette coefficient
std::vector<const double*> dataPtrs(dataLength);
for (int i = 0; i < dataLength; i++) {
dataPtrs[i] = data[i]; // Use original data for silhouette
}
double silhouette = calculateSilhouetteCoefficient(
dataPtrs.data(), dataLength, factorCount, tempAssignments.data(), k);
// Update if better silhouette found
if (silhouette > bestSilhouette) {
bestSilhouette = silhouette;
optimalClusters = k;
assignments = tempAssignments;
centroids = tempCentroids;
}
}
// Allocate cluster results (plus one for terminator)
ClusterResult* results = new ClusterResult[optimalClusters + 1];
// Count points in each cluster
std::vector<int> clusterSizes(optimalClusters, 0);
for (int i = 0; i < dataLength; i++) {
clusterSizes[assignments[i]]++;
}
// Calculate cluster statistics
for (int c = 0; c < optimalClusters; c++) {
ClusterResult& cluster = results[c];
cluster.clusterId = c;
cluster.dataPointCount = clusterSizes[c];
// Calculate cluster significance based on size and compactness
double avgDistance = 0;
int count = 0;
for (int i = 0; i < dataLength; i++) {
if (assignments[i] == c) {
double dist = 0;
for (int j = 0; j < factorCount; j++) {
double diff = normalizedData[i][j] - centroids[c][j];
dist += diff * diff;
}
avgDistance += sqrt(dist);
count++;
}
}
if (count > 0) {
avgDistance /= count;
}
// Higher significance for larger and more compact clusters
double sizeFactor = static_cast<double>(count) / dataLength;
double compactnessFactor = 1.0 - std::min(1.0, avgDistance / 3.0);
cluster.significance = sizeFactor * compactnessFactor;
// Identify important factors for this cluster
std::vector<std::pair<int, double>> factorImportance;
for (int j = 0; j < factorCount; j++) {
// Calculate how different this centroid is from global mean for this factor
double differenceFromMean = std::abs(centroids[c][j]);
factorImportance.push_back(std::make_pair(j, differenceFromMean));
}
// Sort factors by importance
std::sort(factorImportance.begin(), factorImportance.end(),
[](const std::pair<int, double>& a, const std::pair<int, double>& b) {
return a.second > b.second;
});
// Store top factors (up to 3)
int numFactors = std::min(3, factorCount);
for (int j = 0; j < numFactors; j++) {
cluster.factorIndices[j] = factorImportance[j].first;
cluster.factorWeights[j] = std::min(1.0, factorImportance[j].second);
// Normalize weights to sum to 1
double totalWeight = 0;
for (int k = 0; k < numFactors; k++) {
totalWeight += cluster.factorWeights[k];
}
if (totalWeight > 0) {
for (int k = 0; k < numFactors; k++) {
cluster.factorWeights[k] /= totalWeight;
}
}
}
// Generate descriptive cluster name based on key factors
// In a real implementation, this would use domain knowledge
snprintf(cluster.clusterName, MAX_STRING_SIZE,
"Cluster %d - %s", c + 1,
(cluster.significance > 0.7) ? "High Significance" :
(cluster.significance > 0.4) ? "Medium Significance" : "Low Significance");
// Generate detailed description
snprintf(cluster.description, MAX_STRING_SIZE,
"Cluster of %d data points characterized by %s factors. Silhouette: %.2f",
cluster.dataPointCount,
(numFactors > 0) ? "multiple interrelated" : "non-specific",
bestSilhouette);
}
// Mark the end of valid results
results[optimalClusters].clusterId = -1;
return results;
}
/**
* @brief Free memory for cluster analysis results
*
* @param results Pointer to cluster results array
*/
void free_cluster_results(ClusterResult* results) {
delete[] results;
}