nokken/native/statistics/clustering.cpp

// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// clustering.cpp
// Implementation of clustering and pattern recognition functions
//
#include "health_analytics_engine.h"
#include "utils.h"
/**
 * @brief Perform cluster analysis on multivariate health data
 *
 * @param data 2D array of data points [n_samples x n_features]
 * @param factorCount Number of features per data point
 * @param dataLength Number of data points
 * @param maxClusters Maximum number of clusters to identify
 * @return ClusterResult* Array of cluster results
 */
ClusterResult* perform_cluster_analysis(const double** data,
    int factorCount,
    int dataLength,
    int maxClusters) {
if (factorCount < 2 || dataLength < 5 || maxClusters <= 0) {
ClusterResult* dummy = new ClusterResult[1];
memset(dummy, 0, sizeof(ClusterResult));
dummy[0].clusterId = -1;  // Mark as invalid
return dummy;
}

// Normalize data for better clustering
std::vector<std::vector<double>> normalizedData(dataLength, std::vector<double>(factorCount));
std::vector<double> means(factorCount, 0);
std::vector<double> stdDevs(factorCount, 0);

// Calculate means
for (int j = 0; j < factorCount; j++) {
for (int i = 0; i < dataLength; i++) {
means[j] += data[i][j];
}
means[j] /= dataLength;
}

// Calculate standard deviations
for (int j = 0; j < factorCount; j++) {
for (int i = 0; i < dataLength; i++) {
double diff = data[i][j] - means[j];
stdDevs[j] += diff * diff;
}
stdDevs[j] = sqrt(stdDevs[j] / dataLength);
if (stdDevs[j] < 1e-10) stdDevs[j] = 1.0; // Avoid division by zero
}

// Normalize data
for (int i = 0; i < dataLength; i++) {
for (int j = 0; j < factorCount; j++) {
normalizedData[i][j] = (data[i][j] - means[j]) / stdDevs[j];
}
}

// Find optimal number of clusters (between 2 and maxClusters)
int optimalClusters = 2;
double bestSilhouette = -1;

// Arrays for k-means algorithm
std::vector<int> assignments(dataLength);
std::vector<std::vector<double>> centroids(maxClusters, std::vector<double>(factorCount));
std::vector<const double*> normalizedDataPtrs(dataLength);
for (int i = 0; i < dataLength; i++) {
normalizedDataPtrs[i] = normalizedData[i].data();
}

for (int k = 2; k <= maxClusters; k++) {
// Run k-means clustering
std::vector<int> tempAssignments(dataLength, 0);
std::vector<std::vector<double>> tempCentroids(k, std::vector<double>(factorCount, 0));
std::vector<double*> centroidPtrs(k);
for (int i = 0; i < k; i++) {
centroidPtrs[i] = tempCentroids[i].data();
}

kMeansClustering(normalizedDataPtrs.data(), dataLength, factorCount, k,
100, centroidPtrs.data(), tempAssignments.data());

// Calculate silhouette coefficient
std::vector<const double*> dataPtrs(dataLength);
for (int i = 0; i < dataLength; i++) {
dataPtrs[i] = data[i]; // Use original data for silhouette
}

double silhouette = calculateSilhouetteCoefficient(
dataPtrs.data(), dataLength, factorCount, tempAssignments.data(), k);

// Update if better silhouette found
if (silhouette > bestSilhouette) {
bestSilhouette = silhouette;
optimalClusters = k;
assignments = tempAssignments;
centroids = tempCentroids;
}
}

// Allocate cluster results (plus one for terminator)
ClusterResult* results = new ClusterResult[optimalClusters + 1];

// Count points in each cluster
std::vector<int> clusterSizes(optimalClusters, 0);
for (int i = 0; i < dataLength; i++) {
clusterSizes[assignments[i]]++;
}

// Calculate cluster statistics
for (int c = 0; c < optimalClusters; c++) {
ClusterResult& cluster = results[c];
cluster.clusterId = c;
        cluster.dataPointCount = clusterSizes[c];

        // Calculate cluster significance based on size and compactness
        double avgDistance = 0;
        int count = 0;

        for (int i = 0; i < dataLength; i++) {
            if (assignments[i] == c) {
                double dist = 0;
                for (int j = 0; j < factorCount; j++) {
                    double diff = normalizedData[i][j] - centroids[c][j];
                    dist += diff * diff;
                }
                avgDistance += sqrt(dist);
                count++;
            }
        }

        if (count > 0) {
            avgDistance /= count;
        }

        // Higher significance for larger and more compact clusters
        double sizeFactor = static_cast<double>(count) / dataLength;
        double compactnessFactor = 1.0 - std::min(1.0, avgDistance / 3.0);
        cluster.significance = sizeFactor * compactnessFactor;

        // Identify important factors for this cluster
        std::vector<std::pair<int, double>> factorImportance;

        for (int j = 0; j < factorCount; j++) {
            // Calculate how different this centroid is from global mean for this factor
            double differenceFromMean = std::abs(centroids[c][j]);
            factorImportance.push_back(std::make_pair(j, differenceFromMean));
        }

        // Sort factors by importance
        std::sort(factorImportance.begin(), factorImportance.end(),
                 [](const std::pair<int, double>& a, const std::pair<int, double>& b) {
                     return a.second > b.second;
                 });

        // Store top factors (up to 3)
        int numFactors = std::min(3, factorCount);
        for (int j = 0; j < numFactors; j++) {
            cluster.factorIndices[j] = factorImportance[j].first;
            cluster.factorWeights[j] = std::min(1.0, factorImportance[j].second);

            // Normalize weights to sum to 1
            double totalWeight = 0;
            for (int k = 0; k < numFactors; k++) {
                totalWeight += cluster.factorWeights[k];
            }

            if (totalWeight > 0) {
                for (int k = 0; k < numFactors; k++) {
                    cluster.factorWeights[k] /= totalWeight;
                }
            }
        }

        // Generate descriptive cluster name based on key factors
        // In a real implementation, this would use domain knowledge
        snprintf(cluster.clusterName, MAX_STRING_SIZE,
                 "Cluster %d - %s", c + 1,
                 (cluster.significance > 0.7) ? "High Significance" :
                 (cluster.significance > 0.4) ? "Medium Significance" : "Low Significance");

        // Generate detailed description
        snprintf(cluster.description, MAX_STRING_SIZE,
                 "Cluster of %d data points characterized by %s factors. Silhouette: %.2f",
                 cluster.dataPointCount,
                 (numFactors > 0) ? "multiple interrelated" : "non-specific",
                 bestSilhouette);
    }

    // Mark the end of valid results
    results[optimalClusters].clusterId = -1;

    return results;
}

/**
 * @brief Free memory for cluster analysis results
 *
 * @param results Pointer to cluster results array
 */
void free_cluster_results(ClusterResult* results) {
    delete[] results;
}