nokken/native/statistics/utils.cpp

// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// utils.cpp
// Core utility functions used across the health analytics library
//
#include "health_analytics_engine.h"
#include "utils.h"
#include <random>
#include <limits>
#include <numeric>
/**
 * @brief Calculate the mean (average) of a data series
 *
 * @param values Pointer to array of values
 * @param length Number of elements in the array
 * @return double The arithmetic mean
 */
double calculateMean(const double* values, int length) {
    if (length <= 0) return 0;

    double sum = 0;
    for (int i = 0; i < length; i++) {
        sum += values[i];
    }
    return sum / length;
}

/**
 * @brief Calculate the weighted mean of a data series
 *
 * @param values Pointer to array of values
 * @param weights Pointer to array of weights for each value
 * @param length Number of elements in the arrays
 * @return double The weighted arithmetic mean
 */
double calculateWeightedMean(const double* values, const double* weights, int length) {
    if (length <= 0) return 0;

    double sum = 0;
    double weightSum = 0;

    for (int i = 0; i < length; i++) {
        sum += values[i] * weights[i];
        weightSum += weights[i];
    }

    return weightSum > 0 ? sum / weightSum : 0;
}

/**
 * @brief Calculate the variance of a data series
 * Uses Welford's online algorithm for numerical stability
 *
 * @param values Pointer to array of values
 * @param length Number of elements in the array
 * @param mean Pre-calculated mean (if available, otherwise pass 0)
 * @return double The variance (population or sample based on implementation)
 */
double calculateVariance(const double* values, int length, double mean = 0) {
    if (length <= 1) return 0;

    // Use pre-calculated mean if provided, otherwise calculate it
    if (mean == 0) {
        mean = calculateMean(values, length);
    }

    // Use two-pass algorithm for better numerical stability
    double sumSquaredDiff = 0;
    for (int i = 0; i < length; i++) {
        double diff = values[i] - mean;
        sumSquaredDiff += diff * diff;
    }

    // Return sample variance (n-1 denominator for unbiased estimation)
    return sumSquaredDiff / (length - 1);
}

/**
 * @brief Calculate the standard deviation of a data series
 *
 * @param values Pointer to array of values
 * @param length Number of elements in the array
 * @param mean Pre-calculated mean (if available, otherwise pass 0)
 * @return double The standard deviation
 */
double calculateStdDev(const double* values, int length, double mean = 0) {
    return std::sqrt(calculateVariance(values, length, mean));
}

/**
 * @brief Calculate the median of a data series
 *
 * @param values Pointer to array of values (will be modified by sorting)
 * @param length Number of elements in the array
 * @return double The median value
 */
double calculateMedian(double* values, int length) {
    if (length == 0) return 0;
    if (length == 1) return values[0];

    // Create a copy and sort it
    std::vector<double> sorted(values, values + length);
    std::sort(sorted.begin(), sorted.end());

    if (length % 2 == 0) {
        // Even number of elements
        return (sorted[length/2 - 1] + sorted[length/2]) / 2.0;
    } else {
        // Odd number of elements
        return sorted[length/2];
    }
}

/**
 * @brief Calculate the Pearson correlation coefficient between two data series
 *
 * @param x First data series
 * @param y Second data series (must be same length as x)
 * @param length Number of elements in both arrays
 * @return double Correlation coefficient (-1 to 1)
 */
double calculateCorrelation(const double* x, const double* y, int length) {
    if (length <= 1) return 0;

    double sum_x = 0, sum_y = 0, sum_xy = 0;
    double sum_x2 = 0, sum_y2 = 0;

    for (int i = 0; i < length; i++) {
        sum_x += x[i];
        sum_y += y[i];
        sum_xy += x[i] * y[i];
        sum_x2 += x[i] * x[i];
        sum_y2 += y[i] * y[i];
    }

    double denominator = std::sqrt((length * sum_x2 - sum_x * sum_x) *
                                  (length * sum_y2 - sum_y * sum_y));

    if (denominator < 1e-10) return 0; // Avoid division by zero

    return (length * sum_xy - sum_x * sum_y) / denominator;
}

/**
 * @brief Calculate Spearman's rank correlation coefficient
 * More robust to outliers than Pearson correlation
 *
 * @param x First data series
 * @param y Second data series (must be same length as x)
 * @param length Number of elements in both arrays
 * @return double Spearman's rank correlation coefficient (-1 to 1)
 */
double calculateSpearmanCorrelation(const double* x, const double* y, int length) {
    if (length <= 1) return 0;

    // Create vectors with indices to perform ranking
    std::vector<std::pair<double, int>> x_indexed(length);
    std::vector<std::pair<double, int>> y_indexed(length);

    for (int i = 0; i < length; i++) {
        x_indexed[i] = std::make_pair(x[i], i);
        y_indexed[i] = std::make_pair(y[i], i);
    }

    // Sort by values to determine ranks
    std::sort(x_indexed.begin(), x_indexed.end());
    std::sort(y_indexed.begin(), y_indexed.end());

    // Assign ranks (handling ties with average rank)
    std::vector<double> x_ranks(length), y_ranks(length);

    for (int i = 0; i < length; i++) {
        int j = i;
        while (j < length - 1 && x_indexed[j].first == x_indexed[j + 1].first) j++;
        double rank = 1.0 * (i + j) / 2 + 1;
        for (int k = i; k <= j; k++) {
            x_ranks[x_indexed[k].second] = rank;
        }
        i = j;
    }

    for (int i = 0; i < length; i++) {
        int j = i;
        while (j < length - 1 && y_indexed[j].first == y_indexed[j + 1].first) j++;
        double rank = 1.0 * (i + j) / 2 + 1;
        for (int k = i; k <= j; k++) {
            y_ranks[y_indexed[k].second] = rank;
        }
        i = j;
    }

    // Calculate Pearson correlation on the ranks
    double* x_ranks_ptr = x_ranks.data();
    double* y_ranks_ptr = y_ranks.data();
    return calculateCorrelation(x_ranks_ptr, y_ranks_ptr, length);
}

/**
 * @brief Calculate a specific quantile value of a data series
 *
 * @param values Pointer to array of values
 * @param length Number of elements in the array
 * @param q Quantile to calculate (0-1, e.g., 0.25 for first quartile)
 * @return double The value at the specified quantile
 */
double calculateQuantile(const double* values, int length, double q) {
    if (length == 0) return 0;
    if (length == 1) return values[0];
    if (q < 0) q = 0;
    if (q > 1) q = 1;

    std::vector<double> sorted(values, values + length);
    std::sort(sorted.begin(), sorted.end());

    // Linear interpolation between closest ranks
    double pos = (length - 1) * q;
    int idx_lower = static_cast<int>(pos);
    double frac = pos - idx_lower;

    if (idx_lower + 1 < length) {
        return sorted[idx_lower] * (1 - frac) + sorted[idx_lower + 1] * frac;
    } else {
        return sorted[idx_lower];
    }
}

/**
 * @brief Calculate the interquartile range (IQR) of a data series
 *
 * @param values Pointer to array of values
 * @param length Number of elements in the array
 * @return double The IQR (Q3-Q1)
 */
double calculateIQR(const double* values, int length) {
    if (length < 4) return 0;

    double q1 = calculateQuantile(values, length, 0.25);
    double q3 = calculateQuantile(values, length, 0.75);

    return q3 - q1;
}

/**
 * @brief Calculate the skewness of a data distribution
 * Measures the asymmetry of the probability distribution
 *
 * @param values Pointer to array of values
 * @param length Number of elements in the array
 * @param mean Pre-calculated mean (if available, otherwise pass 0)
 * @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0)
 * @return double The skewness value (0 for normal distribution)
 */
double calculateSkewness(const double* values, int length, double mean = 0, double stdDev = 0) {
    if (length <= 2) return 0;

    // Calculate mean and stdDev if not provided
    if (mean == 0) {
        mean = calculateMean(values, length);
    }

    if (stdDev == 0) {
        stdDev = calculateStdDev(values, length, mean);
    }

    if (stdDev < 1e-10) return 0; // Avoid division by zero

    // Calculate third moment (cube of differences)
    double sum = 0;
    for (int i = 0; i < length; i++) {
        double diff = values[i] - mean;
        sum += diff * diff * diff;
    }

    // Return Fisher-Pearson coefficient of skewness
    // Includes adjustment for sample bias
    double n = length;
    double adjustment = std::sqrt(n * (n - 1)) / (n - 2);
    return adjustment * sum / (length * stdDev * stdDev * stdDev);
}

/**
 * @brief Calculate the kurtosis of a data distribution
 * Measures the "tailedness" of the probability distribution
 *
 * @param values Pointer to array of values
 * @param length Number of elements in the array
 * @param mean Pre-calculated mean (if available, otherwise pass 0)
 * @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0)
 * @return double The excess kurtosis (0 for normal distribution)
 */
double calculateKurtosis(const double* values, int length, double mean = 0, double stdDev = 0) {
    if (length <= 3) return 0;

    // Calculate mean and stdDev if not provided
    if (mean == 0) {
        mean = calculateMean(values, length);
    }

    if (stdDev == 0) {
        stdDev = calculateStdDev(values, length, mean);
    }

    if (stdDev < 1e-10) return 0; // Avoid division by zero

    // Calculate fourth moment
    double sum = 0;
    for (int i = 0; i < length; i++) {
        double diff = values[i] - mean;
        sum += diff * diff * diff * diff;
    }

    // Return excess kurtosis with sample adjustment
    double n = length;
    double adjustment = ((n + 1) * n) / ((n - 1) * (n - 2) * (n - 3));
    double second_term = 3 * (n - 1) * (n - 1) / ((n - 2) * (n - 3));
    return adjustment * sum / (stdDev * stdDev * stdDev * stdDev) - second_term;
}

/**
 * @brief Perform linear regression on two data series
 *
 * @param x Independent variable values
 * @param y Dependent variable values (must be same length as x)
 * @param length Number of elements in both arrays
 * @param slope Output parameter for slope
 * @param intercept Output parameter for y-intercept
 * @param r_squared Output parameter for R² coefficient of determination
 * @return bool True if successful, false if error occurred
 */
bool calculateLinearRegression(const double* x, const double* y, int length,
                               double& slope, double& intercept, double& r_squared) {
    if (length < 2) return false;

    double sum_x = 0, sum_y = 0, sum_xy = 0, sum_x2 = 0, sum_y2 = 0;

    for (int i = 0; i < length; i++) {
        sum_x += x[i];
        sum_y += y[i];
        sum_xy += x[i] * y[i];
        sum_x2 += x[i] * x[i];
        sum_y2 += y[i] * y[i];
    }

    double n = static_cast<double>(length);
    double denominator = n * sum_x2 - sum_x * sum_x;

    if (std::abs(denominator) < 1e-10) return false; // Vertical line, undefined slope

    // Calculate slope and intercept
    slope = (n * sum_xy - sum_x * sum_y) / denominator;
    intercept = (sum_y - slope * sum_x) / n;

    // Calculate R² coefficient of determination
    double mean_y = sum_y / n;
    double ss_total = 0, ss_residual = 0;

    for (int i = 0; i < length; i++) {
        double predicted = intercept + slope * x[i];
        ss_total += (y[i] - mean_y) * (y[i] - mean_y);
        ss_residual += (y[i] - predicted) * (y[i] - predicted);
    }

    if (ss_total < 1e-10) {
        r_squared = 1.0; // All points are on the same horizontal line
    } else {
        r_squared = 1.0 - (ss_residual / ss_total);
    }

    return true;
}

/**
 * @brief Calculate the autocorrelation of a time series at specified lag
 *
 * @param values Time series data
 * @param length Number of elements in the array
 * @param lag The lag to calculate autocorrelation for
 * @return double Autocorrelation coefficient at specified lag (-1 to 1)
 */
double calculateAutocorrelation(const double* values, int length, int lag) {
    if (length <= lag || lag <= 0) return 0;

    double mean = calculateMean(values, length);
    double numerator = 0;
    double denominator = 0;

    for (int i = 0; i < length - lag; i++) {
        numerator += (values[i] - mean) * (values[i + lag] - mean);
    }

    for (int i = 0; i < length; i++) {
        denominator += (values[i] - mean) * (values[i] - mean);
    }

    if (denominator < 1e-10) return 0;

    return numerator / denominator;
}

/**
 * @brief Detect outliers in a data series using modified Z-score method
 *
 * @param values Pointer to array of values
 * @param length Number of elements in the array
 * @param outlierIndices Output vector to store indices of detected outliers
 * @param threshold Z-score threshold to consider a point an outlier (typically 3.5)
 * @return int Number of outliers detected
 */
int detectOutliers(const double* values, int length, std::vector<int>& outlierIndices, double threshold = 3.5) {
    if (length < 3) return 0;

    outlierIndices.clear();

    // Use median and MAD instead of mean and std dev for robustness
    std::vector<double> sorted(values, values + length);
    std::sort(sorted.begin(), sorted.end());

    double median = (length % 2 == 0) ?
        (sorted[length/2 - 1] + sorted[length/2]) / 2.0 : sorted[length/2];

    // Calculate MAD (Median Absolute Deviation)
    std::vector<double> deviations(length);
    for (int i = 0; i < length; i++) {
        deviations[i] = std::abs(values[i] - median);
    }
    std::sort(deviations.begin(), deviations.end());

    double mad = (length % 2 == 0) ?
        (deviations[length/2 - 1] + deviations[length/2]) / 2.0 : deviations[length/2];

    // Constant factor for normal distribution
    const double k = 1.4826;

    // Find outliers using modified Z-score
    for (int i = 0; i < length; i++) {
        if (mad < 1e-10) { // If MAD is too small, use simple difference
            if (std::abs(values[i] - median) > threshold) {
                outlierIndices.push_back(i);
            }
        } else {
            double modified_z = k * std::abs(values[i] - median) / mad;
            if (modified_z > threshold) {
                outlierIndices.push_back(i);
            }
        }
    }

    return outlierIndices.size();
}

/**
 * @brief Perform simple moving average on a time series
 *
 * @param values Time series data
 * @param length Number of elements in the array
 * @param window The window size for the moving average
 * @param result Pre-allocated array to store results (size = length)
 */
void calculateMovingAverage(const double* values, int length, int window, double* result) {
    if (length <= 0 || window <= 0) return;

    // Adjust window if it's larger than the data length
    window = std::min(window, length);

    for (int i = 0; i < length; i++) {
        int start = std::max(0, i - window + 1);
        int end = i + 1;
        int count = end - start;

        double sum = 0;
        for (int j = start; j < end; j++) {
            sum += values[j];
        }

        result[i] = sum / count;
    }
}

/**
 * @brief Calculate exponential moving average (EMA) of a time series
 *
 * @param values Time series data
 * @param length Number of elements in the array
 * @param alpha Smoothing factor (0-1)
 * @param result Pre-allocated array to store results (size = length)
 */
void calculateExponentialMovingAverage(const double* values, int length, double alpha, double* result) {
    if (length <= 0 || alpha < 0 || alpha > 1) return;

    // Initialize with first value
    result[0] = values[0];

    // Apply EMA formula: EMA_t = α × value_t + (1 - α) × EMA_{t-1}
    for (int i = 1; i < length; i++) {
        result[i] = alpha * values[i] + (1 - alpha) * result[i - 1];
    }
}

/**
 * @brief Decompose a time series into trend, seasonal, and residual components
 * Implementation of STL (Seasonal and Trend decomposition using Loess)
 *
 * @param values Time series data
 * @param length Number of elements in the array
 * @param seasonality Length of seasonal cycle (e.g., 7 for weekly, 12 for monthly)
 * @param trend Output array for trend component (size = length)
 * @param seasonal Output array for seasonal component (size = length)
 * @param residual Output array for residual component (size = length)
 * @return bool True if successful, false if error occurred
 */
bool decomposeTimeSeries(const double* values, int length, int seasonality,
                          double* trend, double* seasonal, double* residual) {
    if (length <= 2 * seasonality || seasonality <= 1) return false;

    // Calculate trend with centered moving average
    for (int i = 0; i < length; i++) {
        trend[i] = 0;
    }

    int halfSeason = seasonality / 2;
    // Centered moving average for trend
    for (int i = halfSeason; i < length - halfSeason; i++) {
        double sum = 0;
        for (int j = i - halfSeason; j <= i + halfSeason; j++) {
            sum += values[j];
        }
        trend[i] = sum / seasonality;
    }

    // Extrapolate trend at boundaries
    // Left boundary
    double slope = (trend[halfSeason + 5] - trend[halfSeason]) / 5;
    for (int i = 0; i < halfSeason; i++) {
        trend[i] = trend[halfSeason] - (halfSeason - i) * slope;
    }

    // Right boundary
    slope = (trend[length - halfSeason - 1] - trend[length - halfSeason - 6]) / 5;
    for (int i = length - halfSeason; i < length; i++) {
        trend[i] = trend[length - halfSeason - 1] + (i - (length - halfSeason - 1)) * slope;
    }

    // Calculate detrended series
    std::vector<double> detrended(length);
    for (int i = 0; i < length; i++) {
        detrended[i] = values[i] - trend[i];
    }

    // Calculate seasonal component by averaging the detrended values across seasons
    std::vector<double> seasonalAvg(seasonality, 0);
    std::vector<int> seasonalCounts(seasonality, 0);

    for (int i = 0; i < length; i++) {
        int seasonalIndex = i % seasonality;
        seasonalAvg[seasonalIndex] += detrended[i];
        seasonalCounts[seasonalIndex]++;
    }

    for (int i = 0; i < seasonality; i++) {
        if (seasonalCounts[i] > 0) {
            seasonalAvg[i] /= seasonalCounts[i];
        }
    }

    // Normalize seasonal component to sum to zero
    double avgSeasonal = 0;
    for (int i = 0; i < seasonality; i++) {
        avgSeasonal += seasonalAvg[i];
    }
    avgSeasonal /= seasonality;

    for (int i = 0; i < seasonality; i++) {
        seasonalAvg[i] -= avgSeasonal;
    }

    // Apply seasonal component to entire series
    for (int i = 0; i < length; i++) {
        seasonal[i] = seasonalAvg[i % seasonality];
    }

    // Calculate residual component
    for (int i = 0; i < length; i++) {
        residual[i] = values[i] - trend[i] - seasonal[i];
    }

    return true;
}

/**
 * @brief Calculate partial autocorrelation function for a time series
 *
 * @param values Time series data
 * @param length Number of elements in the array
 * @param maxLag Maximum lag to calculate
 * @param pacf Pre-allocated array to store results (size = maxLag + 1)
 * @return int Number of valid PACF values calculated
 */
int calculatePACF(const double* values, int length, int maxLag, double* pacf) {
    if (length <= 1 || maxLag <= 0 || maxLag >= length) return 0;

    // Allocate Yule-Walker matrices
    std::vector<std::vector<double>> phi(maxLag + 1, std::vector<double>(maxLag + 1, 0));

    // Calculate autocorrelations
    std::vector<double> acf(maxLag + 1, 0);
    acf[0] = 1.0; // ACF at lag 0 is always 1

    for (int k = 1; k <= maxLag; k++) {
        acf[k] = calculateAutocorrelation(values, length, k);
    }

    // Set PACF at lag 0 to 1
    pacf[0] = 1.0;

    // Calculate PACF using Levinson-Durbin recursion
    for (int k = 1; k <= maxLag; k++) {
        // Initialize for this order
        double numerator = acf[k];
        for (int j = 1; j < k; j++) {
            numerator -= phi[k-1][j] * acf[k-j];
        }

        double denominator = 1.0;
        for (int j = 1; j < k; j++) {
            denominator -= phi[k-1][j] * acf[j];
        }

        if (std::abs(denominator) < 1e-10) {
            // If denominator is close to zero, set PACF to 0
            phi[k][k] = 0;
        } else {
            phi[k][k] = numerator / denominator;
        }

        // Update remaining coefficients
        for (int j = 1; j < k; j++) {
            phi[k][j] = phi[k-1][j] - phi[k][k] * phi[k-1][k-j];
        }

        // Store the PACF value
        pacf[k] = phi[k][k];
    }

    return maxLag + 1;
}

/**
 * @brief Perform k-means clustering on multivariate data
 *
 * @param data 2D array of data points [n_samples x n_features]
 * @param nSamples Number of data points
 * @param nFeatures Number of features per data point
 * @param k Number of clusters
 * @param maxIter Maximum number of iterations
 * @param centroids Output array for cluster centroids [k x n_features]
 * @param assignments Output array for cluster assignments [n_samples]
 * @return int Number of iterations performed
 */
int kMeansClustering(const double** data, int nSamples, int nFeatures, int k,
                    int maxIter, double** centroids, int* assignments) {
    if (nSamples < k || k <= 0 || nFeatures <= 0) return 0;

    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> distrib(0, nSamples - 1);

    // Initialize centroids using k-means++ initialization
    std::vector<int> centroidIndices;
    std::vector<double> minDistances(nSamples, std::numeric_limits<double>::max());

    // Choose first centroid randomly
    int firstCentroid = distrib(gen);
    centroidIndices.push_back(firstCentroid);

    // Choose remaining centroids
    for (int c = 1; c < k; c++) {
        // Update distances to nearest centroid
        for (int i = 0; i < nSamples; i++) {
            double dist = 0;
            for (int j = 0; j < nFeatures; j++) {
                double diff = data[i][j] - data[centroidIndices.back()][j];
                dist += diff * diff;
            }
            minDistances[i] = std::min(minDistances[i], dist);
        }

        // Calculate sum of squared distances
        double sumSquaredDist = 0;
        for (int i = 0; i < nSamples; i++) {
            sumSquaredDist += minDistances[i];
        }

        // Choose next centroid with probability proportional to D²
        double threshold = (sumSquaredDist * static_cast<double>(rand()) / RAND_MAX);
        double cumulativeProb = 0;
        int nextCentroid = 0;

        for (int i = 0; i < nSamples; i++) {
            cumulativeProb += minDistances[i];
            if (cumulativeProb >= threshold) {
                nextCentroid = i;
                break;
            }
        }

        centroidIndices.push_back(nextCentroid);
    }

    // Copy initial centroids
    for (int i = 0; i < k; i++) {
        for (int j = 0; j < nFeatures; j++) {
            centroids[i][j] = data[centroidIndices[i]][j];
        }
    }

    // Perform k-means iterations
    int iterations = 0;
    bool converged = false;

    while (!converged && iterations < maxIter) {
        // Assign points to nearest centroid
        converged = true;

        for (int i = 0; i < nSamples; i++) {
            double minDist = std::numeric_limits<double>::max();
            int bestCluster = 0;

            for (int c = 0; c < k; c++) {
                double dist = 0;
                for (int j = 0; j < nFeatures; j++) {
                    double diff = data[i][j] - centroids[c][j];
                    dist += diff * diff;
                }

                if (dist < minDist) {
                    minDist = dist;
                    bestCluster = c;
                }
            }

            if (assignments[i] != bestCluster) {
                assignments[i] = bestCluster;
                converged = false;
            }
        }

        // Update centroids
        std::vector<std::vector<double>> newCentroids(k, std::vector<double>(nFeatures, 0));
        std::vector<int> clusterSizes(k, 0);

        for (int i = 0; i < nSamples; i++) {
            int cluster = assignments[i];
            clusterSizes[cluster]++;

            for (int j = 0; j < nFeatures; j++) {
                newCentroids[cluster][j] += data[i][j];
            }
        }

        for (int c = 0; c < k; c++) {
            if (clusterSizes[c] > 0) {
                for (int j = 0; j < nFeatures; j++) {
                    centroids[c][j] = newCentroids[c][j] / clusterSizes[c];
                }
            }
        }

        iterations++;
    }

    return iterations;
}

/**
 * @brief Calculate the silhouette coefficient for clustering validation
 *
 * @param data 2D array of data points [n_samples x n_features]
 * @param nSamples Number of data points
 * @param nFeatures Number of features per data point
 * @param assignments Cluster assignments for each point
 * @param k Number of clusters
 * @return double Average silhouette coefficient (-1 to 1)
 */
double calculateSilhouetteCoefficient(const double** data, int nSamples, int nFeatures,
                                     const int* assignments, int k) {
    if (nSamples <= k || k <= 1) return 0;

    std::vector<double> silhouettes(nSamples);

    // For each point
    for (int i = 0; i < nSamples; i++) {
        int cluster_i = assignments[i];

        // Calculate a(i) - average distance to points in same cluster
        double a_i = 0;
        int count_same_cluster = 0;

        for (int j = 0; j < nSamples; j++) {
            if (j != i && assignments[j] == cluster_i) {
                double dist = 0;
                for (int f = 0; f < nFeatures; f++) {
                    double diff = data[i][f] - data[j][f];
                    dist += diff * diff;
                }
                dist = std::sqrt(dist);

                a_i += dist;
                count_same_cluster++;
            }
        }

        if (count_same_cluster > 0) {
            a_i /= count_same_cluster;
        } else {
            a_i = 0; // Singleton cluster
        }

        // Calculate b(i) - minimum average distance to points in different clusters
        double b_i = std::numeric_limits<double>::max();

        for (int c = 0; c < k; c++) {
            if (c == cluster_i) continue;

            double avg_dist = 0;
            int count_diff_cluster = 0;

            for (int j = 0; j < nSamples; j++) {
                if (assignments[j] == c) {
                    double dist = 0;
                    for (int f = 0; f < nFeatures; f++) {
                        double diff = data[i][f] - data[j][f];
                        dist += diff * diff;
                    }
                    dist = std::sqrt(dist);

                    avg_dist += dist;
                    count_diff_cluster++;
                }
            }

            if (count_diff_cluster > 0) {
                avg_dist /= count_diff_cluster;
                b_i = std::min(b_i, avg_dist);
            }
        }

        // Calculate silhouette
        if (count_same_cluster > 0 && b_i < std::numeric_limits<double>::max()) {
            silhouettes[i] = (b_i - a_i) / std::max(a_i, b_i);
        } else {
            silhouettes[i] = 0; // Handle edge cases
        }
    }

    // Calculate average silhouette
    double avg_silhouette = 0;
    for (int i = 0; i < nSamples; i++) {
        avg_silhouette += silhouettes[i];
    }

    return avg_silhouette / nSamples;
}