nokken/native/statistics/utils.cpp
2025-04-20 11:17:03 -04:00

863 lines
No EOL
28 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// utils.cpp
// Core utility functions used across the health analytics library
//
#include "health_analytics_engine.h"
#include "utils.h"
#include <random>
#include <limits>
#include <numeric>
/**
* @brief Calculate the mean (average) of a data series
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @return double The arithmetic mean
*/
double calculateMean(const double* values, int length) {
if (length <= 0) return 0;
double sum = 0;
for (int i = 0; i < length; i++) {
sum += values[i];
}
return sum / length;
}
/**
* @brief Calculate the weighted mean of a data series
*
* @param values Pointer to array of values
* @param weights Pointer to array of weights for each value
* @param length Number of elements in the arrays
* @return double The weighted arithmetic mean
*/
double calculateWeightedMean(const double* values, const double* weights, int length) {
if (length <= 0) return 0;
double sum = 0;
double weightSum = 0;
for (int i = 0; i < length; i++) {
sum += values[i] * weights[i];
weightSum += weights[i];
}
return weightSum > 0 ? sum / weightSum : 0;
}
/**
* @brief Calculate the variance of a data series
* Uses Welford's online algorithm for numerical stability
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @param mean Pre-calculated mean (if available, otherwise pass 0)
* @return double The variance (population or sample based on implementation)
*/
double calculateVariance(const double* values, int length, double mean = 0) {
if (length <= 1) return 0;
// Use pre-calculated mean if provided, otherwise calculate it
if (mean == 0) {
mean = calculateMean(values, length);
}
// Use two-pass algorithm for better numerical stability
double sumSquaredDiff = 0;
for (int i = 0; i < length; i++) {
double diff = values[i] - mean;
sumSquaredDiff += diff * diff;
}
// Return sample variance (n-1 denominator for unbiased estimation)
return sumSquaredDiff / (length - 1);
}
/**
* @brief Calculate the standard deviation of a data series
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @param mean Pre-calculated mean (if available, otherwise pass 0)
* @return double The standard deviation
*/
double calculateStdDev(const double* values, int length, double mean = 0) {
return std::sqrt(calculateVariance(values, length, mean));
}
/**
* @brief Calculate the median of a data series
*
* @param values Pointer to array of values (will be modified by sorting)
* @param length Number of elements in the array
* @return double The median value
*/
double calculateMedian(double* values, int length) {
if (length == 0) return 0;
if (length == 1) return values[0];
// Create a copy and sort it
std::vector<double> sorted(values, values + length);
std::sort(sorted.begin(), sorted.end());
if (length % 2 == 0) {
// Even number of elements
return (sorted[length/2 - 1] + sorted[length/2]) / 2.0;
} else {
// Odd number of elements
return sorted[length/2];
}
}
/**
* @brief Calculate the Pearson correlation coefficient between two data series
*
* @param x First data series
* @param y Second data series (must be same length as x)
* @param length Number of elements in both arrays
* @return double Correlation coefficient (-1 to 1)
*/
double calculateCorrelation(const double* x, const double* y, int length) {
if (length <= 1) return 0;
double sum_x = 0, sum_y = 0, sum_xy = 0;
double sum_x2 = 0, sum_y2 = 0;
for (int i = 0; i < length; i++) {
sum_x += x[i];
sum_y += y[i];
sum_xy += x[i] * y[i];
sum_x2 += x[i] * x[i];
sum_y2 += y[i] * y[i];
}
double denominator = std::sqrt((length * sum_x2 - sum_x * sum_x) *
(length * sum_y2 - sum_y * sum_y));
if (denominator < 1e-10) return 0; // Avoid division by zero
return (length * sum_xy - sum_x * sum_y) / denominator;
}
/**
* @brief Calculate Spearman's rank correlation coefficient
* More robust to outliers than Pearson correlation
*
* @param x First data series
* @param y Second data series (must be same length as x)
* @param length Number of elements in both arrays
* @return double Spearman's rank correlation coefficient (-1 to 1)
*/
double calculateSpearmanCorrelation(const double* x, const double* y, int length) {
if (length <= 1) return 0;
// Create vectors with indices to perform ranking
std::vector<std::pair<double, int>> x_indexed(length);
std::vector<std::pair<double, int>> y_indexed(length);
for (int i = 0; i < length; i++) {
x_indexed[i] = std::make_pair(x[i], i);
y_indexed[i] = std::make_pair(y[i], i);
}
// Sort by values to determine ranks
std::sort(x_indexed.begin(), x_indexed.end());
std::sort(y_indexed.begin(), y_indexed.end());
// Assign ranks (handling ties with average rank)
std::vector<double> x_ranks(length), y_ranks(length);
for (int i = 0; i < length; i++) {
int j = i;
while (j < length - 1 && x_indexed[j].first == x_indexed[j + 1].first) j++;
double rank = 1.0 * (i + j) / 2 + 1;
for (int k = i; k <= j; k++) {
x_ranks[x_indexed[k].second] = rank;
}
i = j;
}
for (int i = 0; i < length; i++) {
int j = i;
while (j < length - 1 && y_indexed[j].first == y_indexed[j + 1].first) j++;
double rank = 1.0 * (i + j) / 2 + 1;
for (int k = i; k <= j; k++) {
y_ranks[y_indexed[k].second] = rank;
}
i = j;
}
// Calculate Pearson correlation on the ranks
double* x_ranks_ptr = x_ranks.data();
double* y_ranks_ptr = y_ranks.data();
return calculateCorrelation(x_ranks_ptr, y_ranks_ptr, length);
}
/**
* @brief Calculate a specific quantile value of a data series
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @param q Quantile to calculate (0-1, e.g., 0.25 for first quartile)
* @return double The value at the specified quantile
*/
double calculateQuantile(const double* values, int length, double q) {
if (length == 0) return 0;
if (length == 1) return values[0];
if (q < 0) q = 0;
if (q > 1) q = 1;
std::vector<double> sorted(values, values + length);
std::sort(sorted.begin(), sorted.end());
// Linear interpolation between closest ranks
double pos = (length - 1) * q;
int idx_lower = static_cast<int>(pos);
double frac = pos - idx_lower;
if (idx_lower + 1 < length) {
return sorted[idx_lower] * (1 - frac) + sorted[idx_lower + 1] * frac;
} else {
return sorted[idx_lower];
}
}
/**
* @brief Calculate the interquartile range (IQR) of a data series
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @return double The IQR (Q3-Q1)
*/
double calculateIQR(const double* values, int length) {
if (length < 4) return 0;
double q1 = calculateQuantile(values, length, 0.25);
double q3 = calculateQuantile(values, length, 0.75);
return q3 - q1;
}
/**
* @brief Calculate the skewness of a data distribution
* Measures the asymmetry of the probability distribution
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @param mean Pre-calculated mean (if available, otherwise pass 0)
* @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0)
* @return double The skewness value (0 for normal distribution)
*/
double calculateSkewness(const double* values, int length, double mean = 0, double stdDev = 0) {
if (length <= 2) return 0;
// Calculate mean and stdDev if not provided
if (mean == 0) {
mean = calculateMean(values, length);
}
if (stdDev == 0) {
stdDev = calculateStdDev(values, length, mean);
}
if (stdDev < 1e-10) return 0; // Avoid division by zero
// Calculate third moment (cube of differences)
double sum = 0;
for (int i = 0; i < length; i++) {
double diff = values[i] - mean;
sum += diff * diff * diff;
}
// Return Fisher-Pearson coefficient of skewness
// Includes adjustment for sample bias
double n = length;
double adjustment = std::sqrt(n * (n - 1)) / (n - 2);
return adjustment * sum / (length * stdDev * stdDev * stdDev);
}
/**
* @brief Calculate the kurtosis of a data distribution
* Measures the "tailedness" of the probability distribution
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @param mean Pre-calculated mean (if available, otherwise pass 0)
* @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0)
* @return double The excess kurtosis (0 for normal distribution)
*/
double calculateKurtosis(const double* values, int length, double mean = 0, double stdDev = 0) {
if (length <= 3) return 0;
// Calculate mean and stdDev if not provided
if (mean == 0) {
mean = calculateMean(values, length);
}
if (stdDev == 0) {
stdDev = calculateStdDev(values, length, mean);
}
if (stdDev < 1e-10) return 0; // Avoid division by zero
// Calculate fourth moment
double sum = 0;
for (int i = 0; i < length; i++) {
double diff = values[i] - mean;
sum += diff * diff * diff * diff;
}
// Return excess kurtosis with sample adjustment
double n = length;
double adjustment = ((n + 1) * n) / ((n - 1) * (n - 2) * (n - 3));
double second_term = 3 * (n - 1) * (n - 1) / ((n - 2) * (n - 3));
return adjustment * sum / (stdDev * stdDev * stdDev * stdDev) - second_term;
}
/**
* @brief Perform linear regression on two data series
*
* @param x Independent variable values
* @param y Dependent variable values (must be same length as x)
* @param length Number of elements in both arrays
* @param slope Output parameter for slope
* @param intercept Output parameter for y-intercept
* @param r_squared Output parameter for R² coefficient of determination
* @return bool True if successful, false if error occurred
*/
bool calculateLinearRegression(const double* x, const double* y, int length,
double& slope, double& intercept, double& r_squared) {
if (length < 2) return false;
double sum_x = 0, sum_y = 0, sum_xy = 0, sum_x2 = 0, sum_y2 = 0;
for (int i = 0; i < length; i++) {
sum_x += x[i];
sum_y += y[i];
sum_xy += x[i] * y[i];
sum_x2 += x[i] * x[i];
sum_y2 += y[i] * y[i];
}
double n = static_cast<double>(length);
double denominator = n * sum_x2 - sum_x * sum_x;
if (std::abs(denominator) < 1e-10) return false; // Vertical line, undefined slope
// Calculate slope and intercept
slope = (n * sum_xy - sum_x * sum_y) / denominator;
intercept = (sum_y - slope * sum_x) / n;
// Calculate R² coefficient of determination
double mean_y = sum_y / n;
double ss_total = 0, ss_residual = 0;
for (int i = 0; i < length; i++) {
double predicted = intercept + slope * x[i];
ss_total += (y[i] - mean_y) * (y[i] - mean_y);
ss_residual += (y[i] - predicted) * (y[i] - predicted);
}
if (ss_total < 1e-10) {
r_squared = 1.0; // All points are on the same horizontal line
} else {
r_squared = 1.0 - (ss_residual / ss_total);
}
return true;
}
/**
* @brief Calculate the autocorrelation of a time series at specified lag
*
* @param values Time series data
* @param length Number of elements in the array
* @param lag The lag to calculate autocorrelation for
* @return double Autocorrelation coefficient at specified lag (-1 to 1)
*/
double calculateAutocorrelation(const double* values, int length, int lag) {
if (length <= lag || lag <= 0) return 0;
double mean = calculateMean(values, length);
double numerator = 0;
double denominator = 0;
for (int i = 0; i < length - lag; i++) {
numerator += (values[i] - mean) * (values[i + lag] - mean);
}
for (int i = 0; i < length; i++) {
denominator += (values[i] - mean) * (values[i] - mean);
}
if (denominator < 1e-10) return 0;
return numerator / denominator;
}
/**
* @brief Detect outliers in a data series using modified Z-score method
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @param outlierIndices Output vector to store indices of detected outliers
* @param threshold Z-score threshold to consider a point an outlier (typically 3.5)
* @return int Number of outliers detected
*/
int detectOutliers(const double* values, int length, std::vector<int>& outlierIndices, double threshold = 3.5) {
if (length < 3) return 0;
outlierIndices.clear();
// Use median and MAD instead of mean and std dev for robustness
std::vector<double> sorted(values, values + length);
std::sort(sorted.begin(), sorted.end());
double median = (length % 2 == 0) ?
(sorted[length/2 - 1] + sorted[length/2]) / 2.0 : sorted[length/2];
// Calculate MAD (Median Absolute Deviation)
std::vector<double> deviations(length);
for (int i = 0; i < length; i++) {
deviations[i] = std::abs(values[i] - median);
}
std::sort(deviations.begin(), deviations.end());
double mad = (length % 2 == 0) ?
(deviations[length/2 - 1] + deviations[length/2]) / 2.0 : deviations[length/2];
// Constant factor for normal distribution
const double k = 1.4826;
// Find outliers using modified Z-score
for (int i = 0; i < length; i++) {
if (mad < 1e-10) { // If MAD is too small, use simple difference
if (std::abs(values[i] - median) > threshold) {
outlierIndices.push_back(i);
}
} else {
double modified_z = k * std::abs(values[i] - median) / mad;
if (modified_z > threshold) {
outlierIndices.push_back(i);
}
}
}
return outlierIndices.size();
}
/**
* @brief Perform simple moving average on a time series
*
* @param values Time series data
* @param length Number of elements in the array
* @param window The window size for the moving average
* @param result Pre-allocated array to store results (size = length)
*/
void calculateMovingAverage(const double* values, int length, int window, double* result) {
if (length <= 0 || window <= 0) return;
// Adjust window if it's larger than the data length
window = std::min(window, length);
for (int i = 0; i < length; i++) {
int start = std::max(0, i - window + 1);
int end = i + 1;
int count = end - start;
double sum = 0;
for (int j = start; j < end; j++) {
sum += values[j];
}
result[i] = sum / count;
}
}
/**
* @brief Calculate exponential moving average (EMA) of a time series
*
* @param values Time series data
* @param length Number of elements in the array
* @param alpha Smoothing factor (0-1)
* @param result Pre-allocated array to store results (size = length)
*/
void calculateExponentialMovingAverage(const double* values, int length, double alpha, double* result) {
if (length <= 0 || alpha < 0 || alpha > 1) return;
// Initialize with first value
result[0] = values[0];
// Apply EMA formula: EMA_t = α × value_t + (1 - α) × EMA_{t-1}
for (int i = 1; i < length; i++) {
result[i] = alpha * values[i] + (1 - alpha) * result[i - 1];
}
}
/**
* @brief Decompose a time series into trend, seasonal, and residual components
* Implementation of STL (Seasonal and Trend decomposition using Loess)
*
* @param values Time series data
* @param length Number of elements in the array
* @param seasonality Length of seasonal cycle (e.g., 7 for weekly, 12 for monthly)
* @param trend Output array for trend component (size = length)
* @param seasonal Output array for seasonal component (size = length)
* @param residual Output array for residual component (size = length)
* @return bool True if successful, false if error occurred
*/
bool decomposeTimeSeries(const double* values, int length, int seasonality,
double* trend, double* seasonal, double* residual) {
if (length <= 2 * seasonality || seasonality <= 1) return false;
// Calculate trend with centered moving average
for (int i = 0; i < length; i++) {
trend[i] = 0;
}
int halfSeason = seasonality / 2;
// Centered moving average for trend
for (int i = halfSeason; i < length - halfSeason; i++) {
double sum = 0;
for (int j = i - halfSeason; j <= i + halfSeason; j++) {
sum += values[j];
}
trend[i] = sum / seasonality;
}
// Extrapolate trend at boundaries
// Left boundary
double slope = (trend[halfSeason + 5] - trend[halfSeason]) / 5;
for (int i = 0; i < halfSeason; i++) {
trend[i] = trend[halfSeason] - (halfSeason - i) * slope;
}
// Right boundary
slope = (trend[length - halfSeason - 1] - trend[length - halfSeason - 6]) / 5;
for (int i = length - halfSeason; i < length; i++) {
trend[i] = trend[length - halfSeason - 1] + (i - (length - halfSeason - 1)) * slope;
}
// Calculate detrended series
std::vector<double> detrended(length);
for (int i = 0; i < length; i++) {
detrended[i] = values[i] - trend[i];
}
// Calculate seasonal component by averaging the detrended values across seasons
std::vector<double> seasonalAvg(seasonality, 0);
std::vector<int> seasonalCounts(seasonality, 0);
for (int i = 0; i < length; i++) {
int seasonalIndex = i % seasonality;
seasonalAvg[seasonalIndex] += detrended[i];
seasonalCounts[seasonalIndex]++;
}
for (int i = 0; i < seasonality; i++) {
if (seasonalCounts[i] > 0) {
seasonalAvg[i] /= seasonalCounts[i];
}
}
// Normalize seasonal component to sum to zero
double avgSeasonal = 0;
for (int i = 0; i < seasonality; i++) {
avgSeasonal += seasonalAvg[i];
}
avgSeasonal /= seasonality;
for (int i = 0; i < seasonality; i++) {
seasonalAvg[i] -= avgSeasonal;
}
// Apply seasonal component to entire series
for (int i = 0; i < length; i++) {
seasonal[i] = seasonalAvg[i % seasonality];
}
// Calculate residual component
for (int i = 0; i < length; i++) {
residual[i] = values[i] - trend[i] - seasonal[i];
}
return true;
}
/**
* @brief Calculate partial autocorrelation function for a time series
*
* @param values Time series data
* @param length Number of elements in the array
* @param maxLag Maximum lag to calculate
* @param pacf Pre-allocated array to store results (size = maxLag + 1)
* @return int Number of valid PACF values calculated
*/
int calculatePACF(const double* values, int length, int maxLag, double* pacf) {
if (length <= 1 || maxLag <= 0 || maxLag >= length) return 0;
// Allocate Yule-Walker matrices
std::vector<std::vector<double>> phi(maxLag + 1, std::vector<double>(maxLag + 1, 0));
// Calculate autocorrelations
std::vector<double> acf(maxLag + 1, 0);
acf[0] = 1.0; // ACF at lag 0 is always 1
for (int k = 1; k <= maxLag; k++) {
acf[k] = calculateAutocorrelation(values, length, k);
}
// Set PACF at lag 0 to 1
pacf[0] = 1.0;
// Calculate PACF using Levinson-Durbin recursion
for (int k = 1; k <= maxLag; k++) {
// Initialize for this order
double numerator = acf[k];
for (int j = 1; j < k; j++) {
numerator -= phi[k-1][j] * acf[k-j];
}
double denominator = 1.0;
for (int j = 1; j < k; j++) {
denominator -= phi[k-1][j] * acf[j];
}
if (std::abs(denominator) < 1e-10) {
// If denominator is close to zero, set PACF to 0
phi[k][k] = 0;
} else {
phi[k][k] = numerator / denominator;
}
// Update remaining coefficients
for (int j = 1; j < k; j++) {
phi[k][j] = phi[k-1][j] - phi[k][k] * phi[k-1][k-j];
}
// Store the PACF value
pacf[k] = phi[k][k];
}
return maxLag + 1;
}
/**
* @brief Perform k-means clustering on multivariate data
*
* @param data 2D array of data points [n_samples x n_features]
* @param nSamples Number of data points
* @param nFeatures Number of features per data point
* @param k Number of clusters
* @param maxIter Maximum number of iterations
* @param centroids Output array for cluster centroids [k x n_features]
* @param assignments Output array for cluster assignments [n_samples]
* @return int Number of iterations performed
*/
int kMeansClustering(const double** data, int nSamples, int nFeatures, int k,
int maxIter, double** centroids, int* assignments) {
if (nSamples < k || k <= 0 || nFeatures <= 0) return 0;
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> distrib(0, nSamples - 1);
// Initialize centroids using k-means++ initialization
std::vector<int> centroidIndices;
std::vector<double> minDistances(nSamples, std::numeric_limits<double>::max());
// Choose first centroid randomly
int firstCentroid = distrib(gen);
centroidIndices.push_back(firstCentroid);
// Choose remaining centroids
for (int c = 1; c < k; c++) {
// Update distances to nearest centroid
for (int i = 0; i < nSamples; i++) {
double dist = 0;
for (int j = 0; j < nFeatures; j++) {
double diff = data[i][j] - data[centroidIndices.back()][j];
dist += diff * diff;
}
minDistances[i] = std::min(minDistances[i], dist);
}
// Calculate sum of squared distances
double sumSquaredDist = 0;
for (int i = 0; i < nSamples; i++) {
sumSquaredDist += minDistances[i];
}
// Choose next centroid with probability proportional to D²
double threshold = (sumSquaredDist * static_cast<double>(rand()) / RAND_MAX);
double cumulativeProb = 0;
int nextCentroid = 0;
for (int i = 0; i < nSamples; i++) {
cumulativeProb += minDistances[i];
if (cumulativeProb >= threshold) {
nextCentroid = i;
break;
}
}
centroidIndices.push_back(nextCentroid);
}
// Copy initial centroids
for (int i = 0; i < k; i++) {
for (int j = 0; j < nFeatures; j++) {
centroids[i][j] = data[centroidIndices[i]][j];
}
}
// Perform k-means iterations
int iterations = 0;
bool converged = false;
while (!converged && iterations < maxIter) {
// Assign points to nearest centroid
converged = true;
for (int i = 0; i < nSamples; i++) {
double minDist = std::numeric_limits<double>::max();
int bestCluster = 0;
for (int c = 0; c < k; c++) {
double dist = 0;
for (int j = 0; j < nFeatures; j++) {
double diff = data[i][j] - centroids[c][j];
dist += diff * diff;
}
if (dist < minDist) {
minDist = dist;
bestCluster = c;
}
}
if (assignments[i] != bestCluster) {
assignments[i] = bestCluster;
converged = false;
}
}
// Update centroids
std::vector<std::vector<double>> newCentroids(k, std::vector<double>(nFeatures, 0));
std::vector<int> clusterSizes(k, 0);
for (int i = 0; i < nSamples; i++) {
int cluster = assignments[i];
clusterSizes[cluster]++;
for (int j = 0; j < nFeatures; j++) {
newCentroids[cluster][j] += data[i][j];
}
}
for (int c = 0; c < k; c++) {
if (clusterSizes[c] > 0) {
for (int j = 0; j < nFeatures; j++) {
centroids[c][j] = newCentroids[c][j] / clusterSizes[c];
}
}
}
iterations++;
}
return iterations;
}
/**
* @brief Calculate the silhouette coefficient for clustering validation
*
* @param data 2D array of data points [n_samples x n_features]
* @param nSamples Number of data points
* @param nFeatures Number of features per data point
* @param assignments Cluster assignments for each point
* @param k Number of clusters
* @return double Average silhouette coefficient (-1 to 1)
*/
double calculateSilhouetteCoefficient(const double** data, int nSamples, int nFeatures,
const int* assignments, int k) {
if (nSamples <= k || k <= 1) return 0;
std::vector<double> silhouettes(nSamples);
// For each point
for (int i = 0; i < nSamples; i++) {
int cluster_i = assignments[i];
// Calculate a(i) - average distance to points in same cluster
double a_i = 0;
int count_same_cluster = 0;
for (int j = 0; j < nSamples; j++) {
if (j != i && assignments[j] == cluster_i) {
double dist = 0;
for (int f = 0; f < nFeatures; f++) {
double diff = data[i][f] - data[j][f];
dist += diff * diff;
}
dist = std::sqrt(dist);
a_i += dist;
count_same_cluster++;
}
}
if (count_same_cluster > 0) {
a_i /= count_same_cluster;
} else {
a_i = 0; // Singleton cluster
}
// Calculate b(i) - minimum average distance to points in different clusters
double b_i = std::numeric_limits<double>::max();
for (int c = 0; c < k; c++) {
if (c == cluster_i) continue;
double avg_dist = 0;
int count_diff_cluster = 0;
for (int j = 0; j < nSamples; j++) {
if (assignments[j] == c) {
double dist = 0;
for (int f = 0; f < nFeatures; f++) {
double diff = data[i][f] - data[j][f];
dist += diff * diff;
}
dist = std::sqrt(dist);
avg_dist += dist;
count_diff_cluster++;
}
}
if (count_diff_cluster > 0) {
avg_dist /= count_diff_cluster;
b_i = std::min(b_i, avg_dist);
}
}
// Calculate silhouette
if (count_same_cluster > 0 && b_i < std::numeric_limits<double>::max()) {
silhouettes[i] = (b_i - a_i) / std::max(a_i, b_i);
} else {
silhouettes[i] = 0; // Handle edge cases
}
}
// Calculate average silhouette
double avg_silhouette = 0;
for (int i = 0; i < nSamples; i++) {
avg_silhouette += silhouettes[i];
}
return avg_silhouette / nSamples;
}