863 lines
No EOL
28 KiB
C++
863 lines
No EOL
28 KiB
C++
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
|
||
// SPDX-License-Identifier: AGPL-3.0
|
||
//
|
||
// utils.cpp
|
||
// Core utility functions used across the health analytics library
|
||
//
|
||
#include "health_analytics_engine.h"
|
||
#include "utils.h"
|
||
#include <random>
|
||
#include <limits>
|
||
#include <numeric>
|
||
/**
|
||
* @brief Calculate the mean (average) of a data series
|
||
*
|
||
* @param values Pointer to array of values
|
||
* @param length Number of elements in the array
|
||
* @return double The arithmetic mean
|
||
*/
|
||
double calculateMean(const double* values, int length) {
|
||
if (length <= 0) return 0;
|
||
|
||
double sum = 0;
|
||
for (int i = 0; i < length; i++) {
|
||
sum += values[i];
|
||
}
|
||
return sum / length;
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate the weighted mean of a data series
|
||
*
|
||
* @param values Pointer to array of values
|
||
* @param weights Pointer to array of weights for each value
|
||
* @param length Number of elements in the arrays
|
||
* @return double The weighted arithmetic mean
|
||
*/
|
||
double calculateWeightedMean(const double* values, const double* weights, int length) {
|
||
if (length <= 0) return 0;
|
||
|
||
double sum = 0;
|
||
double weightSum = 0;
|
||
|
||
for (int i = 0; i < length; i++) {
|
||
sum += values[i] * weights[i];
|
||
weightSum += weights[i];
|
||
}
|
||
|
||
return weightSum > 0 ? sum / weightSum : 0;
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate the variance of a data series
|
||
* Uses Welford's online algorithm for numerical stability
|
||
*
|
||
* @param values Pointer to array of values
|
||
* @param length Number of elements in the array
|
||
* @param mean Pre-calculated mean (if available, otherwise pass 0)
|
||
* @return double The variance (population or sample based on implementation)
|
||
*/
|
||
double calculateVariance(const double* values, int length, double mean = 0) {
|
||
if (length <= 1) return 0;
|
||
|
||
// Use pre-calculated mean if provided, otherwise calculate it
|
||
if (mean == 0) {
|
||
mean = calculateMean(values, length);
|
||
}
|
||
|
||
// Use two-pass algorithm for better numerical stability
|
||
double sumSquaredDiff = 0;
|
||
for (int i = 0; i < length; i++) {
|
||
double diff = values[i] - mean;
|
||
sumSquaredDiff += diff * diff;
|
||
}
|
||
|
||
// Return sample variance (n-1 denominator for unbiased estimation)
|
||
return sumSquaredDiff / (length - 1);
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate the standard deviation of a data series
|
||
*
|
||
* @param values Pointer to array of values
|
||
* @param length Number of elements in the array
|
||
* @param mean Pre-calculated mean (if available, otherwise pass 0)
|
||
* @return double The standard deviation
|
||
*/
|
||
double calculateStdDev(const double* values, int length, double mean = 0) {
|
||
return std::sqrt(calculateVariance(values, length, mean));
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate the median of a data series
|
||
*
|
||
* @param values Pointer to array of values (will be modified by sorting)
|
||
* @param length Number of elements in the array
|
||
* @return double The median value
|
||
*/
|
||
double calculateMedian(double* values, int length) {
|
||
if (length == 0) return 0;
|
||
if (length == 1) return values[0];
|
||
|
||
// Create a copy and sort it
|
||
std::vector<double> sorted(values, values + length);
|
||
std::sort(sorted.begin(), sorted.end());
|
||
|
||
if (length % 2 == 0) {
|
||
// Even number of elements
|
||
return (sorted[length/2 - 1] + sorted[length/2]) / 2.0;
|
||
} else {
|
||
// Odd number of elements
|
||
return sorted[length/2];
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate the Pearson correlation coefficient between two data series
|
||
*
|
||
* @param x First data series
|
||
* @param y Second data series (must be same length as x)
|
||
* @param length Number of elements in both arrays
|
||
* @return double Correlation coefficient (-1 to 1)
|
||
*/
|
||
double calculateCorrelation(const double* x, const double* y, int length) {
|
||
if (length <= 1) return 0;
|
||
|
||
double sum_x = 0, sum_y = 0, sum_xy = 0;
|
||
double sum_x2 = 0, sum_y2 = 0;
|
||
|
||
for (int i = 0; i < length; i++) {
|
||
sum_x += x[i];
|
||
sum_y += y[i];
|
||
sum_xy += x[i] * y[i];
|
||
sum_x2 += x[i] * x[i];
|
||
sum_y2 += y[i] * y[i];
|
||
}
|
||
|
||
double denominator = std::sqrt((length * sum_x2 - sum_x * sum_x) *
|
||
(length * sum_y2 - sum_y * sum_y));
|
||
|
||
if (denominator < 1e-10) return 0; // Avoid division by zero
|
||
|
||
return (length * sum_xy - sum_x * sum_y) / denominator;
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate Spearman's rank correlation coefficient
|
||
* More robust to outliers than Pearson correlation
|
||
*
|
||
* @param x First data series
|
||
* @param y Second data series (must be same length as x)
|
||
* @param length Number of elements in both arrays
|
||
* @return double Spearman's rank correlation coefficient (-1 to 1)
|
||
*/
|
||
double calculateSpearmanCorrelation(const double* x, const double* y, int length) {
|
||
if (length <= 1) return 0;
|
||
|
||
// Create vectors with indices to perform ranking
|
||
std::vector<std::pair<double, int>> x_indexed(length);
|
||
std::vector<std::pair<double, int>> y_indexed(length);
|
||
|
||
for (int i = 0; i < length; i++) {
|
||
x_indexed[i] = std::make_pair(x[i], i);
|
||
y_indexed[i] = std::make_pair(y[i], i);
|
||
}
|
||
|
||
// Sort by values to determine ranks
|
||
std::sort(x_indexed.begin(), x_indexed.end());
|
||
std::sort(y_indexed.begin(), y_indexed.end());
|
||
|
||
// Assign ranks (handling ties with average rank)
|
||
std::vector<double> x_ranks(length), y_ranks(length);
|
||
|
||
for (int i = 0; i < length; i++) {
|
||
int j = i;
|
||
while (j < length - 1 && x_indexed[j].first == x_indexed[j + 1].first) j++;
|
||
double rank = 1.0 * (i + j) / 2 + 1;
|
||
for (int k = i; k <= j; k++) {
|
||
x_ranks[x_indexed[k].second] = rank;
|
||
}
|
||
i = j;
|
||
}
|
||
|
||
for (int i = 0; i < length; i++) {
|
||
int j = i;
|
||
while (j < length - 1 && y_indexed[j].first == y_indexed[j + 1].first) j++;
|
||
double rank = 1.0 * (i + j) / 2 + 1;
|
||
for (int k = i; k <= j; k++) {
|
||
y_ranks[y_indexed[k].second] = rank;
|
||
}
|
||
i = j;
|
||
}
|
||
|
||
// Calculate Pearson correlation on the ranks
|
||
double* x_ranks_ptr = x_ranks.data();
|
||
double* y_ranks_ptr = y_ranks.data();
|
||
return calculateCorrelation(x_ranks_ptr, y_ranks_ptr, length);
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate a specific quantile value of a data series
|
||
*
|
||
* @param values Pointer to array of values
|
||
* @param length Number of elements in the array
|
||
* @param q Quantile to calculate (0-1, e.g., 0.25 for first quartile)
|
||
* @return double The value at the specified quantile
|
||
*/
|
||
double calculateQuantile(const double* values, int length, double q) {
|
||
if (length == 0) return 0;
|
||
if (length == 1) return values[0];
|
||
if (q < 0) q = 0;
|
||
if (q > 1) q = 1;
|
||
|
||
std::vector<double> sorted(values, values + length);
|
||
std::sort(sorted.begin(), sorted.end());
|
||
|
||
// Linear interpolation between closest ranks
|
||
double pos = (length - 1) * q;
|
||
int idx_lower = static_cast<int>(pos);
|
||
double frac = pos - idx_lower;
|
||
|
||
if (idx_lower + 1 < length) {
|
||
return sorted[idx_lower] * (1 - frac) + sorted[idx_lower + 1] * frac;
|
||
} else {
|
||
return sorted[idx_lower];
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate the interquartile range (IQR) of a data series
|
||
*
|
||
* @param values Pointer to array of values
|
||
* @param length Number of elements in the array
|
||
* @return double The IQR (Q3-Q1)
|
||
*/
|
||
double calculateIQR(const double* values, int length) {
|
||
if (length < 4) return 0;
|
||
|
||
double q1 = calculateQuantile(values, length, 0.25);
|
||
double q3 = calculateQuantile(values, length, 0.75);
|
||
|
||
return q3 - q1;
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate the skewness of a data distribution
|
||
* Measures the asymmetry of the probability distribution
|
||
*
|
||
* @param values Pointer to array of values
|
||
* @param length Number of elements in the array
|
||
* @param mean Pre-calculated mean (if available, otherwise pass 0)
|
||
* @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0)
|
||
* @return double The skewness value (0 for normal distribution)
|
||
*/
|
||
double calculateSkewness(const double* values, int length, double mean = 0, double stdDev = 0) {
|
||
if (length <= 2) return 0;
|
||
|
||
// Calculate mean and stdDev if not provided
|
||
if (mean == 0) {
|
||
mean = calculateMean(values, length);
|
||
}
|
||
|
||
if (stdDev == 0) {
|
||
stdDev = calculateStdDev(values, length, mean);
|
||
}
|
||
|
||
if (stdDev < 1e-10) return 0; // Avoid division by zero
|
||
|
||
// Calculate third moment (cube of differences)
|
||
double sum = 0;
|
||
for (int i = 0; i < length; i++) {
|
||
double diff = values[i] - mean;
|
||
sum += diff * diff * diff;
|
||
}
|
||
|
||
// Return Fisher-Pearson coefficient of skewness
|
||
// Includes adjustment for sample bias
|
||
double n = length;
|
||
double adjustment = std::sqrt(n * (n - 1)) / (n - 2);
|
||
return adjustment * sum / (length * stdDev * stdDev * stdDev);
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate the kurtosis of a data distribution
|
||
* Measures the "tailedness" of the probability distribution
|
||
*
|
||
* @param values Pointer to array of values
|
||
* @param length Number of elements in the array
|
||
* @param mean Pre-calculated mean (if available, otherwise pass 0)
|
||
* @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0)
|
||
* @return double The excess kurtosis (0 for normal distribution)
|
||
*/
|
||
double calculateKurtosis(const double* values, int length, double mean = 0, double stdDev = 0) {
|
||
if (length <= 3) return 0;
|
||
|
||
// Calculate mean and stdDev if not provided
|
||
if (mean == 0) {
|
||
mean = calculateMean(values, length);
|
||
}
|
||
|
||
if (stdDev == 0) {
|
||
stdDev = calculateStdDev(values, length, mean);
|
||
}
|
||
|
||
if (stdDev < 1e-10) return 0; // Avoid division by zero
|
||
|
||
// Calculate fourth moment
|
||
double sum = 0;
|
||
for (int i = 0; i < length; i++) {
|
||
double diff = values[i] - mean;
|
||
sum += diff * diff * diff * diff;
|
||
}
|
||
|
||
// Return excess kurtosis with sample adjustment
|
||
double n = length;
|
||
double adjustment = ((n + 1) * n) / ((n - 1) * (n - 2) * (n - 3));
|
||
double second_term = 3 * (n - 1) * (n - 1) / ((n - 2) * (n - 3));
|
||
return adjustment * sum / (stdDev * stdDev * stdDev * stdDev) - second_term;
|
||
}
|
||
|
||
/**
|
||
* @brief Perform linear regression on two data series
|
||
*
|
||
* @param x Independent variable values
|
||
* @param y Dependent variable values (must be same length as x)
|
||
* @param length Number of elements in both arrays
|
||
* @param slope Output parameter for slope
|
||
* @param intercept Output parameter for y-intercept
|
||
* @param r_squared Output parameter for R² coefficient of determination
|
||
* @return bool True if successful, false if error occurred
|
||
*/
|
||
bool calculateLinearRegression(const double* x, const double* y, int length,
|
||
double& slope, double& intercept, double& r_squared) {
|
||
if (length < 2) return false;
|
||
|
||
double sum_x = 0, sum_y = 0, sum_xy = 0, sum_x2 = 0, sum_y2 = 0;
|
||
|
||
for (int i = 0; i < length; i++) {
|
||
sum_x += x[i];
|
||
sum_y += y[i];
|
||
sum_xy += x[i] * y[i];
|
||
sum_x2 += x[i] * x[i];
|
||
sum_y2 += y[i] * y[i];
|
||
}
|
||
|
||
double n = static_cast<double>(length);
|
||
double denominator = n * sum_x2 - sum_x * sum_x;
|
||
|
||
if (std::abs(denominator) < 1e-10) return false; // Vertical line, undefined slope
|
||
|
||
// Calculate slope and intercept
|
||
slope = (n * sum_xy - sum_x * sum_y) / denominator;
|
||
intercept = (sum_y - slope * sum_x) / n;
|
||
|
||
// Calculate R² coefficient of determination
|
||
double mean_y = sum_y / n;
|
||
double ss_total = 0, ss_residual = 0;
|
||
|
||
for (int i = 0; i < length; i++) {
|
||
double predicted = intercept + slope * x[i];
|
||
ss_total += (y[i] - mean_y) * (y[i] - mean_y);
|
||
ss_residual += (y[i] - predicted) * (y[i] - predicted);
|
||
}
|
||
|
||
if (ss_total < 1e-10) {
|
||
r_squared = 1.0; // All points are on the same horizontal line
|
||
} else {
|
||
r_squared = 1.0 - (ss_residual / ss_total);
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate the autocorrelation of a time series at specified lag
|
||
*
|
||
* @param values Time series data
|
||
* @param length Number of elements in the array
|
||
* @param lag The lag to calculate autocorrelation for
|
||
* @return double Autocorrelation coefficient at specified lag (-1 to 1)
|
||
*/
|
||
double calculateAutocorrelation(const double* values, int length, int lag) {
|
||
if (length <= lag || lag <= 0) return 0;
|
||
|
||
double mean = calculateMean(values, length);
|
||
double numerator = 0;
|
||
double denominator = 0;
|
||
|
||
for (int i = 0; i < length - lag; i++) {
|
||
numerator += (values[i] - mean) * (values[i + lag] - mean);
|
||
}
|
||
|
||
for (int i = 0; i < length; i++) {
|
||
denominator += (values[i] - mean) * (values[i] - mean);
|
||
}
|
||
|
||
if (denominator < 1e-10) return 0;
|
||
|
||
return numerator / denominator;
|
||
}
|
||
|
||
/**
|
||
* @brief Detect outliers in a data series using modified Z-score method
|
||
*
|
||
* @param values Pointer to array of values
|
||
* @param length Number of elements in the array
|
||
* @param outlierIndices Output vector to store indices of detected outliers
|
||
* @param threshold Z-score threshold to consider a point an outlier (typically 3.5)
|
||
* @return int Number of outliers detected
|
||
*/
|
||
int detectOutliers(const double* values, int length, std::vector<int>& outlierIndices, double threshold = 3.5) {
|
||
if (length < 3) return 0;
|
||
|
||
outlierIndices.clear();
|
||
|
||
// Use median and MAD instead of mean and std dev for robustness
|
||
std::vector<double> sorted(values, values + length);
|
||
std::sort(sorted.begin(), sorted.end());
|
||
|
||
double median = (length % 2 == 0) ?
|
||
(sorted[length/2 - 1] + sorted[length/2]) / 2.0 : sorted[length/2];
|
||
|
||
// Calculate MAD (Median Absolute Deviation)
|
||
std::vector<double> deviations(length);
|
||
for (int i = 0; i < length; i++) {
|
||
deviations[i] = std::abs(values[i] - median);
|
||
}
|
||
std::sort(deviations.begin(), deviations.end());
|
||
|
||
double mad = (length % 2 == 0) ?
|
||
(deviations[length/2 - 1] + deviations[length/2]) / 2.0 : deviations[length/2];
|
||
|
||
// Constant factor for normal distribution
|
||
const double k = 1.4826;
|
||
|
||
// Find outliers using modified Z-score
|
||
for (int i = 0; i < length; i++) {
|
||
if (mad < 1e-10) { // If MAD is too small, use simple difference
|
||
if (std::abs(values[i] - median) > threshold) {
|
||
outlierIndices.push_back(i);
|
||
}
|
||
} else {
|
||
double modified_z = k * std::abs(values[i] - median) / mad;
|
||
if (modified_z > threshold) {
|
||
outlierIndices.push_back(i);
|
||
}
|
||
}
|
||
}
|
||
|
||
return outlierIndices.size();
|
||
}
|
||
|
||
/**
|
||
* @brief Perform simple moving average on a time series
|
||
*
|
||
* @param values Time series data
|
||
* @param length Number of elements in the array
|
||
* @param window The window size for the moving average
|
||
* @param result Pre-allocated array to store results (size = length)
|
||
*/
|
||
void calculateMovingAverage(const double* values, int length, int window, double* result) {
|
||
if (length <= 0 || window <= 0) return;
|
||
|
||
// Adjust window if it's larger than the data length
|
||
window = std::min(window, length);
|
||
|
||
for (int i = 0; i < length; i++) {
|
||
int start = std::max(0, i - window + 1);
|
||
int end = i + 1;
|
||
int count = end - start;
|
||
|
||
double sum = 0;
|
||
for (int j = start; j < end; j++) {
|
||
sum += values[j];
|
||
}
|
||
|
||
result[i] = sum / count;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate exponential moving average (EMA) of a time series
|
||
*
|
||
* @param values Time series data
|
||
* @param length Number of elements in the array
|
||
* @param alpha Smoothing factor (0-1)
|
||
* @param result Pre-allocated array to store results (size = length)
|
||
*/
|
||
void calculateExponentialMovingAverage(const double* values, int length, double alpha, double* result) {
|
||
if (length <= 0 || alpha < 0 || alpha > 1) return;
|
||
|
||
// Initialize with first value
|
||
result[0] = values[0];
|
||
|
||
// Apply EMA formula: EMA_t = α × value_t + (1 - α) × EMA_{t-1}
|
||
for (int i = 1; i < length; i++) {
|
||
result[i] = alpha * values[i] + (1 - alpha) * result[i - 1];
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @brief Decompose a time series into trend, seasonal, and residual components
|
||
* Implementation of STL (Seasonal and Trend decomposition using Loess)
|
||
*
|
||
* @param values Time series data
|
||
* @param length Number of elements in the array
|
||
* @param seasonality Length of seasonal cycle (e.g., 7 for weekly, 12 for monthly)
|
||
* @param trend Output array for trend component (size = length)
|
||
* @param seasonal Output array for seasonal component (size = length)
|
||
* @param residual Output array for residual component (size = length)
|
||
* @return bool True if successful, false if error occurred
|
||
*/
|
||
bool decomposeTimeSeries(const double* values, int length, int seasonality,
|
||
double* trend, double* seasonal, double* residual) {
|
||
if (length <= 2 * seasonality || seasonality <= 1) return false;
|
||
|
||
// Calculate trend with centered moving average
|
||
for (int i = 0; i < length; i++) {
|
||
trend[i] = 0;
|
||
}
|
||
|
||
int halfSeason = seasonality / 2;
|
||
// Centered moving average for trend
|
||
for (int i = halfSeason; i < length - halfSeason; i++) {
|
||
double sum = 0;
|
||
for (int j = i - halfSeason; j <= i + halfSeason; j++) {
|
||
sum += values[j];
|
||
}
|
||
trend[i] = sum / seasonality;
|
||
}
|
||
|
||
// Extrapolate trend at boundaries
|
||
// Left boundary
|
||
double slope = (trend[halfSeason + 5] - trend[halfSeason]) / 5;
|
||
for (int i = 0; i < halfSeason; i++) {
|
||
trend[i] = trend[halfSeason] - (halfSeason - i) * slope;
|
||
}
|
||
|
||
// Right boundary
|
||
slope = (trend[length - halfSeason - 1] - trend[length - halfSeason - 6]) / 5;
|
||
for (int i = length - halfSeason; i < length; i++) {
|
||
trend[i] = trend[length - halfSeason - 1] + (i - (length - halfSeason - 1)) * slope;
|
||
}
|
||
|
||
// Calculate detrended series
|
||
std::vector<double> detrended(length);
|
||
for (int i = 0; i < length; i++) {
|
||
detrended[i] = values[i] - trend[i];
|
||
}
|
||
|
||
// Calculate seasonal component by averaging the detrended values across seasons
|
||
std::vector<double> seasonalAvg(seasonality, 0);
|
||
std::vector<int> seasonalCounts(seasonality, 0);
|
||
|
||
for (int i = 0; i < length; i++) {
|
||
int seasonalIndex = i % seasonality;
|
||
seasonalAvg[seasonalIndex] += detrended[i];
|
||
seasonalCounts[seasonalIndex]++;
|
||
}
|
||
|
||
for (int i = 0; i < seasonality; i++) {
|
||
if (seasonalCounts[i] > 0) {
|
||
seasonalAvg[i] /= seasonalCounts[i];
|
||
}
|
||
}
|
||
|
||
// Normalize seasonal component to sum to zero
|
||
double avgSeasonal = 0;
|
||
for (int i = 0; i < seasonality; i++) {
|
||
avgSeasonal += seasonalAvg[i];
|
||
}
|
||
avgSeasonal /= seasonality;
|
||
|
||
for (int i = 0; i < seasonality; i++) {
|
||
seasonalAvg[i] -= avgSeasonal;
|
||
}
|
||
|
||
// Apply seasonal component to entire series
|
||
for (int i = 0; i < length; i++) {
|
||
seasonal[i] = seasonalAvg[i % seasonality];
|
||
}
|
||
|
||
// Calculate residual component
|
||
for (int i = 0; i < length; i++) {
|
||
residual[i] = values[i] - trend[i] - seasonal[i];
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate partial autocorrelation function for a time series
|
||
*
|
||
* @param values Time series data
|
||
* @param length Number of elements in the array
|
||
* @param maxLag Maximum lag to calculate
|
||
* @param pacf Pre-allocated array to store results (size = maxLag + 1)
|
||
* @return int Number of valid PACF values calculated
|
||
*/
|
||
int calculatePACF(const double* values, int length, int maxLag, double* pacf) {
|
||
if (length <= 1 || maxLag <= 0 || maxLag >= length) return 0;
|
||
|
||
// Allocate Yule-Walker matrices
|
||
std::vector<std::vector<double>> phi(maxLag + 1, std::vector<double>(maxLag + 1, 0));
|
||
|
||
// Calculate autocorrelations
|
||
std::vector<double> acf(maxLag + 1, 0);
|
||
acf[0] = 1.0; // ACF at lag 0 is always 1
|
||
|
||
for (int k = 1; k <= maxLag; k++) {
|
||
acf[k] = calculateAutocorrelation(values, length, k);
|
||
}
|
||
|
||
// Set PACF at lag 0 to 1
|
||
pacf[0] = 1.0;
|
||
|
||
// Calculate PACF using Levinson-Durbin recursion
|
||
for (int k = 1; k <= maxLag; k++) {
|
||
// Initialize for this order
|
||
double numerator = acf[k];
|
||
for (int j = 1; j < k; j++) {
|
||
numerator -= phi[k-1][j] * acf[k-j];
|
||
}
|
||
|
||
double denominator = 1.0;
|
||
for (int j = 1; j < k; j++) {
|
||
denominator -= phi[k-1][j] * acf[j];
|
||
}
|
||
|
||
if (std::abs(denominator) < 1e-10) {
|
||
// If denominator is close to zero, set PACF to 0
|
||
phi[k][k] = 0;
|
||
} else {
|
||
phi[k][k] = numerator / denominator;
|
||
}
|
||
|
||
// Update remaining coefficients
|
||
for (int j = 1; j < k; j++) {
|
||
phi[k][j] = phi[k-1][j] - phi[k][k] * phi[k-1][k-j];
|
||
}
|
||
|
||
// Store the PACF value
|
||
pacf[k] = phi[k][k];
|
||
}
|
||
|
||
return maxLag + 1;
|
||
}
|
||
|
||
/**
|
||
* @brief Perform k-means clustering on multivariate data
|
||
*
|
||
* @param data 2D array of data points [n_samples x n_features]
|
||
* @param nSamples Number of data points
|
||
* @param nFeatures Number of features per data point
|
||
* @param k Number of clusters
|
||
* @param maxIter Maximum number of iterations
|
||
* @param centroids Output array for cluster centroids [k x n_features]
|
||
* @param assignments Output array for cluster assignments [n_samples]
|
||
* @return int Number of iterations performed
|
||
*/
|
||
int kMeansClustering(const double** data, int nSamples, int nFeatures, int k,
|
||
int maxIter, double** centroids, int* assignments) {
|
||
if (nSamples < k || k <= 0 || nFeatures <= 0) return 0;
|
||
|
||
std::random_device rd;
|
||
std::mt19937 gen(rd());
|
||
std::uniform_int_distribution<> distrib(0, nSamples - 1);
|
||
|
||
// Initialize centroids using k-means++ initialization
|
||
std::vector<int> centroidIndices;
|
||
std::vector<double> minDistances(nSamples, std::numeric_limits<double>::max());
|
||
|
||
// Choose first centroid randomly
|
||
int firstCentroid = distrib(gen);
|
||
centroidIndices.push_back(firstCentroid);
|
||
|
||
// Choose remaining centroids
|
||
for (int c = 1; c < k; c++) {
|
||
// Update distances to nearest centroid
|
||
for (int i = 0; i < nSamples; i++) {
|
||
double dist = 0;
|
||
for (int j = 0; j < nFeatures; j++) {
|
||
double diff = data[i][j] - data[centroidIndices.back()][j];
|
||
dist += diff * diff;
|
||
}
|
||
minDistances[i] = std::min(minDistances[i], dist);
|
||
}
|
||
|
||
// Calculate sum of squared distances
|
||
double sumSquaredDist = 0;
|
||
for (int i = 0; i < nSamples; i++) {
|
||
sumSquaredDist += minDistances[i];
|
||
}
|
||
|
||
// Choose next centroid with probability proportional to D²
|
||
double threshold = (sumSquaredDist * static_cast<double>(rand()) / RAND_MAX);
|
||
double cumulativeProb = 0;
|
||
int nextCentroid = 0;
|
||
|
||
for (int i = 0; i < nSamples; i++) {
|
||
cumulativeProb += minDistances[i];
|
||
if (cumulativeProb >= threshold) {
|
||
nextCentroid = i;
|
||
break;
|
||
}
|
||
}
|
||
|
||
centroidIndices.push_back(nextCentroid);
|
||
}
|
||
|
||
// Copy initial centroids
|
||
for (int i = 0; i < k; i++) {
|
||
for (int j = 0; j < nFeatures; j++) {
|
||
centroids[i][j] = data[centroidIndices[i]][j];
|
||
}
|
||
}
|
||
|
||
// Perform k-means iterations
|
||
int iterations = 0;
|
||
bool converged = false;
|
||
|
||
while (!converged && iterations < maxIter) {
|
||
// Assign points to nearest centroid
|
||
converged = true;
|
||
|
||
for (int i = 0; i < nSamples; i++) {
|
||
double minDist = std::numeric_limits<double>::max();
|
||
int bestCluster = 0;
|
||
|
||
for (int c = 0; c < k; c++) {
|
||
double dist = 0;
|
||
for (int j = 0; j < nFeatures; j++) {
|
||
double diff = data[i][j] - centroids[c][j];
|
||
dist += diff * diff;
|
||
}
|
||
|
||
if (dist < minDist) {
|
||
minDist = dist;
|
||
bestCluster = c;
|
||
}
|
||
}
|
||
|
||
if (assignments[i] != bestCluster) {
|
||
assignments[i] = bestCluster;
|
||
converged = false;
|
||
}
|
||
}
|
||
|
||
// Update centroids
|
||
std::vector<std::vector<double>> newCentroids(k, std::vector<double>(nFeatures, 0));
|
||
std::vector<int> clusterSizes(k, 0);
|
||
|
||
for (int i = 0; i < nSamples; i++) {
|
||
int cluster = assignments[i];
|
||
clusterSizes[cluster]++;
|
||
|
||
for (int j = 0; j < nFeatures; j++) {
|
||
newCentroids[cluster][j] += data[i][j];
|
||
}
|
||
}
|
||
|
||
for (int c = 0; c < k; c++) {
|
||
if (clusterSizes[c] > 0) {
|
||
for (int j = 0; j < nFeatures; j++) {
|
||
centroids[c][j] = newCentroids[c][j] / clusterSizes[c];
|
||
}
|
||
}
|
||
}
|
||
|
||
iterations++;
|
||
}
|
||
|
||
return iterations;
|
||
}
|
||
|
||
/**
|
||
* @brief Calculate the silhouette coefficient for clustering validation
|
||
*
|
||
* @param data 2D array of data points [n_samples x n_features]
|
||
* @param nSamples Number of data points
|
||
* @param nFeatures Number of features per data point
|
||
* @param assignments Cluster assignments for each point
|
||
* @param k Number of clusters
|
||
* @return double Average silhouette coefficient (-1 to 1)
|
||
*/
|
||
double calculateSilhouetteCoefficient(const double** data, int nSamples, int nFeatures,
|
||
const int* assignments, int k) {
|
||
if (nSamples <= k || k <= 1) return 0;
|
||
|
||
std::vector<double> silhouettes(nSamples);
|
||
|
||
// For each point
|
||
for (int i = 0; i < nSamples; i++) {
|
||
int cluster_i = assignments[i];
|
||
|
||
// Calculate a(i) - average distance to points in same cluster
|
||
double a_i = 0;
|
||
int count_same_cluster = 0;
|
||
|
||
for (int j = 0; j < nSamples; j++) {
|
||
if (j != i && assignments[j] == cluster_i) {
|
||
double dist = 0;
|
||
for (int f = 0; f < nFeatures; f++) {
|
||
double diff = data[i][f] - data[j][f];
|
||
dist += diff * diff;
|
||
}
|
||
dist = std::sqrt(dist);
|
||
|
||
a_i += dist;
|
||
count_same_cluster++;
|
||
}
|
||
}
|
||
|
||
if (count_same_cluster > 0) {
|
||
a_i /= count_same_cluster;
|
||
} else {
|
||
a_i = 0; // Singleton cluster
|
||
}
|
||
|
||
// Calculate b(i) - minimum average distance to points in different clusters
|
||
double b_i = std::numeric_limits<double>::max();
|
||
|
||
for (int c = 0; c < k; c++) {
|
||
if (c == cluster_i) continue;
|
||
|
||
double avg_dist = 0;
|
||
int count_diff_cluster = 0;
|
||
|
||
for (int j = 0; j < nSamples; j++) {
|
||
if (assignments[j] == c) {
|
||
double dist = 0;
|
||
for (int f = 0; f < nFeatures; f++) {
|
||
double diff = data[i][f] - data[j][f];
|
||
dist += diff * diff;
|
||
}
|
||
dist = std::sqrt(dist);
|
||
|
||
avg_dist += dist;
|
||
count_diff_cluster++;
|
||
}
|
||
}
|
||
|
||
if (count_diff_cluster > 0) {
|
||
avg_dist /= count_diff_cluster;
|
||
b_i = std::min(b_i, avg_dist);
|
||
}
|
||
}
|
||
|
||
// Calculate silhouette
|
||
if (count_same_cluster > 0 && b_i < std::numeric_limits<double>::max()) {
|
||
silhouettes[i] = (b_i - a_i) / std::max(a_i, b_i);
|
||
} else {
|
||
silhouettes[i] = 0; // Handle edge cases
|
||
}
|
||
}
|
||
|
||
// Calculate average silhouette
|
||
double avg_silhouette = 0;
|
||
for (int i = 0; i < nSamples; i++) {
|
||
avg_silhouette += silhouettes[i];
|
||
}
|
||
|
||
return avg_silhouette / nSamples;
|
||
} |