// SPDX-FileCopyrightText: © 2025 Nøkken.io // SPDX-License-Identifier: AGPL-3.0 // // utils.cpp // Core utility functions used across the health analytics library // #include "health_analytics_engine.h" #include "utils.h" #include #include #include /** * @brief Calculate the mean (average) of a data series * * @param values Pointer to array of values * @param length Number of elements in the array * @return double The arithmetic mean */ double calculateMean(const double* values, int length) { if (length <= 0) return 0; double sum = 0; for (int i = 0; i < length; i++) { sum += values[i]; } return sum / length; } /** * @brief Calculate the weighted mean of a data series * * @param values Pointer to array of values * @param weights Pointer to array of weights for each value * @param length Number of elements in the arrays * @return double The weighted arithmetic mean */ double calculateWeightedMean(const double* values, const double* weights, int length) { if (length <= 0) return 0; double sum = 0; double weightSum = 0; for (int i = 0; i < length; i++) { sum += values[i] * weights[i]; weightSum += weights[i]; } return weightSum > 0 ? sum / weightSum : 0; } /** * @brief Calculate the variance of a data series * Uses Welford's online algorithm for numerical stability * * @param values Pointer to array of values * @param length Number of elements in the array * @param mean Pre-calculated mean (if available, otherwise pass 0) * @return double The variance (population or sample based on implementation) */ double calculateVariance(const double* values, int length, double mean = 0) { if (length <= 1) return 0; // Use pre-calculated mean if provided, otherwise calculate it if (mean == 0) { mean = calculateMean(values, length); } // Use two-pass algorithm for better numerical stability double sumSquaredDiff = 0; for (int i = 0; i < length; i++) { double diff = values[i] - mean; sumSquaredDiff += diff * diff; } // Return sample variance (n-1 denominator for unbiased estimation) return sumSquaredDiff / (length - 1); } /** * @brief Calculate the standard deviation of a data series * * @param values Pointer to array of values * @param length Number of elements in the array * @param mean Pre-calculated mean (if available, otherwise pass 0) * @return double The standard deviation */ double calculateStdDev(const double* values, int length, double mean = 0) { return std::sqrt(calculateVariance(values, length, mean)); } /** * @brief Calculate the median of a data series * * @param values Pointer to array of values (will be modified by sorting) * @param length Number of elements in the array * @return double The median value */ double calculateMedian(double* values, int length) { if (length == 0) return 0; if (length == 1) return values[0]; // Create a copy and sort it std::vector sorted(values, values + length); std::sort(sorted.begin(), sorted.end()); if (length % 2 == 0) { // Even number of elements return (sorted[length/2 - 1] + sorted[length/2]) / 2.0; } else { // Odd number of elements return sorted[length/2]; } } /** * @brief Calculate the Pearson correlation coefficient between two data series * * @param x First data series * @param y Second data series (must be same length as x) * @param length Number of elements in both arrays * @return double Correlation coefficient (-1 to 1) */ double calculateCorrelation(const double* x, const double* y, int length) { if (length <= 1) return 0; double sum_x = 0, sum_y = 0, sum_xy = 0; double sum_x2 = 0, sum_y2 = 0; for (int i = 0; i < length; i++) { sum_x += x[i]; sum_y += y[i]; sum_xy += x[i] * y[i]; sum_x2 += x[i] * x[i]; sum_y2 += y[i] * y[i]; } double denominator = std::sqrt((length * sum_x2 - sum_x * sum_x) * (length * sum_y2 - sum_y * sum_y)); if (denominator < 1e-10) return 0; // Avoid division by zero return (length * sum_xy - sum_x * sum_y) / denominator; } /** * @brief Calculate Spearman's rank correlation coefficient * More robust to outliers than Pearson correlation * * @param x First data series * @param y Second data series (must be same length as x) * @param length Number of elements in both arrays * @return double Spearman's rank correlation coefficient (-1 to 1) */ double calculateSpearmanCorrelation(const double* x, const double* y, int length) { if (length <= 1) return 0; // Create vectors with indices to perform ranking std::vector> x_indexed(length); std::vector> y_indexed(length); for (int i = 0; i < length; i++) { x_indexed[i] = std::make_pair(x[i], i); y_indexed[i] = std::make_pair(y[i], i); } // Sort by values to determine ranks std::sort(x_indexed.begin(), x_indexed.end()); std::sort(y_indexed.begin(), y_indexed.end()); // Assign ranks (handling ties with average rank) std::vector x_ranks(length), y_ranks(length); for (int i = 0; i < length; i++) { int j = i; while (j < length - 1 && x_indexed[j].first == x_indexed[j + 1].first) j++; double rank = 1.0 * (i + j) / 2 + 1; for (int k = i; k <= j; k++) { x_ranks[x_indexed[k].second] = rank; } i = j; } for (int i = 0; i < length; i++) { int j = i; while (j < length - 1 && y_indexed[j].first == y_indexed[j + 1].first) j++; double rank = 1.0 * (i + j) / 2 + 1; for (int k = i; k <= j; k++) { y_ranks[y_indexed[k].second] = rank; } i = j; } // Calculate Pearson correlation on the ranks double* x_ranks_ptr = x_ranks.data(); double* y_ranks_ptr = y_ranks.data(); return calculateCorrelation(x_ranks_ptr, y_ranks_ptr, length); } /** * @brief Calculate a specific quantile value of a data series * * @param values Pointer to array of values * @param length Number of elements in the array * @param q Quantile to calculate (0-1, e.g., 0.25 for first quartile) * @return double The value at the specified quantile */ double calculateQuantile(const double* values, int length, double q) { if (length == 0) return 0; if (length == 1) return values[0]; if (q < 0) q = 0; if (q > 1) q = 1; std::vector sorted(values, values + length); std::sort(sorted.begin(), sorted.end()); // Linear interpolation between closest ranks double pos = (length - 1) * q; int idx_lower = static_cast(pos); double frac = pos - idx_lower; if (idx_lower + 1 < length) { return sorted[idx_lower] * (1 - frac) + sorted[idx_lower + 1] * frac; } else { return sorted[idx_lower]; } } /** * @brief Calculate the interquartile range (IQR) of a data series * * @param values Pointer to array of values * @param length Number of elements in the array * @return double The IQR (Q3-Q1) */ double calculateIQR(const double* values, int length) { if (length < 4) return 0; double q1 = calculateQuantile(values, length, 0.25); double q3 = calculateQuantile(values, length, 0.75); return q3 - q1; } /** * @brief Calculate the skewness of a data distribution * Measures the asymmetry of the probability distribution * * @param values Pointer to array of values * @param length Number of elements in the array * @param mean Pre-calculated mean (if available, otherwise pass 0) * @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0) * @return double The skewness value (0 for normal distribution) */ double calculateSkewness(const double* values, int length, double mean = 0, double stdDev = 0) { if (length <= 2) return 0; // Calculate mean and stdDev if not provided if (mean == 0) { mean = calculateMean(values, length); } if (stdDev == 0) { stdDev = calculateStdDev(values, length, mean); } if (stdDev < 1e-10) return 0; // Avoid division by zero // Calculate third moment (cube of differences) double sum = 0; for (int i = 0; i < length; i++) { double diff = values[i] - mean; sum += diff * diff * diff; } // Return Fisher-Pearson coefficient of skewness // Includes adjustment for sample bias double n = length; double adjustment = std::sqrt(n * (n - 1)) / (n - 2); return adjustment * sum / (length * stdDev * stdDev * stdDev); } /** * @brief Calculate the kurtosis of a data distribution * Measures the "tailedness" of the probability distribution * * @param values Pointer to array of values * @param length Number of elements in the array * @param mean Pre-calculated mean (if available, otherwise pass 0) * @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0) * @return double The excess kurtosis (0 for normal distribution) */ double calculateKurtosis(const double* values, int length, double mean = 0, double stdDev = 0) { if (length <= 3) return 0; // Calculate mean and stdDev if not provided if (mean == 0) { mean = calculateMean(values, length); } if (stdDev == 0) { stdDev = calculateStdDev(values, length, mean); } if (stdDev < 1e-10) return 0; // Avoid division by zero // Calculate fourth moment double sum = 0; for (int i = 0; i < length; i++) { double diff = values[i] - mean; sum += diff * diff * diff * diff; } // Return excess kurtosis with sample adjustment double n = length; double adjustment = ((n + 1) * n) / ((n - 1) * (n - 2) * (n - 3)); double second_term = 3 * (n - 1) * (n - 1) / ((n - 2) * (n - 3)); return adjustment * sum / (stdDev * stdDev * stdDev * stdDev) - second_term; } /** * @brief Perform linear regression on two data series * * @param x Independent variable values * @param y Dependent variable values (must be same length as x) * @param length Number of elements in both arrays * @param slope Output parameter for slope * @param intercept Output parameter for y-intercept * @param r_squared Output parameter for R² coefficient of determination * @return bool True if successful, false if error occurred */ bool calculateLinearRegression(const double* x, const double* y, int length, double& slope, double& intercept, double& r_squared) { if (length < 2) return false; double sum_x = 0, sum_y = 0, sum_xy = 0, sum_x2 = 0, sum_y2 = 0; for (int i = 0; i < length; i++) { sum_x += x[i]; sum_y += y[i]; sum_xy += x[i] * y[i]; sum_x2 += x[i] * x[i]; sum_y2 += y[i] * y[i]; } double n = static_cast(length); double denominator = n * sum_x2 - sum_x * sum_x; if (std::abs(denominator) < 1e-10) return false; // Vertical line, undefined slope // Calculate slope and intercept slope = (n * sum_xy - sum_x * sum_y) / denominator; intercept = (sum_y - slope * sum_x) / n; // Calculate R² coefficient of determination double mean_y = sum_y / n; double ss_total = 0, ss_residual = 0; for (int i = 0; i < length; i++) { double predicted = intercept + slope * x[i]; ss_total += (y[i] - mean_y) * (y[i] - mean_y); ss_residual += (y[i] - predicted) * (y[i] - predicted); } if (ss_total < 1e-10) { r_squared = 1.0; // All points are on the same horizontal line } else { r_squared = 1.0 - (ss_residual / ss_total); } return true; } /** * @brief Calculate the autocorrelation of a time series at specified lag * * @param values Time series data * @param length Number of elements in the array * @param lag The lag to calculate autocorrelation for * @return double Autocorrelation coefficient at specified lag (-1 to 1) */ double calculateAutocorrelation(const double* values, int length, int lag) { if (length <= lag || lag <= 0) return 0; double mean = calculateMean(values, length); double numerator = 0; double denominator = 0; for (int i = 0; i < length - lag; i++) { numerator += (values[i] - mean) * (values[i + lag] - mean); } for (int i = 0; i < length; i++) { denominator += (values[i] - mean) * (values[i] - mean); } if (denominator < 1e-10) return 0; return numerator / denominator; } /** * @brief Detect outliers in a data series using modified Z-score method * * @param values Pointer to array of values * @param length Number of elements in the array * @param outlierIndices Output vector to store indices of detected outliers * @param threshold Z-score threshold to consider a point an outlier (typically 3.5) * @return int Number of outliers detected */ int detectOutliers(const double* values, int length, std::vector& outlierIndices, double threshold = 3.5) { if (length < 3) return 0; outlierIndices.clear(); // Use median and MAD instead of mean and std dev for robustness std::vector sorted(values, values + length); std::sort(sorted.begin(), sorted.end()); double median = (length % 2 == 0) ? (sorted[length/2 - 1] + sorted[length/2]) / 2.0 : sorted[length/2]; // Calculate MAD (Median Absolute Deviation) std::vector deviations(length); for (int i = 0; i < length; i++) { deviations[i] = std::abs(values[i] - median); } std::sort(deviations.begin(), deviations.end()); double mad = (length % 2 == 0) ? (deviations[length/2 - 1] + deviations[length/2]) / 2.0 : deviations[length/2]; // Constant factor for normal distribution const double k = 1.4826; // Find outliers using modified Z-score for (int i = 0; i < length; i++) { if (mad < 1e-10) { // If MAD is too small, use simple difference if (std::abs(values[i] - median) > threshold) { outlierIndices.push_back(i); } } else { double modified_z = k * std::abs(values[i] - median) / mad; if (modified_z > threshold) { outlierIndices.push_back(i); } } } return outlierIndices.size(); } /** * @brief Perform simple moving average on a time series * * @param values Time series data * @param length Number of elements in the array * @param window The window size for the moving average * @param result Pre-allocated array to store results (size = length) */ void calculateMovingAverage(const double* values, int length, int window, double* result) { if (length <= 0 || window <= 0) return; // Adjust window if it's larger than the data length window = std::min(window, length); for (int i = 0; i < length; i++) { int start = std::max(0, i - window + 1); int end = i + 1; int count = end - start; double sum = 0; for (int j = start; j < end; j++) { sum += values[j]; } result[i] = sum / count; } } /** * @brief Calculate exponential moving average (EMA) of a time series * * @param values Time series data * @param length Number of elements in the array * @param alpha Smoothing factor (0-1) * @param result Pre-allocated array to store results (size = length) */ void calculateExponentialMovingAverage(const double* values, int length, double alpha, double* result) { if (length <= 0 || alpha < 0 || alpha > 1) return; // Initialize with first value result[0] = values[0]; // Apply EMA formula: EMA_t = α × value_t + (1 - α) × EMA_{t-1} for (int i = 1; i < length; i++) { result[i] = alpha * values[i] + (1 - alpha) * result[i - 1]; } } /** * @brief Decompose a time series into trend, seasonal, and residual components * Implementation of STL (Seasonal and Trend decomposition using Loess) * * @param values Time series data * @param length Number of elements in the array * @param seasonality Length of seasonal cycle (e.g., 7 for weekly, 12 for monthly) * @param trend Output array for trend component (size = length) * @param seasonal Output array for seasonal component (size = length) * @param residual Output array for residual component (size = length) * @return bool True if successful, false if error occurred */ bool decomposeTimeSeries(const double* values, int length, int seasonality, double* trend, double* seasonal, double* residual) { if (length <= 2 * seasonality || seasonality <= 1) return false; // Calculate trend with centered moving average for (int i = 0; i < length; i++) { trend[i] = 0; } int halfSeason = seasonality / 2; // Centered moving average for trend for (int i = halfSeason; i < length - halfSeason; i++) { double sum = 0; for (int j = i - halfSeason; j <= i + halfSeason; j++) { sum += values[j]; } trend[i] = sum / seasonality; } // Extrapolate trend at boundaries // Left boundary double slope = (trend[halfSeason + 5] - trend[halfSeason]) / 5; for (int i = 0; i < halfSeason; i++) { trend[i] = trend[halfSeason] - (halfSeason - i) * slope; } // Right boundary slope = (trend[length - halfSeason - 1] - trend[length - halfSeason - 6]) / 5; for (int i = length - halfSeason; i < length; i++) { trend[i] = trend[length - halfSeason - 1] + (i - (length - halfSeason - 1)) * slope; } // Calculate detrended series std::vector detrended(length); for (int i = 0; i < length; i++) { detrended[i] = values[i] - trend[i]; } // Calculate seasonal component by averaging the detrended values across seasons std::vector seasonalAvg(seasonality, 0); std::vector seasonalCounts(seasonality, 0); for (int i = 0; i < length; i++) { int seasonalIndex = i % seasonality; seasonalAvg[seasonalIndex] += detrended[i]; seasonalCounts[seasonalIndex]++; } for (int i = 0; i < seasonality; i++) { if (seasonalCounts[i] > 0) { seasonalAvg[i] /= seasonalCounts[i]; } } // Normalize seasonal component to sum to zero double avgSeasonal = 0; for (int i = 0; i < seasonality; i++) { avgSeasonal += seasonalAvg[i]; } avgSeasonal /= seasonality; for (int i = 0; i < seasonality; i++) { seasonalAvg[i] -= avgSeasonal; } // Apply seasonal component to entire series for (int i = 0; i < length; i++) { seasonal[i] = seasonalAvg[i % seasonality]; } // Calculate residual component for (int i = 0; i < length; i++) { residual[i] = values[i] - trend[i] - seasonal[i]; } return true; } /** * @brief Calculate partial autocorrelation function for a time series * * @param values Time series data * @param length Number of elements in the array * @param maxLag Maximum lag to calculate * @param pacf Pre-allocated array to store results (size = maxLag + 1) * @return int Number of valid PACF values calculated */ int calculatePACF(const double* values, int length, int maxLag, double* pacf) { if (length <= 1 || maxLag <= 0 || maxLag >= length) return 0; // Allocate Yule-Walker matrices std::vector> phi(maxLag + 1, std::vector(maxLag + 1, 0)); // Calculate autocorrelations std::vector acf(maxLag + 1, 0); acf[0] = 1.0; // ACF at lag 0 is always 1 for (int k = 1; k <= maxLag; k++) { acf[k] = calculateAutocorrelation(values, length, k); } // Set PACF at lag 0 to 1 pacf[0] = 1.0; // Calculate PACF using Levinson-Durbin recursion for (int k = 1; k <= maxLag; k++) { // Initialize for this order double numerator = acf[k]; for (int j = 1; j < k; j++) { numerator -= phi[k-1][j] * acf[k-j]; } double denominator = 1.0; for (int j = 1; j < k; j++) { denominator -= phi[k-1][j] * acf[j]; } if (std::abs(denominator) < 1e-10) { // If denominator is close to zero, set PACF to 0 phi[k][k] = 0; } else { phi[k][k] = numerator / denominator; } // Update remaining coefficients for (int j = 1; j < k; j++) { phi[k][j] = phi[k-1][j] - phi[k][k] * phi[k-1][k-j]; } // Store the PACF value pacf[k] = phi[k][k]; } return maxLag + 1; } /** * @brief Perform k-means clustering on multivariate data * * @param data 2D array of data points [n_samples x n_features] * @param nSamples Number of data points * @param nFeatures Number of features per data point * @param k Number of clusters * @param maxIter Maximum number of iterations * @param centroids Output array for cluster centroids [k x n_features] * @param assignments Output array for cluster assignments [n_samples] * @return int Number of iterations performed */ int kMeansClustering(const double** data, int nSamples, int nFeatures, int k, int maxIter, double** centroids, int* assignments) { if (nSamples < k || k <= 0 || nFeatures <= 0) return 0; std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<> distrib(0, nSamples - 1); // Initialize centroids using k-means++ initialization std::vector centroidIndices; std::vector minDistances(nSamples, std::numeric_limits::max()); // Choose first centroid randomly int firstCentroid = distrib(gen); centroidIndices.push_back(firstCentroid); // Choose remaining centroids for (int c = 1; c < k; c++) { // Update distances to nearest centroid for (int i = 0; i < nSamples; i++) { double dist = 0; for (int j = 0; j < nFeatures; j++) { double diff = data[i][j] - data[centroidIndices.back()][j]; dist += diff * diff; } minDistances[i] = std::min(minDistances[i], dist); } // Calculate sum of squared distances double sumSquaredDist = 0; for (int i = 0; i < nSamples; i++) { sumSquaredDist += minDistances[i]; } // Choose next centroid with probability proportional to D² double threshold = (sumSquaredDist * static_cast(rand()) / RAND_MAX); double cumulativeProb = 0; int nextCentroid = 0; for (int i = 0; i < nSamples; i++) { cumulativeProb += minDistances[i]; if (cumulativeProb >= threshold) { nextCentroid = i; break; } } centroidIndices.push_back(nextCentroid); } // Copy initial centroids for (int i = 0; i < k; i++) { for (int j = 0; j < nFeatures; j++) { centroids[i][j] = data[centroidIndices[i]][j]; } } // Perform k-means iterations int iterations = 0; bool converged = false; while (!converged && iterations < maxIter) { // Assign points to nearest centroid converged = true; for (int i = 0; i < nSamples; i++) { double minDist = std::numeric_limits::max(); int bestCluster = 0; for (int c = 0; c < k; c++) { double dist = 0; for (int j = 0; j < nFeatures; j++) { double diff = data[i][j] - centroids[c][j]; dist += diff * diff; } if (dist < minDist) { minDist = dist; bestCluster = c; } } if (assignments[i] != bestCluster) { assignments[i] = bestCluster; converged = false; } } // Update centroids std::vector> newCentroids(k, std::vector(nFeatures, 0)); std::vector clusterSizes(k, 0); for (int i = 0; i < nSamples; i++) { int cluster = assignments[i]; clusterSizes[cluster]++; for (int j = 0; j < nFeatures; j++) { newCentroids[cluster][j] += data[i][j]; } } for (int c = 0; c < k; c++) { if (clusterSizes[c] > 0) { for (int j = 0; j < nFeatures; j++) { centroids[c][j] = newCentroids[c][j] / clusterSizes[c]; } } } iterations++; } return iterations; } /** * @brief Calculate the silhouette coefficient for clustering validation * * @param data 2D array of data points [n_samples x n_features] * @param nSamples Number of data points * @param nFeatures Number of features per data point * @param assignments Cluster assignments for each point * @param k Number of clusters * @return double Average silhouette coefficient (-1 to 1) */ double calculateSilhouetteCoefficient(const double** data, int nSamples, int nFeatures, const int* assignments, int k) { if (nSamples <= k || k <= 1) return 0; std::vector silhouettes(nSamples); // For each point for (int i = 0; i < nSamples; i++) { int cluster_i = assignments[i]; // Calculate a(i) - average distance to points in same cluster double a_i = 0; int count_same_cluster = 0; for (int j = 0; j < nSamples; j++) { if (j != i && assignments[j] == cluster_i) { double dist = 0; for (int f = 0; f < nFeatures; f++) { double diff = data[i][f] - data[j][f]; dist += diff * diff; } dist = std::sqrt(dist); a_i += dist; count_same_cluster++; } } if (count_same_cluster > 0) { a_i /= count_same_cluster; } else { a_i = 0; // Singleton cluster } // Calculate b(i) - minimum average distance to points in different clusters double b_i = std::numeric_limits::max(); for (int c = 0; c < k; c++) { if (c == cluster_i) continue; double avg_dist = 0; int count_diff_cluster = 0; for (int j = 0; j < nSamples; j++) { if (assignments[j] == c) { double dist = 0; for (int f = 0; f < nFeatures; f++) { double diff = data[i][f] - data[j][f]; dist += diff * diff; } dist = std::sqrt(dist); avg_dist += dist; count_diff_cluster++; } } if (count_diff_cluster > 0) { avg_dist /= count_diff_cluster; b_i = std::min(b_i, avg_dist); } } // Calculate silhouette if (count_same_cluster > 0 && b_i < std::numeric_limits::max()) { silhouettes[i] = (b_i - a_i) / std::max(a_i, b_i); } else { silhouettes[i] = 0; // Handle edge cases } } // Calculate average silhouette double avg_silhouette = 0; for (int i = 0; i < nSamples; i++) { avg_silhouette += silhouettes[i]; } return avg_silhouette / nSamples; }