first commit - migrated from codeberg
This commit is contained in:
commit
5ead03e1f7
567 changed files with 102721 additions and 0 deletions
494
native/statistics/anomaly_detection.cpp
Normal file
494
native/statistics/anomaly_detection.cpp
Normal file
|
@ -0,0 +1,494 @@
|
|||
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
|
||||
// SPDX-License-Identifier: AGPL-3.0
|
||||
//
|
||||
// anomaly_detection.cpp
|
||||
// Implementation of anomaly and outlier detection functions
|
||||
//
|
||||
#include "health_analytics_engine.h"
|
||||
/**
|
||||
* @brief Detect anomalies in time series data
|
||||
*
|
||||
* @param timeSeries Time series data
|
||||
* @param dataLength Length of time series
|
||||
* @param threshold Z-score threshold to consider a point an anomaly
|
||||
* @param dates Array of dates corresponding to time series points
|
||||
* @param factorName Name of the factor being analyzed
|
||||
* @return AnomalyResult* Array of detected anomalies
|
||||
*/
|
||||
AnomalyResult* detect_anomalies(const double* timeSeries,
|
||||
int dataLength,
|
||||
double threshold,
|
||||
const DateStruct* dates,
|
||||
const char* factorName) {
|
||||
if (dataLength < 5) {
|
||||
AnomalyResult* dummy = new AnomalyResult[1];
|
||||
memset(dummy, 0, sizeof(AnomalyResult));
|
||||
dummy[0].dataPointIndex = -1; // Mark as invalid
|
||||
return dummy;
|
||||
}
|
||||
|
||||
// Constants
|
||||
const int MAX_RESULTS = 100;
|
||||
|
||||
// Allocate space for results
|
||||
AnomalyResult* results = new AnomalyResult[MAX_RESULTS + 1];
|
||||
int resultCount = 0;
|
||||
|
||||
// Decompose time series if enough data
|
||||
std::vector<double> trend(dataLength);
|
||||
std::vector<double> seasonal(dataLength);
|
||||
std::vector<double> residual(dataLength);
|
||||
|
||||
bool hasDecomposition = false;
|
||||
int seasonality = 0;
|
||||
|
||||
// Try to detect seasonality for decomposition
|
||||
double maxAutocorr = 0;
|
||||
for (int lag = 2; lag <= dataLength/3; lag++) {
|
||||
double acf = calculateAutocorrelation(timeSeries, dataLength, lag);
|
||||
if (acf > 0.3 && acf > maxAutocorr) {
|
||||
maxAutocorr = acf;
|
||||
seasonality = lag;
|
||||
}
|
||||
}
|
||||
|
||||
if (seasonality >= 2) {
|
||||
hasDecomposition = decomposeTimeSeries(timeSeries, dataLength, seasonality,
|
||||
trend.data(), seasonal.data(), residual.data());
|
||||
}
|
||||
|
||||
if (!hasDecomposition) {
|
||||
// Simple moving average for trend if decomposition failed
|
||||
int windowSize = std::min(7, dataLength/3);
|
||||
if (windowSize < 2) windowSize = 2;
|
||||
|
||||
calculateMovingAverage(timeSeries, dataLength, windowSize, trend.data());
|
||||
|
||||
// Residuals = original - trend
|
||||
for (int i = 0; i < dataLength; i++) {
|
||||
seasonal[i] = 0; // No seasonal component
|
||||
residual[i] = timeSeries[i] - trend[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate residual statistics for outlier detection
|
||||
double mean = 0, sumSquared = 0;
|
||||
for (int i = 0; i < dataLength; i++) {
|
||||
mean += residual[i];
|
||||
}
|
||||
mean /= dataLength;
|
||||
|
||||
for (int i = 0; i < dataLength; i++) {
|
||||
double diff = residual[i] - mean;
|
||||
sumSquared += diff * diff;
|
||||
}
|
||||
double stdDev = sqrt(sumSquared / dataLength);
|
||||
|
||||
if (stdDev <= 0) {
|
||||
results[0].dataPointIndex = -1;
|
||||
return results;
|
||||
}
|
||||
|
||||
// Detect global outliers using z-score
|
||||
for (int i = 0; i < dataLength && resultCount < MAX_RESULTS; i++) {
|
||||
double zScore = (residual[i] - mean) / stdDev;
|
||||
|
||||
if (std::abs(zScore) > threshold) {
|
||||
AnomalyResult& anomaly = results[resultCount++];
|
||||
|
||||
anomaly.dataPointIndex = i;
|
||||
anomaly.anomalyScore = std::abs(zScore);
|
||||
anomaly.originalValue = timeSeries[i];
|
||||
anomaly.expectedValue = trend[i] + seasonal[i] + mean;
|
||||
|
||||
// Higher confidence for more extreme anomalies
|
||||
anomaly.confidence = 0.5 + 0.5 * std::min(1.0, (std::abs(zScore) - threshold) / 5.0);
|
||||
|
||||
// Copy date if available
|
||||
if (dates != nullptr) {
|
||||
anomaly.date = dates[i];
|
||||
} else {
|
||||
memset(&anomaly.date, 0, sizeof(DateStruct));
|
||||
}
|
||||
|
||||
// Copy factor name
|
||||
if (factorName != nullptr) {
|
||||
strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
|
||||
anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
|
||||
} else {
|
||||
anomaly.factorName[0] = '\0';
|
||||
}
|
||||
|
||||
// Set anomaly type
|
||||
anomaly.anomalyType = ANOMALY_OUTLIER;
|
||||
|
||||
// Generate description
|
||||
if (zScore > 0) {
|
||||
snprintf(anomaly.description, MAX_STRING_SIZE,
|
||||
"Unusually high value (%.2f standard deviations above expected)",
|
||||
zScore);
|
||||
} else {
|
||||
snprintf(anomaly.description, MAX_STRING_SIZE,
|
||||
"Unusually low value (%.2f standard deviations below expected)",
|
||||
-zScore);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Detect context-based anomalies using local statistics
|
||||
const int LOCAL_WINDOW = std::min(7, dataLength/5);
|
||||
if (LOCAL_WINDOW >= 3) {
|
||||
for (int i = LOCAL_WINDOW; i < dataLength - LOCAL_WINDOW && resultCount < MAX_RESULTS; i++) {
|
||||
// Calculate local statistics
|
||||
double localSum = 0, localSumSquared = 0;
|
||||
for (int j = i - LOCAL_WINDOW; j <= i + LOCAL_WINDOW; j++) {
|
||||
if (j != i) { // Exclude the point itself
|
||||
localSum += timeSeries[j];
|
||||
localSumSquared += timeSeries[j] * timeSeries[j];
|
||||
}
|
||||
}
|
||||
|
||||
double localMean = localSum / (2 * LOCAL_WINDOW);
|
||||
double localVar = localSumSquared / (2 * LOCAL_WINDOW) - localMean * localMean;
|
||||
double localStdDev = sqrt(std::max(localVar, 1e-6)); // Prevent division by zero
|
||||
|
||||
// Calculate local z-score
|
||||
double localZScore = (timeSeries[i] - localMean) / localStdDev;
|
||||
|
||||
// Check if it's a local anomaly but not already a global anomaly
|
||||
if (std::abs(localZScore) > threshold * 1.2) {
|
||||
// Check if this point was already detected as a global anomaly
|
||||
bool alreadyDetected = false;
|
||||
for (int j = 0; j < resultCount; j++) {
|
||||
if (results[j].dataPointIndex == i) {
|
||||
alreadyDetected = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!alreadyDetected) {
|
||||
AnomalyResult& anomaly = results[resultCount++];
|
||||
|
||||
anomaly.dataPointIndex = i;
|
||||
anomaly.anomalyScore = std::abs(localZScore);
|
||||
anomaly.originalValue = timeSeries[i];
|
||||
anomaly.expectedValue = localMean;
|
||||
anomaly.confidence = 0.5 + 0.5 * std::min(1.0, (std::abs(localZScore) - threshold) / 5.0);
|
||||
|
||||
// Copy date if available
|
||||
if (dates != nullptr) {
|
||||
anomaly.date = dates[i];
|
||||
} else {
|
||||
memset(&anomaly.date, 0, sizeof(DateStruct));
|
||||
}
|
||||
|
||||
// Copy factor name
|
||||
if (factorName != nullptr) {
|
||||
strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
|
||||
anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
|
||||
} else {
|
||||
anomaly.factorName[0] = '\0';
|
||||
}
|
||||
|
||||
// Set anomaly type
|
||||
anomaly.anomalyType = ANOMALY_CONTEXTUAL;
|
||||
|
||||
// Generate description
|
||||
if (localZScore > 0) {
|
||||
snprintf(anomaly.description, MAX_STRING_SIZE,
|
||||
"Context anomaly: value is high compared to local neighborhood (%.2f local std dev)",
|
||||
localZScore);
|
||||
} else {
|
||||
snprintf(anomaly.description, MAX_STRING_SIZE,
|
||||
"Context anomaly: value is low compared to local neighborhood (%.2f local std dev)",
|
||||
-localZScore);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Detect trend changes
|
||||
if (dataLength >= 10) {
|
||||
for (int i = 5; i < dataLength - 5 && resultCount < MAX_RESULTS; i++) {
|
||||
// Calculate slope before and after
|
||||
double slopeBefore = 0, slopeAfter = 0;
|
||||
double interceptBefore = 0, interceptAfter = 0;
|
||||
double r2Before = 0, r2After = 0;
|
||||
|
||||
// Create time vectors
|
||||
std::vector<double> time1(5), time2(5);
|
||||
std::vector<double> values1(5), values2(5);
|
||||
|
||||
for (int j = 0; j < 5; j++) {
|
||||
time1[j] = j;
|
||||
time2[j] = j;
|
||||
values1[j] = timeSeries[i - 5 + j];
|
||||
values2[j] = timeSeries[i + j];
|
||||
}
|
||||
|
||||
bool validBefore = calculateLinearRegression(time1.data(), values1.data(), 5,
|
||||
slopeBefore, interceptBefore, r2Before);
|
||||
bool validAfter = calculateLinearRegression(time2.data(), values2.data(), 5,
|
||||
slopeAfter, interceptAfter, r2After);
|
||||
|
||||
if (validBefore && validAfter) {
|
||||
// Check for significant change in slope
|
||||
double slopeChange = slopeAfter - slopeBefore;
|
||||
double meanSlope = (std::abs(slopeBefore) + std::abs(slopeAfter)) / 2;
|
||||
|
||||
if (meanSlope > 0 && std::abs(slopeChange) / meanSlope > 0.5) {
|
||||
AnomalyResult& anomaly = results[resultCount++];
|
||||
|
||||
anomaly.dataPointIndex = i;
|
||||
anomaly.anomalyScore = std::abs(slopeChange) / (meanSlope + 1e-6);
|
||||
anomaly.originalValue = timeSeries[i];
|
||||
anomaly.expectedValue = timeSeries[i]; // Same value, change is in the trend
|
||||
anomaly.confidence = 0.5 + 0.5 * std::min(1.0, anomaly.anomalyScore / 2.0);
|
||||
|
||||
// Copy date if available
|
||||
if (dates != nullptr) {
|
||||
anomaly.date = dates[i];
|
||||
} else {
|
||||
memset(&anomaly.date, 0, sizeof(DateStruct));
|
||||
}
|
||||
|
||||
// Copy factor name
|
||||
if (factorName != nullptr) {
|
||||
strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
|
||||
anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
|
||||
} else {
|
||||
anomaly.factorName[0] = '\0';
|
||||
}
|
||||
|
||||
// Set anomaly type
|
||||
anomaly.anomalyType = ANOMALY_TREND_CHANGE;
|
||||
|
||||
// Generate description
|
||||
if (slopeBefore < 0 && slopeAfter > 0) {
|
||||
snprintf(anomaly.description, MAX_STRING_SIZE,
|
||||
"Trend reversal: changed from decreasing (%.2f/day) to increasing (%.2f/day)",
|
||||
-slopeBefore, slopeAfter);
|
||||
} else if (slopeBefore > 0 && slopeAfter < 0) {
|
||||
snprintf(anomaly.description, MAX_STRING_SIZE,
|
||||
"Trend reversal: changed from increasing (%.2f/day) to decreasing (%.2f/day)",
|
||||
slopeBefore, -slopeAfter);
|
||||
} else if (slopeAfter > slopeBefore) {
|
||||
snprintf(anomaly.description, MAX_STRING_SIZE,
|
||||
"Trend acceleration: rate of change increased from %.2f/day to %.2f/day",
|
||||
slopeBefore, slopeAfter);
|
||||
} else {
|
||||
snprintf(anomaly.description, MAX_STRING_SIZE,
|
||||
"Trend deceleration: rate of change decreased from %.2f/day to %.2f/day",
|
||||
slopeBefore, slopeAfter);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort anomalies by score (most significant first)
|
||||
std::sort(results, results + resultCount,
|
||||
[](const AnomalyResult& a, const AnomalyResult& b) {
|
||||
return a.anomalyScore > b.anomalyScore;
|
||||
});
|
||||
|
||||
// Mark the end of valid results
|
||||
results[resultCount].dataPointIndex = -1;
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Free memory for anomaly detection results
|
||||
*
|
||||
* @param results Pointer to anomaly results array
|
||||
*/
|
||||
void free_anomaly_results(AnomalyResult* results) {
|
||||
delete[] results;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Analyze patterns related to dates (e.g., day of week effects)
|
||||
*
|
||||
* @param values Array of values
|
||||
* @param dates Array of corresponding dates
|
||||
* @param data_length Length of the arrays
|
||||
* @param factor_name Name of the factor being analyzed
|
||||
* @return DatePatternResult* Array of detected patterns
|
||||
*/
|
||||
DatePatternResult* analyze_date_patterns(
|
||||
const double* values,
|
||||
const DateStruct* dates,
|
||||
int data_length,
|
||||
const char* factor_name) {
|
||||
|
||||
// Allocate space for results (3 patterns max + terminator)
|
||||
DatePatternResult* results = new DatePatternResult[4];
|
||||
memset(results, 0, 4 * sizeof(DatePatternResult));
|
||||
|
||||
// Initialize terminator
|
||||
results[0].patternType = PATTERN_NONE;
|
||||
|
||||
// Return empty result for insufficient data
|
||||
if (data_length < 14) {
|
||||
return results;
|
||||
}
|
||||
|
||||
// Count occurrences by day of week
|
||||
double dayOfWeekSums[7] = {0};
|
||||
int dayOfWeekCounts[7] = {0};
|
||||
|
||||
for (int i = 0; i < data_length; i++) {
|
||||
// Convert date to day of week (0 = Sunday, 6 = Saturday)
|
||||
// This is a simplified calculation and might need adjustment
|
||||
int year = dates[i].year;
|
||||
int month = dates[i].month;
|
||||
int day = dates[i].day;
|
||||
|
||||
// Zeller's congruence for finding day of week
|
||||
if (month < 3) {
|
||||
month += 12;
|
||||
year--;
|
||||
}
|
||||
int h = (day + (13 * (month + 1)) / 5 + year + year / 4 - year / 100 + year / 400) % 7;
|
||||
// Convert to 0-based where 0 is Sunday
|
||||
int dayOfWeek = (h + 6) % 7;
|
||||
|
||||
dayOfWeekSums[dayOfWeek] += values[i];
|
||||
dayOfWeekCounts[dayOfWeek]++;
|
||||
}
|
||||
|
||||
// Calculate average by day of week
|
||||
double dayOfWeekAvgs[7] = {0};
|
||||
for (int i = 0; i < 7; i++) {
|
||||
if (dayOfWeekCounts[i] > 0) {
|
||||
dayOfWeekAvgs[i] = dayOfWeekSums[i] / dayOfWeekCounts[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Find peak day
|
||||
int peakDay = 0;
|
||||
double peakValue = dayOfWeekAvgs[0];
|
||||
for (int i = 1; i < 7; i++) {
|
||||
if (dayOfWeekAvgs[i] > peakValue) {
|
||||
peakValue = dayOfWeekAvgs[i];
|
||||
peakDay = i;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate variance to determine if there's a weekly pattern
|
||||
double mean = 0;
|
||||
for (int i = 0; i < 7; i++) {
|
||||
if (dayOfWeekCounts[i] > 0) {
|
||||
mean += dayOfWeekAvgs[i];
|
||||
}
|
||||
}
|
||||
mean /= 7;
|
||||
|
||||
double variance = 0;
|
||||
for (int i = 0; i < 7; i++) {
|
||||
if (dayOfWeekCounts[i] > 0) {
|
||||
double diff = dayOfWeekAvgs[i] - mean;
|
||||
variance += diff * diff;
|
||||
}
|
||||
}
|
||||
variance /= 7;
|
||||
|
||||
// Calculate pattern strength
|
||||
double strength = std::min(1.0, variance / (mean * mean + 0.001));
|
||||
|
||||
// Only report if pattern is significant
|
||||
if (strength > 0.1) {
|
||||
results[0].patternType = PATTERN_WEEKLY;
|
||||
results[0].periodicity = 7;
|
||||
results[0].strength = strength;
|
||||
results[0].peakDayOfWeek = peakDay;
|
||||
|
||||
// Copy peak values
|
||||
for (int i = 0; i < 7; i++) {
|
||||
results[0].peakValues[i] = dayOfWeekAvgs[i];
|
||||
}
|
||||
|
||||
// Generate description
|
||||
const char* dayNames[] = {"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"};
|
||||
snprintf(results[0].description, MAX_STRING_SIZE,
|
||||
"Weekly pattern detected with peak on %s (strength: %.2f)",
|
||||
dayNames[peakDay], strength);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Free memory for date pattern results
|
||||
*
|
||||
* @param results Pointer to date pattern results array
|
||||
*/
|
||||
void free_date_pattern_results(DatePatternResult* results) {
|
||||
delete[] results;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Analyze cyclical patterns in time series data
|
||||
*
|
||||
* @param values Array of values
|
||||
* @param dates Array of corresponding dates
|
||||
* @param data_length Length of the arrays
|
||||
* @param factor_name Name of the factor being analyzed
|
||||
* @return CycleAnalysisResult Structure containing cycle analysis results
|
||||
*/
|
||||
CycleAnalysisResult analyze_cycles(
|
||||
const double* values,
|
||||
const DateStruct* dates,
|
||||
int data_length,
|
||||
const char* factor_name) {
|
||||
|
||||
CycleAnalysisResult result;
|
||||
memset(&result, 0, sizeof(CycleAnalysisResult));
|
||||
|
||||
// Minimum data points required for cycle analysis
|
||||
if (data_length < 20) {
|
||||
strncpy(result.description, "Insufficient data for cycle analysis", MAX_STRING_SIZE-1);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Simple autocorrelation-based cycle detection
|
||||
// Find the peak in autocorrelation function after lag 0
|
||||
int maxLag = data_length / 3; // Look for cycles up to 1/3 of data length
|
||||
double maxCorr = 0;
|
||||
int bestLag = 0;
|
||||
|
||||
for (int lag = 2; lag < maxLag; lag++) {
|
||||
double sum = 0;
|
||||
double count = 0;
|
||||
|
||||
for (int i = 0; i < data_length - lag; i++) {
|
||||
sum += values[i] * values[i + lag];
|
||||
count++;
|
||||
}
|
||||
|
||||
double corr = sum / count;
|
||||
|
||||
if (corr > maxCorr) {
|
||||
maxCorr = corr;
|
||||
bestLag = lag;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate average cycle length in days
|
||||
if (bestLag > 0 && maxCorr > 0.2) {
|
||||
result.cycleLength = bestLag;
|
||||
result.amplitude = maxCorr;
|
||||
result.confidence = maxCorr;
|
||||
result.cycleLengthVariance = bestLag * 0.2; // Simple estimation
|
||||
|
||||
snprintf(result.description, MAX_STRING_SIZE,
|
||||
"Cycle detected with approximate length of %d days (confidence: %.2f)",
|
||||
bestLag, maxCorr);
|
||||
} else {
|
||||
strncpy(result.description, "No significant cycle detected", MAX_STRING_SIZE-1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
9
native/statistics/arm64_fixes.h
Normal file
9
native/statistics/arm64_fixes.h
Normal file
|
@ -0,0 +1,9 @@
|
|||
#pragma once
|
||||
|
||||
#ifdef __aarch64__
|
||||
// Fix uintptr_t definition for ARM64
|
||||
#include <stdint.h>
|
||||
// We need to undefine and redefine uintptr_t to ensure it's 64-bit on ARM64
|
||||
#undef uintptr_t
|
||||
typedef uint64_t uintptr_t;
|
||||
#endif
|
47
native/statistics/basic_stats.cpp
Normal file
47
native/statistics/basic_stats.cpp
Normal file
|
@ -0,0 +1,47 @@
|
|||
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
|
||||
// SPDX-License-Identifier: AGPL-3.0
|
||||
//
|
||||
// basic_stats.cpp
|
||||
// Implementation of basic statistical analysis functions
|
||||
//
|
||||
#include "health_analytics_engine.h"
|
||||
#include "utils.h"
|
||||
/**
|
||||
* @brief Calculate basic statistical properties of a data series
|
||||
*
|
||||
* @param values Pointer to array of values
|
||||
* @param length Number of elements in the array
|
||||
* @return BasicStats Structure containing calculated statistics
|
||||
*/
|
||||
BasicStats calculate_basic_stats(const double* values, int length) {
|
||||
BasicStats stats;
|
||||
|
||||
if (length == 0) {
|
||||
memset(&stats, 0, sizeof(BasicStats));
|
||||
return stats;
|
||||
}
|
||||
|
||||
// Create a copy for calculations that require sorting
|
||||
std::vector<double> sorted(values, values + length);
|
||||
std::sort(sorted.begin(), sorted.end());
|
||||
|
||||
// Calculate basic statistics
|
||||
stats.mean = calculateMean(values, length);
|
||||
stats.variance = calculateVariance(values, length, stats.mean);
|
||||
stats.stdDev = std::sqrt(stats.variance);
|
||||
stats.min = sorted.front();
|
||||
stats.max = sorted.back();
|
||||
stats.median = (length % 2 == 0) ?
|
||||
(sorted[length/2 - 1] + sorted[length/2]) / 2.0 : sorted[length/2];
|
||||
|
||||
// Calculate quartiles and IQR
|
||||
stats.q1 = calculateQuantile(sorted.data(), length, 0.25);
|
||||
stats.q3 = calculateQuantile(sorted.data(), length, 0.75);
|
||||
stats.iqr = stats.q3 - stats.q1;
|
||||
|
||||
// Calculate higher-order statistics
|
||||
stats.skewness = calculateSkewness(values, length, stats.mean, stats.stdDev);
|
||||
stats.kurtosis = calculateKurtosis(values, length, stats.mean, stats.stdDev);
|
||||
|
||||
return stats;
|
||||
}
|
203
native/statistics/clustering.cpp
Normal file
203
native/statistics/clustering.cpp
Normal file
|
@ -0,0 +1,203 @@
|
|||
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
|
||||
// SPDX-License-Identifier: AGPL-3.0
|
||||
//
|
||||
// clustering.cpp
|
||||
// Implementation of clustering and pattern recognition functions
|
||||
//
|
||||
#include "health_analytics_engine.h"
|
||||
#include "utils.h"
|
||||
/**
|
||||
* @brief Perform cluster analysis on multivariate health data
|
||||
*
|
||||
* @param data 2D array of data points [n_samples x n_features]
|
||||
* @param factorCount Number of features per data point
|
||||
* @param dataLength Number of data points
|
||||
* @param maxClusters Maximum number of clusters to identify
|
||||
* @return ClusterResult* Array of cluster results
|
||||
*/
|
||||
ClusterResult* perform_cluster_analysis(const double** data,
|
||||
int factorCount,
|
||||
int dataLength,
|
||||
int maxClusters) {
|
||||
if (factorCount < 2 || dataLength < 5 || maxClusters <= 0) {
|
||||
ClusterResult* dummy = new ClusterResult[1];
|
||||
memset(dummy, 0, sizeof(ClusterResult));
|
||||
dummy[0].clusterId = -1; // Mark as invalid
|
||||
return dummy;
|
||||
}
|
||||
|
||||
// Normalize data for better clustering
|
||||
std::vector<std::vector<double>> normalizedData(dataLength, std::vector<double>(factorCount));
|
||||
std::vector<double> means(factorCount, 0);
|
||||
std::vector<double> stdDevs(factorCount, 0);
|
||||
|
||||
// Calculate means
|
||||
for (int j = 0; j < factorCount; j++) {
|
||||
for (int i = 0; i < dataLength; i++) {
|
||||
means[j] += data[i][j];
|
||||
}
|
||||
means[j] /= dataLength;
|
||||
}
|
||||
|
||||
// Calculate standard deviations
|
||||
for (int j = 0; j < factorCount; j++) {
|
||||
for (int i = 0; i < dataLength; i++) {
|
||||
double diff = data[i][j] - means[j];
|
||||
stdDevs[j] += diff * diff;
|
||||
}
|
||||
stdDevs[j] = sqrt(stdDevs[j] / dataLength);
|
||||
if (stdDevs[j] < 1e-10) stdDevs[j] = 1.0; // Avoid division by zero
|
||||
}
|
||||
|
||||
// Normalize data
|
||||
for (int i = 0; i < dataLength; i++) {
|
||||
for (int j = 0; j < factorCount; j++) {
|
||||
normalizedData[i][j] = (data[i][j] - means[j]) / stdDevs[j];
|
||||
}
|
||||
}
|
||||
|
||||
// Find optimal number of clusters (between 2 and maxClusters)
|
||||
int optimalClusters = 2;
|
||||
double bestSilhouette = -1;
|
||||
|
||||
// Arrays for k-means algorithm
|
||||
std::vector<int> assignments(dataLength);
|
||||
std::vector<std::vector<double>> centroids(maxClusters, std::vector<double>(factorCount));
|
||||
std::vector<const double*> normalizedDataPtrs(dataLength);
|
||||
for (int i = 0; i < dataLength; i++) {
|
||||
normalizedDataPtrs[i] = normalizedData[i].data();
|
||||
}
|
||||
|
||||
for (int k = 2; k <= maxClusters; k++) {
|
||||
// Run k-means clustering
|
||||
std::vector<int> tempAssignments(dataLength, 0);
|
||||
std::vector<std::vector<double>> tempCentroids(k, std::vector<double>(factorCount, 0));
|
||||
std::vector<double*> centroidPtrs(k);
|
||||
for (int i = 0; i < k; i++) {
|
||||
centroidPtrs[i] = tempCentroids[i].data();
|
||||
}
|
||||
|
||||
kMeansClustering(normalizedDataPtrs.data(), dataLength, factorCount, k,
|
||||
100, centroidPtrs.data(), tempAssignments.data());
|
||||
|
||||
// Calculate silhouette coefficient
|
||||
std::vector<const double*> dataPtrs(dataLength);
|
||||
for (int i = 0; i < dataLength; i++) {
|
||||
dataPtrs[i] = data[i]; // Use original data for silhouette
|
||||
}
|
||||
|
||||
double silhouette = calculateSilhouetteCoefficient(
|
||||
dataPtrs.data(), dataLength, factorCount, tempAssignments.data(), k);
|
||||
|
||||
// Update if better silhouette found
|
||||
if (silhouette > bestSilhouette) {
|
||||
bestSilhouette = silhouette;
|
||||
optimalClusters = k;
|
||||
assignments = tempAssignments;
|
||||
centroids = tempCentroids;
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate cluster results (plus one for terminator)
|
||||
ClusterResult* results = new ClusterResult[optimalClusters + 1];
|
||||
|
||||
// Count points in each cluster
|
||||
std::vector<int> clusterSizes(optimalClusters, 0);
|
||||
for (int i = 0; i < dataLength; i++) {
|
||||
clusterSizes[assignments[i]]++;
|
||||
}
|
||||
|
||||
// Calculate cluster statistics
|
||||
for (int c = 0; c < optimalClusters; c++) {
|
||||
ClusterResult& cluster = results[c];
|
||||
cluster.clusterId = c;
|
||||
cluster.dataPointCount = clusterSizes[c];
|
||||
|
||||
// Calculate cluster significance based on size and compactness
|
||||
double avgDistance = 0;
|
||||
int count = 0;
|
||||
|
||||
for (int i = 0; i < dataLength; i++) {
|
||||
if (assignments[i] == c) {
|
||||
double dist = 0;
|
||||
for (int j = 0; j < factorCount; j++) {
|
||||
double diff = normalizedData[i][j] - centroids[c][j];
|
||||
dist += diff * diff;
|
||||
}
|
||||
avgDistance += sqrt(dist);
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
if (count > 0) {
|
||||
avgDistance /= count;
|
||||
}
|
||||
|
||||
// Higher significance for larger and more compact clusters
|
||||
double sizeFactor = static_cast<double>(count) / dataLength;
|
||||
double compactnessFactor = 1.0 - std::min(1.0, avgDistance / 3.0);
|
||||
cluster.significance = sizeFactor * compactnessFactor;
|
||||
|
||||
// Identify important factors for this cluster
|
||||
std::vector<std::pair<int, double>> factorImportance;
|
||||
|
||||
for (int j = 0; j < factorCount; j++) {
|
||||
// Calculate how different this centroid is from global mean for this factor
|
||||
double differenceFromMean = std::abs(centroids[c][j]);
|
||||
factorImportance.push_back(std::make_pair(j, differenceFromMean));
|
||||
}
|
||||
|
||||
// Sort factors by importance
|
||||
std::sort(factorImportance.begin(), factorImportance.end(),
|
||||
[](const std::pair<int, double>& a, const std::pair<int, double>& b) {
|
||||
return a.second > b.second;
|
||||
});
|
||||
|
||||
// Store top factors (up to 3)
|
||||
int numFactors = std::min(3, factorCount);
|
||||
for (int j = 0; j < numFactors; j++) {
|
||||
cluster.factorIndices[j] = factorImportance[j].first;
|
||||
cluster.factorWeights[j] = std::min(1.0, factorImportance[j].second);
|
||||
|
||||
// Normalize weights to sum to 1
|
||||
double totalWeight = 0;
|
||||
for (int k = 0; k < numFactors; k++) {
|
||||
totalWeight += cluster.factorWeights[k];
|
||||
}
|
||||
|
||||
if (totalWeight > 0) {
|
||||
for (int k = 0; k < numFactors; k++) {
|
||||
cluster.factorWeights[k] /= totalWeight;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Generate descriptive cluster name based on key factors
|
||||
// In a real implementation, this would use domain knowledge
|
||||
snprintf(cluster.clusterName, MAX_STRING_SIZE,
|
||||
"Cluster %d - %s", c + 1,
|
||||
(cluster.significance > 0.7) ? "High Significance" :
|
||||
(cluster.significance > 0.4) ? "Medium Significance" : "Low Significance");
|
||||
|
||||
// Generate detailed description
|
||||
snprintf(cluster.description, MAX_STRING_SIZE,
|
||||
"Cluster of %d data points characterized by %s factors. Silhouette: %.2f",
|
||||
cluster.dataPointCount,
|
||||
(numFactors > 0) ? "multiple interrelated" : "non-specific",
|
||||
bestSilhouette);
|
||||
}
|
||||
|
||||
// Mark the end of valid results
|
||||
results[optimalClusters].clusterId = -1;
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Free memory for cluster analysis results
|
||||
*
|
||||
* @param results Pointer to cluster results array
|
||||
*/
|
||||
void free_cluster_results(ClusterResult* results) {
|
||||
delete[] results;
|
||||
}
|
369
native/statistics/correlation.cpp
Normal file
369
native/statistics/correlation.cpp
Normal file
|
@ -0,0 +1,369 @@
|
|||
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
|
||||
// SPDX-License-Identifier: AGPL-3.0
|
||||
//
|
||||
// correlation.cpp
|
||||
// Implementation of correlation analysis functions
|
||||
//
|
||||
#include "health_analytics_engine.h"
|
||||
#include "utils.h"
|
||||
/**
|
||||
* @brief Find factors with strongest correlation to target variable
|
||||
*
|
||||
* @param target Target variable time series
|
||||
* @param factors Array of factor time series
|
||||
* @param factorNames Array of factor names
|
||||
* @param targetLength Length of target time series
|
||||
* @param factorCount Number of factors
|
||||
* @return CorrelationResult* Array of correlation results (sorted by strength)
|
||||
*/
|
||||
CorrelationResult* find_strongest_correlations(const double* target,
|
||||
const double** factors,
|
||||
const char** factorNames,
|
||||
int targetLength,
|
||||
int factorCount) {
|
||||
// Allocate results array (one more than needed to mark the end)
|
||||
CorrelationResult* results = new CorrelationResult[factorCount + 1];
|
||||
|
||||
// Calculate correlations for each factor
|
||||
for (int i = 0; i < factorCount; i++) {
|
||||
// Calculate both Pearson and Spearman correlations
|
||||
double pearson_corr = calculateCorrelation(target, factors[i], targetLength);
|
||||
double spearman_corr = calculateSpearmanCorrelation(target, factors[i], targetLength);
|
||||
|
||||
// Use the correlation with higher absolute value
|
||||
double corr = (std::abs(pearson_corr) > std::abs(spearman_corr)) ?
|
||||
pearson_corr : spearman_corr;
|
||||
|
||||
results[i].factorIndex = i;
|
||||
results[i].correlation = corr;
|
||||
|
||||
// Estimate p-value based on correlation and sample size
|
||||
// This is an approximation; a real implementation would use t-distribution
|
||||
double t = corr * std::sqrt((targetLength - 2) / (1 - corr * corr));
|
||||
// Simplified 2-tailed p-value approximation
|
||||
results[i].pValue = 2 * (1 - std::min(1.0, std::exp(-0.717 * std::abs(t) - 0.416 * t * t)));
|
||||
|
||||
// Copy factor name
|
||||
strncpy(results[i].factorName, factorNames[i], MAX_STRING_SIZE - 1);
|
||||
results[i].factorName[MAX_STRING_SIZE - 1] = '\0';
|
||||
|
||||
// Calculate confidence based on sample size, correlation strength, and p-value
|
||||
double sample_size_factor = 1.0 - 1.0 / std::sqrt(targetLength);
|
||||
double p_value_factor = 1.0 - results[i].pValue;
|
||||
results[i].confidence = std::abs(corr) * sample_size_factor * p_value_factor;
|
||||
}
|
||||
|
||||
// Sort by absolute correlation value
|
||||
std::sort(results, results + factorCount,
|
||||
[](const CorrelationResult& a, const CorrelationResult& b) {
|
||||
return std::abs(a.correlation) > std::abs(b.correlation);
|
||||
});
|
||||
|
||||
// Mark the end of valid results
|
||||
results[factorCount].factorIndex = -1;
|
||||
results[factorCount].correlation = 0;
|
||||
results[factorCount].confidence = 0;
|
||||
results[factorCount].factorName[0] = '\0';
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Free memory for correlation results
|
||||
*
|
||||
* @param results Pointer to correlation results array
|
||||
*/
|
||||
void free_correlation_results(CorrelationResult* results) {
|
||||
delete[] results;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Find multivariate correlations between factors
|
||||
*
|
||||
* @param data Array of factor time series
|
||||
* @param factorNames Array of factor names
|
||||
* @param factorCount Number of factors
|
||||
* @param dataLength Length of time series
|
||||
* @return MultivariateCorrelation* Array of multivariate correlation results
|
||||
*/
|
||||
MultivariateCorrelation* find_multivariate_correlations(const double** data,
|
||||
const char** factorNames,
|
||||
int factorCount,
|
||||
int dataLength) {
|
||||
if (factorCount < 2 || dataLength < 3) {
|
||||
MultivariateCorrelation* dummy = new MultivariateCorrelation[1];
|
||||
memset(dummy, 0, sizeof(MultivariateCorrelation));
|
||||
dummy[0].factorCount = -1; // Mark as invalid
|
||||
return dummy;
|
||||
}
|
||||
|
||||
// Constants
|
||||
const int MAX_RESULTS = 100;
|
||||
const double MIN_CORRELATION_THRESHOLD = 0.3;
|
||||
|
||||
// Allocate space for up to MAX_RESULTS correlations + 1 terminator
|
||||
MultivariateCorrelation* results = new MultivariateCorrelation[MAX_RESULTS + 1];
|
||||
int resultCount = 0;
|
||||
|
||||
// Create correlation matrix for all pairwise correlations
|
||||
std::vector<std::vector<double>> corrMatrix(factorCount, std::vector<double>(factorCount, 0));
|
||||
|
||||
for (int i = 0; i < factorCount; i++) {
|
||||
corrMatrix[i][i] = 1.0; // Self-correlation is 1
|
||||
|
||||
for (int j = i + 1; j < factorCount; j++) {
|
||||
// Calculate correlation using both Pearson and Spearman
|
||||
double pearson = calculateCorrelation(data[i], data[j], dataLength);
|
||||
double spearman = calculateSpearmanCorrelation(data[i], data[j], dataLength);
|
||||
|
||||
// Use the one with higher absolute value
|
||||
double corr = (std::abs(pearson) > std::abs(spearman)) ? pearson : spearman;
|
||||
|
||||
// Store in both positions (symmetric matrix)
|
||||
corrMatrix[i][j] = corr;
|
||||
corrMatrix[j][i] = corr;
|
||||
}
|
||||
}
|
||||
|
||||
// Find pairwise correlations
|
||||
for (int i = 0; i < factorCount && resultCount < MAX_RESULTS; i++) {
|
||||
for (int j = i + 1; j < factorCount && resultCount < MAX_RESULTS; j++) {
|
||||
double corr = corrMatrix[i][j];
|
||||
|
||||
// Only store significant correlations
|
||||
if (std::abs(corr) >= MIN_CORRELATION_THRESHOLD) {
|
||||
MultivariateCorrelation& result = results[resultCount];
|
||||
|
||||
result.factorCount = 2;
|
||||
result.correlationStrength = corr;
|
||||
result.primaryFactorIndex = i;
|
||||
result.secondaryFactorIndex = j;
|
||||
result.tertiaryFactorIndex = -1;
|
||||
|
||||
// Calculate confidence based on sample size and correlation strength
|
||||
double t_stat = std::abs(corr) * std::sqrt((dataLength - 2) / (1 - corr * corr));
|
||||
double p_value = 2 * (1 - std::min(1.0, std::exp(-0.717 * t_stat - 0.416 * t_stat * t_stat)));
|
||||
result.confidence = std::abs(corr) * (1.0 - p_value) * (1.0 - 1.0 / std::sqrt(dataLength));
|
||||
|
||||
// Copy factor names
|
||||
strncpy(result.factorNames[0], factorNames[i], MAX_STRING_SIZE - 1);
|
||||
result.factorNames[0][MAX_STRING_SIZE - 1] = '\0';
|
||||
|
||||
strncpy(result.factorNames[1], factorNames[j], MAX_STRING_SIZE - 1);
|
||||
result.factorNames[1][MAX_STRING_SIZE - 1] = '\0';
|
||||
|
||||
// Set factor weights based on correlation directionality
|
||||
if (corr > 0) {
|
||||
result.factorWeights[0] = 0.5;
|
||||
result.factorWeights[1] = 0.5;
|
||||
} else {
|
||||
result.factorWeights[0] = 0.5;
|
||||
result.factorWeights[1] = -0.5;
|
||||
}
|
||||
|
||||
// Determine relationship type
|
||||
result.relationshipType = RELATIONSHIP_CORRELATION;
|
||||
|
||||
// Generate description
|
||||
const char* strength_text =
|
||||
(std::abs(corr) > 0.7) ? "strong" :
|
||||
(std::abs(corr) > 0.5) ? "moderate" : "weak";
|
||||
|
||||
const char* direction_text = (corr > 0) ? "positive" : "negative";
|
||||
|
||||
snprintf(result.description, MAX_STRING_SIZE,
|
||||
"There is a %s %s correlation between %s and %s (r=%.2f, p=%.3f)",
|
||||
strength_text, direction_text,
|
||||
factorNames[i], factorNames[j], corr,
|
||||
(result.confidence > 0.99) ? 0.001 : 1.0 - result.confidence);
|
||||
|
||||
resultCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find partial correlations and three-way relationships
|
||||
if (factorCount >= 3) {
|
||||
// Calculate partial correlations
|
||||
std::vector<std::vector<std::vector<double>>> partialCorr(
|
||||
factorCount, std::vector<std::vector<double>>(
|
||||
factorCount, std::vector<double>(factorCount, 0.0)));
|
||||
|
||||
for (int i = 0; i < factorCount; i++) {
|
||||
for (int j = i + 1; j < factorCount; j++) {
|
||||
for (int k = 0; k < factorCount; k++) {
|
||||
if (k == i || k == j) continue;
|
||||
|
||||
// Calculate partial correlation between i and j controlling for k
|
||||
double r_ij = corrMatrix[i][j];
|
||||
double r_ik = corrMatrix[i][k];
|
||||
double r_jk = corrMatrix[j][k];
|
||||
|
||||
double denominator = std::sqrt((1 - r_ik * r_ik) * (1 - r_jk * r_jk));
|
||||
|
||||
if (denominator > 1e-10) {
|
||||
double partial = (r_ij - r_ik * r_jk) / denominator;
|
||||
partialCorr[i][j][k] = partial;
|
||||
partialCorr[j][i][k] = partial;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find three-way relationships
|
||||
for (int i = 0; i < factorCount && resultCount < MAX_RESULTS; i++) {
|
||||
for (int j = i + 1; j < factorCount && resultCount < MAX_RESULTS; j++) {
|
||||
for (int k = j + 1; k < factorCount && resultCount < MAX_RESULTS; k++) {
|
||||
// Get pairwise correlations
|
||||
double r_ij = corrMatrix[i][j];
|
||||
double r_ik = corrMatrix[i][k];
|
||||
double r_jk = corrMatrix[j][k];
|
||||
|
||||
// Check if all pairs are correlated
|
||||
if (std::abs(r_ij) >= MIN_CORRELATION_THRESHOLD &&
|
||||
std::abs(r_ik) >= MIN_CORRELATION_THRESHOLD &&
|
||||
std::abs(r_jk) >= MIN_CORRELATION_THRESHOLD) {
|
||||
|
||||
// Get partial correlations
|
||||
double p_ij_k = partialCorr[i][j][k]; // i,j controlling for k
|
||||
double p_ik_j = partialCorr[i][k][j]; // i,k controlling for j
|
||||
double p_jk_i = partialCorr[j][k][i]; // j,k controlling for i
|
||||
|
||||
// Determine if mediation or confounding is present
|
||||
bool is_mediation = false;
|
||||
int mediator = -1;
|
||||
|
||||
// Check if k mediates i->j
|
||||
if (std::abs(p_ij_k) < std::abs(r_ij) * 0.5) {
|
||||
is_mediation = true;
|
||||
mediator = k;
|
||||
}
|
||||
// Check if j mediates i->k
|
||||
else if (std::abs(p_ik_j) < std::abs(r_ik) * 0.5) {
|
||||
is_mediation = true;
|
||||
mediator = j;
|
||||
}
|
||||
// Check if i mediates j->k
|
||||
else if (std::abs(p_jk_i) < std::abs(r_jk) * 0.5) {
|
||||
is_mediation = true;
|
||||
mediator = i;
|
||||
}
|
||||
|
||||
MultivariateCorrelation& result = results[resultCount];
|
||||
|
||||
result.factorCount = 3;
|
||||
// Use average correlation as strength
|
||||
result.correlationStrength = (std::abs(r_ij) + std::abs(r_ik) + std::abs(r_jk)) / 3.0;
|
||||
result.primaryFactorIndex = i;
|
||||
result.secondaryFactorIndex = j;
|
||||
result.tertiaryFactorIndex = k;
|
||||
|
||||
// Lower confidence for three-way relationships
|
||||
result.confidence = result.correlationStrength *
|
||||
(1.0 - 2.0 / std::sqrt(dataLength));
|
||||
|
||||
// Copy factor names
|
||||
strncpy(result.factorNames[0], factorNames[i], MAX_STRING_SIZE - 1);
|
||||
result.factorNames[0][MAX_STRING_SIZE - 1] = '\0';
|
||||
|
||||
strncpy(result.factorNames[1], factorNames[j], MAX_STRING_SIZE - 1);
|
||||
result.factorNames[1][MAX_STRING_SIZE - 1] = '\0';
|
||||
|
||||
strncpy(result.factorNames[2], factorNames[k], MAX_STRING_SIZE - 1);
|
||||
result.factorNames[2][MAX_STRING_SIZE - 1] = '\0';
|
||||
|
||||
// Set relationship type
|
||||
if (is_mediation) {
|
||||
result.relationshipType = RELATIONSHIP_MEDIATION;
|
||||
|
||||
// Set weights based on mediation path
|
||||
if (mediator == i) {
|
||||
result.factorWeights[0] = 0.5; // Mediator
|
||||
result.factorWeights[1] = 0.3; // Source
|
||||
result.factorWeights[2] = 0.3; // Target
|
||||
} else if (mediator == j) {
|
||||
result.factorWeights[0] = 0.3; // Source
|
||||
result.factorWeights[1] = 0.5; // Mediator
|
||||
result.factorWeights[2] = 0.3; // Target
|
||||
} else {
|
||||
result.factorWeights[0] = 0.3; // Source
|
||||
result.factorWeights[1] = 0.3; // Target
|
||||
result.factorWeights[2] = 0.5; // Mediator
|
||||
}
|
||||
|
||||
// Generate description for mediation
|
||||
snprintf(result.description, MAX_STRING_SIZE,
|
||||
"Potential mediation detected: %s may mediate the relationship between %s and %s",
|
||||
factorNames[mediator],
|
||||
factorNames[(mediator+1) % 3],
|
||||
factorNames[(mediator+2) % 3]);
|
||||
} else {
|
||||
result.relationshipType = RELATIONSHIP_NETWORK;
|
||||
|
||||
// Equal weights for network relationship
|
||||
result.factorWeights[0] = 0.33;
|
||||
result.factorWeights[1] = 0.33;
|
||||
result.factorWeights[2] = 0.33;
|
||||
|
||||
// Generate description for network
|
||||
snprintf(result.description, MAX_STRING_SIZE,
|
||||
"Network relationship detected between %s, %s, and %s (average correlation: %.2f)",
|
||||
factorNames[i], factorNames[j], factorNames[k],
|
||||
result.correlationStrength);
|
||||
}
|
||||
|
||||
resultCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Mark the end of valid results
|
||||
if (resultCount < MAX_RESULTS) {
|
||||
results[resultCount].factorCount = -1;
|
||||
} else {
|
||||
results[MAX_RESULTS].factorCount = -1;
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Free memory for multivariate correlation results
|
||||
*
|
||||
* @param results Pointer to multivariate correlation results array
|
||||
*/
|
||||
void free_multivariate_correlations(MultivariateCorrelation* results) {
|
||||
delete[] results;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Direct API access to correlation calculation
|
||||
*
|
||||
* @param x First data series
|
||||
* @param y Second data series
|
||||
* @param length Length of data series
|
||||
* @return double Correlation coefficient
|
||||
*/
|
||||
double calculate_correlation(const double* x, const double* y, int length) {
|
||||
if (length <= 1) return 0;
|
||||
|
||||
double sum_x = 0, sum_y = 0, sum_xy = 0;
|
||||
double sum_x2 = 0, sum_y2 = 0;
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
sum_x += x[i];
|
||||
sum_y += y[i];
|
||||
sum_xy += x[i] * y[i];
|
||||
sum_x2 += x[i] * x[i];
|
||||
sum_y2 += y[i] * y[i];
|
||||
}
|
||||
|
||||
double denominator = sqrt((length * sum_x2 - sum_x * sum_x) *
|
||||
(length * sum_y2 - sum_y * sum_y));
|
||||
|
||||
if (denominator < 1e-10) return 0; // Avoid division by zero
|
||||
|
||||
return (length * sum_xy - sum_x * sum_y) / denominator;
|
||||
}
|
30
native/statistics/health_analytics.cpp
Normal file
30
native/statistics/health_analytics.cpp
Normal file
|
@ -0,0 +1,30 @@
|
|||
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
|
||||
// SPDX-License-Identifier: AGPL-3.0
|
||||
//
|
||||
// health_analytics.cpp
|
||||
// Comprehensive C++ analytics engine for health data processing
|
||||
// Provides statistical analysis, pattern detection, and predictive modeling for health metrics
|
||||
//
|
||||
#include "health_analytics_engine.h"
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <numeric>
|
||||
#include <random>
|
||||
#include <limits>
|
||||
|
||||
// Include all module files
|
||||
#include "utils.cpp"
|
||||
#include "basic_stats.cpp"
|
||||
#include "correlation.cpp"
|
||||
#include "time_series.cpp"
|
||||
#include "clustering.cpp"
|
||||
#include "anomaly_detection.cpp"
|
||||
#include "impact_analysis.cpp"
|
316
native/statistics/health_analytics_engine.h
Normal file
316
native/statistics/health_analytics_engine.h
Normal file
|
@ -0,0 +1,316 @@
|
|||
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
|
||||
// SPDX-License-Identifier: AGPL-3.0
|
||||
//
|
||||
// HealthAnalyticsEngine.h
|
||||
// Core C++ header for health analytics calculations
|
||||
//
|
||||
#ifndef HEALTH_ANALYTICS_ENGINE_H
|
||||
#define HEALTH_ANALYTICS_ENGINE_H
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <memory>
|
||||
#include <cstdint>
|
||||
|
||||
// Constants
|
||||
constexpr int MAX_STRING_SIZE = 200;
|
||||
|
||||
// Forward declarations
|
||||
struct DateStruct;
|
||||
struct BasicStats;
|
||||
struct TrendResult;
|
||||
struct CorrelationResult;
|
||||
struct MultivariateCorrelation;
|
||||
struct ClusterResult;
|
||||
struct TimeSeriesForecast;
|
||||
struct AnomalyResult;
|
||||
struct FactorImpactResult;
|
||||
struct DatePatternResult;
|
||||
struct CycleAnalysisResult;
|
||||
struct MedicationImpactAnalysis;
|
||||
struct HormoneImpactAnalysis;
|
||||
|
||||
// Data structures for FFI communication
|
||||
struct DateStruct {
|
||||
int year;
|
||||
int month;
|
||||
int day;
|
||||
};
|
||||
|
||||
struct BasicStats {
|
||||
double mean;
|
||||
double median;
|
||||
double min;
|
||||
double max;
|
||||
double stdDev;
|
||||
double variance;
|
||||
double skewness;
|
||||
double kurtosis;
|
||||
double q1;
|
||||
double q3;
|
||||
double iqr;
|
||||
};
|
||||
|
||||
enum TrendType {
|
||||
TREND_NONE = 0,
|
||||
TREND_INCREASING = 1,
|
||||
TREND_DECREASING = 2,
|
||||
TREND_CYCLIC = 3,
|
||||
TREND_VARIABLE = 4
|
||||
};
|
||||
|
||||
struct TrendResult {
|
||||
TrendType trendType;
|
||||
double strength;
|
||||
char description[MAX_STRING_SIZE];
|
||||
};
|
||||
|
||||
struct CorrelationResult {
|
||||
int factorIndex;
|
||||
double correlation;
|
||||
double pValue;
|
||||
double confidence;
|
||||
char factorName[MAX_STRING_SIZE];
|
||||
};
|
||||
|
||||
enum RelationshipType {
|
||||
RELATIONSHIP_CORRELATION = 0,
|
||||
RELATIONSHIP_CAUSATION = 1,
|
||||
RELATIONSHIP_COINCIDENTAL = 2,
|
||||
RELATIONSHIP_MEDIATION = 3,
|
||||
RELATIONSHIP_NETWORK = 4
|
||||
};
|
||||
|
||||
struct MultivariateCorrelation {
|
||||
int factorCount;
|
||||
char factorNames[MAX_STRING_SIZE][MAX_STRING_SIZE];
|
||||
double factorWeights[50]; // Using a constant size
|
||||
double correlationStrength;
|
||||
char description[MAX_STRING_SIZE];
|
||||
double confidence;
|
||||
RelationshipType relationshipType;
|
||||
int primaryFactorIndex;
|
||||
int secondaryFactorIndex;
|
||||
int tertiaryFactorIndex;
|
||||
};
|
||||
|
||||
struct ClusterResult {
|
||||
int clusterId;
|
||||
char clusterName[MAX_STRING_SIZE];
|
||||
char description[MAX_STRING_SIZE];
|
||||
int dataPointCount;
|
||||
double significance;
|
||||
int factorIndices[50]; // Using a constant size
|
||||
double factorWeights[50]; // Using a constant size
|
||||
double centeroid[50]; // Using a constant size
|
||||
double radius;
|
||||
};
|
||||
|
||||
enum TimeUnit {
|
||||
TIME_UNIT_DAYS = 0,
|
||||
TIME_UNIT_WEEKS = 1,
|
||||
TIME_UNIT_MONTHS = 2
|
||||
};
|
||||
|
||||
struct TimeSeriesForecast {
|
||||
double predictions[30];
|
||||
double confidenceIntervals[30][2];
|
||||
double overallConfidence;
|
||||
int seasonalityPeriod;
|
||||
TimeUnit timeUnit;
|
||||
char factorName[MAX_STRING_SIZE];
|
||||
};
|
||||
|
||||
enum AnomalyType {
|
||||
ANOMALY_OUTLIER = 0,
|
||||
ANOMALY_TREND_CHANGE = 1,
|
||||
ANOMALY_SEASONALITY_CHANGE = 2,
|
||||
ANOMALY_CONTEXTUAL = 3
|
||||
};
|
||||
|
||||
struct AnomalyResult {
|
||||
int dataPointIndex;
|
||||
double anomalyScore;
|
||||
char description[MAX_STRING_SIZE];
|
||||
double originalValue;
|
||||
double expectedValue;
|
||||
DateStruct date;
|
||||
double confidence;
|
||||
AnomalyType anomalyType;
|
||||
char factorName[MAX_STRING_SIZE];
|
||||
};
|
||||
|
||||
struct FactorImpactResult {
|
||||
int factorIndex;
|
||||
char factorName[MAX_STRING_SIZE];
|
||||
double impactScore;
|
||||
double directEffect;
|
||||
double indirectEffect;
|
||||
double confidence;
|
||||
char mechanism[MAX_STRING_SIZE];
|
||||
};
|
||||
|
||||
enum PatternType {
|
||||
PATTERN_NONE = 0,
|
||||
PATTERN_DAILY = 1,
|
||||
PATTERN_WEEKLY = 2,
|
||||
PATTERN_MONTHLY = 3,
|
||||
PATTERN_CUSTOM = 4
|
||||
};
|
||||
|
||||
struct DatePatternResult {
|
||||
PatternType patternType;
|
||||
int periodicity;
|
||||
double strength;
|
||||
char description[MAX_STRING_SIZE];
|
||||
double peakValues[7]; // For weekly patterns
|
||||
int peakDayOfWeek; // 0-6, where 0 is Sunday
|
||||
int peakDayOfMonth; // 1-31
|
||||
int peakMonth; // 1-12
|
||||
};
|
||||
|
||||
struct CycleAnalysisResult {
|
||||
double cycleLength;
|
||||
double cycleLengthVariance;
|
||||
double amplitude;
|
||||
double phaseShift;
|
||||
double confidence;
|
||||
char description[MAX_STRING_SIZE];
|
||||
};
|
||||
|
||||
struct MedicationImpactAnalysis {
|
||||
char medicationName[MAX_STRING_SIZE];
|
||||
double beforeMean;
|
||||
double afterMean;
|
||||
double changeMagnitude;
|
||||
double changeSignificance;
|
||||
double overallImpact;
|
||||
int daysToEffect;
|
||||
char description[MAX_STRING_SIZE];
|
||||
char factorName[MAX_STRING_SIZE];
|
||||
};
|
||||
|
||||
struct HormoneImpactAnalysis {
|
||||
char hormoneName[MAX_STRING_SIZE];
|
||||
double currentLevel;
|
||||
double optimalLevel;
|
||||
double optimalRangeLower;
|
||||
double optimalRangeUpper;
|
||||
double deviation;
|
||||
double impactOnMood;
|
||||
double impactOnEnergy;
|
||||
double impactOnOtherFactors[50]; // Using a constant size
|
||||
char factorNames[50][MAX_STRING_SIZE]; // Using a constant size
|
||||
char description[MAX_STRING_SIZE];
|
||||
};
|
||||
|
||||
// Removed the HealthAnalyticsEngine class since it's not implemented
|
||||
|
||||
// C-style API for FFI
|
||||
extern "C" {
|
||||
// Basic statistics
|
||||
BasicStats calculate_basic_stats(const double* values, int length);
|
||||
|
||||
// Trend analysis
|
||||
TrendType detect_trend(const double* values, int length, double* strength_out);
|
||||
|
||||
// Correlation analysis
|
||||
double calculate_correlation(const double* x, const double* y, int length);
|
||||
CorrelationResult* find_strongest_correlations(
|
||||
const double* target_values,
|
||||
const double** factor_values,
|
||||
const char** factor_names,
|
||||
int data_length,
|
||||
int factor_count);
|
||||
void free_correlation_results(CorrelationResult* results);
|
||||
|
||||
// Multivariate analysis
|
||||
MultivariateCorrelation* find_multivariate_correlations(
|
||||
const double** factor_data,
|
||||
const char** factor_names,
|
||||
int factor_count,
|
||||
int data_length);
|
||||
void free_multivariate_correlations(MultivariateCorrelation* correlations);
|
||||
|
||||
// Cluster analysis
|
||||
ClusterResult* perform_cluster_analysis(
|
||||
const double** factor_data,
|
||||
int factor_count,
|
||||
int data_length,
|
||||
int max_clusters);
|
||||
void free_cluster_results(ClusterResult* results);
|
||||
|
||||
// Time series forecasting
|
||||
TimeSeriesForecast predict_time_series(
|
||||
const double* time_series_data,
|
||||
int data_length,
|
||||
int steps_ahead,
|
||||
const char* factor_name);
|
||||
|
||||
// Anomaly detection
|
||||
AnomalyResult* detect_anomalies(
|
||||
const double* time_series_data,
|
||||
int data_length,
|
||||
double threshold,
|
||||
const DateStruct* dates,
|
||||
const char* factor_name);
|
||||
void free_anomaly_results(AnomalyResult* results);
|
||||
|
||||
// Factor impact ranking
|
||||
FactorImpactResult* rank_factor_impacts(
|
||||
const double** factor_data,
|
||||
const double* target_data,
|
||||
const char** factor_names,
|
||||
int factor_count,
|
||||
int data_length);
|
||||
void free_factor_impact_results(FactorImpactResult* results);
|
||||
|
||||
// Date pattern analysis
|
||||
DatePatternResult* analyze_date_patterns(
|
||||
const double* values,
|
||||
const DateStruct* dates,
|
||||
int data_length,
|
||||
const char* factor_name);
|
||||
void free_date_pattern_results(DatePatternResult* results);
|
||||
|
||||
// Cycle analysis
|
||||
CycleAnalysisResult analyze_cycles(
|
||||
const double* values,
|
||||
const DateStruct* dates,
|
||||
int data_length,
|
||||
const char* factor_name);
|
||||
|
||||
// Medication impact analysis
|
||||
MedicationImpactAnalysis* analyze_medication_impact(
|
||||
const double* before_data,
|
||||
int before_length,
|
||||
const double* after_data,
|
||||
int after_length,
|
||||
const char* medication_name,
|
||||
const char* factor_name);
|
||||
void free_medication_impact_analysis(MedicationImpactAnalysis* analysis);
|
||||
|
||||
// Hormone impact analysis
|
||||
HormoneImpactAnalysis* analyze_hormone_impact(
|
||||
const double* hormone_levels,
|
||||
int data_length,
|
||||
const double** factor_data,
|
||||
const char** factor_names,
|
||||
int factor_count,
|
||||
const char* hormone_name,
|
||||
double min_optimal_level,
|
||||
double max_optimal_level);
|
||||
void free_hormone_impact_analysis(HormoneImpactAnalysis* analysis);
|
||||
}
|
||||
// Utility functions
|
||||
void normalize_data(const double* data, int length, double minValue, double maxValue, double* normalizedData);
|
||||
int detect_change_points(const double* data, int length, double threshold, int* changePoints, int maxChangePoints);
|
||||
void optimize_svr_parameters(const double** x_data, const double* y_data, int length,
|
||||
double& bestC, double& bestEpsilon, double& bestGamma, double& bestScore);
|
||||
|
||||
#endif // HEALTH_ANALYTICS_ENGINE_H
|
361
native/statistics/impact_analysis.cpp
Normal file
361
native/statistics/impact_analysis.cpp
Normal file
|
@ -0,0 +1,361 @@
|
|||
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
|
||||
// SPDX-License-Identifier: AGPL-3.0
|
||||
//
|
||||
// impact_analysis.cpp
|
||||
// Implementation of factor impact and medication analysis functions
|
||||
//
|
||||
#include "health_analytics_engine.h"
|
||||
#include "utils.h"
|
||||
/**
|
||||
* @brief Rank factors by their impact on a target variable
|
||||
*
|
||||
* @param factors Array of factor time series
|
||||
* @param target Target variable time series
|
||||
* @param factorNames Array of factor names
|
||||
* @param factorCount Number of factors
|
||||
* @param dataLength Length of time series
|
||||
* @return FactorImpactResult* Array of factor impact results
|
||||
*/
|
||||
FactorImpactResult* rank_factor_impacts(const double** factors,
|
||||
const double* target,
|
||||
const char** factorNames,
|
||||
int factorCount,
|
||||
int dataLength) {
|
||||
if (factorCount <= 0 || dataLength <= 2) {
|
||||
FactorImpactResult* dummy = new FactorImpactResult[1];
|
||||
memset(dummy, 0, sizeof(FactorImpactResult));
|
||||
dummy[0].factorIndex = -1; // Mark as invalid
|
||||
return dummy;
|
||||
}
|
||||
|
||||
// Allocate space for results (plus one for terminator)
|
||||
FactorImpactResult* results = new FactorImpactResult[factorCount + 1];
|
||||
|
||||
// Calculate correlation matrix for all factors + target
|
||||
std::vector<std::vector<double>> corrMatrix(factorCount + 1, std::vector<double>(factorCount + 1, 0));
|
||||
|
||||
for (int i = 0; i < factorCount; i++) {
|
||||
// Correlation between factor i and target
|
||||
corrMatrix[i][factorCount] = calculateCorrelation(factors[i], target, dataLength);
|
||||
corrMatrix[factorCount][i] = corrMatrix[i][factorCount];
|
||||
|
||||
// Correlations between factors
|
||||
for (int j = i + 1; j < factorCount; j++) {
|
||||
corrMatrix[i][j] = calculateCorrelation(factors[i], factors[j], dataLength);
|
||||
corrMatrix[j][i] = corrMatrix[i][j];
|
||||
}
|
||||
|
||||
// Self-correlation is 1
|
||||
corrMatrix[i][i] = 1.0;
|
||||
}
|
||||
corrMatrix[factorCount][factorCount] = 1.0;
|
||||
|
||||
// Calculate the impact of each factor
|
||||
for (int i = 0; i < factorCount; i++) {
|
||||
FactorImpactResult& impact = results[i];
|
||||
|
||||
// Direct effect is correlation with target
|
||||
double directEffect = corrMatrix[i][factorCount];
|
||||
|
||||
// Calculate indirect effects through other factors
|
||||
double indirectEffect = 0;
|
||||
for (int j = 0; j < factorCount; j++) {
|
||||
if (j != i) {
|
||||
// Indirect effect through factor j
|
||||
indirectEffect += corrMatrix[i][j] * corrMatrix[j][factorCount];
|
||||
}
|
||||
}
|
||||
// Normalize indirect effect
|
||||
indirectEffect /= std::max(1, factorCount - 1);
|
||||
|
||||
// Calculate partial correlation (direct effect controlling for other factors)
|
||||
// This is a simplified approach - real implementation would use matrix operations
|
||||
double partialCorr = directEffect;
|
||||
if (factorCount > 1) {
|
||||
double sumControlledVar = 0;
|
||||
for (int j = 0; j < factorCount; j++) {
|
||||
if (j != i) {
|
||||
// Remove effect of factor j from both target and factor i
|
||||
double controlEffect = corrMatrix[i][j] * corrMatrix[j][factorCount];
|
||||
partialCorr -= controlEffect / (factorCount - 1);
|
||||
sumControlledVar += corrMatrix[i][j] * corrMatrix[i][j];
|
||||
}
|
||||
}
|
||||
sumControlledVar /= (factorCount - 1);
|
||||
// Normalize partial correlation
|
||||
if (sumControlledVar < 0.98) { // Avoid division by near-zero
|
||||
partialCorr /= sqrt((1 - sumControlledVar));
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate total impact score
|
||||
impact.factorIndex = i;
|
||||
impact.directEffect = directEffect;
|
||||
impact.indirectEffect = indirectEffect;
|
||||
|
||||
// Total impact is weighted sum of direct and partial correlation
|
||||
double partialWeight = 0.7; // Weight more toward direct unique contribution
|
||||
impact.impactScore = partialWeight * std::abs(partialCorr) + (1 - partialWeight) * std::abs(directEffect);
|
||||
|
||||
// Calculate confidence based on correlation strength and sample size
|
||||
double t_stat = std::abs(directEffect) * std::sqrt((dataLength - 2) / (1 - directEffect * directEffect));
|
||||
double p_value = 2 * (1 - std::min(1.0, std::exp(-0.717 * t_stat - 0.416 * t_stat * t_stat)));
|
||||
impact.confidence = std::min(0.95, (1.0 - p_value) * (1.0 - 1.0 / std::sqrt(dataLength)));
|
||||
|
||||
// Copy factor name
|
||||
strncpy(impact.factorName, factorNames[i], MAX_STRING_SIZE - 1);
|
||||
impact.factorName[MAX_STRING_SIZE - 1] = '\0';
|
||||
|
||||
// Generate mechanism description based on direct and indirect effects
|
||||
const char* direction = (directEffect > 0) ? "positive" : "negative";
|
||||
const char* strength =
|
||||
(std::abs(directEffect) > 0.7) ? "strong" :
|
||||
(std::abs(directEffect) > 0.4) ? "moderate" : "weak";
|
||||
|
||||
// Check for mediation effects
|
||||
bool hasMediationEffect = std::abs(indirectEffect) > 0.2 &&
|
||||
std::abs(indirectEffect) > std::abs(directEffect) * 0.5;
|
||||
|
||||
if (hasMediationEffect) {
|
||||
snprintf(impact.mechanism, MAX_STRING_SIZE,
|
||||
"%s %s impact: %s affects the target both directly (%.2f) and through other factors (%.2f)",
|
||||
strength, direction, factorNames[i], directEffect, indirectEffect);
|
||||
} else {
|
||||
snprintf(impact.mechanism, MAX_STRING_SIZE,
|
||||
"%s %s impact: changes in %s are %s associated with changes in the target (r=%.2f)",
|
||||
strength, direction, factorNames[i], direction, directEffect);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by impact score (descending)
|
||||
std::sort(results, results + factorCount,
|
||||
[](const FactorImpactResult& a, const FactorImpactResult& b) {
|
||||
return a.impactScore > b.impactScore;
|
||||
});
|
||||
|
||||
// Mark the end of valid results
|
||||
results[factorCount].factorIndex = -1;
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Free memory for factor impact results
|
||||
*
|
||||
* @param results Pointer to factor impact results array
|
||||
*/
|
||||
void free_factor_impact_results(FactorImpactResult* results) {
|
||||
delete[] results;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Analyze the impact of medication on a health metric
|
||||
*
|
||||
* @param before_data Values before medication
|
||||
* @param before_length Length of before data
|
||||
* @param after_data Values after medication
|
||||
* @param after_length Length of after data
|
||||
* @param medication_name Name of the medication
|
||||
* @param factor_name Name of the health factor
|
||||
* @return MedicationImpactAnalysis* Pointer to impact analysis result
|
||||
*/
|
||||
MedicationImpactAnalysis* analyze_medication_impact(
|
||||
const double* before_data,
|
||||
int before_length,
|
||||
const double* after_data,
|
||||
int after_length,
|
||||
const char* medication_name,
|
||||
const char* factor_name) {
|
||||
|
||||
MedicationImpactAnalysis* result = new MedicationImpactAnalysis();
|
||||
memset(result, 0, sizeof(MedicationImpactAnalysis));
|
||||
|
||||
// Copy names
|
||||
strncpy(result->medicationName, medication_name, MAX_STRING_SIZE-1);
|
||||
strncpy(result->factorName, factor_name, MAX_STRING_SIZE-1);
|
||||
|
||||
// Return if insufficient data
|
||||
if (before_length < 5 || after_length < 5) {
|
||||
strncpy(result->description, "Insufficient data for analysis", MAX_STRING_SIZE-1);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Calculate means
|
||||
double beforeSum = 0, afterSum = 0;
|
||||
for (int i = 0; i < before_length; i++) {
|
||||
beforeSum += before_data[i];
|
||||
}
|
||||
for (int i = 0; i < after_length; i++) {
|
||||
afterSum += after_data[i];
|
||||
}
|
||||
|
||||
result->beforeMean = beforeSum / before_length;
|
||||
result->afterMean = afterSum / after_length;
|
||||
|
||||
// Calculate change magnitude
|
||||
result->changeMagnitude = result->afterMean - result->beforeMean;
|
||||
|
||||
// Calculate significance with simple t-test
|
||||
double beforeVar = 0, afterVar = 0;
|
||||
for (int i = 0; i < before_length; i++) {
|
||||
double diff = before_data[i] - result->beforeMean;
|
||||
beforeVar += diff * diff;
|
||||
}
|
||||
for (int i = 0; i < after_length; i++) {
|
||||
double diff = after_data[i] - result->afterMean;
|
||||
afterVar += diff * diff;
|
||||
}
|
||||
|
||||
beforeVar /= (before_length - 1);
|
||||
afterVar /= (after_length - 1);
|
||||
|
||||
double se = sqrt(beforeVar/before_length + afterVar/after_length);
|
||||
double tStat = fabs(result->changeMagnitude) / (se + 0.0001);
|
||||
|
||||
// Simple significance estimation (0-1)
|
||||
result->changeSignificance = std::min(1.0, tStat / 5.0);
|
||||
|
||||
// Overall impact combines magnitude and significance
|
||||
result->overallImpact = fabs(result->changeMagnitude) * result->changeSignificance;
|
||||
|
||||
// Estimate days to effect (placeholder implementation)
|
||||
result->daysToEffect = 7; // Assumed 1 week
|
||||
|
||||
// Generate description
|
||||
const char* direction = (result->changeMagnitude > 0) ? "increased" : "decreased";
|
||||
const char* significance = (result->changeSignificance > 0.7) ? "significant" :
|
||||
(result->changeSignificance > 0.3) ? "moderate" : "slight";
|
||||
|
||||
snprintf(result->description, MAX_STRING_SIZE,
|
||||
"%s shows a %s %s effect on %s (%.1f → %.1f)",
|
||||
medication_name, significance, direction, factor_name,
|
||||
result->beforeMean, result->afterMean);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Free memory for medication impact analysis
|
||||
*
|
||||
* @param analysis Pointer to medication impact analysis
|
||||
*/
|
||||
void free_medication_impact_analysis(MedicationImpactAnalysis* analysis) {
|
||||
delete analysis;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Analyze hormone impact on health metrics
|
||||
*
|
||||
* @param hormone_levels Array of hormone levels
|
||||
* @param data_length Length of hormone data
|
||||
* @param factor_data Array of factor data arrays
|
||||
* @param factor_names Array of factor names
|
||||
* @param factor_count Number of factors
|
||||
* @param hormone_name Name of the hormone
|
||||
* @param min_optimal_level Lower bound of optimal range
|
||||
* @param max_optimal_level Upper bound of optimal range
|
||||
* @return HormoneImpactAnalysis* Pointer to impact analysis result
|
||||
*/
|
||||
HormoneImpactAnalysis* analyze_hormone_impact(
|
||||
const double* hormone_levels,
|
||||
int data_length,
|
||||
const double** factor_data,
|
||||
const char** factor_names,
|
||||
int factor_count,
|
||||
const char* hormone_name,
|
||||
double min_optimal_level,
|
||||
double max_optimal_level) {
|
||||
|
||||
HormoneImpactAnalysis* result = new HormoneImpactAnalysis();
|
||||
memset(result, 0, sizeof(HormoneImpactAnalysis));
|
||||
|
||||
// Copy hormone name
|
||||
strncpy(result->hormoneName, hormone_name, MAX_STRING_SIZE-1);
|
||||
|
||||
// Return if insufficient data
|
||||
if (data_length < 3 || factor_count <= 0) {
|
||||
strncpy(result->description, "Insufficient data for analysis", MAX_STRING_SIZE-1);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Calculate current hormone level (average of recent readings)
|
||||
double sum = 0;
|
||||
for (int i = 0; i < data_length; i++) {
|
||||
sum += hormone_levels[i];
|
||||
}
|
||||
result->currentLevel = sum / data_length;
|
||||
|
||||
// Set optimal levels
|
||||
result->optimalRangeLower = min_optimal_level;
|
||||
result->optimalRangeUpper = max_optimal_level;
|
||||
result->optimalLevel = (min_optimal_level + max_optimal_level) / 2;
|
||||
|
||||
// Calculate deviation from optimal range
|
||||
if (result->currentLevel < min_optimal_level) {
|
||||
result->deviation = (result->currentLevel - min_optimal_level) / min_optimal_level;
|
||||
} else if (result->currentLevel > max_optimal_level) {
|
||||
result->deviation = (result->currentLevel - max_optimal_level) / max_optimal_level;
|
||||
} else {
|
||||
result->deviation = 0; // Within optimal range
|
||||
}
|
||||
|
||||
// Calculate correlations with factors
|
||||
for (int i = 0; i < factor_count && i < 50; i++) {
|
||||
// Calculate correlation
|
||||
double correlation = 0;
|
||||
double sum_xy = 0, sum_x2 = 0, sum_y2 = 0;
|
||||
double sum_x = 0, sum_y = 0;
|
||||
|
||||
for (int j = 0; j < data_length; j++) {
|
||||
sum_x += hormone_levels[j];
|
||||
sum_y += factor_data[i][j];
|
||||
sum_xy += hormone_levels[j] * factor_data[i][j];
|
||||
sum_x2 += hormone_levels[j] * hormone_levels[j];
|
||||
sum_y2 += factor_data[i][j] * factor_data[i][j];
|
||||
}
|
||||
|
||||
double n = data_length;
|
||||
double denominator = sqrt((n * sum_x2 - sum_x * sum_x) * (n * sum_y2 - sum_y * sum_y));
|
||||
|
||||
if (denominator > 0) {
|
||||
correlation = (n * sum_xy - sum_x * sum_y) / denominator;
|
||||
}
|
||||
|
||||
// Store impact and factor name
|
||||
result->impactOnOtherFactors[i] = correlation;
|
||||
strncpy(result->factorNames[i], factor_names[i], MAX_STRING_SIZE-1);
|
||||
|
||||
// Set impact on mood and energy if found
|
||||
if (strcmp(factor_names[i], "Mood") == 0) {
|
||||
result->impactOnMood = correlation;
|
||||
} else if (strcmp(factor_names[i], "Energy") == 0) {
|
||||
result->impactOnEnergy = correlation;
|
||||
}
|
||||
}
|
||||
|
||||
// Generate description
|
||||
const char* status;
|
||||
if (fabs(result->deviation) < 0.1) {
|
||||
status = "within optimal range";
|
||||
} else if (result->deviation < 0) {
|
||||
status = "below optimal range";
|
||||
} else {
|
||||
status = "above optimal range";
|
||||
}
|
||||
|
||||
snprintf(result->description, MAX_STRING_SIZE,
|
||||
"%s level is %s (%.1f, range: %.1f-%.1f)",
|
||||
hormone_name, status, result->currentLevel,
|
||||
min_optimal_level, max_optimal_level);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Free memory for hormone impact analysis
|
||||
*
|
||||
* @param analysis Pointer to hormone impact analysis
|
||||
*/
|
||||
void free_hormone_impact_analysis(HormoneImpactAnalysis* analysis) {
|
||||
delete analysis;
|
||||
}
|
272
native/statistics/time_series.cpp
Normal file
272
native/statistics/time_series.cpp
Normal file
|
@ -0,0 +1,272 @@
|
|||
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
|
||||
// SPDX-License-Identifier: AGPL-3.0
|
||||
//
|
||||
// time_series.cpp
|
||||
// Implementation of time series analysis functions
|
||||
//
|
||||
#include "health_analytics_engine.h"
|
||||
#include "utils.h"
|
||||
/**
|
||||
* @brief Detect trends in time series data
|
||||
*
|
||||
* @param values Time series data
|
||||
* @param length Number of elements in the array
|
||||
* @param strength Output parameter for trend strength
|
||||
* @return TrendType Enum indicating trend direction and type
|
||||
*/
|
||||
TrendType detect_trend(const double* values, int length, double* strength) {
|
||||
if (length < 3) {
|
||||
*strength = 0;
|
||||
return TREND_NONE;
|
||||
}
|
||||
|
||||
// Generate time vector (0, 1, 2, ...)
|
||||
std::vector<double> time(length);
|
||||
for (int i = 0; i < length; i++) {
|
||||
time[i] = i;
|
||||
}
|
||||
|
||||
// Calculate linear regression
|
||||
double slope, intercept, r_squared;
|
||||
if (!calculateLinearRegression(time.data(), values, length, slope, intercept, r_squared)) {
|
||||
*strength = 0;
|
||||
return TREND_NONE;
|
||||
}
|
||||
|
||||
// Detrend the data for further analysis
|
||||
std::vector<double> detrended(length);
|
||||
for (int i = 0; i < length; i++) {
|
||||
detrended[i] = values[i] - (intercept + slope * i);
|
||||
}
|
||||
|
||||
// Check for cyclical patterns using autocorrelation
|
||||
bool has_cycle = false;
|
||||
int cycle_length = 0;
|
||||
double max_autocorr = 0;
|
||||
|
||||
// Check autocorrelation for various lags
|
||||
const int MIN_LAG = 2;
|
||||
const int MAX_LAG = length / 3; // Look for cycles up to 1/3 of series length
|
||||
|
||||
for (int lag = MIN_LAG; lag < MAX_LAG; lag++) {
|
||||
double autocorr = calculateAutocorrelation(detrended.data(), length, lag);
|
||||
|
||||
// If strong positive autocorrelation found
|
||||
if (autocorr > 0.3 && autocorr > max_autocorr) {
|
||||
max_autocorr = autocorr;
|
||||
cycle_length = lag;
|
||||
has_cycle = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if cycle pattern is stronger than linear trend
|
||||
if (has_cycle && max_autocorr > std::abs(r_squared)) {
|
||||
*strength = max_autocorr;
|
||||
return TREND_CYCLIC;
|
||||
}
|
||||
|
||||
// Determine trend direction based on slope and strength
|
||||
*strength = std::abs(r_squared);
|
||||
|
||||
// Require minimum strength to declare a trend
|
||||
if (*strength < 0.2) {
|
||||
return TREND_NONE;
|
||||
} else if (slope > 0) {
|
||||
return TREND_INCREASING;
|
||||
} else {
|
||||
return TREND_DECREASING;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Predict future values of a time series using ARIMA-like approach
|
||||
*
|
||||
* @param timeSeries Time series data
|
||||
* @param dataLength Length of time series
|
||||
* @param stepsAhead Number of future steps to predict
|
||||
* @param factorName Name of the factor being predicted
|
||||
* @return TimeSeriesForecast Structure containing predictions and confidence intervals
|
||||
*/
|
||||
TimeSeriesForecast predict_time_series(const double* timeSeries,
|
||||
int dataLength,
|
||||
int stepsAhead,
|
||||
const char* factorName) {
|
||||
TimeSeriesForecast forecast;
|
||||
memset(&forecast, 0, sizeof(TimeSeriesForecast));
|
||||
|
||||
if (dataLength < 5 || stepsAhead <= 0) {
|
||||
forecast.overallConfidence = 0;
|
||||
return forecast;
|
||||
}
|
||||
|
||||
// Copy factor name
|
||||
strncpy(forecast.factorName, factorName, MAX_STRING_SIZE - 1);
|
||||
forecast.factorName[MAX_STRING_SIZE - 1] = '\0';
|
||||
|
||||
// First, check for seasonality
|
||||
int potentialSeasonality = 0;
|
||||
double maxAutocorr = 0;
|
||||
|
||||
// Look for seasonality in range 2 to dataLength/3
|
||||
for (int lag = 2; lag <= dataLength/3; lag++) {
|
||||
double acf = calculateAutocorrelation(timeSeries, dataLength, lag);
|
||||
if (acf > 0.3 && acf > maxAutocorr) {
|
||||
maxAutocorr = acf;
|
||||
potentialSeasonality = lag;
|
||||
}
|
||||
}
|
||||
|
||||
// Set seasonality period if detected
|
||||
forecast.seasonalityPeriod = potentialSeasonality;
|
||||
|
||||
// Decompose time series if seasonality detected
|
||||
std::vector<double> trend(dataLength);
|
||||
std::vector<double> seasonal(dataLength);
|
||||
std::vector<double> residual(dataLength);
|
||||
|
||||
bool hasSeasonality = potentialSeasonality > 0 && maxAutocorr > 0.3;
|
||||
|
||||
if (hasSeasonality) {
|
||||
// Decompose the time series
|
||||
decomposeTimeSeries(timeSeries, dataLength, potentialSeasonality,
|
||||
trend.data(), seasonal.data(), residual.data());
|
||||
} else {
|
||||
// No seasonality, just use simple moving average for trend
|
||||
calculateMovingAverage(timeSeries, dataLength, std::min(7, dataLength/3), trend.data());
|
||||
|
||||
// No seasonal component
|
||||
for (int i = 0; i < dataLength; i++) {
|
||||
seasonal[i] = 0;
|
||||
residual[i] = timeSeries[i] - trend[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Fit AR model to residuals for short-term dynamics
|
||||
// Determine optimal AR order using PACF
|
||||
int maxLag = std::min(10, dataLength/5);
|
||||
std::vector<double> pacf(maxLag + 1);
|
||||
calculatePACF(residual.data(), dataLength, maxLag, pacf.data());
|
||||
|
||||
// Find significant AR terms (PACF > 0.2)
|
||||
std::vector<int> significantLags;
|
||||
for (int i = 1; i <= maxLag; i++) {
|
||||
if (std::abs(pacf[i]) > 0.2) {
|
||||
significantLags.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
// Limit to 3 most significant terms
|
||||
if (significantLags.size() > 3) {
|
||||
std::sort(significantLags.begin(), significantLags.end(),
|
||||
[&pacf](int a, int b) {
|
||||
return std::abs(pacf[a]) > std::abs(pacf[b]);
|
||||
});
|
||||
significantLags.resize(3);
|
||||
}
|
||||
|
||||
// Fit AR coefficients using linear regression
|
||||
int arOrder = significantLags.size();
|
||||
std::vector<double> arCoefficients(arOrder, 0);
|
||||
|
||||
if (arOrder > 0) {
|
||||
// Prepare training data for AR model
|
||||
int trainingSize = dataLength - significantLags.back();
|
||||
std::vector<std::vector<double>> X(trainingSize, std::vector<double>(arOrder));
|
||||
std::vector<double> y(trainingSize);
|
||||
|
||||
for (int i = 0; i < trainingSize; i++) {
|
||||
int t = i + significantLags.back();
|
||||
y[i] = residual[t];
|
||||
|
||||
for (int j = 0; j < arOrder; j++) {
|
||||
X[i][j] = residual[t - significantLags[j]];
|
||||
}
|
||||
}
|
||||
|
||||
// Very simplified AR coefficient estimation
|
||||
// Real implementation would use matrix operations
|
||||
for (int j = 0; j < arOrder; j++) {
|
||||
double sumXY = 0, sumX2 = 0;
|
||||
for (int i = 0; i < trainingSize; i++) {
|
||||
sumXY += X[i][j] * y[i];
|
||||
sumX2 += X[i][j] * X[i][j];
|
||||
}
|
||||
if (sumX2 > 0) {
|
||||
arCoefficients[j] = sumXY / sumX2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set time unit (days by default)
|
||||
forecast.timeUnit = TIME_UNIT_DAYS;
|
||||
|
||||
// Generate forecasts
|
||||
double trendGrowth = 0;
|
||||
if (dataLength > 10) {
|
||||
// Calculate average trend growth over last 10 points
|
||||
trendGrowth = (trend[dataLength-1] - trend[dataLength-11]) / 10.0;
|
||||
}
|
||||
|
||||
// Last observed values
|
||||
std::vector<double> lastResiduals(dataLength);
|
||||
for (int i = 0; i < dataLength; i++) {
|
||||
lastResiduals[i] = residual[i];
|
||||
}
|
||||
|
||||
// Generate predictions
|
||||
for (int i = 0; i < stepsAhead && i < 30; i++) {
|
||||
int t = dataLength + i;
|
||||
|
||||
// Forecast trend component
|
||||
double trendForecast = trend[dataLength-1] + trendGrowth * (i + 1);
|
||||
|
||||
// Forecast seasonal component (if any)
|
||||
double seasonalForecast = 0;
|
||||
if (hasSeasonality && potentialSeasonality > 0) {
|
||||
seasonalForecast = seasonal[dataLength - potentialSeasonality + (i % potentialSeasonality)];
|
||||
}
|
||||
|
||||
// Forecast residual component using AR model
|
||||
double residualForecast = 0;
|
||||
for (int j = 0; j < arOrder; j++) {
|
||||
int lag = significantLags[j];
|
||||
if (i >= lag) {
|
||||
// Use previously forecasted residuals
|
||||
residualForecast += arCoefficients[j] * lastResiduals[dataLength + i - lag];
|
||||
} else {
|
||||
// Use observed residuals
|
||||
residualForecast += arCoefficients[j] * residual[dataLength - lag + i];
|
||||
}
|
||||
}
|
||||
|
||||
// Store forecasted residual
|
||||
lastResiduals.push_back(residualForecast);
|
||||
|
||||
// Combine components for final forecast
|
||||
forecast.predictions[i] = trendForecast + seasonalForecast + residualForecast;
|
||||
|
||||
// Calculate confidence intervals (widen with forecast horizon)
|
||||
double stdError = 0;
|
||||
for (int j = 0; j < dataLength; j++) {
|
||||
stdError += residual[j] * residual[j];
|
||||
}
|
||||
stdError = sqrt(stdError / dataLength);
|
||||
|
||||
// Wider intervals for longer forecasts
|
||||
double multiplier = 1.96 * sqrt(1.0 + 0.25 * i); // Roughly 95% CI with growing uncertainty
|
||||
|
||||
forecast.confidenceIntervals[i][0] = forecast.predictions[i] - multiplier * stdError;
|
||||
forecast.confidenceIntervals[i][1] = forecast.predictions[i] + multiplier * stdError;
|
||||
}
|
||||
|
||||
// Set overall confidence based on model quality and forecast distance
|
||||
double modelAccuracy = 0.8; // Would be calculated from validation in real model
|
||||
if (hasSeasonality) modelAccuracy += 0.1;
|
||||
if (arOrder > 0) modelAccuracy += 0.1 * std::min(arOrder, 2);
|
||||
|
||||
forecast.overallConfidence = modelAccuracy * exp(-0.05 * stepsAhead);
|
||||
if (forecast.overallConfidence > 0.95) forecast.overallConfidence = 0.95;
|
||||
if (forecast.overallConfidence < 0.2) forecast.overallConfidence = 0.2;
|
||||
|
||||
return forecast;
|
||||
}
|
863
native/statistics/utils.cpp
Normal file
863
native/statistics/utils.cpp
Normal file
|
@ -0,0 +1,863 @@
|
|||
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
|
||||
// SPDX-License-Identifier: AGPL-3.0
|
||||
//
|
||||
// utils.cpp
|
||||
// Core utility functions used across the health analytics library
|
||||
//
|
||||
#include "health_analytics_engine.h"
|
||||
#include "utils.h"
|
||||
#include <random>
|
||||
#include <limits>
|
||||
#include <numeric>
|
||||
/**
|
||||
* @brief Calculate the mean (average) of a data series
|
||||
*
|
||||
* @param values Pointer to array of values
|
||||
* @param length Number of elements in the array
|
||||
* @return double The arithmetic mean
|
||||
*/
|
||||
double calculateMean(const double* values, int length) {
|
||||
if (length <= 0) return 0;
|
||||
|
||||
double sum = 0;
|
||||
for (int i = 0; i < length; i++) {
|
||||
sum += values[i];
|
||||
}
|
||||
return sum / length;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate the weighted mean of a data series
|
||||
*
|
||||
* @param values Pointer to array of values
|
||||
* @param weights Pointer to array of weights for each value
|
||||
* @param length Number of elements in the arrays
|
||||
* @return double The weighted arithmetic mean
|
||||
*/
|
||||
double calculateWeightedMean(const double* values, const double* weights, int length) {
|
||||
if (length <= 0) return 0;
|
||||
|
||||
double sum = 0;
|
||||
double weightSum = 0;
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
sum += values[i] * weights[i];
|
||||
weightSum += weights[i];
|
||||
}
|
||||
|
||||
return weightSum > 0 ? sum / weightSum : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate the variance of a data series
|
||||
* Uses Welford's online algorithm for numerical stability
|
||||
*
|
||||
* @param values Pointer to array of values
|
||||
* @param length Number of elements in the array
|
||||
* @param mean Pre-calculated mean (if available, otherwise pass 0)
|
||||
* @return double The variance (population or sample based on implementation)
|
||||
*/
|
||||
double calculateVariance(const double* values, int length, double mean = 0) {
|
||||
if (length <= 1) return 0;
|
||||
|
||||
// Use pre-calculated mean if provided, otherwise calculate it
|
||||
if (mean == 0) {
|
||||
mean = calculateMean(values, length);
|
||||
}
|
||||
|
||||
// Use two-pass algorithm for better numerical stability
|
||||
double sumSquaredDiff = 0;
|
||||
for (int i = 0; i < length; i++) {
|
||||
double diff = values[i] - mean;
|
||||
sumSquaredDiff += diff * diff;
|
||||
}
|
||||
|
||||
// Return sample variance (n-1 denominator for unbiased estimation)
|
||||
return sumSquaredDiff / (length - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate the standard deviation of a data series
|
||||
*
|
||||
* @param values Pointer to array of values
|
||||
* @param length Number of elements in the array
|
||||
* @param mean Pre-calculated mean (if available, otherwise pass 0)
|
||||
* @return double The standard deviation
|
||||
*/
|
||||
double calculateStdDev(const double* values, int length, double mean = 0) {
|
||||
return std::sqrt(calculateVariance(values, length, mean));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate the median of a data series
|
||||
*
|
||||
* @param values Pointer to array of values (will be modified by sorting)
|
||||
* @param length Number of elements in the array
|
||||
* @return double The median value
|
||||
*/
|
||||
double calculateMedian(double* values, int length) {
|
||||
if (length == 0) return 0;
|
||||
if (length == 1) return values[0];
|
||||
|
||||
// Create a copy and sort it
|
||||
std::vector<double> sorted(values, values + length);
|
||||
std::sort(sorted.begin(), sorted.end());
|
||||
|
||||
if (length % 2 == 0) {
|
||||
// Even number of elements
|
||||
return (sorted[length/2 - 1] + sorted[length/2]) / 2.0;
|
||||
} else {
|
||||
// Odd number of elements
|
||||
return sorted[length/2];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate the Pearson correlation coefficient between two data series
|
||||
*
|
||||
* @param x First data series
|
||||
* @param y Second data series (must be same length as x)
|
||||
* @param length Number of elements in both arrays
|
||||
* @return double Correlation coefficient (-1 to 1)
|
||||
*/
|
||||
double calculateCorrelation(const double* x, const double* y, int length) {
|
||||
if (length <= 1) return 0;
|
||||
|
||||
double sum_x = 0, sum_y = 0, sum_xy = 0;
|
||||
double sum_x2 = 0, sum_y2 = 0;
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
sum_x += x[i];
|
||||
sum_y += y[i];
|
||||
sum_xy += x[i] * y[i];
|
||||
sum_x2 += x[i] * x[i];
|
||||
sum_y2 += y[i] * y[i];
|
||||
}
|
||||
|
||||
double denominator = std::sqrt((length * sum_x2 - sum_x * sum_x) *
|
||||
(length * sum_y2 - sum_y * sum_y));
|
||||
|
||||
if (denominator < 1e-10) return 0; // Avoid division by zero
|
||||
|
||||
return (length * sum_xy - sum_x * sum_y) / denominator;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate Spearman's rank correlation coefficient
|
||||
* More robust to outliers than Pearson correlation
|
||||
*
|
||||
* @param x First data series
|
||||
* @param y Second data series (must be same length as x)
|
||||
* @param length Number of elements in both arrays
|
||||
* @return double Spearman's rank correlation coefficient (-1 to 1)
|
||||
*/
|
||||
double calculateSpearmanCorrelation(const double* x, const double* y, int length) {
|
||||
if (length <= 1) return 0;
|
||||
|
||||
// Create vectors with indices to perform ranking
|
||||
std::vector<std::pair<double, int>> x_indexed(length);
|
||||
std::vector<std::pair<double, int>> y_indexed(length);
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
x_indexed[i] = std::make_pair(x[i], i);
|
||||
y_indexed[i] = std::make_pair(y[i], i);
|
||||
}
|
||||
|
||||
// Sort by values to determine ranks
|
||||
std::sort(x_indexed.begin(), x_indexed.end());
|
||||
std::sort(y_indexed.begin(), y_indexed.end());
|
||||
|
||||
// Assign ranks (handling ties with average rank)
|
||||
std::vector<double> x_ranks(length), y_ranks(length);
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
int j = i;
|
||||
while (j < length - 1 && x_indexed[j].first == x_indexed[j + 1].first) j++;
|
||||
double rank = 1.0 * (i + j) / 2 + 1;
|
||||
for (int k = i; k <= j; k++) {
|
||||
x_ranks[x_indexed[k].second] = rank;
|
||||
}
|
||||
i = j;
|
||||
}
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
int j = i;
|
||||
while (j < length - 1 && y_indexed[j].first == y_indexed[j + 1].first) j++;
|
||||
double rank = 1.0 * (i + j) / 2 + 1;
|
||||
for (int k = i; k <= j; k++) {
|
||||
y_ranks[y_indexed[k].second] = rank;
|
||||
}
|
||||
i = j;
|
||||
}
|
||||
|
||||
// Calculate Pearson correlation on the ranks
|
||||
double* x_ranks_ptr = x_ranks.data();
|
||||
double* y_ranks_ptr = y_ranks.data();
|
||||
return calculateCorrelation(x_ranks_ptr, y_ranks_ptr, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate a specific quantile value of a data series
|
||||
*
|
||||
* @param values Pointer to array of values
|
||||
* @param length Number of elements in the array
|
||||
* @param q Quantile to calculate (0-1, e.g., 0.25 for first quartile)
|
||||
* @return double The value at the specified quantile
|
||||
*/
|
||||
double calculateQuantile(const double* values, int length, double q) {
|
||||
if (length == 0) return 0;
|
||||
if (length == 1) return values[0];
|
||||
if (q < 0) q = 0;
|
||||
if (q > 1) q = 1;
|
||||
|
||||
std::vector<double> sorted(values, values + length);
|
||||
std::sort(sorted.begin(), sorted.end());
|
||||
|
||||
// Linear interpolation between closest ranks
|
||||
double pos = (length - 1) * q;
|
||||
int idx_lower = static_cast<int>(pos);
|
||||
double frac = pos - idx_lower;
|
||||
|
||||
if (idx_lower + 1 < length) {
|
||||
return sorted[idx_lower] * (1 - frac) + sorted[idx_lower + 1] * frac;
|
||||
} else {
|
||||
return sorted[idx_lower];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate the interquartile range (IQR) of a data series
|
||||
*
|
||||
* @param values Pointer to array of values
|
||||
* @param length Number of elements in the array
|
||||
* @return double The IQR (Q3-Q1)
|
||||
*/
|
||||
double calculateIQR(const double* values, int length) {
|
||||
if (length < 4) return 0;
|
||||
|
||||
double q1 = calculateQuantile(values, length, 0.25);
|
||||
double q3 = calculateQuantile(values, length, 0.75);
|
||||
|
||||
return q3 - q1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate the skewness of a data distribution
|
||||
* Measures the asymmetry of the probability distribution
|
||||
*
|
||||
* @param values Pointer to array of values
|
||||
* @param length Number of elements in the array
|
||||
* @param mean Pre-calculated mean (if available, otherwise pass 0)
|
||||
* @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0)
|
||||
* @return double The skewness value (0 for normal distribution)
|
||||
*/
|
||||
double calculateSkewness(const double* values, int length, double mean = 0, double stdDev = 0) {
|
||||
if (length <= 2) return 0;
|
||||
|
||||
// Calculate mean and stdDev if not provided
|
||||
if (mean == 0) {
|
||||
mean = calculateMean(values, length);
|
||||
}
|
||||
|
||||
if (stdDev == 0) {
|
||||
stdDev = calculateStdDev(values, length, mean);
|
||||
}
|
||||
|
||||
if (stdDev < 1e-10) return 0; // Avoid division by zero
|
||||
|
||||
// Calculate third moment (cube of differences)
|
||||
double sum = 0;
|
||||
for (int i = 0; i < length; i++) {
|
||||
double diff = values[i] - mean;
|
||||
sum += diff * diff * diff;
|
||||
}
|
||||
|
||||
// Return Fisher-Pearson coefficient of skewness
|
||||
// Includes adjustment for sample bias
|
||||
double n = length;
|
||||
double adjustment = std::sqrt(n * (n - 1)) / (n - 2);
|
||||
return adjustment * sum / (length * stdDev * stdDev * stdDev);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate the kurtosis of a data distribution
|
||||
* Measures the "tailedness" of the probability distribution
|
||||
*
|
||||
* @param values Pointer to array of values
|
||||
* @param length Number of elements in the array
|
||||
* @param mean Pre-calculated mean (if available, otherwise pass 0)
|
||||
* @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0)
|
||||
* @return double The excess kurtosis (0 for normal distribution)
|
||||
*/
|
||||
double calculateKurtosis(const double* values, int length, double mean = 0, double stdDev = 0) {
|
||||
if (length <= 3) return 0;
|
||||
|
||||
// Calculate mean and stdDev if not provided
|
||||
if (mean == 0) {
|
||||
mean = calculateMean(values, length);
|
||||
}
|
||||
|
||||
if (stdDev == 0) {
|
||||
stdDev = calculateStdDev(values, length, mean);
|
||||
}
|
||||
|
||||
if (stdDev < 1e-10) return 0; // Avoid division by zero
|
||||
|
||||
// Calculate fourth moment
|
||||
double sum = 0;
|
||||
for (int i = 0; i < length; i++) {
|
||||
double diff = values[i] - mean;
|
||||
sum += diff * diff * diff * diff;
|
||||
}
|
||||
|
||||
// Return excess kurtosis with sample adjustment
|
||||
double n = length;
|
||||
double adjustment = ((n + 1) * n) / ((n - 1) * (n - 2) * (n - 3));
|
||||
double second_term = 3 * (n - 1) * (n - 1) / ((n - 2) * (n - 3));
|
||||
return adjustment * sum / (stdDev * stdDev * stdDev * stdDev) - second_term;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform linear regression on two data series
|
||||
*
|
||||
* @param x Independent variable values
|
||||
* @param y Dependent variable values (must be same length as x)
|
||||
* @param length Number of elements in both arrays
|
||||
* @param slope Output parameter for slope
|
||||
* @param intercept Output parameter for y-intercept
|
||||
* @param r_squared Output parameter for R² coefficient of determination
|
||||
* @return bool True if successful, false if error occurred
|
||||
*/
|
||||
bool calculateLinearRegression(const double* x, const double* y, int length,
|
||||
double& slope, double& intercept, double& r_squared) {
|
||||
if (length < 2) return false;
|
||||
|
||||
double sum_x = 0, sum_y = 0, sum_xy = 0, sum_x2 = 0, sum_y2 = 0;
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
sum_x += x[i];
|
||||
sum_y += y[i];
|
||||
sum_xy += x[i] * y[i];
|
||||
sum_x2 += x[i] * x[i];
|
||||
sum_y2 += y[i] * y[i];
|
||||
}
|
||||
|
||||
double n = static_cast<double>(length);
|
||||
double denominator = n * sum_x2 - sum_x * sum_x;
|
||||
|
||||
if (std::abs(denominator) < 1e-10) return false; // Vertical line, undefined slope
|
||||
|
||||
// Calculate slope and intercept
|
||||
slope = (n * sum_xy - sum_x * sum_y) / denominator;
|
||||
intercept = (sum_y - slope * sum_x) / n;
|
||||
|
||||
// Calculate R² coefficient of determination
|
||||
double mean_y = sum_y / n;
|
||||
double ss_total = 0, ss_residual = 0;
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
double predicted = intercept + slope * x[i];
|
||||
ss_total += (y[i] - mean_y) * (y[i] - mean_y);
|
||||
ss_residual += (y[i] - predicted) * (y[i] - predicted);
|
||||
}
|
||||
|
||||
if (ss_total < 1e-10) {
|
||||
r_squared = 1.0; // All points are on the same horizontal line
|
||||
} else {
|
||||
r_squared = 1.0 - (ss_residual / ss_total);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate the autocorrelation of a time series at specified lag
|
||||
*
|
||||
* @param values Time series data
|
||||
* @param length Number of elements in the array
|
||||
* @param lag The lag to calculate autocorrelation for
|
||||
* @return double Autocorrelation coefficient at specified lag (-1 to 1)
|
||||
*/
|
||||
double calculateAutocorrelation(const double* values, int length, int lag) {
|
||||
if (length <= lag || lag <= 0) return 0;
|
||||
|
||||
double mean = calculateMean(values, length);
|
||||
double numerator = 0;
|
||||
double denominator = 0;
|
||||
|
||||
for (int i = 0; i < length - lag; i++) {
|
||||
numerator += (values[i] - mean) * (values[i + lag] - mean);
|
||||
}
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
denominator += (values[i] - mean) * (values[i] - mean);
|
||||
}
|
||||
|
||||
if (denominator < 1e-10) return 0;
|
||||
|
||||
return numerator / denominator;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Detect outliers in a data series using modified Z-score method
|
||||
*
|
||||
* @param values Pointer to array of values
|
||||
* @param length Number of elements in the array
|
||||
* @param outlierIndices Output vector to store indices of detected outliers
|
||||
* @param threshold Z-score threshold to consider a point an outlier (typically 3.5)
|
||||
* @return int Number of outliers detected
|
||||
*/
|
||||
int detectOutliers(const double* values, int length, std::vector<int>& outlierIndices, double threshold = 3.5) {
|
||||
if (length < 3) return 0;
|
||||
|
||||
outlierIndices.clear();
|
||||
|
||||
// Use median and MAD instead of mean and std dev for robustness
|
||||
std::vector<double> sorted(values, values + length);
|
||||
std::sort(sorted.begin(), sorted.end());
|
||||
|
||||
double median = (length % 2 == 0) ?
|
||||
(sorted[length/2 - 1] + sorted[length/2]) / 2.0 : sorted[length/2];
|
||||
|
||||
// Calculate MAD (Median Absolute Deviation)
|
||||
std::vector<double> deviations(length);
|
||||
for (int i = 0; i < length; i++) {
|
||||
deviations[i] = std::abs(values[i] - median);
|
||||
}
|
||||
std::sort(deviations.begin(), deviations.end());
|
||||
|
||||
double mad = (length % 2 == 0) ?
|
||||
(deviations[length/2 - 1] + deviations[length/2]) / 2.0 : deviations[length/2];
|
||||
|
||||
// Constant factor for normal distribution
|
||||
const double k = 1.4826;
|
||||
|
||||
// Find outliers using modified Z-score
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (mad < 1e-10) { // If MAD is too small, use simple difference
|
||||
if (std::abs(values[i] - median) > threshold) {
|
||||
outlierIndices.push_back(i);
|
||||
}
|
||||
} else {
|
||||
double modified_z = k * std::abs(values[i] - median) / mad;
|
||||
if (modified_z > threshold) {
|
||||
outlierIndices.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return outlierIndices.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform simple moving average on a time series
|
||||
*
|
||||
* @param values Time series data
|
||||
* @param length Number of elements in the array
|
||||
* @param window The window size for the moving average
|
||||
* @param result Pre-allocated array to store results (size = length)
|
||||
*/
|
||||
void calculateMovingAverage(const double* values, int length, int window, double* result) {
|
||||
if (length <= 0 || window <= 0) return;
|
||||
|
||||
// Adjust window if it's larger than the data length
|
||||
window = std::min(window, length);
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
int start = std::max(0, i - window + 1);
|
||||
int end = i + 1;
|
||||
int count = end - start;
|
||||
|
||||
double sum = 0;
|
||||
for (int j = start; j < end; j++) {
|
||||
sum += values[j];
|
||||
}
|
||||
|
||||
result[i] = sum / count;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate exponential moving average (EMA) of a time series
|
||||
*
|
||||
* @param values Time series data
|
||||
* @param length Number of elements in the array
|
||||
* @param alpha Smoothing factor (0-1)
|
||||
* @param result Pre-allocated array to store results (size = length)
|
||||
*/
|
||||
void calculateExponentialMovingAverage(const double* values, int length, double alpha, double* result) {
|
||||
if (length <= 0 || alpha < 0 || alpha > 1) return;
|
||||
|
||||
// Initialize with first value
|
||||
result[0] = values[0];
|
||||
|
||||
// Apply EMA formula: EMA_t = α × value_t + (1 - α) × EMA_{t-1}
|
||||
for (int i = 1; i < length; i++) {
|
||||
result[i] = alpha * values[i] + (1 - alpha) * result[i - 1];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Decompose a time series into trend, seasonal, and residual components
|
||||
* Implementation of STL (Seasonal and Trend decomposition using Loess)
|
||||
*
|
||||
* @param values Time series data
|
||||
* @param length Number of elements in the array
|
||||
* @param seasonality Length of seasonal cycle (e.g., 7 for weekly, 12 for monthly)
|
||||
* @param trend Output array for trend component (size = length)
|
||||
* @param seasonal Output array for seasonal component (size = length)
|
||||
* @param residual Output array for residual component (size = length)
|
||||
* @return bool True if successful, false if error occurred
|
||||
*/
|
||||
bool decomposeTimeSeries(const double* values, int length, int seasonality,
|
||||
double* trend, double* seasonal, double* residual) {
|
||||
if (length <= 2 * seasonality || seasonality <= 1) return false;
|
||||
|
||||
// Calculate trend with centered moving average
|
||||
for (int i = 0; i < length; i++) {
|
||||
trend[i] = 0;
|
||||
}
|
||||
|
||||
int halfSeason = seasonality / 2;
|
||||
// Centered moving average for trend
|
||||
for (int i = halfSeason; i < length - halfSeason; i++) {
|
||||
double sum = 0;
|
||||
for (int j = i - halfSeason; j <= i + halfSeason; j++) {
|
||||
sum += values[j];
|
||||
}
|
||||
trend[i] = sum / seasonality;
|
||||
}
|
||||
|
||||
// Extrapolate trend at boundaries
|
||||
// Left boundary
|
||||
double slope = (trend[halfSeason + 5] - trend[halfSeason]) / 5;
|
||||
for (int i = 0; i < halfSeason; i++) {
|
||||
trend[i] = trend[halfSeason] - (halfSeason - i) * slope;
|
||||
}
|
||||
|
||||
// Right boundary
|
||||
slope = (trend[length - halfSeason - 1] - trend[length - halfSeason - 6]) / 5;
|
||||
for (int i = length - halfSeason; i < length; i++) {
|
||||
trend[i] = trend[length - halfSeason - 1] + (i - (length - halfSeason - 1)) * slope;
|
||||
}
|
||||
|
||||
// Calculate detrended series
|
||||
std::vector<double> detrended(length);
|
||||
for (int i = 0; i < length; i++) {
|
||||
detrended[i] = values[i] - trend[i];
|
||||
}
|
||||
|
||||
// Calculate seasonal component by averaging the detrended values across seasons
|
||||
std::vector<double> seasonalAvg(seasonality, 0);
|
||||
std::vector<int> seasonalCounts(seasonality, 0);
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
int seasonalIndex = i % seasonality;
|
||||
seasonalAvg[seasonalIndex] += detrended[i];
|
||||
seasonalCounts[seasonalIndex]++;
|
||||
}
|
||||
|
||||
for (int i = 0; i < seasonality; i++) {
|
||||
if (seasonalCounts[i] > 0) {
|
||||
seasonalAvg[i] /= seasonalCounts[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize seasonal component to sum to zero
|
||||
double avgSeasonal = 0;
|
||||
for (int i = 0; i < seasonality; i++) {
|
||||
avgSeasonal += seasonalAvg[i];
|
||||
}
|
||||
avgSeasonal /= seasonality;
|
||||
|
||||
for (int i = 0; i < seasonality; i++) {
|
||||
seasonalAvg[i] -= avgSeasonal;
|
||||
}
|
||||
|
||||
// Apply seasonal component to entire series
|
||||
for (int i = 0; i < length; i++) {
|
||||
seasonal[i] = seasonalAvg[i % seasonality];
|
||||
}
|
||||
|
||||
// Calculate residual component
|
||||
for (int i = 0; i < length; i++) {
|
||||
residual[i] = values[i] - trend[i] - seasonal[i];
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate partial autocorrelation function for a time series
|
||||
*
|
||||
* @param values Time series data
|
||||
* @param length Number of elements in the array
|
||||
* @param maxLag Maximum lag to calculate
|
||||
* @param pacf Pre-allocated array to store results (size = maxLag + 1)
|
||||
* @return int Number of valid PACF values calculated
|
||||
*/
|
||||
int calculatePACF(const double* values, int length, int maxLag, double* pacf) {
|
||||
if (length <= 1 || maxLag <= 0 || maxLag >= length) return 0;
|
||||
|
||||
// Allocate Yule-Walker matrices
|
||||
std::vector<std::vector<double>> phi(maxLag + 1, std::vector<double>(maxLag + 1, 0));
|
||||
|
||||
// Calculate autocorrelations
|
||||
std::vector<double> acf(maxLag + 1, 0);
|
||||
acf[0] = 1.0; // ACF at lag 0 is always 1
|
||||
|
||||
for (int k = 1; k <= maxLag; k++) {
|
||||
acf[k] = calculateAutocorrelation(values, length, k);
|
||||
}
|
||||
|
||||
// Set PACF at lag 0 to 1
|
||||
pacf[0] = 1.0;
|
||||
|
||||
// Calculate PACF using Levinson-Durbin recursion
|
||||
for (int k = 1; k <= maxLag; k++) {
|
||||
// Initialize for this order
|
||||
double numerator = acf[k];
|
||||
for (int j = 1; j < k; j++) {
|
||||
numerator -= phi[k-1][j] * acf[k-j];
|
||||
}
|
||||
|
||||
double denominator = 1.0;
|
||||
for (int j = 1; j < k; j++) {
|
||||
denominator -= phi[k-1][j] * acf[j];
|
||||
}
|
||||
|
||||
if (std::abs(denominator) < 1e-10) {
|
||||
// If denominator is close to zero, set PACF to 0
|
||||
phi[k][k] = 0;
|
||||
} else {
|
||||
phi[k][k] = numerator / denominator;
|
||||
}
|
||||
|
||||
// Update remaining coefficients
|
||||
for (int j = 1; j < k; j++) {
|
||||
phi[k][j] = phi[k-1][j] - phi[k][k] * phi[k-1][k-j];
|
||||
}
|
||||
|
||||
// Store the PACF value
|
||||
pacf[k] = phi[k][k];
|
||||
}
|
||||
|
||||
return maxLag + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Perform k-means clustering on multivariate data
|
||||
*
|
||||
* @param data 2D array of data points [n_samples x n_features]
|
||||
* @param nSamples Number of data points
|
||||
* @param nFeatures Number of features per data point
|
||||
* @param k Number of clusters
|
||||
* @param maxIter Maximum number of iterations
|
||||
* @param centroids Output array for cluster centroids [k x n_features]
|
||||
* @param assignments Output array for cluster assignments [n_samples]
|
||||
* @return int Number of iterations performed
|
||||
*/
|
||||
int kMeansClustering(const double** data, int nSamples, int nFeatures, int k,
|
||||
int maxIter, double** centroids, int* assignments) {
|
||||
if (nSamples < k || k <= 0 || nFeatures <= 0) return 0;
|
||||
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
std::uniform_int_distribution<> distrib(0, nSamples - 1);
|
||||
|
||||
// Initialize centroids using k-means++ initialization
|
||||
std::vector<int> centroidIndices;
|
||||
std::vector<double> minDistances(nSamples, std::numeric_limits<double>::max());
|
||||
|
||||
// Choose first centroid randomly
|
||||
int firstCentroid = distrib(gen);
|
||||
centroidIndices.push_back(firstCentroid);
|
||||
|
||||
// Choose remaining centroids
|
||||
for (int c = 1; c < k; c++) {
|
||||
// Update distances to nearest centroid
|
||||
for (int i = 0; i < nSamples; i++) {
|
||||
double dist = 0;
|
||||
for (int j = 0; j < nFeatures; j++) {
|
||||
double diff = data[i][j] - data[centroidIndices.back()][j];
|
||||
dist += diff * diff;
|
||||
}
|
||||
minDistances[i] = std::min(minDistances[i], dist);
|
||||
}
|
||||
|
||||
// Calculate sum of squared distances
|
||||
double sumSquaredDist = 0;
|
||||
for (int i = 0; i < nSamples; i++) {
|
||||
sumSquaredDist += minDistances[i];
|
||||
}
|
||||
|
||||
// Choose next centroid with probability proportional to D²
|
||||
double threshold = (sumSquaredDist * static_cast<double>(rand()) / RAND_MAX);
|
||||
double cumulativeProb = 0;
|
||||
int nextCentroid = 0;
|
||||
|
||||
for (int i = 0; i < nSamples; i++) {
|
||||
cumulativeProb += minDistances[i];
|
||||
if (cumulativeProb >= threshold) {
|
||||
nextCentroid = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
centroidIndices.push_back(nextCentroid);
|
||||
}
|
||||
|
||||
// Copy initial centroids
|
||||
for (int i = 0; i < k; i++) {
|
||||
for (int j = 0; j < nFeatures; j++) {
|
||||
centroids[i][j] = data[centroidIndices[i]][j];
|
||||
}
|
||||
}
|
||||
|
||||
// Perform k-means iterations
|
||||
int iterations = 0;
|
||||
bool converged = false;
|
||||
|
||||
while (!converged && iterations < maxIter) {
|
||||
// Assign points to nearest centroid
|
||||
converged = true;
|
||||
|
||||
for (int i = 0; i < nSamples; i++) {
|
||||
double minDist = std::numeric_limits<double>::max();
|
||||
int bestCluster = 0;
|
||||
|
||||
for (int c = 0; c < k; c++) {
|
||||
double dist = 0;
|
||||
for (int j = 0; j < nFeatures; j++) {
|
||||
double diff = data[i][j] - centroids[c][j];
|
||||
dist += diff * diff;
|
||||
}
|
||||
|
||||
if (dist < minDist) {
|
||||
minDist = dist;
|
||||
bestCluster = c;
|
||||
}
|
||||
}
|
||||
|
||||
if (assignments[i] != bestCluster) {
|
||||
assignments[i] = bestCluster;
|
||||
converged = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Update centroids
|
||||
std::vector<std::vector<double>> newCentroids(k, std::vector<double>(nFeatures, 0));
|
||||
std::vector<int> clusterSizes(k, 0);
|
||||
|
||||
for (int i = 0; i < nSamples; i++) {
|
||||
int cluster = assignments[i];
|
||||
clusterSizes[cluster]++;
|
||||
|
||||
for (int j = 0; j < nFeatures; j++) {
|
||||
newCentroids[cluster][j] += data[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
for (int c = 0; c < k; c++) {
|
||||
if (clusterSizes[c] > 0) {
|
||||
for (int j = 0; j < nFeatures; j++) {
|
||||
centroids[c][j] = newCentroids[c][j] / clusterSizes[c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
iterations++;
|
||||
}
|
||||
|
||||
return iterations;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculate the silhouette coefficient for clustering validation
|
||||
*
|
||||
* @param data 2D array of data points [n_samples x n_features]
|
||||
* @param nSamples Number of data points
|
||||
* @param nFeatures Number of features per data point
|
||||
* @param assignments Cluster assignments for each point
|
||||
* @param k Number of clusters
|
||||
* @return double Average silhouette coefficient (-1 to 1)
|
||||
*/
|
||||
double calculateSilhouetteCoefficient(const double** data, int nSamples, int nFeatures,
|
||||
const int* assignments, int k) {
|
||||
if (nSamples <= k || k <= 1) return 0;
|
||||
|
||||
std::vector<double> silhouettes(nSamples);
|
||||
|
||||
// For each point
|
||||
for (int i = 0; i < nSamples; i++) {
|
||||
int cluster_i = assignments[i];
|
||||
|
||||
// Calculate a(i) - average distance to points in same cluster
|
||||
double a_i = 0;
|
||||
int count_same_cluster = 0;
|
||||
|
||||
for (int j = 0; j < nSamples; j++) {
|
||||
if (j != i && assignments[j] == cluster_i) {
|
||||
double dist = 0;
|
||||
for (int f = 0; f < nFeatures; f++) {
|
||||
double diff = data[i][f] - data[j][f];
|
||||
dist += diff * diff;
|
||||
}
|
||||
dist = std::sqrt(dist);
|
||||
|
||||
a_i += dist;
|
||||
count_same_cluster++;
|
||||
}
|
||||
}
|
||||
|
||||
if (count_same_cluster > 0) {
|
||||
a_i /= count_same_cluster;
|
||||
} else {
|
||||
a_i = 0; // Singleton cluster
|
||||
}
|
||||
|
||||
// Calculate b(i) - minimum average distance to points in different clusters
|
||||
double b_i = std::numeric_limits<double>::max();
|
||||
|
||||
for (int c = 0; c < k; c++) {
|
||||
if (c == cluster_i) continue;
|
||||
|
||||
double avg_dist = 0;
|
||||
int count_diff_cluster = 0;
|
||||
|
||||
for (int j = 0; j < nSamples; j++) {
|
||||
if (assignments[j] == c) {
|
||||
double dist = 0;
|
||||
for (int f = 0; f < nFeatures; f++) {
|
||||
double diff = data[i][f] - data[j][f];
|
||||
dist += diff * diff;
|
||||
}
|
||||
dist = std::sqrt(dist);
|
||||
|
||||
avg_dist += dist;
|
||||
count_diff_cluster++;
|
||||
}
|
||||
}
|
||||
|
||||
if (count_diff_cluster > 0) {
|
||||
avg_dist /= count_diff_cluster;
|
||||
b_i = std::min(b_i, avg_dist);
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate silhouette
|
||||
if (count_same_cluster > 0 && b_i < std::numeric_limits<double>::max()) {
|
||||
silhouettes[i] = (b_i - a_i) / std::max(a_i, b_i);
|
||||
} else {
|
||||
silhouettes[i] = 0; // Handle edge cases
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate average silhouette
|
||||
double avg_silhouette = 0;
|
||||
for (int i = 0; i < nSamples; i++) {
|
||||
avg_silhouette += silhouettes[i];
|
||||
}
|
||||
|
||||
return avg_silhouette / nSamples;
|
||||
}
|
32
native/statistics/utils.h
Normal file
32
native/statistics/utils.h
Normal file
|
@ -0,0 +1,32 @@
|
|||
// utils.h
|
||||
#ifndef UTILS_H
|
||||
#define UTILS_H
|
||||
|
||||
#include "health_analytics_engine.h"
|
||||
#include <random>
|
||||
#include <limits>
|
||||
#include <numeric>
|
||||
|
||||
// Declare all utility functions from utils.cpp
|
||||
double calculateMean(const double* values, int length);
|
||||
double calculateWeightedMean(const double* values, const double* weights, int length);
|
||||
double calculateVariance(const double* values, int length, double mean);
|
||||
double calculateStdDev(const double* values, int length, double mean);
|
||||
double calculateMedian(double* values, int length);
|
||||
double calculateCorrelation(const double* x, const double* y, int length);
|
||||
double calculateSpearmanCorrelation(const double* x, const double* y, int length);
|
||||
double calculateQuantile(const double* values, int length, double q);
|
||||
double calculateIQR(const double* values, int length);
|
||||
double calculateSkewness(const double* values, int length, double mean, double stdDev);
|
||||
double calculateKurtosis(const double* values, int length, double mean, double stdDev);
|
||||
bool calculateLinearRegression(const double* x, const double* y, int length, double& slope, double& intercept, double& r_squared);
|
||||
double calculateAutocorrelation(const double* values, int length, int lag);
|
||||
int detectOutliers(const double* values, int length, std::vector<int>& outlierIndices, double threshold);
|
||||
void calculateMovingAverage(const double* values, int length, int window, double* result);
|
||||
void calculateExponentialMovingAverage(const double* values, int length, double alpha, double* result);
|
||||
bool decomposeTimeSeries(const double* values, int length, int seasonality, double* trend, double* seasonal, double* residual);
|
||||
int calculatePACF(const double* values, int length, int maxLag, double* pacf);
|
||||
int kMeansClustering(const double** data, int nSamples, int nFeatures, int k, int maxIter, double** centroids, int* assignments);
|
||||
double calculateSilhouetteCoefficient(const double** data, int nSamples, int nFeatures, const int* assignments, int k);
|
||||
|
||||
#endif // UTILS_H
|
Loading…
Add table
Add a link
Reference in a new issue