first commit - migrated from codeberg

This commit is contained in:
Charlotte Croce 2025-04-20 11:17:03 -04:00
commit 5ead03e1f7
567 changed files with 102721 additions and 0 deletions

View file

@ -0,0 +1,494 @@
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// anomaly_detection.cpp
// Implementation of anomaly and outlier detection functions
//
#include "health_analytics_engine.h"
/**
* @brief Detect anomalies in time series data
*
* @param timeSeries Time series data
* @param dataLength Length of time series
* @param threshold Z-score threshold to consider a point an anomaly
* @param dates Array of dates corresponding to time series points
* @param factorName Name of the factor being analyzed
* @return AnomalyResult* Array of detected anomalies
*/
AnomalyResult* detect_anomalies(const double* timeSeries,
int dataLength,
double threshold,
const DateStruct* dates,
const char* factorName) {
if (dataLength < 5) {
AnomalyResult* dummy = new AnomalyResult[1];
memset(dummy, 0, sizeof(AnomalyResult));
dummy[0].dataPointIndex = -1; // Mark as invalid
return dummy;
}
// Constants
const int MAX_RESULTS = 100;
// Allocate space for results
AnomalyResult* results = new AnomalyResult[MAX_RESULTS + 1];
int resultCount = 0;
// Decompose time series if enough data
std::vector<double> trend(dataLength);
std::vector<double> seasonal(dataLength);
std::vector<double> residual(dataLength);
bool hasDecomposition = false;
int seasonality = 0;
// Try to detect seasonality for decomposition
double maxAutocorr = 0;
for (int lag = 2; lag <= dataLength/3; lag++) {
double acf = calculateAutocorrelation(timeSeries, dataLength, lag);
if (acf > 0.3 && acf > maxAutocorr) {
maxAutocorr = acf;
seasonality = lag;
}
}
if (seasonality >= 2) {
hasDecomposition = decomposeTimeSeries(timeSeries, dataLength, seasonality,
trend.data(), seasonal.data(), residual.data());
}
if (!hasDecomposition) {
// Simple moving average for trend if decomposition failed
int windowSize = std::min(7, dataLength/3);
if (windowSize < 2) windowSize = 2;
calculateMovingAverage(timeSeries, dataLength, windowSize, trend.data());
// Residuals = original - trend
for (int i = 0; i < dataLength; i++) {
seasonal[i] = 0; // No seasonal component
residual[i] = timeSeries[i] - trend[i];
}
}
// Calculate residual statistics for outlier detection
double mean = 0, sumSquared = 0;
for (int i = 0; i < dataLength; i++) {
mean += residual[i];
}
mean /= dataLength;
for (int i = 0; i < dataLength; i++) {
double diff = residual[i] - mean;
sumSquared += diff * diff;
}
double stdDev = sqrt(sumSquared / dataLength);
if (stdDev <= 0) {
results[0].dataPointIndex = -1;
return results;
}
// Detect global outliers using z-score
for (int i = 0; i < dataLength && resultCount < MAX_RESULTS; i++) {
double zScore = (residual[i] - mean) / stdDev;
if (std::abs(zScore) > threshold) {
AnomalyResult& anomaly = results[resultCount++];
anomaly.dataPointIndex = i;
anomaly.anomalyScore = std::abs(zScore);
anomaly.originalValue = timeSeries[i];
anomaly.expectedValue = trend[i] + seasonal[i] + mean;
// Higher confidence for more extreme anomalies
anomaly.confidence = 0.5 + 0.5 * std::min(1.0, (std::abs(zScore) - threshold) / 5.0);
// Copy date if available
if (dates != nullptr) {
anomaly.date = dates[i];
} else {
memset(&anomaly.date, 0, sizeof(DateStruct));
}
// Copy factor name
if (factorName != nullptr) {
strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
} else {
anomaly.factorName[0] = '\0';
}
// Set anomaly type
anomaly.anomalyType = ANOMALY_OUTLIER;
// Generate description
if (zScore > 0) {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Unusually high value (%.2f standard deviations above expected)",
zScore);
} else {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Unusually low value (%.2f standard deviations below expected)",
-zScore);
}
}
}
// Detect context-based anomalies using local statistics
const int LOCAL_WINDOW = std::min(7, dataLength/5);
if (LOCAL_WINDOW >= 3) {
for (int i = LOCAL_WINDOW; i < dataLength - LOCAL_WINDOW && resultCount < MAX_RESULTS; i++) {
// Calculate local statistics
double localSum = 0, localSumSquared = 0;
for (int j = i - LOCAL_WINDOW; j <= i + LOCAL_WINDOW; j++) {
if (j != i) { // Exclude the point itself
localSum += timeSeries[j];
localSumSquared += timeSeries[j] * timeSeries[j];
}
}
double localMean = localSum / (2 * LOCAL_WINDOW);
double localVar = localSumSquared / (2 * LOCAL_WINDOW) - localMean * localMean;
double localStdDev = sqrt(std::max(localVar, 1e-6)); // Prevent division by zero
// Calculate local z-score
double localZScore = (timeSeries[i] - localMean) / localStdDev;
// Check if it's a local anomaly but not already a global anomaly
if (std::abs(localZScore) > threshold * 1.2) {
// Check if this point was already detected as a global anomaly
bool alreadyDetected = false;
for (int j = 0; j < resultCount; j++) {
if (results[j].dataPointIndex == i) {
alreadyDetected = true;
break;
}
}
if (!alreadyDetected) {
AnomalyResult& anomaly = results[resultCount++];
anomaly.dataPointIndex = i;
anomaly.anomalyScore = std::abs(localZScore);
anomaly.originalValue = timeSeries[i];
anomaly.expectedValue = localMean;
anomaly.confidence = 0.5 + 0.5 * std::min(1.0, (std::abs(localZScore) - threshold) / 5.0);
// Copy date if available
if (dates != nullptr) {
anomaly.date = dates[i];
} else {
memset(&anomaly.date, 0, sizeof(DateStruct));
}
// Copy factor name
if (factorName != nullptr) {
strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
} else {
anomaly.factorName[0] = '\0';
}
// Set anomaly type
anomaly.anomalyType = ANOMALY_CONTEXTUAL;
// Generate description
if (localZScore > 0) {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Context anomaly: value is high compared to local neighborhood (%.2f local std dev)",
localZScore);
} else {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Context anomaly: value is low compared to local neighborhood (%.2f local std dev)",
-localZScore);
}
}
}
}
}
// Detect trend changes
if (dataLength >= 10) {
for (int i = 5; i < dataLength - 5 && resultCount < MAX_RESULTS; i++) {
// Calculate slope before and after
double slopeBefore = 0, slopeAfter = 0;
double interceptBefore = 0, interceptAfter = 0;
double r2Before = 0, r2After = 0;
// Create time vectors
std::vector<double> time1(5), time2(5);
std::vector<double> values1(5), values2(5);
for (int j = 0; j < 5; j++) {
time1[j] = j;
time2[j] = j;
values1[j] = timeSeries[i - 5 + j];
values2[j] = timeSeries[i + j];
}
bool validBefore = calculateLinearRegression(time1.data(), values1.data(), 5,
slopeBefore, interceptBefore, r2Before);
bool validAfter = calculateLinearRegression(time2.data(), values2.data(), 5,
slopeAfter, interceptAfter, r2After);
if (validBefore && validAfter) {
// Check for significant change in slope
double slopeChange = slopeAfter - slopeBefore;
double meanSlope = (std::abs(slopeBefore) + std::abs(slopeAfter)) / 2;
if (meanSlope > 0 && std::abs(slopeChange) / meanSlope > 0.5) {
AnomalyResult& anomaly = results[resultCount++];
anomaly.dataPointIndex = i;
anomaly.anomalyScore = std::abs(slopeChange) / (meanSlope + 1e-6);
anomaly.originalValue = timeSeries[i];
anomaly.expectedValue = timeSeries[i]; // Same value, change is in the trend
anomaly.confidence = 0.5 + 0.5 * std::min(1.0, anomaly.anomalyScore / 2.0);
// Copy date if available
if (dates != nullptr) {
anomaly.date = dates[i];
} else {
memset(&anomaly.date, 0, sizeof(DateStruct));
}
// Copy factor name
if (factorName != nullptr) {
strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
} else {
anomaly.factorName[0] = '\0';
}
// Set anomaly type
anomaly.anomalyType = ANOMALY_TREND_CHANGE;
// Generate description
if (slopeBefore < 0 && slopeAfter > 0) {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Trend reversal: changed from decreasing (%.2f/day) to increasing (%.2f/day)",
-slopeBefore, slopeAfter);
} else if (slopeBefore > 0 && slopeAfter < 0) {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Trend reversal: changed from increasing (%.2f/day) to decreasing (%.2f/day)",
slopeBefore, -slopeAfter);
} else if (slopeAfter > slopeBefore) {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Trend acceleration: rate of change increased from %.2f/day to %.2f/day",
slopeBefore, slopeAfter);
} else {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Trend deceleration: rate of change decreased from %.2f/day to %.2f/day",
slopeBefore, slopeAfter);
}
}
}
}
}
// Sort anomalies by score (most significant first)
std::sort(results, results + resultCount,
[](const AnomalyResult& a, const AnomalyResult& b) {
return a.anomalyScore > b.anomalyScore;
});
// Mark the end of valid results
results[resultCount].dataPointIndex = -1;
return results;
}
/**
* @brief Free memory for anomaly detection results
*
* @param results Pointer to anomaly results array
*/
void free_anomaly_results(AnomalyResult* results) {
delete[] results;
}
/**
* @brief Analyze patterns related to dates (e.g., day of week effects)
*
* @param values Array of values
* @param dates Array of corresponding dates
* @param data_length Length of the arrays
* @param factor_name Name of the factor being analyzed
* @return DatePatternResult* Array of detected patterns
*/
DatePatternResult* analyze_date_patterns(
const double* values,
const DateStruct* dates,
int data_length,
const char* factor_name) {
// Allocate space for results (3 patterns max + terminator)
DatePatternResult* results = new DatePatternResult[4];
memset(results, 0, 4 * sizeof(DatePatternResult));
// Initialize terminator
results[0].patternType = PATTERN_NONE;
// Return empty result for insufficient data
if (data_length < 14) {
return results;
}
// Count occurrences by day of week
double dayOfWeekSums[7] = {0};
int dayOfWeekCounts[7] = {0};
for (int i = 0; i < data_length; i++) {
// Convert date to day of week (0 = Sunday, 6 = Saturday)
// This is a simplified calculation and might need adjustment
int year = dates[i].year;
int month = dates[i].month;
int day = dates[i].day;
// Zeller's congruence for finding day of week
if (month < 3) {
month += 12;
year--;
}
int h = (day + (13 * (month + 1)) / 5 + year + year / 4 - year / 100 + year / 400) % 7;
// Convert to 0-based where 0 is Sunday
int dayOfWeek = (h + 6) % 7;
dayOfWeekSums[dayOfWeek] += values[i];
dayOfWeekCounts[dayOfWeek]++;
}
// Calculate average by day of week
double dayOfWeekAvgs[7] = {0};
for (int i = 0; i < 7; i++) {
if (dayOfWeekCounts[i] > 0) {
dayOfWeekAvgs[i] = dayOfWeekSums[i] / dayOfWeekCounts[i];
}
}
// Find peak day
int peakDay = 0;
double peakValue = dayOfWeekAvgs[0];
for (int i = 1; i < 7; i++) {
if (dayOfWeekAvgs[i] > peakValue) {
peakValue = dayOfWeekAvgs[i];
peakDay = i;
}
}
// Calculate variance to determine if there's a weekly pattern
double mean = 0;
for (int i = 0; i < 7; i++) {
if (dayOfWeekCounts[i] > 0) {
mean += dayOfWeekAvgs[i];
}
}
mean /= 7;
double variance = 0;
for (int i = 0; i < 7; i++) {
if (dayOfWeekCounts[i] > 0) {
double diff = dayOfWeekAvgs[i] - mean;
variance += diff * diff;
}
}
variance /= 7;
// Calculate pattern strength
double strength = std::min(1.0, variance / (mean * mean + 0.001));
// Only report if pattern is significant
if (strength > 0.1) {
results[0].patternType = PATTERN_WEEKLY;
results[0].periodicity = 7;
results[0].strength = strength;
results[0].peakDayOfWeek = peakDay;
// Copy peak values
for (int i = 0; i < 7; i++) {
results[0].peakValues[i] = dayOfWeekAvgs[i];
}
// Generate description
const char* dayNames[] = {"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"};
snprintf(results[0].description, MAX_STRING_SIZE,
"Weekly pattern detected with peak on %s (strength: %.2f)",
dayNames[peakDay], strength);
}
return results;
}
/**
* @brief Free memory for date pattern results
*
* @param results Pointer to date pattern results array
*/
void free_date_pattern_results(DatePatternResult* results) {
delete[] results;
}
/**
* @brief Analyze cyclical patterns in time series data
*
* @param values Array of values
* @param dates Array of corresponding dates
* @param data_length Length of the arrays
* @param factor_name Name of the factor being analyzed
* @return CycleAnalysisResult Structure containing cycle analysis results
*/
CycleAnalysisResult analyze_cycles(
const double* values,
const DateStruct* dates,
int data_length,
const char* factor_name) {
CycleAnalysisResult result;
memset(&result, 0, sizeof(CycleAnalysisResult));
// Minimum data points required for cycle analysis
if (data_length < 20) {
strncpy(result.description, "Insufficient data for cycle analysis", MAX_STRING_SIZE-1);
return result;
}
// Simple autocorrelation-based cycle detection
// Find the peak in autocorrelation function after lag 0
int maxLag = data_length / 3; // Look for cycles up to 1/3 of data length
double maxCorr = 0;
int bestLag = 0;
for (int lag = 2; lag < maxLag; lag++) {
double sum = 0;
double count = 0;
for (int i = 0; i < data_length - lag; i++) {
sum += values[i] * values[i + lag];
count++;
}
double corr = sum / count;
if (corr > maxCorr) {
maxCorr = corr;
bestLag = lag;
}
}
// Calculate average cycle length in days
if (bestLag > 0 && maxCorr > 0.2) {
result.cycleLength = bestLag;
result.amplitude = maxCorr;
result.confidence = maxCorr;
result.cycleLengthVariance = bestLag * 0.2; // Simple estimation
snprintf(result.description, MAX_STRING_SIZE,
"Cycle detected with approximate length of %d days (confidence: %.2f)",
bestLag, maxCorr);
} else {
strncpy(result.description, "No significant cycle detected", MAX_STRING_SIZE-1);
}
return result;
}

View file

@ -0,0 +1,9 @@
#pragma once
#ifdef __aarch64__
// Fix uintptr_t definition for ARM64
#include <stdint.h>
// We need to undefine and redefine uintptr_t to ensure it's 64-bit on ARM64
#undef uintptr_t
typedef uint64_t uintptr_t;
#endif

View file

@ -0,0 +1,47 @@
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// basic_stats.cpp
// Implementation of basic statistical analysis functions
//
#include "health_analytics_engine.h"
#include "utils.h"
/**
* @brief Calculate basic statistical properties of a data series
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @return BasicStats Structure containing calculated statistics
*/
BasicStats calculate_basic_stats(const double* values, int length) {
BasicStats stats;
if (length == 0) {
memset(&stats, 0, sizeof(BasicStats));
return stats;
}
// Create a copy for calculations that require sorting
std::vector<double> sorted(values, values + length);
std::sort(sorted.begin(), sorted.end());
// Calculate basic statistics
stats.mean = calculateMean(values, length);
stats.variance = calculateVariance(values, length, stats.mean);
stats.stdDev = std::sqrt(stats.variance);
stats.min = sorted.front();
stats.max = sorted.back();
stats.median = (length % 2 == 0) ?
(sorted[length/2 - 1] + sorted[length/2]) / 2.0 : sorted[length/2];
// Calculate quartiles and IQR
stats.q1 = calculateQuantile(sorted.data(), length, 0.25);
stats.q3 = calculateQuantile(sorted.data(), length, 0.75);
stats.iqr = stats.q3 - stats.q1;
// Calculate higher-order statistics
stats.skewness = calculateSkewness(values, length, stats.mean, stats.stdDev);
stats.kurtosis = calculateKurtosis(values, length, stats.mean, stats.stdDev);
return stats;
}

View file

@ -0,0 +1,203 @@
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// clustering.cpp
// Implementation of clustering and pattern recognition functions
//
#include "health_analytics_engine.h"
#include "utils.h"
/**
* @brief Perform cluster analysis on multivariate health data
*
* @param data 2D array of data points [n_samples x n_features]
* @param factorCount Number of features per data point
* @param dataLength Number of data points
* @param maxClusters Maximum number of clusters to identify
* @return ClusterResult* Array of cluster results
*/
ClusterResult* perform_cluster_analysis(const double** data,
int factorCount,
int dataLength,
int maxClusters) {
if (factorCount < 2 || dataLength < 5 || maxClusters <= 0) {
ClusterResult* dummy = new ClusterResult[1];
memset(dummy, 0, sizeof(ClusterResult));
dummy[0].clusterId = -1; // Mark as invalid
return dummy;
}
// Normalize data for better clustering
std::vector<std::vector<double>> normalizedData(dataLength, std::vector<double>(factorCount));
std::vector<double> means(factorCount, 0);
std::vector<double> stdDevs(factorCount, 0);
// Calculate means
for (int j = 0; j < factorCount; j++) {
for (int i = 0; i < dataLength; i++) {
means[j] += data[i][j];
}
means[j] /= dataLength;
}
// Calculate standard deviations
for (int j = 0; j < factorCount; j++) {
for (int i = 0; i < dataLength; i++) {
double diff = data[i][j] - means[j];
stdDevs[j] += diff * diff;
}
stdDevs[j] = sqrt(stdDevs[j] / dataLength);
if (stdDevs[j] < 1e-10) stdDevs[j] = 1.0; // Avoid division by zero
}
// Normalize data
for (int i = 0; i < dataLength; i++) {
for (int j = 0; j < factorCount; j++) {
normalizedData[i][j] = (data[i][j] - means[j]) / stdDevs[j];
}
}
// Find optimal number of clusters (between 2 and maxClusters)
int optimalClusters = 2;
double bestSilhouette = -1;
// Arrays for k-means algorithm
std::vector<int> assignments(dataLength);
std::vector<std::vector<double>> centroids(maxClusters, std::vector<double>(factorCount));
std::vector<const double*> normalizedDataPtrs(dataLength);
for (int i = 0; i < dataLength; i++) {
normalizedDataPtrs[i] = normalizedData[i].data();
}
for (int k = 2; k <= maxClusters; k++) {
// Run k-means clustering
std::vector<int> tempAssignments(dataLength, 0);
std::vector<std::vector<double>> tempCentroids(k, std::vector<double>(factorCount, 0));
std::vector<double*> centroidPtrs(k);
for (int i = 0; i < k; i++) {
centroidPtrs[i] = tempCentroids[i].data();
}
kMeansClustering(normalizedDataPtrs.data(), dataLength, factorCount, k,
100, centroidPtrs.data(), tempAssignments.data());
// Calculate silhouette coefficient
std::vector<const double*> dataPtrs(dataLength);
for (int i = 0; i < dataLength; i++) {
dataPtrs[i] = data[i]; // Use original data for silhouette
}
double silhouette = calculateSilhouetteCoefficient(
dataPtrs.data(), dataLength, factorCount, tempAssignments.data(), k);
// Update if better silhouette found
if (silhouette > bestSilhouette) {
bestSilhouette = silhouette;
optimalClusters = k;
assignments = tempAssignments;
centroids = tempCentroids;
}
}
// Allocate cluster results (plus one for terminator)
ClusterResult* results = new ClusterResult[optimalClusters + 1];
// Count points in each cluster
std::vector<int> clusterSizes(optimalClusters, 0);
for (int i = 0; i < dataLength; i++) {
clusterSizes[assignments[i]]++;
}
// Calculate cluster statistics
for (int c = 0; c < optimalClusters; c++) {
ClusterResult& cluster = results[c];
cluster.clusterId = c;
cluster.dataPointCount = clusterSizes[c];
// Calculate cluster significance based on size and compactness
double avgDistance = 0;
int count = 0;
for (int i = 0; i < dataLength; i++) {
if (assignments[i] == c) {
double dist = 0;
for (int j = 0; j < factorCount; j++) {
double diff = normalizedData[i][j] - centroids[c][j];
dist += diff * diff;
}
avgDistance += sqrt(dist);
count++;
}
}
if (count > 0) {
avgDistance /= count;
}
// Higher significance for larger and more compact clusters
double sizeFactor = static_cast<double>(count) / dataLength;
double compactnessFactor = 1.0 - std::min(1.0, avgDistance / 3.0);
cluster.significance = sizeFactor * compactnessFactor;
// Identify important factors for this cluster
std::vector<std::pair<int, double>> factorImportance;
for (int j = 0; j < factorCount; j++) {
// Calculate how different this centroid is from global mean for this factor
double differenceFromMean = std::abs(centroids[c][j]);
factorImportance.push_back(std::make_pair(j, differenceFromMean));
}
// Sort factors by importance
std::sort(factorImportance.begin(), factorImportance.end(),
[](const std::pair<int, double>& a, const std::pair<int, double>& b) {
return a.second > b.second;
});
// Store top factors (up to 3)
int numFactors = std::min(3, factorCount);
for (int j = 0; j < numFactors; j++) {
cluster.factorIndices[j] = factorImportance[j].first;
cluster.factorWeights[j] = std::min(1.0, factorImportance[j].second);
// Normalize weights to sum to 1
double totalWeight = 0;
for (int k = 0; k < numFactors; k++) {
totalWeight += cluster.factorWeights[k];
}
if (totalWeight > 0) {
for (int k = 0; k < numFactors; k++) {
cluster.factorWeights[k] /= totalWeight;
}
}
}
// Generate descriptive cluster name based on key factors
// In a real implementation, this would use domain knowledge
snprintf(cluster.clusterName, MAX_STRING_SIZE,
"Cluster %d - %s", c + 1,
(cluster.significance > 0.7) ? "High Significance" :
(cluster.significance > 0.4) ? "Medium Significance" : "Low Significance");
// Generate detailed description
snprintf(cluster.description, MAX_STRING_SIZE,
"Cluster of %d data points characterized by %s factors. Silhouette: %.2f",
cluster.dataPointCount,
(numFactors > 0) ? "multiple interrelated" : "non-specific",
bestSilhouette);
}
// Mark the end of valid results
results[optimalClusters].clusterId = -1;
return results;
}
/**
* @brief Free memory for cluster analysis results
*
* @param results Pointer to cluster results array
*/
void free_cluster_results(ClusterResult* results) {
delete[] results;
}

View file

@ -0,0 +1,369 @@
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// correlation.cpp
// Implementation of correlation analysis functions
//
#include "health_analytics_engine.h"
#include "utils.h"
/**
* @brief Find factors with strongest correlation to target variable
*
* @param target Target variable time series
* @param factors Array of factor time series
* @param factorNames Array of factor names
* @param targetLength Length of target time series
* @param factorCount Number of factors
* @return CorrelationResult* Array of correlation results (sorted by strength)
*/
CorrelationResult* find_strongest_correlations(const double* target,
const double** factors,
const char** factorNames,
int targetLength,
int factorCount) {
// Allocate results array (one more than needed to mark the end)
CorrelationResult* results = new CorrelationResult[factorCount + 1];
// Calculate correlations for each factor
for (int i = 0; i < factorCount; i++) {
// Calculate both Pearson and Spearman correlations
double pearson_corr = calculateCorrelation(target, factors[i], targetLength);
double spearman_corr = calculateSpearmanCorrelation(target, factors[i], targetLength);
// Use the correlation with higher absolute value
double corr = (std::abs(pearson_corr) > std::abs(spearman_corr)) ?
pearson_corr : spearman_corr;
results[i].factorIndex = i;
results[i].correlation = corr;
// Estimate p-value based on correlation and sample size
// This is an approximation; a real implementation would use t-distribution
double t = corr * std::sqrt((targetLength - 2) / (1 - corr * corr));
// Simplified 2-tailed p-value approximation
results[i].pValue = 2 * (1 - std::min(1.0, std::exp(-0.717 * std::abs(t) - 0.416 * t * t)));
// Copy factor name
strncpy(results[i].factorName, factorNames[i], MAX_STRING_SIZE - 1);
results[i].factorName[MAX_STRING_SIZE - 1] = '\0';
// Calculate confidence based on sample size, correlation strength, and p-value
double sample_size_factor = 1.0 - 1.0 / std::sqrt(targetLength);
double p_value_factor = 1.0 - results[i].pValue;
results[i].confidence = std::abs(corr) * sample_size_factor * p_value_factor;
}
// Sort by absolute correlation value
std::sort(results, results + factorCount,
[](const CorrelationResult& a, const CorrelationResult& b) {
return std::abs(a.correlation) > std::abs(b.correlation);
});
// Mark the end of valid results
results[factorCount].factorIndex = -1;
results[factorCount].correlation = 0;
results[factorCount].confidence = 0;
results[factorCount].factorName[0] = '\0';
return results;
}
/**
* @brief Free memory for correlation results
*
* @param results Pointer to correlation results array
*/
void free_correlation_results(CorrelationResult* results) {
delete[] results;
}
/**
* @brief Find multivariate correlations between factors
*
* @param data Array of factor time series
* @param factorNames Array of factor names
* @param factorCount Number of factors
* @param dataLength Length of time series
* @return MultivariateCorrelation* Array of multivariate correlation results
*/
MultivariateCorrelation* find_multivariate_correlations(const double** data,
const char** factorNames,
int factorCount,
int dataLength) {
if (factorCount < 2 || dataLength < 3) {
MultivariateCorrelation* dummy = new MultivariateCorrelation[1];
memset(dummy, 0, sizeof(MultivariateCorrelation));
dummy[0].factorCount = -1; // Mark as invalid
return dummy;
}
// Constants
const int MAX_RESULTS = 100;
const double MIN_CORRELATION_THRESHOLD = 0.3;
// Allocate space for up to MAX_RESULTS correlations + 1 terminator
MultivariateCorrelation* results = new MultivariateCorrelation[MAX_RESULTS + 1];
int resultCount = 0;
// Create correlation matrix for all pairwise correlations
std::vector<std::vector<double>> corrMatrix(factorCount, std::vector<double>(factorCount, 0));
for (int i = 0; i < factorCount; i++) {
corrMatrix[i][i] = 1.0; // Self-correlation is 1
for (int j = i + 1; j < factorCount; j++) {
// Calculate correlation using both Pearson and Spearman
double pearson = calculateCorrelation(data[i], data[j], dataLength);
double spearman = calculateSpearmanCorrelation(data[i], data[j], dataLength);
// Use the one with higher absolute value
double corr = (std::abs(pearson) > std::abs(spearman)) ? pearson : spearman;
// Store in both positions (symmetric matrix)
corrMatrix[i][j] = corr;
corrMatrix[j][i] = corr;
}
}
// Find pairwise correlations
for (int i = 0; i < factorCount && resultCount < MAX_RESULTS; i++) {
for (int j = i + 1; j < factorCount && resultCount < MAX_RESULTS; j++) {
double corr = corrMatrix[i][j];
// Only store significant correlations
if (std::abs(corr) >= MIN_CORRELATION_THRESHOLD) {
MultivariateCorrelation& result = results[resultCount];
result.factorCount = 2;
result.correlationStrength = corr;
result.primaryFactorIndex = i;
result.secondaryFactorIndex = j;
result.tertiaryFactorIndex = -1;
// Calculate confidence based on sample size and correlation strength
double t_stat = std::abs(corr) * std::sqrt((dataLength - 2) / (1 - corr * corr));
double p_value = 2 * (1 - std::min(1.0, std::exp(-0.717 * t_stat - 0.416 * t_stat * t_stat)));
result.confidence = std::abs(corr) * (1.0 - p_value) * (1.0 - 1.0 / std::sqrt(dataLength));
// Copy factor names
strncpy(result.factorNames[0], factorNames[i], MAX_STRING_SIZE - 1);
result.factorNames[0][MAX_STRING_SIZE - 1] = '\0';
strncpy(result.factorNames[1], factorNames[j], MAX_STRING_SIZE - 1);
result.factorNames[1][MAX_STRING_SIZE - 1] = '\0';
// Set factor weights based on correlation directionality
if (corr > 0) {
result.factorWeights[0] = 0.5;
result.factorWeights[1] = 0.5;
} else {
result.factorWeights[0] = 0.5;
result.factorWeights[1] = -0.5;
}
// Determine relationship type
result.relationshipType = RELATIONSHIP_CORRELATION;
// Generate description
const char* strength_text =
(std::abs(corr) > 0.7) ? "strong" :
(std::abs(corr) > 0.5) ? "moderate" : "weak";
const char* direction_text = (corr > 0) ? "positive" : "negative";
snprintf(result.description, MAX_STRING_SIZE,
"There is a %s %s correlation between %s and %s (r=%.2f, p=%.3f)",
strength_text, direction_text,
factorNames[i], factorNames[j], corr,
(result.confidence > 0.99) ? 0.001 : 1.0 - result.confidence);
resultCount++;
}
}
}
// Find partial correlations and three-way relationships
if (factorCount >= 3) {
// Calculate partial correlations
std::vector<std::vector<std::vector<double>>> partialCorr(
factorCount, std::vector<std::vector<double>>(
factorCount, std::vector<double>(factorCount, 0.0)));
for (int i = 0; i < factorCount; i++) {
for (int j = i + 1; j < factorCount; j++) {
for (int k = 0; k < factorCount; k++) {
if (k == i || k == j) continue;
// Calculate partial correlation between i and j controlling for k
double r_ij = corrMatrix[i][j];
double r_ik = corrMatrix[i][k];
double r_jk = corrMatrix[j][k];
double denominator = std::sqrt((1 - r_ik * r_ik) * (1 - r_jk * r_jk));
if (denominator > 1e-10) {
double partial = (r_ij - r_ik * r_jk) / denominator;
partialCorr[i][j][k] = partial;
partialCorr[j][i][k] = partial;
}
}
}
}
// Find three-way relationships
for (int i = 0; i < factorCount && resultCount < MAX_RESULTS; i++) {
for (int j = i + 1; j < factorCount && resultCount < MAX_RESULTS; j++) {
for (int k = j + 1; k < factorCount && resultCount < MAX_RESULTS; k++) {
// Get pairwise correlations
double r_ij = corrMatrix[i][j];
double r_ik = corrMatrix[i][k];
double r_jk = corrMatrix[j][k];
// Check if all pairs are correlated
if (std::abs(r_ij) >= MIN_CORRELATION_THRESHOLD &&
std::abs(r_ik) >= MIN_CORRELATION_THRESHOLD &&
std::abs(r_jk) >= MIN_CORRELATION_THRESHOLD) {
// Get partial correlations
double p_ij_k = partialCorr[i][j][k]; // i,j controlling for k
double p_ik_j = partialCorr[i][k][j]; // i,k controlling for j
double p_jk_i = partialCorr[j][k][i]; // j,k controlling for i
// Determine if mediation or confounding is present
bool is_mediation = false;
int mediator = -1;
// Check if k mediates i->j
if (std::abs(p_ij_k) < std::abs(r_ij) * 0.5) {
is_mediation = true;
mediator = k;
}
// Check if j mediates i->k
else if (std::abs(p_ik_j) < std::abs(r_ik) * 0.5) {
is_mediation = true;
mediator = j;
}
// Check if i mediates j->k
else if (std::abs(p_jk_i) < std::abs(r_jk) * 0.5) {
is_mediation = true;
mediator = i;
}
MultivariateCorrelation& result = results[resultCount];
result.factorCount = 3;
// Use average correlation as strength
result.correlationStrength = (std::abs(r_ij) + std::abs(r_ik) + std::abs(r_jk)) / 3.0;
result.primaryFactorIndex = i;
result.secondaryFactorIndex = j;
result.tertiaryFactorIndex = k;
// Lower confidence for three-way relationships
result.confidence = result.correlationStrength *
(1.0 - 2.0 / std::sqrt(dataLength));
// Copy factor names
strncpy(result.factorNames[0], factorNames[i], MAX_STRING_SIZE - 1);
result.factorNames[0][MAX_STRING_SIZE - 1] = '\0';
strncpy(result.factorNames[1], factorNames[j], MAX_STRING_SIZE - 1);
result.factorNames[1][MAX_STRING_SIZE - 1] = '\0';
strncpy(result.factorNames[2], factorNames[k], MAX_STRING_SIZE - 1);
result.factorNames[2][MAX_STRING_SIZE - 1] = '\0';
// Set relationship type
if (is_mediation) {
result.relationshipType = RELATIONSHIP_MEDIATION;
// Set weights based on mediation path
if (mediator == i) {
result.factorWeights[0] = 0.5; // Mediator
result.factorWeights[1] = 0.3; // Source
result.factorWeights[2] = 0.3; // Target
} else if (mediator == j) {
result.factorWeights[0] = 0.3; // Source
result.factorWeights[1] = 0.5; // Mediator
result.factorWeights[2] = 0.3; // Target
} else {
result.factorWeights[0] = 0.3; // Source
result.factorWeights[1] = 0.3; // Target
result.factorWeights[2] = 0.5; // Mediator
}
// Generate description for mediation
snprintf(result.description, MAX_STRING_SIZE,
"Potential mediation detected: %s may mediate the relationship between %s and %s",
factorNames[mediator],
factorNames[(mediator+1) % 3],
factorNames[(mediator+2) % 3]);
} else {
result.relationshipType = RELATIONSHIP_NETWORK;
// Equal weights for network relationship
result.factorWeights[0] = 0.33;
result.factorWeights[1] = 0.33;
result.factorWeights[2] = 0.33;
// Generate description for network
snprintf(result.description, MAX_STRING_SIZE,
"Network relationship detected between %s, %s, and %s (average correlation: %.2f)",
factorNames[i], factorNames[j], factorNames[k],
result.correlationStrength);
}
resultCount++;
}
}
}
}
}
// Mark the end of valid results
if (resultCount < MAX_RESULTS) {
results[resultCount].factorCount = -1;
} else {
results[MAX_RESULTS].factorCount = -1;
}
return results;
}
/**
* @brief Free memory for multivariate correlation results
*
* @param results Pointer to multivariate correlation results array
*/
void free_multivariate_correlations(MultivariateCorrelation* results) {
delete[] results;
}
/**
* @brief Direct API access to correlation calculation
*
* @param x First data series
* @param y Second data series
* @param length Length of data series
* @return double Correlation coefficient
*/
double calculate_correlation(const double* x, const double* y, int length) {
if (length <= 1) return 0;
double sum_x = 0, sum_y = 0, sum_xy = 0;
double sum_x2 = 0, sum_y2 = 0;
for (int i = 0; i < length; i++) {
sum_x += x[i];
sum_y += y[i];
sum_xy += x[i] * y[i];
sum_x2 += x[i] * x[i];
sum_y2 += y[i] * y[i];
}
double denominator = sqrt((length * sum_x2 - sum_x * sum_x) *
(length * sum_y2 - sum_y * sum_y));
if (denominator < 1e-10) return 0; // Avoid division by zero
return (length * sum_xy - sum_x * sum_y) / denominator;
}

View file

@ -0,0 +1,30 @@
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// health_analytics.cpp
// Comprehensive C++ analytics engine for health data processing
// Provides statistical analysis, pattern detection, and predictive modeling for health metrics
//
#include "health_analytics_engine.h"
#include <vector>
#include <string>
#include <cmath>
#include <algorithm>
#include <map>
#include <set>
#include <unordered_map>
#include <memory>
#include <cstring>
#include <ctime>
#include <numeric>
#include <random>
#include <limits>
// Include all module files
#include "utils.cpp"
#include "basic_stats.cpp"
#include "correlation.cpp"
#include "time_series.cpp"
#include "clustering.cpp"
#include "anomaly_detection.cpp"
#include "impact_analysis.cpp"

View file

@ -0,0 +1,316 @@
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// HealthAnalyticsEngine.h
// Core C++ header for health analytics calculations
//
#ifndef HEALTH_ANALYTICS_ENGINE_H
#define HEALTH_ANALYTICS_ENGINE_H
#include <vector>
#include <string>
#include <cstring>
#include <map>
#include <unordered_map>
#include <algorithm>
#include <cmath>
#include <memory>
#include <cstdint>
// Constants
constexpr int MAX_STRING_SIZE = 200;
// Forward declarations
struct DateStruct;
struct BasicStats;
struct TrendResult;
struct CorrelationResult;
struct MultivariateCorrelation;
struct ClusterResult;
struct TimeSeriesForecast;
struct AnomalyResult;
struct FactorImpactResult;
struct DatePatternResult;
struct CycleAnalysisResult;
struct MedicationImpactAnalysis;
struct HormoneImpactAnalysis;
// Data structures for FFI communication
struct DateStruct {
int year;
int month;
int day;
};
struct BasicStats {
double mean;
double median;
double min;
double max;
double stdDev;
double variance;
double skewness;
double kurtosis;
double q1;
double q3;
double iqr;
};
enum TrendType {
TREND_NONE = 0,
TREND_INCREASING = 1,
TREND_DECREASING = 2,
TREND_CYCLIC = 3,
TREND_VARIABLE = 4
};
struct TrendResult {
TrendType trendType;
double strength;
char description[MAX_STRING_SIZE];
};
struct CorrelationResult {
int factorIndex;
double correlation;
double pValue;
double confidence;
char factorName[MAX_STRING_SIZE];
};
enum RelationshipType {
RELATIONSHIP_CORRELATION = 0,
RELATIONSHIP_CAUSATION = 1,
RELATIONSHIP_COINCIDENTAL = 2,
RELATIONSHIP_MEDIATION = 3,
RELATIONSHIP_NETWORK = 4
};
struct MultivariateCorrelation {
int factorCount;
char factorNames[MAX_STRING_SIZE][MAX_STRING_SIZE];
double factorWeights[50]; // Using a constant size
double correlationStrength;
char description[MAX_STRING_SIZE];
double confidence;
RelationshipType relationshipType;
int primaryFactorIndex;
int secondaryFactorIndex;
int tertiaryFactorIndex;
};
struct ClusterResult {
int clusterId;
char clusterName[MAX_STRING_SIZE];
char description[MAX_STRING_SIZE];
int dataPointCount;
double significance;
int factorIndices[50]; // Using a constant size
double factorWeights[50]; // Using a constant size
double centeroid[50]; // Using a constant size
double radius;
};
enum TimeUnit {
TIME_UNIT_DAYS = 0,
TIME_UNIT_WEEKS = 1,
TIME_UNIT_MONTHS = 2
};
struct TimeSeriesForecast {
double predictions[30];
double confidenceIntervals[30][2];
double overallConfidence;
int seasonalityPeriod;
TimeUnit timeUnit;
char factorName[MAX_STRING_SIZE];
};
enum AnomalyType {
ANOMALY_OUTLIER = 0,
ANOMALY_TREND_CHANGE = 1,
ANOMALY_SEASONALITY_CHANGE = 2,
ANOMALY_CONTEXTUAL = 3
};
struct AnomalyResult {
int dataPointIndex;
double anomalyScore;
char description[MAX_STRING_SIZE];
double originalValue;
double expectedValue;
DateStruct date;
double confidence;
AnomalyType anomalyType;
char factorName[MAX_STRING_SIZE];
};
struct FactorImpactResult {
int factorIndex;
char factorName[MAX_STRING_SIZE];
double impactScore;
double directEffect;
double indirectEffect;
double confidence;
char mechanism[MAX_STRING_SIZE];
};
enum PatternType {
PATTERN_NONE = 0,
PATTERN_DAILY = 1,
PATTERN_WEEKLY = 2,
PATTERN_MONTHLY = 3,
PATTERN_CUSTOM = 4
};
struct DatePatternResult {
PatternType patternType;
int periodicity;
double strength;
char description[MAX_STRING_SIZE];
double peakValues[7]; // For weekly patterns
int peakDayOfWeek; // 0-6, where 0 is Sunday
int peakDayOfMonth; // 1-31
int peakMonth; // 1-12
};
struct CycleAnalysisResult {
double cycleLength;
double cycleLengthVariance;
double amplitude;
double phaseShift;
double confidence;
char description[MAX_STRING_SIZE];
};
struct MedicationImpactAnalysis {
char medicationName[MAX_STRING_SIZE];
double beforeMean;
double afterMean;
double changeMagnitude;
double changeSignificance;
double overallImpact;
int daysToEffect;
char description[MAX_STRING_SIZE];
char factorName[MAX_STRING_SIZE];
};
struct HormoneImpactAnalysis {
char hormoneName[MAX_STRING_SIZE];
double currentLevel;
double optimalLevel;
double optimalRangeLower;
double optimalRangeUpper;
double deviation;
double impactOnMood;
double impactOnEnergy;
double impactOnOtherFactors[50]; // Using a constant size
char factorNames[50][MAX_STRING_SIZE]; // Using a constant size
char description[MAX_STRING_SIZE];
};
// Removed the HealthAnalyticsEngine class since it's not implemented
// C-style API for FFI
extern "C" {
// Basic statistics
BasicStats calculate_basic_stats(const double* values, int length);
// Trend analysis
TrendType detect_trend(const double* values, int length, double* strength_out);
// Correlation analysis
double calculate_correlation(const double* x, const double* y, int length);
CorrelationResult* find_strongest_correlations(
const double* target_values,
const double** factor_values,
const char** factor_names,
int data_length,
int factor_count);
void free_correlation_results(CorrelationResult* results);
// Multivariate analysis
MultivariateCorrelation* find_multivariate_correlations(
const double** factor_data,
const char** factor_names,
int factor_count,
int data_length);
void free_multivariate_correlations(MultivariateCorrelation* correlations);
// Cluster analysis
ClusterResult* perform_cluster_analysis(
const double** factor_data,
int factor_count,
int data_length,
int max_clusters);
void free_cluster_results(ClusterResult* results);
// Time series forecasting
TimeSeriesForecast predict_time_series(
const double* time_series_data,
int data_length,
int steps_ahead,
const char* factor_name);
// Anomaly detection
AnomalyResult* detect_anomalies(
const double* time_series_data,
int data_length,
double threshold,
const DateStruct* dates,
const char* factor_name);
void free_anomaly_results(AnomalyResult* results);
// Factor impact ranking
FactorImpactResult* rank_factor_impacts(
const double** factor_data,
const double* target_data,
const char** factor_names,
int factor_count,
int data_length);
void free_factor_impact_results(FactorImpactResult* results);
// Date pattern analysis
DatePatternResult* analyze_date_patterns(
const double* values,
const DateStruct* dates,
int data_length,
const char* factor_name);
void free_date_pattern_results(DatePatternResult* results);
// Cycle analysis
CycleAnalysisResult analyze_cycles(
const double* values,
const DateStruct* dates,
int data_length,
const char* factor_name);
// Medication impact analysis
MedicationImpactAnalysis* analyze_medication_impact(
const double* before_data,
int before_length,
const double* after_data,
int after_length,
const char* medication_name,
const char* factor_name);
void free_medication_impact_analysis(MedicationImpactAnalysis* analysis);
// Hormone impact analysis
HormoneImpactAnalysis* analyze_hormone_impact(
const double* hormone_levels,
int data_length,
const double** factor_data,
const char** factor_names,
int factor_count,
const char* hormone_name,
double min_optimal_level,
double max_optimal_level);
void free_hormone_impact_analysis(HormoneImpactAnalysis* analysis);
}
// Utility functions
void normalize_data(const double* data, int length, double minValue, double maxValue, double* normalizedData);
int detect_change_points(const double* data, int length, double threshold, int* changePoints, int maxChangePoints);
void optimize_svr_parameters(const double** x_data, const double* y_data, int length,
double& bestC, double& bestEpsilon, double& bestGamma, double& bestScore);
#endif // HEALTH_ANALYTICS_ENGINE_H

View file

@ -0,0 +1,361 @@
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// impact_analysis.cpp
// Implementation of factor impact and medication analysis functions
//
#include "health_analytics_engine.h"
#include "utils.h"
/**
* @brief Rank factors by their impact on a target variable
*
* @param factors Array of factor time series
* @param target Target variable time series
* @param factorNames Array of factor names
* @param factorCount Number of factors
* @param dataLength Length of time series
* @return FactorImpactResult* Array of factor impact results
*/
FactorImpactResult* rank_factor_impacts(const double** factors,
const double* target,
const char** factorNames,
int factorCount,
int dataLength) {
if (factorCount <= 0 || dataLength <= 2) {
FactorImpactResult* dummy = new FactorImpactResult[1];
memset(dummy, 0, sizeof(FactorImpactResult));
dummy[0].factorIndex = -1; // Mark as invalid
return dummy;
}
// Allocate space for results (plus one for terminator)
FactorImpactResult* results = new FactorImpactResult[factorCount + 1];
// Calculate correlation matrix for all factors + target
std::vector<std::vector<double>> corrMatrix(factorCount + 1, std::vector<double>(factorCount + 1, 0));
for (int i = 0; i < factorCount; i++) {
// Correlation between factor i and target
corrMatrix[i][factorCount] = calculateCorrelation(factors[i], target, dataLength);
corrMatrix[factorCount][i] = corrMatrix[i][factorCount];
// Correlations between factors
for (int j = i + 1; j < factorCount; j++) {
corrMatrix[i][j] = calculateCorrelation(factors[i], factors[j], dataLength);
corrMatrix[j][i] = corrMatrix[i][j];
}
// Self-correlation is 1
corrMatrix[i][i] = 1.0;
}
corrMatrix[factorCount][factorCount] = 1.0;
// Calculate the impact of each factor
for (int i = 0; i < factorCount; i++) {
FactorImpactResult& impact = results[i];
// Direct effect is correlation with target
double directEffect = corrMatrix[i][factorCount];
// Calculate indirect effects through other factors
double indirectEffect = 0;
for (int j = 0; j < factorCount; j++) {
if (j != i) {
// Indirect effect through factor j
indirectEffect += corrMatrix[i][j] * corrMatrix[j][factorCount];
}
}
// Normalize indirect effect
indirectEffect /= std::max(1, factorCount - 1);
// Calculate partial correlation (direct effect controlling for other factors)
// This is a simplified approach - real implementation would use matrix operations
double partialCorr = directEffect;
if (factorCount > 1) {
double sumControlledVar = 0;
for (int j = 0; j < factorCount; j++) {
if (j != i) {
// Remove effect of factor j from both target and factor i
double controlEffect = corrMatrix[i][j] * corrMatrix[j][factorCount];
partialCorr -= controlEffect / (factorCount - 1);
sumControlledVar += corrMatrix[i][j] * corrMatrix[i][j];
}
}
sumControlledVar /= (factorCount - 1);
// Normalize partial correlation
if (sumControlledVar < 0.98) { // Avoid division by near-zero
partialCorr /= sqrt((1 - sumControlledVar));
}
}
// Calculate total impact score
impact.factorIndex = i;
impact.directEffect = directEffect;
impact.indirectEffect = indirectEffect;
// Total impact is weighted sum of direct and partial correlation
double partialWeight = 0.7; // Weight more toward direct unique contribution
impact.impactScore = partialWeight * std::abs(partialCorr) + (1 - partialWeight) * std::abs(directEffect);
// Calculate confidence based on correlation strength and sample size
double t_stat = std::abs(directEffect) * std::sqrt((dataLength - 2) / (1 - directEffect * directEffect));
double p_value = 2 * (1 - std::min(1.0, std::exp(-0.717 * t_stat - 0.416 * t_stat * t_stat)));
impact.confidence = std::min(0.95, (1.0 - p_value) * (1.0 - 1.0 / std::sqrt(dataLength)));
// Copy factor name
strncpy(impact.factorName, factorNames[i], MAX_STRING_SIZE - 1);
impact.factorName[MAX_STRING_SIZE - 1] = '\0';
// Generate mechanism description based on direct and indirect effects
const char* direction = (directEffect > 0) ? "positive" : "negative";
const char* strength =
(std::abs(directEffect) > 0.7) ? "strong" :
(std::abs(directEffect) > 0.4) ? "moderate" : "weak";
// Check for mediation effects
bool hasMediationEffect = std::abs(indirectEffect) > 0.2 &&
std::abs(indirectEffect) > std::abs(directEffect) * 0.5;
if (hasMediationEffect) {
snprintf(impact.mechanism, MAX_STRING_SIZE,
"%s %s impact: %s affects the target both directly (%.2f) and through other factors (%.2f)",
strength, direction, factorNames[i], directEffect, indirectEffect);
} else {
snprintf(impact.mechanism, MAX_STRING_SIZE,
"%s %s impact: changes in %s are %s associated with changes in the target (r=%.2f)",
strength, direction, factorNames[i], direction, directEffect);
}
}
// Sort by impact score (descending)
std::sort(results, results + factorCount,
[](const FactorImpactResult& a, const FactorImpactResult& b) {
return a.impactScore > b.impactScore;
});
// Mark the end of valid results
results[factorCount].factorIndex = -1;
return results;
}
/**
* @brief Free memory for factor impact results
*
* @param results Pointer to factor impact results array
*/
void free_factor_impact_results(FactorImpactResult* results) {
delete[] results;
}
/**
* @brief Analyze the impact of medication on a health metric
*
* @param before_data Values before medication
* @param before_length Length of before data
* @param after_data Values after medication
* @param after_length Length of after data
* @param medication_name Name of the medication
* @param factor_name Name of the health factor
* @return MedicationImpactAnalysis* Pointer to impact analysis result
*/
MedicationImpactAnalysis* analyze_medication_impact(
const double* before_data,
int before_length,
const double* after_data,
int after_length,
const char* medication_name,
const char* factor_name) {
MedicationImpactAnalysis* result = new MedicationImpactAnalysis();
memset(result, 0, sizeof(MedicationImpactAnalysis));
// Copy names
strncpy(result->medicationName, medication_name, MAX_STRING_SIZE-1);
strncpy(result->factorName, factor_name, MAX_STRING_SIZE-1);
// Return if insufficient data
if (before_length < 5 || after_length < 5) {
strncpy(result->description, "Insufficient data for analysis", MAX_STRING_SIZE-1);
return result;
}
// Calculate means
double beforeSum = 0, afterSum = 0;
for (int i = 0; i < before_length; i++) {
beforeSum += before_data[i];
}
for (int i = 0; i < after_length; i++) {
afterSum += after_data[i];
}
result->beforeMean = beforeSum / before_length;
result->afterMean = afterSum / after_length;
// Calculate change magnitude
result->changeMagnitude = result->afterMean - result->beforeMean;
// Calculate significance with simple t-test
double beforeVar = 0, afterVar = 0;
for (int i = 0; i < before_length; i++) {
double diff = before_data[i] - result->beforeMean;
beforeVar += diff * diff;
}
for (int i = 0; i < after_length; i++) {
double diff = after_data[i] - result->afterMean;
afterVar += diff * diff;
}
beforeVar /= (before_length - 1);
afterVar /= (after_length - 1);
double se = sqrt(beforeVar/before_length + afterVar/after_length);
double tStat = fabs(result->changeMagnitude) / (se + 0.0001);
// Simple significance estimation (0-1)
result->changeSignificance = std::min(1.0, tStat / 5.0);
// Overall impact combines magnitude and significance
result->overallImpact = fabs(result->changeMagnitude) * result->changeSignificance;
// Estimate days to effect (placeholder implementation)
result->daysToEffect = 7; // Assumed 1 week
// Generate description
const char* direction = (result->changeMagnitude > 0) ? "increased" : "decreased";
const char* significance = (result->changeSignificance > 0.7) ? "significant" :
(result->changeSignificance > 0.3) ? "moderate" : "slight";
snprintf(result->description, MAX_STRING_SIZE,
"%s shows a %s %s effect on %s (%.1f → %.1f)",
medication_name, significance, direction, factor_name,
result->beforeMean, result->afterMean);
return result;
}
/**
* @brief Free memory for medication impact analysis
*
* @param analysis Pointer to medication impact analysis
*/
void free_medication_impact_analysis(MedicationImpactAnalysis* analysis) {
delete analysis;
}
/**
* @brief Analyze hormone impact on health metrics
*
* @param hormone_levels Array of hormone levels
* @param data_length Length of hormone data
* @param factor_data Array of factor data arrays
* @param factor_names Array of factor names
* @param factor_count Number of factors
* @param hormone_name Name of the hormone
* @param min_optimal_level Lower bound of optimal range
* @param max_optimal_level Upper bound of optimal range
* @return HormoneImpactAnalysis* Pointer to impact analysis result
*/
HormoneImpactAnalysis* analyze_hormone_impact(
const double* hormone_levels,
int data_length,
const double** factor_data,
const char** factor_names,
int factor_count,
const char* hormone_name,
double min_optimal_level,
double max_optimal_level) {
HormoneImpactAnalysis* result = new HormoneImpactAnalysis();
memset(result, 0, sizeof(HormoneImpactAnalysis));
// Copy hormone name
strncpy(result->hormoneName, hormone_name, MAX_STRING_SIZE-1);
// Return if insufficient data
if (data_length < 3 || factor_count <= 0) {
strncpy(result->description, "Insufficient data for analysis", MAX_STRING_SIZE-1);
return result;
}
// Calculate current hormone level (average of recent readings)
double sum = 0;
for (int i = 0; i < data_length; i++) {
sum += hormone_levels[i];
}
result->currentLevel = sum / data_length;
// Set optimal levels
result->optimalRangeLower = min_optimal_level;
result->optimalRangeUpper = max_optimal_level;
result->optimalLevel = (min_optimal_level + max_optimal_level) / 2;
// Calculate deviation from optimal range
if (result->currentLevel < min_optimal_level) {
result->deviation = (result->currentLevel - min_optimal_level) / min_optimal_level;
} else if (result->currentLevel > max_optimal_level) {
result->deviation = (result->currentLevel - max_optimal_level) / max_optimal_level;
} else {
result->deviation = 0; // Within optimal range
}
// Calculate correlations with factors
for (int i = 0; i < factor_count && i < 50; i++) {
// Calculate correlation
double correlation = 0;
double sum_xy = 0, sum_x2 = 0, sum_y2 = 0;
double sum_x = 0, sum_y = 0;
for (int j = 0; j < data_length; j++) {
sum_x += hormone_levels[j];
sum_y += factor_data[i][j];
sum_xy += hormone_levels[j] * factor_data[i][j];
sum_x2 += hormone_levels[j] * hormone_levels[j];
sum_y2 += factor_data[i][j] * factor_data[i][j];
}
double n = data_length;
double denominator = sqrt((n * sum_x2 - sum_x * sum_x) * (n * sum_y2 - sum_y * sum_y));
if (denominator > 0) {
correlation = (n * sum_xy - sum_x * sum_y) / denominator;
}
// Store impact and factor name
result->impactOnOtherFactors[i] = correlation;
strncpy(result->factorNames[i], factor_names[i], MAX_STRING_SIZE-1);
// Set impact on mood and energy if found
if (strcmp(factor_names[i], "Mood") == 0) {
result->impactOnMood = correlation;
} else if (strcmp(factor_names[i], "Energy") == 0) {
result->impactOnEnergy = correlation;
}
}
// Generate description
const char* status;
if (fabs(result->deviation) < 0.1) {
status = "within optimal range";
} else if (result->deviation < 0) {
status = "below optimal range";
} else {
status = "above optimal range";
}
snprintf(result->description, MAX_STRING_SIZE,
"%s level is %s (%.1f, range: %.1f-%.1f)",
hormone_name, status, result->currentLevel,
min_optimal_level, max_optimal_level);
return result;
}
/**
* @brief Free memory for hormone impact analysis
*
* @param analysis Pointer to hormone impact analysis
*/
void free_hormone_impact_analysis(HormoneImpactAnalysis* analysis) {
delete analysis;
}

View file

@ -0,0 +1,272 @@
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// time_series.cpp
// Implementation of time series analysis functions
//
#include "health_analytics_engine.h"
#include "utils.h"
/**
* @brief Detect trends in time series data
*
* @param values Time series data
* @param length Number of elements in the array
* @param strength Output parameter for trend strength
* @return TrendType Enum indicating trend direction and type
*/
TrendType detect_trend(const double* values, int length, double* strength) {
if (length < 3) {
*strength = 0;
return TREND_NONE;
}
// Generate time vector (0, 1, 2, ...)
std::vector<double> time(length);
for (int i = 0; i < length; i++) {
time[i] = i;
}
// Calculate linear regression
double slope, intercept, r_squared;
if (!calculateLinearRegression(time.data(), values, length, slope, intercept, r_squared)) {
*strength = 0;
return TREND_NONE;
}
// Detrend the data for further analysis
std::vector<double> detrended(length);
for (int i = 0; i < length; i++) {
detrended[i] = values[i] - (intercept + slope * i);
}
// Check for cyclical patterns using autocorrelation
bool has_cycle = false;
int cycle_length = 0;
double max_autocorr = 0;
// Check autocorrelation for various lags
const int MIN_LAG = 2;
const int MAX_LAG = length / 3; // Look for cycles up to 1/3 of series length
for (int lag = MIN_LAG; lag < MAX_LAG; lag++) {
double autocorr = calculateAutocorrelation(detrended.data(), length, lag);
// If strong positive autocorrelation found
if (autocorr > 0.3 && autocorr > max_autocorr) {
max_autocorr = autocorr;
cycle_length = lag;
has_cycle = true;
}
}
// Check if cycle pattern is stronger than linear trend
if (has_cycle && max_autocorr > std::abs(r_squared)) {
*strength = max_autocorr;
return TREND_CYCLIC;
}
// Determine trend direction based on slope and strength
*strength = std::abs(r_squared);
// Require minimum strength to declare a trend
if (*strength < 0.2) {
return TREND_NONE;
} else if (slope > 0) {
return TREND_INCREASING;
} else {
return TREND_DECREASING;
}
}
/**
* @brief Predict future values of a time series using ARIMA-like approach
*
* @param timeSeries Time series data
* @param dataLength Length of time series
* @param stepsAhead Number of future steps to predict
* @param factorName Name of the factor being predicted
* @return TimeSeriesForecast Structure containing predictions and confidence intervals
*/
TimeSeriesForecast predict_time_series(const double* timeSeries,
int dataLength,
int stepsAhead,
const char* factorName) {
TimeSeriesForecast forecast;
memset(&forecast, 0, sizeof(TimeSeriesForecast));
if (dataLength < 5 || stepsAhead <= 0) {
forecast.overallConfidence = 0;
return forecast;
}
// Copy factor name
strncpy(forecast.factorName, factorName, MAX_STRING_SIZE - 1);
forecast.factorName[MAX_STRING_SIZE - 1] = '\0';
// First, check for seasonality
int potentialSeasonality = 0;
double maxAutocorr = 0;
// Look for seasonality in range 2 to dataLength/3
for (int lag = 2; lag <= dataLength/3; lag++) {
double acf = calculateAutocorrelation(timeSeries, dataLength, lag);
if (acf > 0.3 && acf > maxAutocorr) {
maxAutocorr = acf;
potentialSeasonality = lag;
}
}
// Set seasonality period if detected
forecast.seasonalityPeriod = potentialSeasonality;
// Decompose time series if seasonality detected
std::vector<double> trend(dataLength);
std::vector<double> seasonal(dataLength);
std::vector<double> residual(dataLength);
bool hasSeasonality = potentialSeasonality > 0 && maxAutocorr > 0.3;
if (hasSeasonality) {
// Decompose the time series
decomposeTimeSeries(timeSeries, dataLength, potentialSeasonality,
trend.data(), seasonal.data(), residual.data());
} else {
// No seasonality, just use simple moving average for trend
calculateMovingAverage(timeSeries, dataLength, std::min(7, dataLength/3), trend.data());
// No seasonal component
for (int i = 0; i < dataLength; i++) {
seasonal[i] = 0;
residual[i] = timeSeries[i] - trend[i];
}
}
// Fit AR model to residuals for short-term dynamics
// Determine optimal AR order using PACF
int maxLag = std::min(10, dataLength/5);
std::vector<double> pacf(maxLag + 1);
calculatePACF(residual.data(), dataLength, maxLag, pacf.data());
// Find significant AR terms (PACF > 0.2)
std::vector<int> significantLags;
for (int i = 1; i <= maxLag; i++) {
if (std::abs(pacf[i]) > 0.2) {
significantLags.push_back(i);
}
}
// Limit to 3 most significant terms
if (significantLags.size() > 3) {
std::sort(significantLags.begin(), significantLags.end(),
[&pacf](int a, int b) {
return std::abs(pacf[a]) > std::abs(pacf[b]);
});
significantLags.resize(3);
}
// Fit AR coefficients using linear regression
int arOrder = significantLags.size();
std::vector<double> arCoefficients(arOrder, 0);
if (arOrder > 0) {
// Prepare training data for AR model
int trainingSize = dataLength - significantLags.back();
std::vector<std::vector<double>> X(trainingSize, std::vector<double>(arOrder));
std::vector<double> y(trainingSize);
for (int i = 0; i < trainingSize; i++) {
int t = i + significantLags.back();
y[i] = residual[t];
for (int j = 0; j < arOrder; j++) {
X[i][j] = residual[t - significantLags[j]];
}
}
// Very simplified AR coefficient estimation
// Real implementation would use matrix operations
for (int j = 0; j < arOrder; j++) {
double sumXY = 0, sumX2 = 0;
for (int i = 0; i < trainingSize; i++) {
sumXY += X[i][j] * y[i];
sumX2 += X[i][j] * X[i][j];
}
if (sumX2 > 0) {
arCoefficients[j] = sumXY / sumX2;
}
}
}
// Set time unit (days by default)
forecast.timeUnit = TIME_UNIT_DAYS;
// Generate forecasts
double trendGrowth = 0;
if (dataLength > 10) {
// Calculate average trend growth over last 10 points
trendGrowth = (trend[dataLength-1] - trend[dataLength-11]) / 10.0;
}
// Last observed values
std::vector<double> lastResiduals(dataLength);
for (int i = 0; i < dataLength; i++) {
lastResiduals[i] = residual[i];
}
// Generate predictions
for (int i = 0; i < stepsAhead && i < 30; i++) {
int t = dataLength + i;
// Forecast trend component
double trendForecast = trend[dataLength-1] + trendGrowth * (i + 1);
// Forecast seasonal component (if any)
double seasonalForecast = 0;
if (hasSeasonality && potentialSeasonality > 0) {
seasonalForecast = seasonal[dataLength - potentialSeasonality + (i % potentialSeasonality)];
}
// Forecast residual component using AR model
double residualForecast = 0;
for (int j = 0; j < arOrder; j++) {
int lag = significantLags[j];
if (i >= lag) {
// Use previously forecasted residuals
residualForecast += arCoefficients[j] * lastResiduals[dataLength + i - lag];
} else {
// Use observed residuals
residualForecast += arCoefficients[j] * residual[dataLength - lag + i];
}
}
// Store forecasted residual
lastResiduals.push_back(residualForecast);
// Combine components for final forecast
forecast.predictions[i] = trendForecast + seasonalForecast + residualForecast;
// Calculate confidence intervals (widen with forecast horizon)
double stdError = 0;
for (int j = 0; j < dataLength; j++) {
stdError += residual[j] * residual[j];
}
stdError = sqrt(stdError / dataLength);
// Wider intervals for longer forecasts
double multiplier = 1.96 * sqrt(1.0 + 0.25 * i); // Roughly 95% CI with growing uncertainty
forecast.confidenceIntervals[i][0] = forecast.predictions[i] - multiplier * stdError;
forecast.confidenceIntervals[i][1] = forecast.predictions[i] + multiplier * stdError;
}
// Set overall confidence based on model quality and forecast distance
double modelAccuracy = 0.8; // Would be calculated from validation in real model
if (hasSeasonality) modelAccuracy += 0.1;
if (arOrder > 0) modelAccuracy += 0.1 * std::min(arOrder, 2);
forecast.overallConfidence = modelAccuracy * exp(-0.05 * stepsAhead);
if (forecast.overallConfidence > 0.95) forecast.overallConfidence = 0.95;
if (forecast.overallConfidence < 0.2) forecast.overallConfidence = 0.2;
return forecast;
}

863
native/statistics/utils.cpp Normal file
View file

@ -0,0 +1,863 @@
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// utils.cpp
// Core utility functions used across the health analytics library
//
#include "health_analytics_engine.h"
#include "utils.h"
#include <random>
#include <limits>
#include <numeric>
/**
* @brief Calculate the mean (average) of a data series
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @return double The arithmetic mean
*/
double calculateMean(const double* values, int length) {
if (length <= 0) return 0;
double sum = 0;
for (int i = 0; i < length; i++) {
sum += values[i];
}
return sum / length;
}
/**
* @brief Calculate the weighted mean of a data series
*
* @param values Pointer to array of values
* @param weights Pointer to array of weights for each value
* @param length Number of elements in the arrays
* @return double The weighted arithmetic mean
*/
double calculateWeightedMean(const double* values, const double* weights, int length) {
if (length <= 0) return 0;
double sum = 0;
double weightSum = 0;
for (int i = 0; i < length; i++) {
sum += values[i] * weights[i];
weightSum += weights[i];
}
return weightSum > 0 ? sum / weightSum : 0;
}
/**
* @brief Calculate the variance of a data series
* Uses Welford's online algorithm for numerical stability
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @param mean Pre-calculated mean (if available, otherwise pass 0)
* @return double The variance (population or sample based on implementation)
*/
double calculateVariance(const double* values, int length, double mean = 0) {
if (length <= 1) return 0;
// Use pre-calculated mean if provided, otherwise calculate it
if (mean == 0) {
mean = calculateMean(values, length);
}
// Use two-pass algorithm for better numerical stability
double sumSquaredDiff = 0;
for (int i = 0; i < length; i++) {
double diff = values[i] - mean;
sumSquaredDiff += diff * diff;
}
// Return sample variance (n-1 denominator for unbiased estimation)
return sumSquaredDiff / (length - 1);
}
/**
* @brief Calculate the standard deviation of a data series
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @param mean Pre-calculated mean (if available, otherwise pass 0)
* @return double The standard deviation
*/
double calculateStdDev(const double* values, int length, double mean = 0) {
return std::sqrt(calculateVariance(values, length, mean));
}
/**
* @brief Calculate the median of a data series
*
* @param values Pointer to array of values (will be modified by sorting)
* @param length Number of elements in the array
* @return double The median value
*/
double calculateMedian(double* values, int length) {
if (length == 0) return 0;
if (length == 1) return values[0];
// Create a copy and sort it
std::vector<double> sorted(values, values + length);
std::sort(sorted.begin(), sorted.end());
if (length % 2 == 0) {
// Even number of elements
return (sorted[length/2 - 1] + sorted[length/2]) / 2.0;
} else {
// Odd number of elements
return sorted[length/2];
}
}
/**
* @brief Calculate the Pearson correlation coefficient between two data series
*
* @param x First data series
* @param y Second data series (must be same length as x)
* @param length Number of elements in both arrays
* @return double Correlation coefficient (-1 to 1)
*/
double calculateCorrelation(const double* x, const double* y, int length) {
if (length <= 1) return 0;
double sum_x = 0, sum_y = 0, sum_xy = 0;
double sum_x2 = 0, sum_y2 = 0;
for (int i = 0; i < length; i++) {
sum_x += x[i];
sum_y += y[i];
sum_xy += x[i] * y[i];
sum_x2 += x[i] * x[i];
sum_y2 += y[i] * y[i];
}
double denominator = std::sqrt((length * sum_x2 - sum_x * sum_x) *
(length * sum_y2 - sum_y * sum_y));
if (denominator < 1e-10) return 0; // Avoid division by zero
return (length * sum_xy - sum_x * sum_y) / denominator;
}
/**
* @brief Calculate Spearman's rank correlation coefficient
* More robust to outliers than Pearson correlation
*
* @param x First data series
* @param y Second data series (must be same length as x)
* @param length Number of elements in both arrays
* @return double Spearman's rank correlation coefficient (-1 to 1)
*/
double calculateSpearmanCorrelation(const double* x, const double* y, int length) {
if (length <= 1) return 0;
// Create vectors with indices to perform ranking
std::vector<std::pair<double, int>> x_indexed(length);
std::vector<std::pair<double, int>> y_indexed(length);
for (int i = 0; i < length; i++) {
x_indexed[i] = std::make_pair(x[i], i);
y_indexed[i] = std::make_pair(y[i], i);
}
// Sort by values to determine ranks
std::sort(x_indexed.begin(), x_indexed.end());
std::sort(y_indexed.begin(), y_indexed.end());
// Assign ranks (handling ties with average rank)
std::vector<double> x_ranks(length), y_ranks(length);
for (int i = 0; i < length; i++) {
int j = i;
while (j < length - 1 && x_indexed[j].first == x_indexed[j + 1].first) j++;
double rank = 1.0 * (i + j) / 2 + 1;
for (int k = i; k <= j; k++) {
x_ranks[x_indexed[k].second] = rank;
}
i = j;
}
for (int i = 0; i < length; i++) {
int j = i;
while (j < length - 1 && y_indexed[j].first == y_indexed[j + 1].first) j++;
double rank = 1.0 * (i + j) / 2 + 1;
for (int k = i; k <= j; k++) {
y_ranks[y_indexed[k].second] = rank;
}
i = j;
}
// Calculate Pearson correlation on the ranks
double* x_ranks_ptr = x_ranks.data();
double* y_ranks_ptr = y_ranks.data();
return calculateCorrelation(x_ranks_ptr, y_ranks_ptr, length);
}
/**
* @brief Calculate a specific quantile value of a data series
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @param q Quantile to calculate (0-1, e.g., 0.25 for first quartile)
* @return double The value at the specified quantile
*/
double calculateQuantile(const double* values, int length, double q) {
if (length == 0) return 0;
if (length == 1) return values[0];
if (q < 0) q = 0;
if (q > 1) q = 1;
std::vector<double> sorted(values, values + length);
std::sort(sorted.begin(), sorted.end());
// Linear interpolation between closest ranks
double pos = (length - 1) * q;
int idx_lower = static_cast<int>(pos);
double frac = pos - idx_lower;
if (idx_lower + 1 < length) {
return sorted[idx_lower] * (1 - frac) + sorted[idx_lower + 1] * frac;
} else {
return sorted[idx_lower];
}
}
/**
* @brief Calculate the interquartile range (IQR) of a data series
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @return double The IQR (Q3-Q1)
*/
double calculateIQR(const double* values, int length) {
if (length < 4) return 0;
double q1 = calculateQuantile(values, length, 0.25);
double q3 = calculateQuantile(values, length, 0.75);
return q3 - q1;
}
/**
* @brief Calculate the skewness of a data distribution
* Measures the asymmetry of the probability distribution
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @param mean Pre-calculated mean (if available, otherwise pass 0)
* @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0)
* @return double The skewness value (0 for normal distribution)
*/
double calculateSkewness(const double* values, int length, double mean = 0, double stdDev = 0) {
if (length <= 2) return 0;
// Calculate mean and stdDev if not provided
if (mean == 0) {
mean = calculateMean(values, length);
}
if (stdDev == 0) {
stdDev = calculateStdDev(values, length, mean);
}
if (stdDev < 1e-10) return 0; // Avoid division by zero
// Calculate third moment (cube of differences)
double sum = 0;
for (int i = 0; i < length; i++) {
double diff = values[i] - mean;
sum += diff * diff * diff;
}
// Return Fisher-Pearson coefficient of skewness
// Includes adjustment for sample bias
double n = length;
double adjustment = std::sqrt(n * (n - 1)) / (n - 2);
return adjustment * sum / (length * stdDev * stdDev * stdDev);
}
/**
* @brief Calculate the kurtosis of a data distribution
* Measures the "tailedness" of the probability distribution
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @param mean Pre-calculated mean (if available, otherwise pass 0)
* @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0)
* @return double The excess kurtosis (0 for normal distribution)
*/
double calculateKurtosis(const double* values, int length, double mean = 0, double stdDev = 0) {
if (length <= 3) return 0;
// Calculate mean and stdDev if not provided
if (mean == 0) {
mean = calculateMean(values, length);
}
if (stdDev == 0) {
stdDev = calculateStdDev(values, length, mean);
}
if (stdDev < 1e-10) return 0; // Avoid division by zero
// Calculate fourth moment
double sum = 0;
for (int i = 0; i < length; i++) {
double diff = values[i] - mean;
sum += diff * diff * diff * diff;
}
// Return excess kurtosis with sample adjustment
double n = length;
double adjustment = ((n + 1) * n) / ((n - 1) * (n - 2) * (n - 3));
double second_term = 3 * (n - 1) * (n - 1) / ((n - 2) * (n - 3));
return adjustment * sum / (stdDev * stdDev * stdDev * stdDev) - second_term;
}
/**
* @brief Perform linear regression on two data series
*
* @param x Independent variable values
* @param y Dependent variable values (must be same length as x)
* @param length Number of elements in both arrays
* @param slope Output parameter for slope
* @param intercept Output parameter for y-intercept
* @param r_squared Output parameter for R² coefficient of determination
* @return bool True if successful, false if error occurred
*/
bool calculateLinearRegression(const double* x, const double* y, int length,
double& slope, double& intercept, double& r_squared) {
if (length < 2) return false;
double sum_x = 0, sum_y = 0, sum_xy = 0, sum_x2 = 0, sum_y2 = 0;
for (int i = 0; i < length; i++) {
sum_x += x[i];
sum_y += y[i];
sum_xy += x[i] * y[i];
sum_x2 += x[i] * x[i];
sum_y2 += y[i] * y[i];
}
double n = static_cast<double>(length);
double denominator = n * sum_x2 - sum_x * sum_x;
if (std::abs(denominator) < 1e-10) return false; // Vertical line, undefined slope
// Calculate slope and intercept
slope = (n * sum_xy - sum_x * sum_y) / denominator;
intercept = (sum_y - slope * sum_x) / n;
// Calculate R² coefficient of determination
double mean_y = sum_y / n;
double ss_total = 0, ss_residual = 0;
for (int i = 0; i < length; i++) {
double predicted = intercept + slope * x[i];
ss_total += (y[i] - mean_y) * (y[i] - mean_y);
ss_residual += (y[i] - predicted) * (y[i] - predicted);
}
if (ss_total < 1e-10) {
r_squared = 1.0; // All points are on the same horizontal line
} else {
r_squared = 1.0 - (ss_residual / ss_total);
}
return true;
}
/**
* @brief Calculate the autocorrelation of a time series at specified lag
*
* @param values Time series data
* @param length Number of elements in the array
* @param lag The lag to calculate autocorrelation for
* @return double Autocorrelation coefficient at specified lag (-1 to 1)
*/
double calculateAutocorrelation(const double* values, int length, int lag) {
if (length <= lag || lag <= 0) return 0;
double mean = calculateMean(values, length);
double numerator = 0;
double denominator = 0;
for (int i = 0; i < length - lag; i++) {
numerator += (values[i] - mean) * (values[i + lag] - mean);
}
for (int i = 0; i < length; i++) {
denominator += (values[i] - mean) * (values[i] - mean);
}
if (denominator < 1e-10) return 0;
return numerator / denominator;
}
/**
* @brief Detect outliers in a data series using modified Z-score method
*
* @param values Pointer to array of values
* @param length Number of elements in the array
* @param outlierIndices Output vector to store indices of detected outliers
* @param threshold Z-score threshold to consider a point an outlier (typically 3.5)
* @return int Number of outliers detected
*/
int detectOutliers(const double* values, int length, std::vector<int>& outlierIndices, double threshold = 3.5) {
if (length < 3) return 0;
outlierIndices.clear();
// Use median and MAD instead of mean and std dev for robustness
std::vector<double> sorted(values, values + length);
std::sort(sorted.begin(), sorted.end());
double median = (length % 2 == 0) ?
(sorted[length/2 - 1] + sorted[length/2]) / 2.0 : sorted[length/2];
// Calculate MAD (Median Absolute Deviation)
std::vector<double> deviations(length);
for (int i = 0; i < length; i++) {
deviations[i] = std::abs(values[i] - median);
}
std::sort(deviations.begin(), deviations.end());
double mad = (length % 2 == 0) ?
(deviations[length/2 - 1] + deviations[length/2]) / 2.0 : deviations[length/2];
// Constant factor for normal distribution
const double k = 1.4826;
// Find outliers using modified Z-score
for (int i = 0; i < length; i++) {
if (mad < 1e-10) { // If MAD is too small, use simple difference
if (std::abs(values[i] - median) > threshold) {
outlierIndices.push_back(i);
}
} else {
double modified_z = k * std::abs(values[i] - median) / mad;
if (modified_z > threshold) {
outlierIndices.push_back(i);
}
}
}
return outlierIndices.size();
}
/**
* @brief Perform simple moving average on a time series
*
* @param values Time series data
* @param length Number of elements in the array
* @param window The window size for the moving average
* @param result Pre-allocated array to store results (size = length)
*/
void calculateMovingAverage(const double* values, int length, int window, double* result) {
if (length <= 0 || window <= 0) return;
// Adjust window if it's larger than the data length
window = std::min(window, length);
for (int i = 0; i < length; i++) {
int start = std::max(0, i - window + 1);
int end = i + 1;
int count = end - start;
double sum = 0;
for (int j = start; j < end; j++) {
sum += values[j];
}
result[i] = sum / count;
}
}
/**
* @brief Calculate exponential moving average (EMA) of a time series
*
* @param values Time series data
* @param length Number of elements in the array
* @param alpha Smoothing factor (0-1)
* @param result Pre-allocated array to store results (size = length)
*/
void calculateExponentialMovingAverage(const double* values, int length, double alpha, double* result) {
if (length <= 0 || alpha < 0 || alpha > 1) return;
// Initialize with first value
result[0] = values[0];
// Apply EMA formula: EMA_t = α × value_t + (1 - α) × EMA_{t-1}
for (int i = 1; i < length; i++) {
result[i] = alpha * values[i] + (1 - alpha) * result[i - 1];
}
}
/**
* @brief Decompose a time series into trend, seasonal, and residual components
* Implementation of STL (Seasonal and Trend decomposition using Loess)
*
* @param values Time series data
* @param length Number of elements in the array
* @param seasonality Length of seasonal cycle (e.g., 7 for weekly, 12 for monthly)
* @param trend Output array for trend component (size = length)
* @param seasonal Output array for seasonal component (size = length)
* @param residual Output array for residual component (size = length)
* @return bool True if successful, false if error occurred
*/
bool decomposeTimeSeries(const double* values, int length, int seasonality,
double* trend, double* seasonal, double* residual) {
if (length <= 2 * seasonality || seasonality <= 1) return false;
// Calculate trend with centered moving average
for (int i = 0; i < length; i++) {
trend[i] = 0;
}
int halfSeason = seasonality / 2;
// Centered moving average for trend
for (int i = halfSeason; i < length - halfSeason; i++) {
double sum = 0;
for (int j = i - halfSeason; j <= i + halfSeason; j++) {
sum += values[j];
}
trend[i] = sum / seasonality;
}
// Extrapolate trend at boundaries
// Left boundary
double slope = (trend[halfSeason + 5] - trend[halfSeason]) / 5;
for (int i = 0; i < halfSeason; i++) {
trend[i] = trend[halfSeason] - (halfSeason - i) * slope;
}
// Right boundary
slope = (trend[length - halfSeason - 1] - trend[length - halfSeason - 6]) / 5;
for (int i = length - halfSeason; i < length; i++) {
trend[i] = trend[length - halfSeason - 1] + (i - (length - halfSeason - 1)) * slope;
}
// Calculate detrended series
std::vector<double> detrended(length);
for (int i = 0; i < length; i++) {
detrended[i] = values[i] - trend[i];
}
// Calculate seasonal component by averaging the detrended values across seasons
std::vector<double> seasonalAvg(seasonality, 0);
std::vector<int> seasonalCounts(seasonality, 0);
for (int i = 0; i < length; i++) {
int seasonalIndex = i % seasonality;
seasonalAvg[seasonalIndex] += detrended[i];
seasonalCounts[seasonalIndex]++;
}
for (int i = 0; i < seasonality; i++) {
if (seasonalCounts[i] > 0) {
seasonalAvg[i] /= seasonalCounts[i];
}
}
// Normalize seasonal component to sum to zero
double avgSeasonal = 0;
for (int i = 0; i < seasonality; i++) {
avgSeasonal += seasonalAvg[i];
}
avgSeasonal /= seasonality;
for (int i = 0; i < seasonality; i++) {
seasonalAvg[i] -= avgSeasonal;
}
// Apply seasonal component to entire series
for (int i = 0; i < length; i++) {
seasonal[i] = seasonalAvg[i % seasonality];
}
// Calculate residual component
for (int i = 0; i < length; i++) {
residual[i] = values[i] - trend[i] - seasonal[i];
}
return true;
}
/**
* @brief Calculate partial autocorrelation function for a time series
*
* @param values Time series data
* @param length Number of elements in the array
* @param maxLag Maximum lag to calculate
* @param pacf Pre-allocated array to store results (size = maxLag + 1)
* @return int Number of valid PACF values calculated
*/
int calculatePACF(const double* values, int length, int maxLag, double* pacf) {
if (length <= 1 || maxLag <= 0 || maxLag >= length) return 0;
// Allocate Yule-Walker matrices
std::vector<std::vector<double>> phi(maxLag + 1, std::vector<double>(maxLag + 1, 0));
// Calculate autocorrelations
std::vector<double> acf(maxLag + 1, 0);
acf[0] = 1.0; // ACF at lag 0 is always 1
for (int k = 1; k <= maxLag; k++) {
acf[k] = calculateAutocorrelation(values, length, k);
}
// Set PACF at lag 0 to 1
pacf[0] = 1.0;
// Calculate PACF using Levinson-Durbin recursion
for (int k = 1; k <= maxLag; k++) {
// Initialize for this order
double numerator = acf[k];
for (int j = 1; j < k; j++) {
numerator -= phi[k-1][j] * acf[k-j];
}
double denominator = 1.0;
for (int j = 1; j < k; j++) {
denominator -= phi[k-1][j] * acf[j];
}
if (std::abs(denominator) < 1e-10) {
// If denominator is close to zero, set PACF to 0
phi[k][k] = 0;
} else {
phi[k][k] = numerator / denominator;
}
// Update remaining coefficients
for (int j = 1; j < k; j++) {
phi[k][j] = phi[k-1][j] - phi[k][k] * phi[k-1][k-j];
}
// Store the PACF value
pacf[k] = phi[k][k];
}
return maxLag + 1;
}
/**
* @brief Perform k-means clustering on multivariate data
*
* @param data 2D array of data points [n_samples x n_features]
* @param nSamples Number of data points
* @param nFeatures Number of features per data point
* @param k Number of clusters
* @param maxIter Maximum number of iterations
* @param centroids Output array for cluster centroids [k x n_features]
* @param assignments Output array for cluster assignments [n_samples]
* @return int Number of iterations performed
*/
int kMeansClustering(const double** data, int nSamples, int nFeatures, int k,
int maxIter, double** centroids, int* assignments) {
if (nSamples < k || k <= 0 || nFeatures <= 0) return 0;
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> distrib(0, nSamples - 1);
// Initialize centroids using k-means++ initialization
std::vector<int> centroidIndices;
std::vector<double> minDistances(nSamples, std::numeric_limits<double>::max());
// Choose first centroid randomly
int firstCentroid = distrib(gen);
centroidIndices.push_back(firstCentroid);
// Choose remaining centroids
for (int c = 1; c < k; c++) {
// Update distances to nearest centroid
for (int i = 0; i < nSamples; i++) {
double dist = 0;
for (int j = 0; j < nFeatures; j++) {
double diff = data[i][j] - data[centroidIndices.back()][j];
dist += diff * diff;
}
minDistances[i] = std::min(minDistances[i], dist);
}
// Calculate sum of squared distances
double sumSquaredDist = 0;
for (int i = 0; i < nSamples; i++) {
sumSquaredDist += minDistances[i];
}
// Choose next centroid with probability proportional to D²
double threshold = (sumSquaredDist * static_cast<double>(rand()) / RAND_MAX);
double cumulativeProb = 0;
int nextCentroid = 0;
for (int i = 0; i < nSamples; i++) {
cumulativeProb += minDistances[i];
if (cumulativeProb >= threshold) {
nextCentroid = i;
break;
}
}
centroidIndices.push_back(nextCentroid);
}
// Copy initial centroids
for (int i = 0; i < k; i++) {
for (int j = 0; j < nFeatures; j++) {
centroids[i][j] = data[centroidIndices[i]][j];
}
}
// Perform k-means iterations
int iterations = 0;
bool converged = false;
while (!converged && iterations < maxIter) {
// Assign points to nearest centroid
converged = true;
for (int i = 0; i < nSamples; i++) {
double minDist = std::numeric_limits<double>::max();
int bestCluster = 0;
for (int c = 0; c < k; c++) {
double dist = 0;
for (int j = 0; j < nFeatures; j++) {
double diff = data[i][j] - centroids[c][j];
dist += diff * diff;
}
if (dist < minDist) {
minDist = dist;
bestCluster = c;
}
}
if (assignments[i] != bestCluster) {
assignments[i] = bestCluster;
converged = false;
}
}
// Update centroids
std::vector<std::vector<double>> newCentroids(k, std::vector<double>(nFeatures, 0));
std::vector<int> clusterSizes(k, 0);
for (int i = 0; i < nSamples; i++) {
int cluster = assignments[i];
clusterSizes[cluster]++;
for (int j = 0; j < nFeatures; j++) {
newCentroids[cluster][j] += data[i][j];
}
}
for (int c = 0; c < k; c++) {
if (clusterSizes[c] > 0) {
for (int j = 0; j < nFeatures; j++) {
centroids[c][j] = newCentroids[c][j] / clusterSizes[c];
}
}
}
iterations++;
}
return iterations;
}
/**
* @brief Calculate the silhouette coefficient for clustering validation
*
* @param data 2D array of data points [n_samples x n_features]
* @param nSamples Number of data points
* @param nFeatures Number of features per data point
* @param assignments Cluster assignments for each point
* @param k Number of clusters
* @return double Average silhouette coefficient (-1 to 1)
*/
double calculateSilhouetteCoefficient(const double** data, int nSamples, int nFeatures,
const int* assignments, int k) {
if (nSamples <= k || k <= 1) return 0;
std::vector<double> silhouettes(nSamples);
// For each point
for (int i = 0; i < nSamples; i++) {
int cluster_i = assignments[i];
// Calculate a(i) - average distance to points in same cluster
double a_i = 0;
int count_same_cluster = 0;
for (int j = 0; j < nSamples; j++) {
if (j != i && assignments[j] == cluster_i) {
double dist = 0;
for (int f = 0; f < nFeatures; f++) {
double diff = data[i][f] - data[j][f];
dist += diff * diff;
}
dist = std::sqrt(dist);
a_i += dist;
count_same_cluster++;
}
}
if (count_same_cluster > 0) {
a_i /= count_same_cluster;
} else {
a_i = 0; // Singleton cluster
}
// Calculate b(i) - minimum average distance to points in different clusters
double b_i = std::numeric_limits<double>::max();
for (int c = 0; c < k; c++) {
if (c == cluster_i) continue;
double avg_dist = 0;
int count_diff_cluster = 0;
for (int j = 0; j < nSamples; j++) {
if (assignments[j] == c) {
double dist = 0;
for (int f = 0; f < nFeatures; f++) {
double diff = data[i][f] - data[j][f];
dist += diff * diff;
}
dist = std::sqrt(dist);
avg_dist += dist;
count_diff_cluster++;
}
}
if (count_diff_cluster > 0) {
avg_dist /= count_diff_cluster;
b_i = std::min(b_i, avg_dist);
}
}
// Calculate silhouette
if (count_same_cluster > 0 && b_i < std::numeric_limits<double>::max()) {
silhouettes[i] = (b_i - a_i) / std::max(a_i, b_i);
} else {
silhouettes[i] = 0; // Handle edge cases
}
}
// Calculate average silhouette
double avg_silhouette = 0;
for (int i = 0; i < nSamples; i++) {
avg_silhouette += silhouettes[i];
}
return avg_silhouette / nSamples;
}

32
native/statistics/utils.h Normal file
View file

@ -0,0 +1,32 @@
// utils.h
#ifndef UTILS_H
#define UTILS_H
#include "health_analytics_engine.h"
#include <random>
#include <limits>
#include <numeric>
// Declare all utility functions from utils.cpp
double calculateMean(const double* values, int length);
double calculateWeightedMean(const double* values, const double* weights, int length);
double calculateVariance(const double* values, int length, double mean);
double calculateStdDev(const double* values, int length, double mean);
double calculateMedian(double* values, int length);
double calculateCorrelation(const double* x, const double* y, int length);
double calculateSpearmanCorrelation(const double* x, const double* y, int length);
double calculateQuantile(const double* values, int length, double q);
double calculateIQR(const double* values, int length);
double calculateSkewness(const double* values, int length, double mean, double stdDev);
double calculateKurtosis(const double* values, int length, double mean, double stdDev);
bool calculateLinearRegression(const double* x, const double* y, int length, double& slope, double& intercept, double& r_squared);
double calculateAutocorrelation(const double* values, int length, int lag);
int detectOutliers(const double* values, int length, std::vector<int>& outlierIndices, double threshold);
void calculateMovingAverage(const double* values, int length, int window, double* result);
void calculateExponentialMovingAverage(const double* values, int length, double alpha, double* result);
bool decomposeTimeSeries(const double* values, int length, int seasonality, double* trend, double* seasonal, double* residual);
int calculatePACF(const double* values, int length, int maxLag, double* pacf);
int kMeansClustering(const double** data, int nSamples, int nFeatures, int k, int maxIter, double** centroids, int* assignments);
double calculateSilhouetteCoefficient(const double** data, int nSamples, int nFeatures, const int* assignments, int k);
#endif // UTILS_H