first commit - migrated from codeberg

2025-04-20 11:17:03 -04:00 · 2025-04-20 11:17:03 -04:00 · 5ead03e1f7
commit 5ead03e1f7
567 changed files with 102721 additions and 0 deletions
--- a/native/statistics/anomaly_detection.cpp
+++ b/native/statistics/anomaly_detection.cpp
@ -0,0 +1,494 @@
+// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
+// SPDX-License-Identifier: AGPL-3.0
+//
+// anomaly_detection.cpp
+// Implementation of anomaly and outlier detection functions
+//
+#include "health_analytics_engine.h"
+/**
+ * @brief Detect anomalies in time series data
+ * 
+ * @param timeSeries Time series data
+ * @param dataLength Length of time series
+ * @param threshold Z-score threshold to consider a point an anomaly
+ * @param dates Array of dates corresponding to time series points
+ * @param factorName Name of the factor being analyzed
+ * @return AnomalyResult* Array of detected anomalies
+ */
+AnomalyResult* detect_anomalies(const double* timeSeries,
+    int dataLength,
+    double threshold,
+    const DateStruct* dates,
+    const char* factorName) {
+if (dataLength < 5) {
+AnomalyResult* dummy = new AnomalyResult[1];
+memset(dummy, 0, sizeof(AnomalyResult));
+dummy[0].dataPointIndex = -1;  // Mark as invalid
+return dummy;
+}
+
+// Constants
+const int MAX_RESULTS = 100;
+
+// Allocate space for results
+AnomalyResult* results = new AnomalyResult[MAX_RESULTS + 1];
+int resultCount = 0;
+
+// Decompose time series if enough data
+std::vector<double> trend(dataLength);
+std::vector<double> seasonal(dataLength);
+std::vector<double> residual(dataLength);
+
+bool hasDecomposition = false;
+int seasonality = 0;
+
+// Try to detect seasonality for decomposition
+double maxAutocorr = 0;
+for (int lag = 2; lag <= dataLength/3; lag++) {
+double acf = calculateAutocorrelation(timeSeries, dataLength, lag);
+if (acf > 0.3 && acf > maxAutocorr) {
+maxAutocorr = acf;
+seasonality = lag;
+}
+}
+
+if (seasonality >= 2) {
+hasDecomposition = decomposeTimeSeries(timeSeries, dataLength, seasonality,
+                    trend.data(), seasonal.data(), residual.data());
+}
+
+if (!hasDecomposition) {
+// Simple moving average for trend if decomposition failed
+int windowSize = std::min(7, dataLength/3);
+if (windowSize < 2) windowSize = 2;
+
+calculateMovingAverage(timeSeries, dataLength, windowSize, trend.data());
+
+// Residuals = original - trend
+for (int i = 0; i < dataLength; i++) {
+seasonal[i] = 0; // No seasonal component
+residual[i] = timeSeries[i] - trend[i];
+}
+}
+
+// Calculate residual statistics for outlier detection
+double mean = 0, sumSquared = 0;
+for (int i = 0; i < dataLength; i++) {
+mean += residual[i];
+}
+mean /= dataLength;
+
+for (int i = 0; i < dataLength; i++) {
+double diff = residual[i] - mean;
+sumSquared += diff * diff;
+}
+double stdDev = sqrt(sumSquared / dataLength);
+
+if (stdDev <= 0) {
+results[0].dataPointIndex = -1;
+return results;
+}
+
+// Detect global outliers using z-score
+for (int i = 0; i < dataLength && resultCount < MAX_RESULTS; i++) {
+double zScore = (residual[i] - mean) / stdDev;
+
+if (std::abs(zScore) > threshold) {
+AnomalyResult& anomaly = results[resultCount++];
+
+anomaly.dataPointIndex = i;
+anomaly.anomalyScore = std::abs(zScore);
+anomaly.originalValue = timeSeries[i];
+anomaly.expectedValue = trend[i] + seasonal[i] + mean;
+
+// Higher confidence for more extreme anomalies
+anomaly.confidence = 0.5 + 0.5 * std::min(1.0, (std::abs(zScore) - threshold) / 5.0);
+
+// Copy date if available
+if (dates != nullptr) {
+anomaly.date = dates[i];
+} else {
+memset(&anomaly.date, 0, sizeof(DateStruct));
+}
+
+// Copy factor name
+if (factorName != nullptr) {
+strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
+anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
+} else {
+anomaly.factorName[0] = '\0';
+}
+
+// Set anomaly type
+anomaly.anomalyType = ANOMALY_OUTLIER;
+
+// Generate description
+if (zScore > 0) {
+snprintf(anomaly.description, MAX_STRING_SIZE,
+"Unusually high value (%.2f standard deviations above expected)",
+zScore);
+} else {
+snprintf(anomaly.description, MAX_STRING_SIZE,
+"Unusually low value (%.2f standard deviations below expected)",
+-zScore);
+}
+}
+}
+
+// Detect context-based anomalies using local statistics
+const int LOCAL_WINDOW = std::min(7, dataLength/5);
+if (LOCAL_WINDOW >= 3) {
+for (int i = LOCAL_WINDOW; i < dataLength - LOCAL_WINDOW && resultCount < MAX_RESULTS; i++) {
+// Calculate local statistics
+double localSum = 0, localSumSquared = 0;
+for (int j = i - LOCAL_WINDOW; j <= i + LOCAL_WINDOW; j++) {
+if (j != i) { // Exclude the point itself
+localSum += timeSeries[j];
+localSumSquared += timeSeries[j] * timeSeries[j];
+}
+}
+
+double localMean = localSum / (2 * LOCAL_WINDOW);
+double localVar = localSumSquared / (2 * LOCAL_WINDOW) - localMean * localMean;
+double localStdDev = sqrt(std::max(localVar, 1e-6)); // Prevent division by zero
+
+// Calculate local z-score
+double localZScore = (timeSeries[i] - localMean) / localStdDev;
+
+// Check if it's a local anomaly but not already a global anomaly
+if (std::abs(localZScore) > threshold * 1.2) {
+// Check if this point was already detected as a global anomaly
+bool alreadyDetected = false;
+for (int j = 0; j < resultCount; j++) {
+if (results[j].dataPointIndex == i) {
+alreadyDetected = true;
+break;
+}
+}
+
+if (!alreadyDetected) {
+AnomalyResult& anomaly = results[resultCount++];
+
+anomaly.dataPointIndex = i;
+anomaly.anomalyScore = std::abs(localZScore);
+anomaly.originalValue = timeSeries[i];
+anomaly.expectedValue = localMean;
+anomaly.confidence = 0.5 + 0.5 * std::min(1.0, (std::abs(localZScore) - threshold) / 5.0);
+
+// Copy date if available
+if (dates != nullptr) {
+anomaly.date = dates[i];
+} else {
+memset(&anomaly.date, 0, sizeof(DateStruct));
+}
+
+// Copy factor name
+if (factorName != nullptr) {
+strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
+anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
+} else {
+anomaly.factorName[0] = '\0';
+}
+
+// Set anomaly type
+anomaly.anomalyType = ANOMALY_CONTEXTUAL;
+
+// Generate description
+if (localZScore > 0) {
+snprintf(anomaly.description, MAX_STRING_SIZE,
+      "Context anomaly: value is high compared to local neighborhood (%.2f local std dev)",
+      localZScore);
+} else {
+snprintf(anomaly.description, MAX_STRING_SIZE,
+      "Context anomaly: value is low compared to local neighborhood (%.2f local std dev)",
+      -localZScore);
+}
+}
+}
+}
+}
+
+// Detect trend changes
+if (dataLength >= 10) {
+for (int i = 5; i < dataLength - 5 && resultCount < MAX_RESULTS; i++) {
+// Calculate slope before and after
+double slopeBefore = 0, slopeAfter = 0;
+double interceptBefore = 0, interceptAfter = 0;
+double r2Before = 0, r2After = 0;
+
+// Create time vectors
+std::vector<double> time1(5), time2(5);
+std::vector<double> values1(5), values2(5);
+
+for (int j = 0; j < 5; j++) {
+time1[j] = j;
+time2[j] = j;
+values1[j] = timeSeries[i - 5 + j];
+values2[j] = timeSeries[i + j];
+}
+
+bool validBefore = calculateLinearRegression(time1.data(), values1.data(), 5, 
+                            slopeBefore, interceptBefore, r2Before);
+bool validAfter = calculateLinearRegression(time2.data(), values2.data(), 5, 
+                           slopeAfter, interceptAfter, r2After);
+
+if (validBefore && validAfter) {
+// Check for significant change in slope
+double slopeChange = slopeAfter - slopeBefore;
+double meanSlope = (std::abs(slopeBefore) + std::abs(slopeAfter)) / 2;
+
+if (meanSlope > 0 && std::abs(slopeChange) / meanSlope > 0.5) {
+AnomalyResult& anomaly = results[resultCount++];
+
+anomaly.dataPointIndex = i;
+anomaly.anomalyScore = std::abs(slopeChange) / (meanSlope + 1e-6);
+anomaly.originalValue = timeSeries[i];
+anomaly.expectedValue = timeSeries[i]; // Same value, change is in the trend
+anomaly.confidence = 0.5 + 0.5 * std::min(1.0, anomaly.anomalyScore / 2.0);
+
+// Copy date if available
+if (dates != nullptr) {
+anomaly.date = dates[i];
+} else {
+memset(&anomaly.date, 0, sizeof(DateStruct));
+}
+
+// Copy factor name
+if (factorName != nullptr) {
+strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
+anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
+} else {
+anomaly.factorName[0] = '\0';
+}
+
+// Set anomaly type
+anomaly.anomalyType = ANOMALY_TREND_CHANGE;
+
+// Generate description
+if (slopeBefore < 0 && slopeAfter > 0) {
+snprintf(anomaly.description, MAX_STRING_SIZE,
+      "Trend reversal: changed from decreasing (%.2f/day) to increasing (%.2f/day)",
+      -slopeBefore, slopeAfter);
+} else if (slopeBefore > 0 && slopeAfter < 0) {
+snprintf(anomaly.description, MAX_STRING_SIZE,
+      "Trend reversal: changed from increasing (%.2f/day) to decreasing (%.2f/day)",
+      slopeBefore, -slopeAfter);
+} else if (slopeAfter > slopeBefore) {
+snprintf(anomaly.description, MAX_STRING_SIZE,
+      "Trend acceleration: rate of change increased from %.2f/day to %.2f/day",
+      slopeBefore, slopeAfter);
+} else {
+snprintf(anomaly.description, MAX_STRING_SIZE,
+      "Trend deceleration: rate of change decreased from %.2f/day to %.2f/day",
+      slopeBefore, slopeAfter);
+}
+}
+}
+}
+}
+
+// Sort anomalies by score (most significant first)
+std::sort(results, results + resultCount, 
+[](const AnomalyResult& a, const AnomalyResult& b) {
+return a.anomalyScore > b.anomalyScore;
+});
+
+// Mark the end of valid results
+results[resultCount].dataPointIndex = -1;
+
+return results;
+}
+
+/**
+* @brief Free memory for anomaly detection results
+* 
+* @param results Pointer to anomaly results array
+*/
+void free_anomaly_results(AnomalyResult* results) {
+delete[] results;
+}
+
+/**
+* @brief Analyze patterns related to dates (e.g., day of week effects)
+* 
+* @param values Array of values
+* @param dates Array of corresponding dates
+* @param data_length Length of the arrays
+* @param factor_name Name of the factor being analyzed
+* @return DatePatternResult* Array of detected patterns
+*/
+DatePatternResult* analyze_date_patterns(
+const double* values,
+const DateStruct* dates,
+int data_length,
+const char* factor_name) {
+
+// Allocate space for results (3 patterns max + terminator)
+DatePatternResult* results = new DatePatternResult[4];
+memset(results, 0, 4 * sizeof(DatePatternResult));
+
+// Initialize terminator
+results[0].patternType = PATTERN_NONE;
+
+// Return empty result for insufficient data
+if (data_length < 14) {
+return results;
+}
+
+// Count occurrences by day of week
+double dayOfWeekSums[7] = {0};
+int dayOfWeekCounts[7] = {0};
+
+for (int i = 0; i < data_length; i++) {
+// Convert date to day of week (0 = Sunday, 6 = Saturday)
+// This is a simplified calculation and might need adjustment
+int year = dates[i].year;
+int month = dates[i].month;
+int day = dates[i].day;
+
+// Zeller's congruence for finding day of week
+if (month < 3) {
+month += 12;
+year--;
+}
+int h = (day + (13 * (month + 1)) / 5 + year + year / 4 - year / 100 + year / 400) % 7;
+// Convert to 0-based where 0 is Sunday
+int dayOfWeek = (h + 6) % 7;
+
+dayOfWeekSums[dayOfWeek] += values[i];
+dayOfWeekCounts[dayOfWeek]++;
+}
+
+// Calculate average by day of week
+double dayOfWeekAvgs[7] = {0};
+for (int i = 0; i < 7; i++) {
+if (dayOfWeekCounts[i] > 0) {
+dayOfWeekAvgs[i] = dayOfWeekSums[i] / dayOfWeekCounts[i];
+}
+}
+
+// Find peak day
+int peakDay = 0;
+double peakValue = dayOfWeekAvgs[0];
+for (int i = 1; i < 7; i++) {
+if (dayOfWeekAvgs[i] > peakValue) {
+peakValue = dayOfWeekAvgs[i];
+peakDay = i;
+}
+}
+
+// Calculate variance to determine if there's a weekly pattern
+double mean = 0;
+for (int i = 0; i < 7; i++) {
+if (dayOfWeekCounts[i] > 0) {
+mean += dayOfWeekAvgs[i];
+}
+}
+mean /= 7;
+
+double variance = 0;
+for (int i = 0; i < 7; i++) {
+if (dayOfWeekCounts[i] > 0) {
+double diff = dayOfWeekAvgs[i] - mean;
+variance += diff * diff;
+}
+}
+variance /= 7;
+
+// Calculate pattern strength
+double strength = std::min(1.0, variance / (mean * mean + 0.001));
+
+// Only report if pattern is significant
+if (strength > 0.1) {
+results[0].patternType = PATTERN_WEEKLY;
+results[0].periodicity = 7;
+results[0].strength = strength;
+results[0].peakDayOfWeek = peakDay;
+
+// Copy peak values
+for (int i = 0; i < 7; i++) {
+results[0].peakValues[i] = dayOfWeekAvgs[i];
+}
+
+// Generate description
+const char* dayNames[] = {"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"};
+snprintf(results[0].description, MAX_STRING_SIZE,
+"Weekly pattern detected with peak on %s (strength: %.2f)",
+dayNames[peakDay], strength);
+}
+
+return results;
+}
+
+/**
+* @brief Free memory for date pattern results
+* 
+* @param results Pointer to date pattern results array
+*/
+void free_date_pattern_results(DatePatternResult* results) {
+delete[] results;
+}
+
+/**
+* @brief Analyze cyclical patterns in time series data
+* 
+* @param values Array of values
+* @param dates Array of corresponding dates
+* @param data_length Length of the arrays
+* @param factor_name Name of the factor being analyzed
+* @return CycleAnalysisResult Structure containing cycle analysis results
+*/
+CycleAnalysisResult analyze_cycles(
+const double* values,
+const DateStruct* dates,
+int data_length,
+const char* factor_name) {
+
+CycleAnalysisResult result;
+memset(&result, 0, sizeof(CycleAnalysisResult));
+
+// Minimum data points required for cycle analysis
+if (data_length < 20) {
+strncpy(result.description, "Insufficient data for cycle analysis", MAX_STRING_SIZE-1);
+return result;
+}
+
+// Simple autocorrelation-based cycle detection
+// Find the peak in autocorrelation function after lag 0
+int maxLag = data_length / 3; // Look for cycles up to 1/3 of data length
+double maxCorr = 0;
+int bestLag = 0;
+
+for (int lag = 2; lag < maxLag; lag++) {
+double sum = 0;
+double count = 0;
+
+for (int i = 0; i < data_length - lag; i++) {
+sum += values[i] * values[i + lag];
+count++;
+}
+
+double corr = sum / count;
+
+if (corr > maxCorr) {
+maxCorr = corr;
+bestLag = lag;
+}
+}
+
+// Calculate average cycle length in days
+if (bestLag > 0 && maxCorr > 0.2) {
+result.cycleLength = bestLag;
+result.amplitude = maxCorr;
+result.confidence = maxCorr;
+result.cycleLengthVariance = bestLag * 0.2; // Simple estimation
+
+snprintf(result.description, MAX_STRING_SIZE,
+"Cycle detected with approximate length of %d days (confidence: %.2f)",
+bestLag, maxCorr);
+} else {
+strncpy(result.description, "No significant cycle detected", MAX_STRING_SIZE-1);
+}
+
+return result;
+}
--- a/native/statistics/arm64_fixes.h
+++ b/native/statistics/arm64_fixes.h
@ -0,0 +1,9 @@
+#pragma once
+
+#ifdef __aarch64__
+// Fix uintptr_t definition for ARM64
+#include <stdint.h>
+// We need to undefine and redefine uintptr_t to ensure it's 64-bit on ARM64
+#undef uintptr_t
+typedef uint64_t uintptr_t;
+#endif
--- a/native/statistics/basic_stats.cpp
+++ b/native/statistics/basic_stats.cpp
@ -0,0 +1,47 @@
+// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
+// SPDX-License-Identifier: AGPL-3.0
+//
+// basic_stats.cpp
+// Implementation of basic statistical analysis functions
+//
+#include "health_analytics_engine.h"
+#include "utils.h"
+/**
+ * @brief Calculate basic statistical properties of a data series
+ * 
+ * @param values Pointer to array of values
+ * @param length Number of elements in the array
+ * @return BasicStats Structure containing calculated statistics
+ */
+BasicStats calculate_basic_stats(const double* values, int length) {
+    BasicStats stats;
+    
+    if (length == 0) {
+        memset(&stats, 0, sizeof(BasicStats));
+        return stats;
+    }
+    
+    // Create a copy for calculations that require sorting
+    std::vector<double> sorted(values, values + length);
+    std::sort(sorted.begin(), sorted.end());
+    
+    // Calculate basic statistics
+    stats.mean = calculateMean(values, length);
+    stats.variance = calculateVariance(values, length, stats.mean);
+    stats.stdDev = std::sqrt(stats.variance);
+    stats.min = sorted.front();
+    stats.max = sorted.back();
+    stats.median = (length % 2 == 0) ? 
+        (sorted[length/2 - 1] + sorted[length/2]) / 2.0 : sorted[length/2];
+    
+    // Calculate quartiles and IQR
+    stats.q1 = calculateQuantile(sorted.data(), length, 0.25);
+    stats.q3 = calculateQuantile(sorted.data(), length, 0.75);
+    stats.iqr = stats.q3 - stats.q1;
+    
+    // Calculate higher-order statistics
+    stats.skewness = calculateSkewness(values, length, stats.mean, stats.stdDev);
+    stats.kurtosis = calculateKurtosis(values, length, stats.mean, stats.stdDev);
+    
+    return stats;
+}
--- a/native/statistics/clustering.cpp
+++ b/native/statistics/clustering.cpp
@ -0,0 +1,203 @@
+// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
+// SPDX-License-Identifier: AGPL-3.0
+//
+// clustering.cpp
+// Implementation of clustering and pattern recognition functions
+//
+#include "health_analytics_engine.h"
+#include "utils.h"
+/**
+ * @brief Perform cluster analysis on multivariate health data
+ * 
+ * @param data 2D array of data points [n_samples x n_features]
+ * @param factorCount Number of features per data point
+ * @param dataLength Number of data points
+ * @param maxClusters Maximum number of clusters to identify
+ * @return ClusterResult* Array of cluster results
+ */
+ClusterResult* perform_cluster_analysis(const double** data,
+    int factorCount,
+    int dataLength,
+    int maxClusters) {
+if (factorCount < 2 || dataLength < 5 || maxClusters <= 0) {
+ClusterResult* dummy = new ClusterResult[1];
+memset(dummy, 0, sizeof(ClusterResult));
+dummy[0].clusterId = -1;  // Mark as invalid
+return dummy;
+}
+
+// Normalize data for better clustering
+std::vector<std::vector<double>> normalizedData(dataLength, std::vector<double>(factorCount));
+std::vector<double> means(factorCount, 0);
+std::vector<double> stdDevs(factorCount, 0);
+
+// Calculate means
+for (int j = 0; j < factorCount; j++) {
+for (int i = 0; i < dataLength; i++) {
+means[j] += data[i][j];
+}
+means[j] /= dataLength;
+}
+
+// Calculate standard deviations
+for (int j = 0; j < factorCount; j++) {
+for (int i = 0; i < dataLength; i++) {
+double diff = data[i][j] - means[j];
+stdDevs[j] += diff * diff;
+}
+stdDevs[j] = sqrt(stdDevs[j] / dataLength);
+if (stdDevs[j] < 1e-10) stdDevs[j] = 1.0; // Avoid division by zero
+}
+
+// Normalize data
+for (int i = 0; i < dataLength; i++) {
+for (int j = 0; j < factorCount; j++) {
+normalizedData[i][j] = (data[i][j] - means[j]) / stdDevs[j];
+}
+}
+
+// Find optimal number of clusters (between 2 and maxClusters)
+int optimalClusters = 2;
+double bestSilhouette = -1;
+
+// Arrays for k-means algorithm
+std::vector<int> assignments(dataLength);
+std::vector<std::vector<double>> centroids(maxClusters, std::vector<double>(factorCount));
+std::vector<const double*> normalizedDataPtrs(dataLength);
+for (int i = 0; i < dataLength; i++) {
+normalizedDataPtrs[i] = normalizedData[i].data();
+}
+
+for (int k = 2; k <= maxClusters; k++) {
+// Run k-means clustering
+std::vector<int> tempAssignments(dataLength, 0);
+std::vector<std::vector<double>> tempCentroids(k, std::vector<double>(factorCount, 0));
+std::vector<double*> centroidPtrs(k);
+for (int i = 0; i < k; i++) {
+centroidPtrs[i] = tempCentroids[i].data();
+}
+
+kMeansClustering(normalizedDataPtrs.data(), dataLength, factorCount, k, 
+100, centroidPtrs.data(), tempAssignments.data());
+
+// Calculate silhouette coefficient
+std::vector<const double*> dataPtrs(dataLength);
+for (int i = 0; i < dataLength; i++) {
+dataPtrs[i] = data[i]; // Use original data for silhouette
+}
+
+double silhouette = calculateSilhouetteCoefficient(
+dataPtrs.data(), dataLength, factorCount, tempAssignments.data(), k);
+
+// Update if better silhouette found
+if (silhouette > bestSilhouette) {
+bestSilhouette = silhouette;
+optimalClusters = k;
+assignments = tempAssignments;
+centroids = tempCentroids;
+}
+}
+
+// Allocate cluster results (plus one for terminator)
+ClusterResult* results = new ClusterResult[optimalClusters + 1];
+
+// Count points in each cluster
+std::vector<int> clusterSizes(optimalClusters, 0);
+for (int i = 0; i < dataLength; i++) {
+clusterSizes[assignments[i]]++;
+}
+
+// Calculate cluster statistics
+for (int c = 0; c < optimalClusters; c++) {
+ClusterResult& cluster = results[c];
+cluster.clusterId = c;
+        cluster.dataPointCount = clusterSizes[c];
+        
+        // Calculate cluster significance based on size and compactness
+        double avgDistance = 0;
+        int count = 0;
+        
+        for (int i = 0; i < dataLength; i++) {
+            if (assignments[i] == c) {
+                double dist = 0;
+                for (int j = 0; j < factorCount; j++) {
+                    double diff = normalizedData[i][j] - centroids[c][j];
+                    dist += diff * diff;
+                }
+                avgDistance += sqrt(dist);
+                count++;
+            }
+        }
+        
+        if (count > 0) {
+            avgDistance /= count;
+        }
+        
+        // Higher significance for larger and more compact clusters
+        double sizeFactor = static_cast<double>(count) / dataLength;
+        double compactnessFactor = 1.0 - std::min(1.0, avgDistance / 3.0);
+        cluster.significance = sizeFactor * compactnessFactor;
+        
+        // Identify important factors for this cluster
+        std::vector<std::pair<int, double>> factorImportance;
+        
+        for (int j = 0; j < factorCount; j++) {
+            // Calculate how different this centroid is from global mean for this factor
+            double differenceFromMean = std::abs(centroids[c][j]);
+            factorImportance.push_back(std::make_pair(j, differenceFromMean));
+        }
+        
+        // Sort factors by importance
+        std::sort(factorImportance.begin(), factorImportance.end(),
+                 [](const std::pair<int, double>& a, const std::pair<int, double>& b) {
+                     return a.second > b.second;
+                 });
+        
+        // Store top factors (up to 3)
+        int numFactors = std::min(3, factorCount);
+        for (int j = 0; j < numFactors; j++) {
+            cluster.factorIndices[j] = factorImportance[j].first;
+            cluster.factorWeights[j] = std::min(1.0, factorImportance[j].second);
+            
+            // Normalize weights to sum to 1
+            double totalWeight = 0;
+            for (int k = 0; k < numFactors; k++) {
+                totalWeight += cluster.factorWeights[k];
+            }
+            
+            if (totalWeight > 0) {
+                for (int k = 0; k < numFactors; k++) {
+                    cluster.factorWeights[k] /= totalWeight;
+                }
+            }
+        }
+        
+        // Generate descriptive cluster name based on key factors
+        // In a real implementation, this would use domain knowledge
+        snprintf(cluster.clusterName, MAX_STRING_SIZE, 
+                 "Cluster %d - %s", c + 1, 
+                 (cluster.significance > 0.7) ? "High Significance" : 
+                 (cluster.significance > 0.4) ? "Medium Significance" : "Low Significance");
+        
+        // Generate detailed description
+        snprintf(cluster.description, MAX_STRING_SIZE,
+                 "Cluster of %d data points characterized by %s factors. Silhouette: %.2f",
+                 cluster.dataPointCount,
+                 (numFactors > 0) ? "multiple interrelated" : "non-specific",
+                 bestSilhouette);
+    }
+    
+    // Mark the end of valid results
+    results[optimalClusters].clusterId = -1;
+    
+    return results;
+}
+
+/**
+ * @brief Free memory for cluster analysis results
+ * 
+ * @param results Pointer to cluster results array
+ */
+void free_cluster_results(ClusterResult* results) {
+    delete[] results;
+}
--- a/native/statistics/correlation.cpp
+++ b/native/statistics/correlation.cpp
@ -0,0 +1,369 @@
+// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
+// SPDX-License-Identifier: AGPL-3.0
+//
+// correlation.cpp
+// Implementation of correlation analysis functions
+//
+#include "health_analytics_engine.h"
+#include "utils.h"
+/**
+ * @brief Find factors with strongest correlation to target variable
+ * 
+ * @param target Target variable time series
+ * @param factors Array of factor time series
+ * @param factorNames Array of factor names
+ * @param targetLength Length of target time series
+ * @param factorCount Number of factors
+ * @return CorrelationResult* Array of correlation results (sorted by strength)
+ */
+CorrelationResult* find_strongest_correlations(const double* target, 
+    const double** factors,
+    const char** factorNames,
+    int targetLength,
+    int factorCount) {
+// Allocate results array (one more than needed to mark the end)
+CorrelationResult* results = new CorrelationResult[factorCount + 1];
+
+// Calculate correlations for each factor
+for (int i = 0; i < factorCount; i++) {
+// Calculate both Pearson and Spearman correlations
+double pearson_corr = calculateCorrelation(target, factors[i], targetLength);
+double spearman_corr = calculateSpearmanCorrelation(target, factors[i], targetLength);
+
+// Use the correlation with higher absolute value
+double corr = (std::abs(pearson_corr) > std::abs(spearman_corr)) ? 
+pearson_corr : spearman_corr;
+
+results[i].factorIndex = i;
+results[i].correlation = corr;
+
+// Estimate p-value based on correlation and sample size
+// This is an approximation; a real implementation would use t-distribution
+double t = corr * std::sqrt((targetLength - 2) / (1 - corr * corr));
+// Simplified 2-tailed p-value approximation
+results[i].pValue = 2 * (1 - std::min(1.0, std::exp(-0.717 * std::abs(t) - 0.416 * t * t)));
+
+// Copy factor name
+strncpy(results[i].factorName, factorNames[i], MAX_STRING_SIZE - 1);
+results[i].factorName[MAX_STRING_SIZE - 1] = '\0';
+
+// Calculate confidence based on sample size, correlation strength, and p-value
+double sample_size_factor = 1.0 - 1.0 / std::sqrt(targetLength);
+double p_value_factor = 1.0 - results[i].pValue;
+results[i].confidence = std::abs(corr) * sample_size_factor * p_value_factor;
+}
+
+// Sort by absolute correlation value
+std::sort(results, results + factorCount, 
+[](const CorrelationResult& a, const CorrelationResult& b) {
+return std::abs(a.correlation) > std::abs(b.correlation);
+});
+
+// Mark the end of valid results
+results[factorCount].factorIndex = -1;
+results[factorCount].correlation = 0;
+results[factorCount].confidence = 0;
+results[factorCount].factorName[0] = '\0';
+
+return results;
+}
+
+/**
+* @brief Free memory for correlation results
+* 
+* @param results Pointer to correlation results array
+*/
+void free_correlation_results(CorrelationResult* results) {
+delete[] results;
+}
+
+/**
+* @brief Find multivariate correlations between factors
+* 
+* @param data Array of factor time series
+* @param factorNames Array of factor names
+* @param factorCount Number of factors
+* @param dataLength Length of time series
+* @return MultivariateCorrelation* Array of multivariate correlation results
+*/
+MultivariateCorrelation* find_multivariate_correlations(const double** data,
+             const char** factorNames,
+             int factorCount,
+             int dataLength) {
+if (factorCount < 2 || dataLength < 3) {
+MultivariateCorrelation* dummy = new MultivariateCorrelation[1];
+memset(dummy, 0, sizeof(MultivariateCorrelation));
+dummy[0].factorCount = -1;  // Mark as invalid
+return dummy;
+}
+
+// Constants
+const int MAX_RESULTS = 100;
+const double MIN_CORRELATION_THRESHOLD = 0.3;
+
+// Allocate space for up to MAX_RESULTS correlations + 1 terminator
+MultivariateCorrelation* results = new MultivariateCorrelation[MAX_RESULTS + 1];
+int resultCount = 0;
+
+// Create correlation matrix for all pairwise correlations
+std::vector<std::vector<double>> corrMatrix(factorCount, std::vector<double>(factorCount, 0));
+
+for (int i = 0; i < factorCount; i++) {
+corrMatrix[i][i] = 1.0; // Self-correlation is 1
+
+for (int j = i + 1; j < factorCount; j++) {
+// Calculate correlation using both Pearson and Spearman
+double pearson = calculateCorrelation(data[i], data[j], dataLength);
+double spearman = calculateSpearmanCorrelation(data[i], data[j], dataLength);
+
+// Use the one with higher absolute value
+double corr = (std::abs(pearson) > std::abs(spearman)) ? pearson : spearman;
+
+// Store in both positions (symmetric matrix)
+corrMatrix[i][j] = corr;
+corrMatrix[j][i] = corr;
+}
+}
+
+// Find pairwise correlations
+for (int i = 0; i < factorCount && resultCount < MAX_RESULTS; i++) {
+for (int j = i + 1; j < factorCount && resultCount < MAX_RESULTS; j++) {
+double corr = corrMatrix[i][j];
+
+// Only store significant correlations
+if (std::abs(corr) >= MIN_CORRELATION_THRESHOLD) {
+MultivariateCorrelation& result = results[resultCount];
+
+result.factorCount = 2;
+result.correlationStrength = corr;
+result.primaryFactorIndex = i;
+result.secondaryFactorIndex = j;
+result.tertiaryFactorIndex = -1;
+
+// Calculate confidence based on sample size and correlation strength
+double t_stat = std::abs(corr) * std::sqrt((dataLength - 2) / (1 - corr * corr));
+double p_value = 2 * (1 - std::min(1.0, std::exp(-0.717 * t_stat - 0.416 * t_stat * t_stat)));
+result.confidence = std::abs(corr) * (1.0 - p_value) * (1.0 - 1.0 / std::sqrt(dataLength));
+
+// Copy factor names
+strncpy(result.factorNames[0], factorNames[i], MAX_STRING_SIZE - 1);
+result.factorNames[0][MAX_STRING_SIZE - 1] = '\0';
+
+strncpy(result.factorNames[1], factorNames[j], MAX_STRING_SIZE - 1);
+result.factorNames[1][MAX_STRING_SIZE - 1] = '\0';
+
+// Set factor weights based on correlation directionality
+if (corr > 0) {
+result.factorWeights[0] = 0.5;
+result.factorWeights[1] = 0.5;
+} else {
+result.factorWeights[0] = 0.5;
+result.factorWeights[1] = -0.5;
+}
+
+// Determine relationship type
+result.relationshipType = RELATIONSHIP_CORRELATION;
+
+// Generate description
+const char* strength_text = 
+(std::abs(corr) > 0.7) ? "strong" : 
+(std::abs(corr) > 0.5) ? "moderate" : "weak";
+
+const char* direction_text = (corr > 0) ? "positive" : "negative";
+
+snprintf(result.description, MAX_STRING_SIZE,
+"There is a %s %s correlation between %s and %s (r=%.2f, p=%.3f)",
+strength_text, direction_text,
+factorNames[i], factorNames[j], corr, 
+(result.confidence > 0.99) ? 0.001 : 1.0 - result.confidence);
+
+resultCount++;
+}
+}
+}
+
+// Find partial correlations and three-way relationships
+if (factorCount >= 3) {
+// Calculate partial correlations
+std::vector<std::vector<std::vector<double>>> partialCorr(
+factorCount, std::vector<std::vector<double>>(
+factorCount, std::vector<double>(factorCount, 0.0)));
+
+for (int i = 0; i < factorCount; i++) {
+for (int j = i + 1; j < factorCount; j++) {
+for (int k = 0; k < factorCount; k++) {
+if (k == i || k == j) continue;
+
+// Calculate partial correlation between i and j controlling for k
+double r_ij = corrMatrix[i][j];
+double r_ik = corrMatrix[i][k];
+double r_jk = corrMatrix[j][k];
+
+double denominator = std::sqrt((1 - r_ik * r_ik) * (1 - r_jk * r_jk));
+
+if (denominator > 1e-10) {
+double partial = (r_ij - r_ik * r_jk) / denominator;
+partialCorr[i][j][k] = partial;
+partialCorr[j][i][k] = partial;
+}
+}
+}
+}
+
+// Find three-way relationships
+for (int i = 0; i < factorCount && resultCount < MAX_RESULTS; i++) {
+for (int j = i + 1; j < factorCount && resultCount < MAX_RESULTS; j++) {
+for (int k = j + 1; k < factorCount && resultCount < MAX_RESULTS; k++) {
+// Get pairwise correlations
+double r_ij = corrMatrix[i][j];
+double r_ik = corrMatrix[i][k];
+double r_jk = corrMatrix[j][k];
+
+// Check if all pairs are correlated
+if (std::abs(r_ij) >= MIN_CORRELATION_THRESHOLD &&
+std::abs(r_ik) >= MIN_CORRELATION_THRESHOLD &&
+std::abs(r_jk) >= MIN_CORRELATION_THRESHOLD) {
+
+// Get partial correlations
+double p_ij_k = partialCorr[i][j][k]; // i,j controlling for k
+double p_ik_j = partialCorr[i][k][j]; // i,k controlling for j
+double p_jk_i = partialCorr[j][k][i]; // j,k controlling for i
+
+// Determine if mediation or confounding is present
+bool is_mediation = false;
+int mediator = -1;
+
+// Check if k mediates i->j
+if (std::abs(p_ij_k) < std::abs(r_ij) * 0.5) {
+is_mediation = true;
+mediator = k;
+}
+// Check if j mediates i->k
+else if (std::abs(p_ik_j) < std::abs(r_ik) * 0.5) {
+is_mediation = true;
+mediator = j;
+}
+// Check if i mediates j->k
+else if (std::abs(p_jk_i) < std::abs(r_jk) * 0.5) {
+is_mediation = true;
+mediator = i;
+}
+
+MultivariateCorrelation& result = results[resultCount];
+
+result.factorCount = 3;
+// Use average correlation as strength
+result.correlationStrength = (std::abs(r_ij) + std::abs(r_ik) + std::abs(r_jk)) / 3.0;
+result.primaryFactorIndex = i;
+result.secondaryFactorIndex = j;
+result.tertiaryFactorIndex = k;
+
+// Lower confidence for three-way relationships
+result.confidence = result.correlationStrength * 
+ (1.0 - 2.0 / std::sqrt(dataLength));
+
+// Copy factor names
+strncpy(result.factorNames[0], factorNames[i], MAX_STRING_SIZE - 1);
+result.factorNames[0][MAX_STRING_SIZE - 1] = '\0';
+
+strncpy(result.factorNames[1], factorNames[j], MAX_STRING_SIZE - 1);
+result.factorNames[1][MAX_STRING_SIZE - 1] = '\0';
+
+strncpy(result.factorNames[2], factorNames[k], MAX_STRING_SIZE - 1);
+result.factorNames[2][MAX_STRING_SIZE - 1] = '\0';
+
+// Set relationship type
+if (is_mediation) {
+result.relationshipType = RELATIONSHIP_MEDIATION;
+
+// Set weights based on mediation path
+if (mediator == i) {
+result.factorWeights[0] = 0.5;  // Mediator
+result.factorWeights[1] = 0.3;  // Source
+result.factorWeights[2] = 0.3;  // Target
+} else if (mediator == j) {
+result.factorWeights[0] = 0.3;  // Source
+result.factorWeights[1] = 0.5;  // Mediator
+result.factorWeights[2] = 0.3;  // Target
+} else {
+result.factorWeights[0] = 0.3;  // Source
+result.factorWeights[1] = 0.3;  // Target
+result.factorWeights[2] = 0.5;  // Mediator
+}
+
+// Generate description for mediation
+snprintf(result.description, MAX_STRING_SIZE,
+"Potential mediation detected: %s may mediate the relationship between %s and %s",
+factorNames[mediator],
+factorNames[(mediator+1) % 3],
+factorNames[(mediator+2) % 3]);
+} else {
+result.relationshipType = RELATIONSHIP_NETWORK;
+
+// Equal weights for network relationship
+result.factorWeights[0] = 0.33;
+result.factorWeights[1] = 0.33;
+result.factorWeights[2] = 0.33;
+
+// Generate description for network
+snprintf(result.description, MAX_STRING_SIZE,
+"Network relationship detected between %s, %s, and %s (average correlation: %.2f)",
+factorNames[i], factorNames[j], factorNames[k],
+result.correlationStrength);
+}
+
+resultCount++;
+}
+}
+}
+}
+}
+
+// Mark the end of valid results
+if (resultCount < MAX_RESULTS) {
+results[resultCount].factorCount = -1;
+} else {
+results[MAX_RESULTS].factorCount = -1;
+}
+
+return results;
+}
+
+/**
+* @brief Free memory for multivariate correlation results
+* 
+* @param results Pointer to multivariate correlation results array
+*/
+void free_multivariate_correlations(MultivariateCorrelation* results) {
+delete[] results;
+}
+
+/**
+* @brief Direct API access to correlation calculation
+* 
+* @param x First data series
+* @param y Second data series
+* @param length Length of data series
+* @return double Correlation coefficient
+*/
+double calculate_correlation(const double* x, const double* y, int length) {
+if (length <= 1) return 0;
+
+double sum_x = 0, sum_y = 0, sum_xy = 0;
+double sum_x2 = 0, sum_y2 = 0;
+
+for (int i = 0; i < length; i++) {
+sum_x += x[i];
+sum_y += y[i];
+sum_xy += x[i] * y[i];
+sum_x2 += x[i] * x[i];
+sum_y2 += y[i] * y[i];
+}
+
+double denominator = sqrt((length * sum_x2 - sum_x * sum_x) * 
+(length * sum_y2 - sum_y * sum_y));
+
+if (denominator < 1e-10) return 0; // Avoid division by zero
+
+return (length * sum_xy - sum_x * sum_y) / denominator;
+}
--- a/native/statistics/health_analytics.cpp
+++ b/native/statistics/health_analytics.cpp
@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
+// SPDX-License-Identifier: AGPL-3.0
+//
+// health_analytics.cpp
+// Comprehensive C++ analytics engine for health data processing
+// Provides statistical analysis, pattern detection, and predictive modeling for health metrics
+//
+#include "health_analytics_engine.h" 
+#include <vector>
+#include <string>
+#include <cmath>
+#include <algorithm>
+#include <map>
+#include <set>
+#include <unordered_map>
+#include <memory>
+#include <cstring>
+#include <ctime>
+#include <numeric>
+#include <random>
+#include <limits>
+
+// Include all module files
+#include "utils.cpp"
+#include "basic_stats.cpp"
+#include "correlation.cpp"
+#include "time_series.cpp"
+#include "clustering.cpp"
+#include "anomaly_detection.cpp"
+#include "impact_analysis.cpp"
--- a/native/statistics/health_analytics_engine.h
+++ b/native/statistics/health_analytics_engine.h
@ -0,0 +1,316 @@
+// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
+// SPDX-License-Identifier: AGPL-3.0
+//
+// HealthAnalyticsEngine.h
+// Core C++ header for health analytics calculations
+//
+#ifndef HEALTH_ANALYTICS_ENGINE_H
+#define HEALTH_ANALYTICS_ENGINE_H
+
+#include <vector>
+#include <string>
+#include <cstring>
+#include <map>
+#include <unordered_map>
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <cstdint>
+
+// Constants
+constexpr int MAX_STRING_SIZE = 200;
+
+// Forward declarations
+struct DateStruct;
+struct BasicStats;
+struct TrendResult;
+struct CorrelationResult;
+struct MultivariateCorrelation;
+struct ClusterResult;
+struct TimeSeriesForecast;
+struct AnomalyResult;
+struct FactorImpactResult;
+struct DatePatternResult;
+struct CycleAnalysisResult;
+struct MedicationImpactAnalysis;
+struct HormoneImpactAnalysis;
+
+// Data structures for FFI communication
+struct DateStruct {
+    int year;
+    int month;
+    int day;
+};
+
+struct BasicStats {
+    double mean;
+    double median;
+    double min;
+    double max;
+    double stdDev;
+    double variance;
+    double skewness;
+    double kurtosis;
+    double q1;
+    double q3;
+    double iqr;
+};
+
+enum TrendType {
+    TREND_NONE = 0,
+    TREND_INCREASING = 1,
+    TREND_DECREASING = 2,
+    TREND_CYCLIC = 3,
+    TREND_VARIABLE = 4
+};
+
+struct TrendResult {
+    TrendType trendType;
+    double strength;
+    char description[MAX_STRING_SIZE];
+};
+
+struct CorrelationResult {
+    int factorIndex;
+    double correlation;
+    double pValue;
+    double confidence;
+    char factorName[MAX_STRING_SIZE];
+};
+
+enum RelationshipType {
+    RELATIONSHIP_CORRELATION = 0,
+    RELATIONSHIP_CAUSATION = 1,
+    RELATIONSHIP_COINCIDENTAL = 2,
+    RELATIONSHIP_MEDIATION = 3,
+    RELATIONSHIP_NETWORK = 4
+};
+
+struct MultivariateCorrelation {
+    int factorCount;
+    char factorNames[MAX_STRING_SIZE][MAX_STRING_SIZE];
+    double factorWeights[50]; // Using a constant size
+    double correlationStrength;
+    char description[MAX_STRING_SIZE];
+    double confidence;
+    RelationshipType relationshipType;
+    int primaryFactorIndex;
+    int secondaryFactorIndex;
+    int tertiaryFactorIndex;
+};
+
+struct ClusterResult {
+    int clusterId;
+    char clusterName[MAX_STRING_SIZE];
+    char description[MAX_STRING_SIZE];
+    int dataPointCount;
+    double significance;
+    int factorIndices[50]; // Using a constant size
+    double factorWeights[50]; // Using a constant size
+    double centeroid[50]; // Using a constant size
+    double radius;
+};
+
+enum TimeUnit {
+    TIME_UNIT_DAYS = 0,
+    TIME_UNIT_WEEKS = 1,
+    TIME_UNIT_MONTHS = 2
+};
+
+struct TimeSeriesForecast {
+    double predictions[30];
+    double confidenceIntervals[30][2];
+    double overallConfidence;
+    int seasonalityPeriod;
+    TimeUnit timeUnit;
+    char factorName[MAX_STRING_SIZE];
+};
+
+enum AnomalyType {
+    ANOMALY_OUTLIER = 0,
+    ANOMALY_TREND_CHANGE = 1,
+    ANOMALY_SEASONALITY_CHANGE = 2,
+    ANOMALY_CONTEXTUAL = 3
+};
+
+struct AnomalyResult {
+    int dataPointIndex;
+    double anomalyScore;
+    char description[MAX_STRING_SIZE];
+    double originalValue;
+    double expectedValue;
+    DateStruct date;
+    double confidence;
+    AnomalyType anomalyType;
+    char factorName[MAX_STRING_SIZE];
+};
+
+struct FactorImpactResult {
+    int factorIndex;
+    char factorName[MAX_STRING_SIZE];
+    double impactScore;
+    double directEffect;
+    double indirectEffect;
+    double confidence;
+    char mechanism[MAX_STRING_SIZE];
+};
+
+enum PatternType {
+    PATTERN_NONE = 0,
+    PATTERN_DAILY = 1,
+    PATTERN_WEEKLY = 2,
+    PATTERN_MONTHLY = 3,
+    PATTERN_CUSTOM = 4
+};
+
+struct DatePatternResult {
+    PatternType patternType;
+    int periodicity;
+    double strength;
+    char description[MAX_STRING_SIZE];
+    double peakValues[7];  // For weekly patterns
+    int peakDayOfWeek;     // 0-6, where 0 is Sunday
+    int peakDayOfMonth;    // 1-31
+    int peakMonth;         // 1-12
+};
+
+struct CycleAnalysisResult {
+    double cycleLength;
+    double cycleLengthVariance;
+    double amplitude;
+    double phaseShift;
+    double confidence;
+    char description[MAX_STRING_SIZE];
+};
+
+struct MedicationImpactAnalysis {
+    char medicationName[MAX_STRING_SIZE];
+    double beforeMean;
+    double afterMean;
+    double changeMagnitude;
+    double changeSignificance;
+    double overallImpact;
+    int daysToEffect;
+    char description[MAX_STRING_SIZE];
+    char factorName[MAX_STRING_SIZE];
+};
+
+struct HormoneImpactAnalysis {
+    char hormoneName[MAX_STRING_SIZE];
+    double currentLevel;
+    double optimalLevel;
+    double optimalRangeLower;
+    double optimalRangeUpper;
+    double deviation;
+    double impactOnMood;
+    double impactOnEnergy;
+    double impactOnOtherFactors[50]; // Using a constant size
+    char factorNames[50][MAX_STRING_SIZE]; // Using a constant size
+    char description[MAX_STRING_SIZE];
+};
+
+// Removed the HealthAnalyticsEngine class since it's not implemented
+
+// C-style API for FFI
+extern "C" {
+    // Basic statistics
+    BasicStats calculate_basic_stats(const double* values, int length);
+    
+    // Trend analysis
+    TrendType detect_trend(const double* values, int length, double* strength_out);
+    
+    // Correlation analysis
+    double calculate_correlation(const double* x, const double* y, int length);
+    CorrelationResult* find_strongest_correlations(
+        const double* target_values, 
+        const double** factor_values, 
+        const char** factor_names, 
+        int data_length, 
+        int factor_count);
+    void free_correlation_results(CorrelationResult* results);
+    
+    // Multivariate analysis
+    MultivariateCorrelation* find_multivariate_correlations(
+        const double** factor_data,
+        const char** factor_names,
+        int factor_count,
+        int data_length);
+    void free_multivariate_correlations(MultivariateCorrelation* correlations);
+    
+    // Cluster analysis
+    ClusterResult* perform_cluster_analysis(
+        const double** factor_data,
+        int factor_count,
+        int data_length,
+        int max_clusters);
+    void free_cluster_results(ClusterResult* results);
+    
+    // Time series forecasting
+    TimeSeriesForecast predict_time_series(
+        const double* time_series_data,
+        int data_length,
+        int steps_ahead,
+        const char* factor_name);
+    
+    // Anomaly detection
+    AnomalyResult* detect_anomalies(
+        const double* time_series_data,
+        int data_length,
+        double threshold,
+        const DateStruct* dates,
+        const char* factor_name);
+    void free_anomaly_results(AnomalyResult* results);
+    
+    // Factor impact ranking
+    FactorImpactResult* rank_factor_impacts(
+        const double** factor_data,
+        const double* target_data,
+        const char** factor_names,
+        int factor_count,
+        int data_length);
+    void free_factor_impact_results(FactorImpactResult* results);
+    
+    // Date pattern analysis
+    DatePatternResult* analyze_date_patterns(
+        const double* values,
+        const DateStruct* dates,
+        int data_length,
+        const char* factor_name);
+    void free_date_pattern_results(DatePatternResult* results);
+    
+    // Cycle analysis
+    CycleAnalysisResult analyze_cycles(
+        const double* values,
+        const DateStruct* dates,
+        int data_length,
+        const char* factor_name);
+    
+    // Medication impact analysis
+    MedicationImpactAnalysis* analyze_medication_impact(
+        const double* before_data,
+        int before_length,
+        const double* after_data,
+        int after_length,
+        const char* medication_name,
+        const char* factor_name);
+    void free_medication_impact_analysis(MedicationImpactAnalysis* analysis);
+    
+    // Hormone impact analysis
+    HormoneImpactAnalysis* analyze_hormone_impact(
+        const double* hormone_levels,
+        int data_length,
+        const double** factor_data,
+        const char** factor_names,
+        int factor_count,
+        const char* hormone_name,
+        double min_optimal_level,
+        double max_optimal_level);
+    void free_hormone_impact_analysis(HormoneImpactAnalysis* analysis);
+}
+// Utility functions
+void normalize_data(const double* data, int length, double minValue, double maxValue, double* normalizedData);
+int detect_change_points(const double* data, int length, double threshold, int* changePoints, int maxChangePoints);
+void optimize_svr_parameters(const double** x_data, const double* y_data, int length, 
+                           double& bestC, double& bestEpsilon, double& bestGamma, double& bestScore);
+
+#endif // HEALTH_ANALYTICS_ENGINE_H
--- a/native/statistics/impact_analysis.cpp
+++ b/native/statistics/impact_analysis.cpp
@ -0,0 +1,361 @@
+// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
+// SPDX-License-Identifier: AGPL-3.0
+//
+// impact_analysis.cpp
+// Implementation of factor impact and medication analysis functions
+//
+#include "health_analytics_engine.h"
+#include "utils.h"
+/**
+ * @brief Rank factors by their impact on a target variable
+ * 
+ * @param factors Array of factor time series
+ * @param target Target variable time series
+ * @param factorNames Array of factor names
+ * @param factorCount Number of factors
+ * @param dataLength Length of time series
+ * @return FactorImpactResult* Array of factor impact results
+ */
+FactorImpactResult* rank_factor_impacts(const double** factors,
+    const double* target,
+    const char** factorNames,
+    int factorCount,
+    int dataLength) {
+if (factorCount <= 0 || dataLength <= 2) {
+FactorImpactResult* dummy = new FactorImpactResult[1];
+memset(dummy, 0, sizeof(FactorImpactResult));
+dummy[0].factorIndex = -1;  // Mark as invalid
+return dummy;
+}
+
+// Allocate space for results (plus one for terminator)
+FactorImpactResult* results = new FactorImpactResult[factorCount + 1];
+
+// Calculate correlation matrix for all factors + target
+std::vector<std::vector<double>> corrMatrix(factorCount + 1, std::vector<double>(factorCount + 1, 0));
+
+for (int i = 0; i < factorCount; i++) {
+// Correlation between factor i and target
+corrMatrix[i][factorCount] = calculateCorrelation(factors[i], target, dataLength);
+corrMatrix[factorCount][i] = corrMatrix[i][factorCount];
+
+// Correlations between factors
+for (int j = i + 1; j < factorCount; j++) {
+corrMatrix[i][j] = calculateCorrelation(factors[i], factors[j], dataLength);
+corrMatrix[j][i] = corrMatrix[i][j];
+}
+
+// Self-correlation is 1
+corrMatrix[i][i] = 1.0;
+}
+corrMatrix[factorCount][factorCount] = 1.0;
+
+// Calculate the impact of each factor
+for (int i = 0; i < factorCount; i++) {
+FactorImpactResult& impact = results[i];
+
+// Direct effect is correlation with target
+double directEffect = corrMatrix[i][factorCount];
+
+// Calculate indirect effects through other factors
+double indirectEffect = 0;
+for (int j = 0; j < factorCount; j++) {
+if (j != i) {
+// Indirect effect through factor j
+indirectEffect += corrMatrix[i][j] * corrMatrix[j][factorCount];
+}
+}
+// Normalize indirect effect
+indirectEffect /= std::max(1, factorCount - 1);
+
+// Calculate partial correlation (direct effect controlling for other factors)
+// This is a simplified approach - real implementation would use matrix operations
+double partialCorr = directEffect;
+if (factorCount > 1) {
+double sumControlledVar = 0;
+for (int j = 0; j < factorCount; j++) {
+if (j != i) {
+// Remove effect of factor j from both target and factor i
+double controlEffect = corrMatrix[i][j] * corrMatrix[j][factorCount];
+partialCorr -= controlEffect / (factorCount - 1);
+sumControlledVar += corrMatrix[i][j] * corrMatrix[i][j];
+}
+}
+sumControlledVar /= (factorCount - 1);
+// Normalize partial correlation
+if (sumControlledVar < 0.98) { // Avoid division by near-zero
+partialCorr /= sqrt((1 - sumControlledVar));
+}
+}
+
+// Calculate total impact score
+impact.factorIndex = i;
+impact.directEffect = directEffect;
+impact.indirectEffect = indirectEffect;
+
+// Total impact is weighted sum of direct and partial correlation
+double partialWeight = 0.7;  // Weight more toward direct unique contribution
+impact.impactScore = partialWeight * std::abs(partialCorr) + (1 - partialWeight) * std::abs(directEffect);
+
+// Calculate confidence based on correlation strength and sample size
+double t_stat = std::abs(directEffect) * std::sqrt((dataLength - 2) / (1 - directEffect * directEffect));
+double p_value = 2 * (1 - std::min(1.0, std::exp(-0.717 * t_stat - 0.416 * t_stat * t_stat)));
+impact.confidence = std::min(0.95, (1.0 - p_value) * (1.0 - 1.0 / std::sqrt(dataLength)));
+
+// Copy factor name
+strncpy(impact.factorName, factorNames[i], MAX_STRING_SIZE - 1);
+impact.factorName[MAX_STRING_SIZE - 1] = '\0';
+
+// Generate mechanism description based on direct and indirect effects
+const char* direction = (directEffect > 0) ? "positive" : "negative";
+const char* strength = 
+(std::abs(directEffect) > 0.7) ? "strong" : 
+(std::abs(directEffect) > 0.4) ? "moderate" : "weak";
+
+// Check for mediation effects
+bool hasMediationEffect = std::abs(indirectEffect) > 0.2 && 
+      std::abs(indirectEffect) > std::abs(directEffect) * 0.5;
+
+if (hasMediationEffect) {
+snprintf(impact.mechanism, MAX_STRING_SIZE,
+"%s %s impact: %s affects the target both directly (%.2f) and through other factors (%.2f)",
+strength, direction, factorNames[i], directEffect, indirectEffect);
+} else {
+snprintf(impact.mechanism, MAX_STRING_SIZE,
+"%s %s impact: changes in %s are %s associated with changes in the target (r=%.2f)",
+strength, direction, factorNames[i], direction, directEffect);
+}
+}
+
+// Sort by impact score (descending)
+std::sort(results, results + factorCount, 
+[](const FactorImpactResult& a, const FactorImpactResult& b) {
+return a.impactScore > b.impactScore;
+});
+
+// Mark the end of valid results
+results[factorCount].factorIndex = -1;
+
+return results;
+}
+
+/**
+* @brief Free memory for factor impact results
+* 
+* @param results Pointer to factor impact results array
+*/
+void free_factor_impact_results(FactorImpactResult* results) {
+delete[] results;
+}
+
+/**
+* @brief Analyze the impact of medication on a health metric
+* 
+* @param before_data Values before medication
+* @param before_length Length of before data
+* @param after_data Values after medication
+* @param after_length Length of after data
+* @param medication_name Name of the medication
+* @param factor_name Name of the health factor
+* @return MedicationImpactAnalysis* Pointer to impact analysis result
+*/
+MedicationImpactAnalysis* analyze_medication_impact(
+const double* before_data,
+int before_length,
+const double* after_data,
+int after_length,
+const char* medication_name,
+const char* factor_name) {
+
+MedicationImpactAnalysis* result = new MedicationImpactAnalysis();
+memset(result, 0, sizeof(MedicationImpactAnalysis));
+
+// Copy names
+strncpy(result->medicationName, medication_name, MAX_STRING_SIZE-1);
+strncpy(result->factorName, factor_name, MAX_STRING_SIZE-1);
+
+// Return if insufficient data
+if (before_length < 5 || after_length < 5) {
+strncpy(result->description, "Insufficient data for analysis", MAX_STRING_SIZE-1);
+return result;
+}
+
+// Calculate means
+double beforeSum = 0, afterSum = 0;
+for (int i = 0; i < before_length; i++) {
+beforeSum += before_data[i];
+}
+for (int i = 0; i < after_length; i++) {
+afterSum += after_data[i];
+}
+
+result->beforeMean = beforeSum / before_length;
+result->afterMean = afterSum / after_length;
+
+// Calculate change magnitude
+result->changeMagnitude = result->afterMean - result->beforeMean;
+
+// Calculate significance with simple t-test
+double beforeVar = 0, afterVar = 0;
+for (int i = 0; i < before_length; i++) {
+double diff = before_data[i] - result->beforeMean;
+beforeVar += diff * diff;
+}
+for (int i = 0; i < after_length; i++) {
+double diff = after_data[i] - result->afterMean;
+afterVar += diff * diff;
+}
+
+beforeVar /= (before_length - 1);
+afterVar /= (after_length - 1);
+
+double se = sqrt(beforeVar/before_length + afterVar/after_length);
+double tStat = fabs(result->changeMagnitude) / (se + 0.0001);
+
+// Simple significance estimation (0-1)
+result->changeSignificance = std::min(1.0, tStat / 5.0);
+
+// Overall impact combines magnitude and significance
+result->overallImpact = fabs(result->changeMagnitude) * result->changeSignificance;
+
+// Estimate days to effect (placeholder implementation)
+result->daysToEffect = 7; // Assumed 1 week
+
+// Generate description
+const char* direction = (result->changeMagnitude > 0) ? "increased" : "decreased";
+const char* significance = (result->changeSignificance > 0.7) ? "significant" : 
+  (result->changeSignificance > 0.3) ? "moderate" : "slight";
+
+snprintf(result->description, MAX_STRING_SIZE,
+"%s shows a %s %s effect on %s (%.1f → %.1f)",
+medication_name, significance, direction, factor_name,
+result->beforeMean, result->afterMean);
+
+return result;
+}
+
+/**
+* @brief Free memory for medication impact analysis
+* 
+* @param analysis Pointer to medication impact analysis
+*/
+void free_medication_impact_analysis(MedicationImpactAnalysis* analysis) {
+delete analysis;
+}
+
+/**
+* @brief Analyze hormone impact on health metrics
+* 
+* @param hormone_levels Array of hormone levels
+* @param data_length Length of hormone data
+* @param factor_data Array of factor data arrays
+* @param factor_names Array of factor names
+* @param factor_count Number of factors
+* @param hormone_name Name of the hormone
+* @param min_optimal_level Lower bound of optimal range
+* @param max_optimal_level Upper bound of optimal range
+* @return HormoneImpactAnalysis* Pointer to impact analysis result
+*/
+HormoneImpactAnalysis* analyze_hormone_impact(
+const double* hormone_levels,
+int data_length,
+const double** factor_data,
+const char** factor_names,
+int factor_count,
+const char* hormone_name,
+double min_optimal_level,
+double max_optimal_level) {
+
+HormoneImpactAnalysis* result = new HormoneImpactAnalysis();
+memset(result, 0, sizeof(HormoneImpactAnalysis));
+
+// Copy hormone name
+strncpy(result->hormoneName, hormone_name, MAX_STRING_SIZE-1);
+
+// Return if insufficient data
+if (data_length < 3 || factor_count <= 0) {
+strncpy(result->description, "Insufficient data for analysis", MAX_STRING_SIZE-1);
+return result;
+}
+
+// Calculate current hormone level (average of recent readings)
+double sum = 0;
+for (int i = 0; i < data_length; i++) {
+sum += hormone_levels[i];
+}
+result->currentLevel = sum / data_length;
+
+// Set optimal levels
+result->optimalRangeLower = min_optimal_level;
+result->optimalRangeUpper = max_optimal_level;
+result->optimalLevel = (min_optimal_level + max_optimal_level) / 2;
+
+// Calculate deviation from optimal range
+if (result->currentLevel < min_optimal_level) {
+result->deviation = (result->currentLevel - min_optimal_level) / min_optimal_level;
+} else if (result->currentLevel > max_optimal_level) {
+result->deviation = (result->currentLevel - max_optimal_level) / max_optimal_level;
+} else {
+result->deviation = 0; // Within optimal range
+}
+
+// Calculate correlations with factors
+for (int i = 0; i < factor_count && i < 50; i++) {
+// Calculate correlation
+double correlation = 0;
+double sum_xy = 0, sum_x2 = 0, sum_y2 = 0;
+double sum_x = 0, sum_y = 0;
+
+for (int j = 0; j < data_length; j++) {
+sum_x += hormone_levels[j];
+sum_y += factor_data[i][j];
+sum_xy += hormone_levels[j] * factor_data[i][j];
+sum_x2 += hormone_levels[j] * hormone_levels[j];
+sum_y2 += factor_data[i][j] * factor_data[i][j];
+}
+
+double n = data_length;
+double denominator = sqrt((n * sum_x2 - sum_x * sum_x) * (n * sum_y2 - sum_y * sum_y));
+
+if (denominator > 0) {
+correlation = (n * sum_xy - sum_x * sum_y) / denominator;
+}
+
+// Store impact and factor name
+result->impactOnOtherFactors[i] = correlation;
+strncpy(result->factorNames[i], factor_names[i], MAX_STRING_SIZE-1);
+
+// Set impact on mood and energy if found
+if (strcmp(factor_names[i], "Mood") == 0) {
+result->impactOnMood = correlation;
+} else if (strcmp(factor_names[i], "Energy") == 0) {
+result->impactOnEnergy = correlation;
+}
+}
+
+// Generate description
+const char* status;
+if (fabs(result->deviation) < 0.1) {
+status = "within optimal range";
+} else if (result->deviation < 0) {
+status = "below optimal range";
+} else {
+status = "above optimal range";
+}
+
+snprintf(result->description, MAX_STRING_SIZE,
+"%s level is %s (%.1f, range: %.1f-%.1f)",
+hormone_name, status, result->currentLevel,
+min_optimal_level, max_optimal_level);
+
+return result;
+}
+
+/**
+* @brief Free memory for hormone impact analysis
+* 
+* @param analysis Pointer to hormone impact analysis
+*/
+void free_hormone_impact_analysis(HormoneImpactAnalysis* analysis) {
+delete analysis;
+}
--- a/native/statistics/time_series.cpp
+++ b/native/statistics/time_series.cpp
@ -0,0 +1,272 @@
+// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
+// SPDX-License-Identifier: AGPL-3.0
+//
+// time_series.cpp
+// Implementation of time series analysis functions
+//
+#include "health_analytics_engine.h"
+#include "utils.h"
+/**
+ * @brief Detect trends in time series data
+ * 
+ * @param values Time series data
+ * @param length Number of elements in the array
+ * @param strength Output parameter for trend strength
+ * @return TrendType Enum indicating trend direction and type
+ */
+TrendType detect_trend(const double* values, int length, double* strength) {
+    if (length < 3) {
+        *strength = 0;
+        return TREND_NONE;
+    }
+    
+    // Generate time vector (0, 1, 2, ...)
+    std::vector<double> time(length);
+    for (int i = 0; i < length; i++) {
+        time[i] = i;
+    }
+    
+    // Calculate linear regression
+    double slope, intercept, r_squared;
+    if (!calculateLinearRegression(time.data(), values, length, slope, intercept, r_squared)) {
+        *strength = 0;
+        return TREND_NONE;
+    }
+    
+    // Detrend the data for further analysis
+    std::vector<double> detrended(length);
+    for (int i = 0; i < length; i++) {
+        detrended[i] = values[i] - (intercept + slope * i);
+    }
+    
+    // Check for cyclical patterns using autocorrelation
+    bool has_cycle = false;
+    int cycle_length = 0;
+    double max_autocorr = 0;
+    
+    // Check autocorrelation for various lags
+    const int MIN_LAG = 2;
+    const int MAX_LAG = length / 3; // Look for cycles up to 1/3 of series length
+    
+    for (int lag = MIN_LAG; lag < MAX_LAG; lag++) {
+        double autocorr = calculateAutocorrelation(detrended.data(), length, lag);
+        
+        // If strong positive autocorrelation found
+        if (autocorr > 0.3 && autocorr > max_autocorr) {
+            max_autocorr = autocorr;
+            cycle_length = lag;
+            has_cycle = true;
+        }
+    }
+    
+    // Check if cycle pattern is stronger than linear trend
+    if (has_cycle && max_autocorr > std::abs(r_squared)) {
+        *strength = max_autocorr;
+        return TREND_CYCLIC;
+    }
+    
+    // Determine trend direction based on slope and strength
+    *strength = std::abs(r_squared);
+    
+    // Require minimum strength to declare a trend
+    if (*strength < 0.2) {
+        return TREND_NONE;
+    } else if (slope > 0) {
+        return TREND_INCREASING;
+    } else {
+        return TREND_DECREASING;
+    }
+}
+
+/**
+ * @brief Predict future values of a time series using ARIMA-like approach
+ * 
+ * @param timeSeries Time series data
+ * @param dataLength Length of time series
+ * @param stepsAhead Number of future steps to predict
+ * @param factorName Name of the factor being predicted
+ * @return TimeSeriesForecast Structure containing predictions and confidence intervals
+ */
+TimeSeriesForecast predict_time_series(const double* timeSeries, 
+                                     int dataLength,
+                                     int stepsAhead,
+                                     const char* factorName) {
+    TimeSeriesForecast forecast;
+    memset(&forecast, 0, sizeof(TimeSeriesForecast));
+    
+    if (dataLength < 5 || stepsAhead <= 0) {
+        forecast.overallConfidence = 0;
+        return forecast;
+    }
+    
+    // Copy factor name
+    strncpy(forecast.factorName, factorName, MAX_STRING_SIZE - 1);
+    forecast.factorName[MAX_STRING_SIZE - 1] = '\0';
+    
+    // First, check for seasonality
+    int potentialSeasonality = 0;
+    double maxAutocorr = 0;
+    
+    // Look for seasonality in range 2 to dataLength/3
+    for (int lag = 2; lag <= dataLength/3; lag++) {
+        double acf = calculateAutocorrelation(timeSeries, dataLength, lag);
+        if (acf > 0.3 && acf > maxAutocorr) {
+            maxAutocorr = acf;
+            potentialSeasonality = lag;
+        }
+    }
+    
+    // Set seasonality period if detected
+    forecast.seasonalityPeriod = potentialSeasonality;
+    
+    // Decompose time series if seasonality detected
+    std::vector<double> trend(dataLength);
+    std::vector<double> seasonal(dataLength);
+    std::vector<double> residual(dataLength);
+    
+    bool hasSeasonality = potentialSeasonality > 0 && maxAutocorr > 0.3;
+    
+    if (hasSeasonality) {
+        // Decompose the time series
+        decomposeTimeSeries(timeSeries, dataLength, potentialSeasonality,
+                           trend.data(), seasonal.data(), residual.data());
+    } else {
+        // No seasonality, just use simple moving average for trend
+        calculateMovingAverage(timeSeries, dataLength, std::min(7, dataLength/3), trend.data());
+        
+        // No seasonal component
+        for (int i = 0; i < dataLength; i++) {
+            seasonal[i] = 0;
+            residual[i] = timeSeries[i] - trend[i];
+        }
+    }
+    
+    // Fit AR model to residuals for short-term dynamics
+    // Determine optimal AR order using PACF
+    int maxLag = std::min(10, dataLength/5);
+    std::vector<double> pacf(maxLag + 1);
+    calculatePACF(residual.data(), dataLength, maxLag, pacf.data());
+    
+    // Find significant AR terms (PACF > 0.2)
+    std::vector<int> significantLags;
+    for (int i = 1; i <= maxLag; i++) {
+        if (std::abs(pacf[i]) > 0.2) {
+            significantLags.push_back(i);
+        }
+    }
+    
+    // Limit to 3 most significant terms
+    if (significantLags.size() > 3) {
+        std::sort(significantLags.begin(), significantLags.end(),
+                 [&pacf](int a, int b) {
+                     return std::abs(pacf[a]) > std::abs(pacf[b]);
+                 });
+        significantLags.resize(3);
+    }
+    
+    // Fit AR coefficients using linear regression
+    int arOrder = significantLags.size();
+    std::vector<double> arCoefficients(arOrder, 0);
+    
+    if (arOrder > 0) {
+        // Prepare training data for AR model
+        int trainingSize = dataLength - significantLags.back();
+        std::vector<std::vector<double>> X(trainingSize, std::vector<double>(arOrder));
+        std::vector<double> y(trainingSize);
+        
+        for (int i = 0; i < trainingSize; i++) {
+            int t = i + significantLags.back();
+            y[i] = residual[t];
+            
+            for (int j = 0; j < arOrder; j++) {
+                X[i][j] = residual[t - significantLags[j]];
+            }
+        }
+        
+        // Very simplified AR coefficient estimation 
+        // Real implementation would use matrix operations
+        for (int j = 0; j < arOrder; j++) {
+            double sumXY = 0, sumX2 = 0;
+            for (int i = 0; i < trainingSize; i++) {
+                sumXY += X[i][j] * y[i];
+                sumX2 += X[i][j] * X[i][j];
+            }
+            if (sumX2 > 0) {
+                arCoefficients[j] = sumXY / sumX2;
+            }
+        }
+    }
+    
+    // Set time unit (days by default)
+    forecast.timeUnit = TIME_UNIT_DAYS;
+    
+    // Generate forecasts
+    double trendGrowth = 0;
+    if (dataLength > 10) {
+        // Calculate average trend growth over last 10 points
+        trendGrowth = (trend[dataLength-1] - trend[dataLength-11]) / 10.0;
+    }
+    
+    // Last observed values
+    std::vector<double> lastResiduals(dataLength);
+    for (int i = 0; i < dataLength; i++) {
+        lastResiduals[i] = residual[i];
+    }
+    
+    // Generate predictions
+    for (int i = 0; i < stepsAhead && i < 30; i++) {
+        int t = dataLength + i;
+        
+        // Forecast trend component
+        double trendForecast = trend[dataLength-1] + trendGrowth * (i + 1);
+        
+        // Forecast seasonal component (if any)
+        double seasonalForecast = 0;
+        if (hasSeasonality && potentialSeasonality > 0) {
+            seasonalForecast = seasonal[dataLength - potentialSeasonality + (i % potentialSeasonality)];
+        }
+        
+        // Forecast residual component using AR model
+        double residualForecast = 0;
+        for (int j = 0; j < arOrder; j++) {
+            int lag = significantLags[j];
+            if (i >= lag) {
+                // Use previously forecasted residuals
+                residualForecast += arCoefficients[j] * lastResiduals[dataLength + i - lag];
+            } else {
+                // Use observed residuals
+                residualForecast += arCoefficients[j] * residual[dataLength - lag + i];
+            }
+        }
+        
+        // Store forecasted residual
+        lastResiduals.push_back(residualForecast);
+        
+        // Combine components for final forecast
+        forecast.predictions[i] = trendForecast + seasonalForecast + residualForecast;
+        
+        // Calculate confidence intervals (widen with forecast horizon)
+        double stdError = 0;
+        for (int j = 0; j < dataLength; j++) {
+            stdError += residual[j] * residual[j];
+        }
+        stdError = sqrt(stdError / dataLength);
+        
+        // Wider intervals for longer forecasts
+        double multiplier = 1.96 * sqrt(1.0 + 0.25 * i); // Roughly 95% CI with growing uncertainty
+        
+        forecast.confidenceIntervals[i][0] = forecast.predictions[i] - multiplier * stdError;
+        forecast.confidenceIntervals[i][1] = forecast.predictions[i] + multiplier * stdError;
+    }
+    
+    // Set overall confidence based on model quality and forecast distance
+    double modelAccuracy = 0.8; // Would be calculated from validation in real model
+    if (hasSeasonality) modelAccuracy += 0.1;
+    if (arOrder > 0) modelAccuracy += 0.1 * std::min(arOrder, 2);
+    
+    forecast.overallConfidence = modelAccuracy * exp(-0.05 * stepsAhead);
+    if (forecast.overallConfidence > 0.95) forecast.overallConfidence = 0.95;
+    if (forecast.overallConfidence < 0.2) forecast.overallConfidence = 0.2;
+    
+    return forecast;
+}
--- a/native/statistics/utils.cpp
+++ b/native/statistics/utils.cpp
@ -0,0 +1,863 @@
+// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
+// SPDX-License-Identifier: AGPL-3.0
+//
+// utils.cpp
+// Core utility functions used across the health analytics library
+//
+#include "health_analytics_engine.h"
+#include "utils.h"
+#include <random>
+#include <limits>
+#include <numeric>
+/**
+ * @brief Calculate the mean (average) of a data series
+ * 
+ * @param values Pointer to array of values
+ * @param length Number of elements in the array
+ * @return double The arithmetic mean
+ */
+double calculateMean(const double* values, int length) {
+    if (length <= 0) return 0;
+    
+    double sum = 0;
+    for (int i = 0; i < length; i++) {
+        sum += values[i];
+    }
+    return sum / length;
+}
+
+/**
+ * @brief Calculate the weighted mean of a data series
+ * 
+ * @param values Pointer to array of values
+ * @param weights Pointer to array of weights for each value
+ * @param length Number of elements in the arrays
+ * @return double The weighted arithmetic mean
+ */
+double calculateWeightedMean(const double* values, const double* weights, int length) {
+    if (length <= 0) return 0;
+    
+    double sum = 0;
+    double weightSum = 0;
+    
+    for (int i = 0; i < length; i++) {
+        sum += values[i] * weights[i];
+        weightSum += weights[i];
+    }
+    
+    return weightSum > 0 ? sum / weightSum : 0;
+}
+
+/**
+ * @brief Calculate the variance of a data series
+ * Uses Welford's online algorithm for numerical stability
+ * 
+ * @param values Pointer to array of values
+ * @param length Number of elements in the array
+ * @param mean Pre-calculated mean (if available, otherwise pass 0)
+ * @return double The variance (population or sample based on implementation)
+ */
+double calculateVariance(const double* values, int length, double mean = 0) {
+    if (length <= 1) return 0;
+    
+    // Use pre-calculated mean if provided, otherwise calculate it
+    if (mean == 0) {
+        mean = calculateMean(values, length);
+    }
+    
+    // Use two-pass algorithm for better numerical stability
+    double sumSquaredDiff = 0;
+    for (int i = 0; i < length; i++) {
+        double diff = values[i] - mean;
+        sumSquaredDiff += diff * diff;
+    }
+    
+    // Return sample variance (n-1 denominator for unbiased estimation)
+    return sumSquaredDiff / (length - 1);
+}
+
+/**
+ * @brief Calculate the standard deviation of a data series
+ * 
+ * @param values Pointer to array of values
+ * @param length Number of elements in the array
+ * @param mean Pre-calculated mean (if available, otherwise pass 0)
+ * @return double The standard deviation
+ */
+double calculateStdDev(const double* values, int length, double mean = 0) {
+    return std::sqrt(calculateVariance(values, length, mean));
+}
+
+/**
+ * @brief Calculate the median of a data series
+ * 
+ * @param values Pointer to array of values (will be modified by sorting)
+ * @param length Number of elements in the array
+ * @return double The median value
+ */
+double calculateMedian(double* values, int length) {
+    if (length == 0) return 0;
+    if (length == 1) return values[0];
+    
+    // Create a copy and sort it
+    std::vector<double> sorted(values, values + length);
+    std::sort(sorted.begin(), sorted.end());
+    
+    if (length % 2 == 0) {
+        // Even number of elements
+        return (sorted[length/2 - 1] + sorted[length/2]) / 2.0;
+    } else {
+        // Odd number of elements
+        return sorted[length/2];
+    }
+}
+
+/**
+ * @brief Calculate the Pearson correlation coefficient between two data series
+ * 
+ * @param x First data series
+ * @param y Second data series (must be same length as x)
+ * @param length Number of elements in both arrays
+ * @return double Correlation coefficient (-1 to 1)
+ */
+double calculateCorrelation(const double* x, const double* y, int length) {
+    if (length <= 1) return 0;
+    
+    double sum_x = 0, sum_y = 0, sum_xy = 0;
+    double sum_x2 = 0, sum_y2 = 0;
+    
+    for (int i = 0; i < length; i++) {
+        sum_x += x[i];
+        sum_y += y[i];
+        sum_xy += x[i] * y[i];
+        sum_x2 += x[i] * x[i];
+        sum_y2 += y[i] * y[i];
+    }
+    
+    double denominator = std::sqrt((length * sum_x2 - sum_x * sum_x) * 
+                                  (length * sum_y2 - sum_y * sum_y));
+    
+    if (denominator < 1e-10) return 0; // Avoid division by zero
+    
+    return (length * sum_xy - sum_x * sum_y) / denominator;
+}
+
+/**
+ * @brief Calculate Spearman's rank correlation coefficient
+ * More robust to outliers than Pearson correlation
+ * 
+ * @param x First data series
+ * @param y Second data series (must be same length as x)
+ * @param length Number of elements in both arrays
+ * @return double Spearman's rank correlation coefficient (-1 to 1)
+ */
+double calculateSpearmanCorrelation(const double* x, const double* y, int length) {
+    if (length <= 1) return 0;
+    
+    // Create vectors with indices to perform ranking
+    std::vector<std::pair<double, int>> x_indexed(length);
+    std::vector<std::pair<double, int>> y_indexed(length);
+    
+    for (int i = 0; i < length; i++) {
+        x_indexed[i] = std::make_pair(x[i], i);
+        y_indexed[i] = std::make_pair(y[i], i);
+    }
+    
+    // Sort by values to determine ranks
+    std::sort(x_indexed.begin(), x_indexed.end());
+    std::sort(y_indexed.begin(), y_indexed.end());
+    
+    // Assign ranks (handling ties with average rank)
+    std::vector<double> x_ranks(length), y_ranks(length);
+    
+    for (int i = 0; i < length; i++) {
+        int j = i;
+        while (j < length - 1 && x_indexed[j].first == x_indexed[j + 1].first) j++;
+        double rank = 1.0 * (i + j) / 2 + 1;
+        for (int k = i; k <= j; k++) {
+            x_ranks[x_indexed[k].second] = rank;
+        }
+        i = j;
+    }
+    
+    for (int i = 0; i < length; i++) {
+        int j = i;
+        while (j < length - 1 && y_indexed[j].first == y_indexed[j + 1].first) j++;
+        double rank = 1.0 * (i + j) / 2 + 1;
+        for (int k = i; k <= j; k++) {
+            y_ranks[y_indexed[k].second] = rank;
+        }
+        i = j;
+    }
+    
+    // Calculate Pearson correlation on the ranks
+    double* x_ranks_ptr = x_ranks.data();
+    double* y_ranks_ptr = y_ranks.data();
+    return calculateCorrelation(x_ranks_ptr, y_ranks_ptr, length);
+}
+
+/**
+ * @brief Calculate a specific quantile value of a data series
+ * 
+ * @param values Pointer to array of values
+ * @param length Number of elements in the array
+ * @param q Quantile to calculate (0-1, e.g., 0.25 for first quartile)
+ * @return double The value at the specified quantile
+ */
+double calculateQuantile(const double* values, int length, double q) {
+    if (length == 0) return 0;
+    if (length == 1) return values[0];
+    if (q < 0) q = 0;
+    if (q > 1) q = 1;
+    
+    std::vector<double> sorted(values, values + length);
+    std::sort(sorted.begin(), sorted.end());
+    
+    // Linear interpolation between closest ranks
+    double pos = (length - 1) * q;
+    int idx_lower = static_cast<int>(pos);
+    double frac = pos - idx_lower;
+    
+    if (idx_lower + 1 < length) {
+        return sorted[idx_lower] * (1 - frac) + sorted[idx_lower + 1] * frac;
+    } else {
+        return sorted[idx_lower];
+    }
+}
+
+/**
+ * @brief Calculate the interquartile range (IQR) of a data series
+ * 
+ * @param values Pointer to array of values
+ * @param length Number of elements in the array
+ * @return double The IQR (Q3-Q1)
+ */
+double calculateIQR(const double* values, int length) {
+    if (length < 4) return 0;
+    
+    double q1 = calculateQuantile(values, length, 0.25);
+    double q3 = calculateQuantile(values, length, 0.75);
+    
+    return q3 - q1;
+}
+
+/**
+ * @brief Calculate the skewness of a data distribution
+ * Measures the asymmetry of the probability distribution
+ * 
+ * @param values Pointer to array of values
+ * @param length Number of elements in the array
+ * @param mean Pre-calculated mean (if available, otherwise pass 0)
+ * @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0)
+ * @return double The skewness value (0 for normal distribution)
+ */
+double calculateSkewness(const double* values, int length, double mean = 0, double stdDev = 0) {
+    if (length <= 2) return 0;
+    
+    // Calculate mean and stdDev if not provided
+    if (mean == 0) {
+        mean = calculateMean(values, length);
+    }
+    
+    if (stdDev == 0) {
+        stdDev = calculateStdDev(values, length, mean);
+    }
+    
+    if (stdDev < 1e-10) return 0; // Avoid division by zero
+    
+    // Calculate third moment (cube of differences)
+    double sum = 0;
+    for (int i = 0; i < length; i++) {
+        double diff = values[i] - mean;
+        sum += diff * diff * diff;
+    }
+    
+    // Return Fisher-Pearson coefficient of skewness
+    // Includes adjustment for sample bias
+    double n = length;
+    double adjustment = std::sqrt(n * (n - 1)) / (n - 2);
+    return adjustment * sum / (length * stdDev * stdDev * stdDev);
+}
+
+/**
+ * @brief Calculate the kurtosis of a data distribution
+ * Measures the "tailedness" of the probability distribution
+ * 
+ * @param values Pointer to array of values
+ * @param length Number of elements in the array
+ * @param mean Pre-calculated mean (if available, otherwise pass 0)
+ * @param stdDev Pre-calculated standard deviation (if available, otherwise pass 0)
+ * @return double The excess kurtosis (0 for normal distribution)
+ */
+double calculateKurtosis(const double* values, int length, double mean = 0, double stdDev = 0) {
+    if (length <= 3) return 0;
+    
+    // Calculate mean and stdDev if not provided
+    if (mean == 0) {
+        mean = calculateMean(values, length);
+    }
+    
+    if (stdDev == 0) {
+        stdDev = calculateStdDev(values, length, mean);
+    }
+    
+    if (stdDev < 1e-10) return 0; // Avoid division by zero
+    
+    // Calculate fourth moment
+    double sum = 0;
+    for (int i = 0; i < length; i++) {
+        double diff = values[i] - mean;
+        sum += diff * diff * diff * diff;
+    }
+    
+    // Return excess kurtosis with sample adjustment
+    double n = length;
+    double adjustment = ((n + 1) * n) / ((n - 1) * (n - 2) * (n - 3));
+    double second_term = 3 * (n - 1) * (n - 1) / ((n - 2) * (n - 3));
+    return adjustment * sum / (stdDev * stdDev * stdDev * stdDev) - second_term;
+}
+
+/**
+ * @brief Perform linear regression on two data series
+ * 
+ * @param x Independent variable values
+ * @param y Dependent variable values (must be same length as x)
+ * @param length Number of elements in both arrays
+ * @param slope Output parameter for slope
+ * @param intercept Output parameter for y-intercept
+ * @param r_squared Output parameter for R² coefficient of determination
+ * @return bool True if successful, false if error occurred
+ */
+bool calculateLinearRegression(const double* x, const double* y, int length, 
+                               double& slope, double& intercept, double& r_squared) {
+    if (length < 2) return false;
+    
+    double sum_x = 0, sum_y = 0, sum_xy = 0, sum_x2 = 0, sum_y2 = 0;
+    
+    for (int i = 0; i < length; i++) {
+        sum_x += x[i];
+        sum_y += y[i];
+        sum_xy += x[i] * y[i];
+        sum_x2 += x[i] * x[i];
+        sum_y2 += y[i] * y[i];
+    }
+    
+    double n = static_cast<double>(length);
+    double denominator = n * sum_x2 - sum_x * sum_x;
+    
+    if (std::abs(denominator) < 1e-10) return false; // Vertical line, undefined slope
+    
+    // Calculate slope and intercept
+    slope = (n * sum_xy - sum_x * sum_y) / denominator;
+    intercept = (sum_y - slope * sum_x) / n;
+    
+    // Calculate R² coefficient of determination
+    double mean_y = sum_y / n;
+    double ss_total = 0, ss_residual = 0;
+    
+    for (int i = 0; i < length; i++) {
+        double predicted = intercept + slope * x[i];
+        ss_total += (y[i] - mean_y) * (y[i] - mean_y);
+        ss_residual += (y[i] - predicted) * (y[i] - predicted);
+    }
+    
+    if (ss_total < 1e-10) {
+        r_squared = 1.0; // All points are on the same horizontal line
+    } else {
+        r_squared = 1.0 - (ss_residual / ss_total);
+    }
+    
+    return true;
+}
+
+/**
+ * @brief Calculate the autocorrelation of a time series at specified lag
+ * 
+ * @param values Time series data
+ * @param length Number of elements in the array
+ * @param lag The lag to calculate autocorrelation for
+ * @return double Autocorrelation coefficient at specified lag (-1 to 1)
+ */
+double calculateAutocorrelation(const double* values, int length, int lag) {
+    if (length <= lag || lag <= 0) return 0;
+    
+    double mean = calculateMean(values, length);
+    double numerator = 0;
+    double denominator = 0;
+    
+    for (int i = 0; i < length - lag; i++) {
+        numerator += (values[i] - mean) * (values[i + lag] - mean);
+    }
+    
+    for (int i = 0; i < length; i++) {
+        denominator += (values[i] - mean) * (values[i] - mean);
+    }
+    
+    if (denominator < 1e-10) return 0;
+    
+    return numerator / denominator;
+}
+
+/**
+ * @brief Detect outliers in a data series using modified Z-score method
+ * 
+ * @param values Pointer to array of values
+ * @param length Number of elements in the array
+ * @param outlierIndices Output vector to store indices of detected outliers
+ * @param threshold Z-score threshold to consider a point an outlier (typically 3.5)
+ * @return int Number of outliers detected
+ */
+int detectOutliers(const double* values, int length, std::vector<int>& outlierIndices, double threshold = 3.5) {
+    if (length < 3) return 0;
+    
+    outlierIndices.clear();
+    
+    // Use median and MAD instead of mean and std dev for robustness
+    std::vector<double> sorted(values, values + length);
+    std::sort(sorted.begin(), sorted.end());
+    
+    double median = (length % 2 == 0) ? 
+        (sorted[length/2 - 1] + sorted[length/2]) / 2.0 : sorted[length/2];
+    
+    // Calculate MAD (Median Absolute Deviation)
+    std::vector<double> deviations(length);
+    for (int i = 0; i < length; i++) {
+        deviations[i] = std::abs(values[i] - median);
+    }
+    std::sort(deviations.begin(), deviations.end());
+    
+    double mad = (length % 2 == 0) ? 
+        (deviations[length/2 - 1] + deviations[length/2]) / 2.0 : deviations[length/2];
+    
+    // Constant factor for normal distribution
+    const double k = 1.4826; 
+    
+    // Find outliers using modified Z-score
+    for (int i = 0; i < length; i++) {
+        if (mad < 1e-10) { // If MAD is too small, use simple difference
+            if (std::abs(values[i] - median) > threshold) {
+                outlierIndices.push_back(i);
+            }
+        } else {
+            double modified_z = k * std::abs(values[i] - median) / mad;
+            if (modified_z > threshold) {
+                outlierIndices.push_back(i);
+            }
+        }
+    }
+    
+    return outlierIndices.size();
+}
+
+/**
+ * @brief Perform simple moving average on a time series
+ * 
+ * @param values Time series data
+ * @param length Number of elements in the array
+ * @param window The window size for the moving average
+ * @param result Pre-allocated array to store results (size = length)
+ */
+void calculateMovingAverage(const double* values, int length, int window, double* result) {
+    if (length <= 0 || window <= 0) return;
+    
+    // Adjust window if it's larger than the data length
+    window = std::min(window, length);
+    
+    for (int i = 0; i < length; i++) {
+        int start = std::max(0, i - window + 1);
+        int end = i + 1;
+        int count = end - start;
+        
+        double sum = 0;
+        for (int j = start; j < end; j++) {
+            sum += values[j];
+        }
+        
+        result[i] = sum / count;
+    }
+}
+
+/**
+ * @brief Calculate exponential moving average (EMA) of a time series
+ * 
+ * @param values Time series data
+ * @param length Number of elements in the array
+ * @param alpha Smoothing factor (0-1)
+ * @param result Pre-allocated array to store results (size = length)
+ */
+void calculateExponentialMovingAverage(const double* values, int length, double alpha, double* result) {
+    if (length <= 0 || alpha < 0 || alpha > 1) return;
+    
+    // Initialize with first value
+    result[0] = values[0];
+    
+    // Apply EMA formula: EMA_t = α × value_t + (1 - α) × EMA_{t-1}
+    for (int i = 1; i < length; i++) {
+        result[i] = alpha * values[i] + (1 - alpha) * result[i - 1];
+    }
+}
+
+/**
+ * @brief Decompose a time series into trend, seasonal, and residual components
+ * Implementation of STL (Seasonal and Trend decomposition using Loess)
+ * 
+ * @param values Time series data
+ * @param length Number of elements in the array
+ * @param seasonality Length of seasonal cycle (e.g., 7 for weekly, 12 for monthly)
+ * @param trend Output array for trend component (size = length)
+ * @param seasonal Output array for seasonal component (size = length)
+ * @param residual Output array for residual component (size = length)
+ * @return bool True if successful, false if error occurred
+ */
+bool decomposeTimeSeries(const double* values, int length, int seasonality,
+                          double* trend, double* seasonal, double* residual) {
+    if (length <= 2 * seasonality || seasonality <= 1) return false;
+    
+    // Calculate trend with centered moving average
+    for (int i = 0; i < length; i++) {
+        trend[i] = 0;
+    }
+    
+    int halfSeason = seasonality / 2;
+    // Centered moving average for trend
+    for (int i = halfSeason; i < length - halfSeason; i++) {
+        double sum = 0;
+        for (int j = i - halfSeason; j <= i + halfSeason; j++) {
+            sum += values[j];
+        }
+        trend[i] = sum / seasonality;
+    }
+    
+    // Extrapolate trend at boundaries
+    // Left boundary
+    double slope = (trend[halfSeason + 5] - trend[halfSeason]) / 5;
+    for (int i = 0; i < halfSeason; i++) {
+        trend[i] = trend[halfSeason] - (halfSeason - i) * slope;
+    }
+    
+    // Right boundary
+    slope = (trend[length - halfSeason - 1] - trend[length - halfSeason - 6]) / 5;
+    for (int i = length - halfSeason; i < length; i++) {
+        trend[i] = trend[length - halfSeason - 1] + (i - (length - halfSeason - 1)) * slope;
+    }
+    
+    // Calculate detrended series
+    std::vector<double> detrended(length);
+    for (int i = 0; i < length; i++) {
+        detrended[i] = values[i] - trend[i];
+    }
+    
+    // Calculate seasonal component by averaging the detrended values across seasons
+    std::vector<double> seasonalAvg(seasonality, 0);
+    std::vector<int> seasonalCounts(seasonality, 0);
+    
+    for (int i = 0; i < length; i++) {
+        int seasonalIndex = i % seasonality;
+        seasonalAvg[seasonalIndex] += detrended[i];
+        seasonalCounts[seasonalIndex]++;
+    }
+    
+    for (int i = 0; i < seasonality; i++) {
+        if (seasonalCounts[i] > 0) {
+            seasonalAvg[i] /= seasonalCounts[i];
+        }
+    }
+    
+    // Normalize seasonal component to sum to zero
+    double avgSeasonal = 0;
+    for (int i = 0; i < seasonality; i++) {
+        avgSeasonal += seasonalAvg[i];
+    }
+    avgSeasonal /= seasonality;
+    
+    for (int i = 0; i < seasonality; i++) {
+        seasonalAvg[i] -= avgSeasonal;
+    }
+    
+    // Apply seasonal component to entire series
+    for (int i = 0; i < length; i++) {
+        seasonal[i] = seasonalAvg[i % seasonality];
+    }
+    
+    // Calculate residual component
+    for (int i = 0; i < length; i++) {
+        residual[i] = values[i] - trend[i] - seasonal[i];
+    }
+    
+    return true;
+}
+
+/**
+ * @brief Calculate partial autocorrelation function for a time series
+ * 
+ * @param values Time series data
+ * @param length Number of elements in the array
+ * @param maxLag Maximum lag to calculate
+ * @param pacf Pre-allocated array to store results (size = maxLag + 1)
+ * @return int Number of valid PACF values calculated
+ */
+int calculatePACF(const double* values, int length, int maxLag, double* pacf) {
+    if (length <= 1 || maxLag <= 0 || maxLag >= length) return 0;
+    
+    // Allocate Yule-Walker matrices
+    std::vector<std::vector<double>> phi(maxLag + 1, std::vector<double>(maxLag + 1, 0));
+    
+    // Calculate autocorrelations
+    std::vector<double> acf(maxLag + 1, 0);
+    acf[0] = 1.0; // ACF at lag 0 is always 1
+    
+    for (int k = 1; k <= maxLag; k++) {
+        acf[k] = calculateAutocorrelation(values, length, k);
+    }
+    
+    // Set PACF at lag 0 to 1
+    pacf[0] = 1.0;
+    
+    // Calculate PACF using Levinson-Durbin recursion
+    for (int k = 1; k <= maxLag; k++) {
+        // Initialize for this order
+        double numerator = acf[k];
+        for (int j = 1; j < k; j++) {
+            numerator -= phi[k-1][j] * acf[k-j];
+        }
+        
+        double denominator = 1.0;
+        for (int j = 1; j < k; j++) {
+            denominator -= phi[k-1][j] * acf[j];
+        }
+        
+        if (std::abs(denominator) < 1e-10) {
+            // If denominator is close to zero, set PACF to 0
+            phi[k][k] = 0;
+        } else {
+            phi[k][k] = numerator / denominator;
+        }
+        
+        // Update remaining coefficients
+        for (int j = 1; j < k; j++) {
+            phi[k][j] = phi[k-1][j] - phi[k][k] * phi[k-1][k-j];
+        }
+        
+        // Store the PACF value
+        pacf[k] = phi[k][k];
+    }
+    
+    return maxLag + 1;
+}
+
+/**
+ * @brief Perform k-means clustering on multivariate data
+ * 
+ * @param data 2D array of data points [n_samples x n_features]
+ * @param nSamples Number of data points
+ * @param nFeatures Number of features per data point
+ * @param k Number of clusters
+ * @param maxIter Maximum number of iterations
+ * @param centroids Output array for cluster centroids [k x n_features]
+ * @param assignments Output array for cluster assignments [n_samples]
+ * @return int Number of iterations performed
+ */
+int kMeansClustering(const double** data, int nSamples, int nFeatures, int k, 
+                    int maxIter, double** centroids, int* assignments) {
+    if (nSamples < k || k <= 0 || nFeatures <= 0) return 0;
+    
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> distrib(0, nSamples - 1);
+    
+    // Initialize centroids using k-means++ initialization
+    std::vector<int> centroidIndices;
+    std::vector<double> minDistances(nSamples, std::numeric_limits<double>::max());
+    
+    // Choose first centroid randomly
+    int firstCentroid = distrib(gen);
+    centroidIndices.push_back(firstCentroid);
+    
+    // Choose remaining centroids
+    for (int c = 1; c < k; c++) {
+        // Update distances to nearest centroid
+        for (int i = 0; i < nSamples; i++) {
+            double dist = 0;
+            for (int j = 0; j < nFeatures; j++) {
+                double diff = data[i][j] - data[centroidIndices.back()][j];
+                dist += diff * diff;
+            }
+            minDistances[i] = std::min(minDistances[i], dist);
+        }
+        
+        // Calculate sum of squared distances
+        double sumSquaredDist = 0;
+        for (int i = 0; i < nSamples; i++) {
+            sumSquaredDist += minDistances[i];
+        }
+        
+        // Choose next centroid with probability proportional to D²
+        double threshold = (sumSquaredDist * static_cast<double>(rand()) / RAND_MAX);
+        double cumulativeProb = 0;
+        int nextCentroid = 0;
+        
+        for (int i = 0; i < nSamples; i++) {
+            cumulativeProb += minDistances[i];
+            if (cumulativeProb >= threshold) {
+                nextCentroid = i;
+                break;
+            }
+        }
+        
+        centroidIndices.push_back(nextCentroid);
+    }
+    
+    // Copy initial centroids
+    for (int i = 0; i < k; i++) {
+        for (int j = 0; j < nFeatures; j++) {
+            centroids[i][j] = data[centroidIndices[i]][j];
+        }
+    }
+    
+    // Perform k-means iterations
+    int iterations = 0;
+    bool converged = false;
+    
+    while (!converged && iterations < maxIter) {
+        // Assign points to nearest centroid
+        converged = true;
+        
+        for (int i = 0; i < nSamples; i++) {
+            double minDist = std::numeric_limits<double>::max();
+            int bestCluster = 0;
+            
+            for (int c = 0; c < k; c++) {
+                double dist = 0;
+                for (int j = 0; j < nFeatures; j++) {
+                    double diff = data[i][j] - centroids[c][j];
+                    dist += diff * diff;
+                }
+                
+                if (dist < minDist) {
+                    minDist = dist;
+                    bestCluster = c;
+                }
+            }
+            
+            if (assignments[i] != bestCluster) {
+                assignments[i] = bestCluster;
+                converged = false;
+            }
+        }
+        
+        // Update centroids
+        std::vector<std::vector<double>> newCentroids(k, std::vector<double>(nFeatures, 0));
+        std::vector<int> clusterSizes(k, 0);
+        
+        for (int i = 0; i < nSamples; i++) {
+            int cluster = assignments[i];
+            clusterSizes[cluster]++;
+            
+            for (int j = 0; j < nFeatures; j++) {
+                newCentroids[cluster][j] += data[i][j];
+            }
+        }
+        
+        for (int c = 0; c < k; c++) {
+            if (clusterSizes[c] > 0) {
+                for (int j = 0; j < nFeatures; j++) {
+                    centroids[c][j] = newCentroids[c][j] / clusterSizes[c];
+                }
+            }
+        }
+        
+        iterations++;
+    }
+    
+    return iterations;
+}
+
+/**
+ * @brief Calculate the silhouette coefficient for clustering validation
+ * 
+ * @param data 2D array of data points [n_samples x n_features]
+ * @param nSamples Number of data points
+ * @param nFeatures Number of features per data point
+ * @param assignments Cluster assignments for each point
+ * @param k Number of clusters
+ * @return double Average silhouette coefficient (-1 to 1)
+ */
+double calculateSilhouetteCoefficient(const double** data, int nSamples, int nFeatures, 
+                                     const int* assignments, int k) {
+    if (nSamples <= k || k <= 1) return 0;
+    
+    std::vector<double> silhouettes(nSamples);
+    
+    // For each point
+    for (int i = 0; i < nSamples; i++) {
+        int cluster_i = assignments[i];
+        
+        // Calculate a(i) - average distance to points in same cluster
+        double a_i = 0;
+        int count_same_cluster = 0;
+        
+        for (int j = 0; j < nSamples; j++) {
+            if (j != i && assignments[j] == cluster_i) {
+                double dist = 0;
+                for (int f = 0; f < nFeatures; f++) {
+                    double diff = data[i][f] - data[j][f];
+                    dist += diff * diff;
+                }
+                dist = std::sqrt(dist);
+                
+                a_i += dist;
+                count_same_cluster++;
+            }
+        }
+        
+        if (count_same_cluster > 0) {
+            a_i /= count_same_cluster;
+        } else {
+            a_i = 0; // Singleton cluster
+        }
+        
+        // Calculate b(i) - minimum average distance to points in different clusters
+        double b_i = std::numeric_limits<double>::max();
+        
+        for (int c = 0; c < k; c++) {
+            if (c == cluster_i) continue;
+            
+            double avg_dist = 0;
+            int count_diff_cluster = 0;
+            
+            for (int j = 0; j < nSamples; j++) {
+                if (assignments[j] == c) {
+                    double dist = 0;
+                    for (int f = 0; f < nFeatures; f++) {
+                        double diff = data[i][f] - data[j][f];
+                        dist += diff * diff;
+                    }
+                    dist = std::sqrt(dist);
+                    
+                    avg_dist += dist;
+                    count_diff_cluster++;
+                }
+            }
+            
+            if (count_diff_cluster > 0) {
+                avg_dist /= count_diff_cluster;
+                b_i = std::min(b_i, avg_dist);
+            }
+        }
+        
+        // Calculate silhouette
+        if (count_same_cluster > 0 && b_i < std::numeric_limits<double>::max()) {
+            silhouettes[i] = (b_i - a_i) / std::max(a_i, b_i);
+        } else {
+            silhouettes[i] = 0; // Handle edge cases
+        }
+    }
+    
+    // Calculate average silhouette
+    double avg_silhouette = 0;
+    for (int i = 0; i < nSamples; i++) {
+        avg_silhouette += silhouettes[i];
+    }
+    
+    return avg_silhouette / nSamples;
+}
--- a/native/statistics/utils.h
+++ b/native/statistics/utils.h
@ -0,0 +1,32 @@
+// utils.h
+#ifndef UTILS_H
+#define UTILS_H
+
+#include "health_analytics_engine.h"
+#include <random>
+#include <limits>
+#include <numeric>
+
+// Declare all utility functions from utils.cpp
+double calculateMean(const double* values, int length);
+double calculateWeightedMean(const double* values, const double* weights, int length);
+double calculateVariance(const double* values, int length, double mean);
+double calculateStdDev(const double* values, int length, double mean);
+double calculateMedian(double* values, int length);
+double calculateCorrelation(const double* x, const double* y, int length);
+double calculateSpearmanCorrelation(const double* x, const double* y, int length);
+double calculateQuantile(const double* values, int length, double q);
+double calculateIQR(const double* values, int length);
+double calculateSkewness(const double* values, int length, double mean, double stdDev);
+double calculateKurtosis(const double* values, int length, double mean, double stdDev);
+bool calculateLinearRegression(const double* x, const double* y, int length, double& slope, double& intercept, double& r_squared);
+double calculateAutocorrelation(const double* values, int length, int lag);
+int detectOutliers(const double* values, int length, std::vector<int>& outlierIndices, double threshold);
+void calculateMovingAverage(const double* values, int length, int window, double* result);
+void calculateExponentialMovingAverage(const double* values, int length, double alpha, double* result);
+bool decomposeTimeSeries(const double* values, int length, int seasonality, double* trend, double* seasonal, double* residual);
+int calculatePACF(const double* values, int length, int maxLag, double* pacf);
+int kMeansClustering(const double** data, int nSamples, int nFeatures, int k, int maxIter, double** centroids, int* assignments);
+double calculateSilhouetteCoefficient(const double** data, int nSamples, int nFeatures, const int* assignments, int k);
+
+#endif // UTILS_H