// SPDX-FileCopyrightText: © 2025 Nøkken.io // SPDX-License-Identifier: AGPL-3.0 // // anomaly_detection.cpp // Implementation of anomaly and outlier detection functions // #include "health_analytics_engine.h" /** * @brief Detect anomalies in time series data * * @param timeSeries Time series data * @param dataLength Length of time series * @param threshold Z-score threshold to consider a point an anomaly * @param dates Array of dates corresponding to time series points * @param factorName Name of the factor being analyzed * @return AnomalyResult* Array of detected anomalies */ AnomalyResult* detect_anomalies(const double* timeSeries, int dataLength, double threshold, const DateStruct* dates, const char* factorName) { if (dataLength < 5) { AnomalyResult* dummy = new AnomalyResult[1]; memset(dummy, 0, sizeof(AnomalyResult)); dummy[0].dataPointIndex = -1; // Mark as invalid return dummy; } // Constants const int MAX_RESULTS = 100; // Allocate space for results AnomalyResult* results = new AnomalyResult[MAX_RESULTS + 1]; int resultCount = 0; // Decompose time series if enough data std::vector trend(dataLength); std::vector seasonal(dataLength); std::vector residual(dataLength); bool hasDecomposition = false; int seasonality = 0; // Try to detect seasonality for decomposition double maxAutocorr = 0; for (int lag = 2; lag <= dataLength/3; lag++) { double acf = calculateAutocorrelation(timeSeries, dataLength, lag); if (acf > 0.3 && acf > maxAutocorr) { maxAutocorr = acf; seasonality = lag; } } if (seasonality >= 2) { hasDecomposition = decomposeTimeSeries(timeSeries, dataLength, seasonality, trend.data(), seasonal.data(), residual.data()); } if (!hasDecomposition) { // Simple moving average for trend if decomposition failed int windowSize = std::min(7, dataLength/3); if (windowSize < 2) windowSize = 2; calculateMovingAverage(timeSeries, dataLength, windowSize, trend.data()); // Residuals = original - trend for (int i = 0; i < dataLength; i++) { seasonal[i] = 0; // No seasonal component residual[i] = timeSeries[i] - trend[i]; } } // Calculate residual statistics for outlier detection double mean = 0, sumSquared = 0; for (int i = 0; i < dataLength; i++) { mean += residual[i]; } mean /= dataLength; for (int i = 0; i < dataLength; i++) { double diff = residual[i] - mean; sumSquared += diff * diff; } double stdDev = sqrt(sumSquared / dataLength); if (stdDev <= 0) { results[0].dataPointIndex = -1; return results; } // Detect global outliers using z-score for (int i = 0; i < dataLength && resultCount < MAX_RESULTS; i++) { double zScore = (residual[i] - mean) / stdDev; if (std::abs(zScore) > threshold) { AnomalyResult& anomaly = results[resultCount++]; anomaly.dataPointIndex = i; anomaly.anomalyScore = std::abs(zScore); anomaly.originalValue = timeSeries[i]; anomaly.expectedValue = trend[i] + seasonal[i] + mean; // Higher confidence for more extreme anomalies anomaly.confidence = 0.5 + 0.5 * std::min(1.0, (std::abs(zScore) - threshold) / 5.0); // Copy date if available if (dates != nullptr) { anomaly.date = dates[i]; } else { memset(&anomaly.date, 0, sizeof(DateStruct)); } // Copy factor name if (factorName != nullptr) { strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1); anomaly.factorName[MAX_STRING_SIZE - 1] = '\0'; } else { anomaly.factorName[0] = '\0'; } // Set anomaly type anomaly.anomalyType = ANOMALY_OUTLIER; // Generate description if (zScore > 0) { snprintf(anomaly.description, MAX_STRING_SIZE, "Unusually high value (%.2f standard deviations above expected)", zScore); } else { snprintf(anomaly.description, MAX_STRING_SIZE, "Unusually low value (%.2f standard deviations below expected)", -zScore); } } } // Detect context-based anomalies using local statistics const int LOCAL_WINDOW = std::min(7, dataLength/5); if (LOCAL_WINDOW >= 3) { for (int i = LOCAL_WINDOW; i < dataLength - LOCAL_WINDOW && resultCount < MAX_RESULTS; i++) { // Calculate local statistics double localSum = 0, localSumSquared = 0; for (int j = i - LOCAL_WINDOW; j <= i + LOCAL_WINDOW; j++) { if (j != i) { // Exclude the point itself localSum += timeSeries[j]; localSumSquared += timeSeries[j] * timeSeries[j]; } } double localMean = localSum / (2 * LOCAL_WINDOW); double localVar = localSumSquared / (2 * LOCAL_WINDOW) - localMean * localMean; double localStdDev = sqrt(std::max(localVar, 1e-6)); // Prevent division by zero // Calculate local z-score double localZScore = (timeSeries[i] - localMean) / localStdDev; // Check if it's a local anomaly but not already a global anomaly if (std::abs(localZScore) > threshold * 1.2) { // Check if this point was already detected as a global anomaly bool alreadyDetected = false; for (int j = 0; j < resultCount; j++) { if (results[j].dataPointIndex == i) { alreadyDetected = true; break; } } if (!alreadyDetected) { AnomalyResult& anomaly = results[resultCount++]; anomaly.dataPointIndex = i; anomaly.anomalyScore = std::abs(localZScore); anomaly.originalValue = timeSeries[i]; anomaly.expectedValue = localMean; anomaly.confidence = 0.5 + 0.5 * std::min(1.0, (std::abs(localZScore) - threshold) / 5.0); // Copy date if available if (dates != nullptr) { anomaly.date = dates[i]; } else { memset(&anomaly.date, 0, sizeof(DateStruct)); } // Copy factor name if (factorName != nullptr) { strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1); anomaly.factorName[MAX_STRING_SIZE - 1] = '\0'; } else { anomaly.factorName[0] = '\0'; } // Set anomaly type anomaly.anomalyType = ANOMALY_CONTEXTUAL; // Generate description if (localZScore > 0) { snprintf(anomaly.description, MAX_STRING_SIZE, "Context anomaly: value is high compared to local neighborhood (%.2f local std dev)", localZScore); } else { snprintf(anomaly.description, MAX_STRING_SIZE, "Context anomaly: value is low compared to local neighborhood (%.2f local std dev)", -localZScore); } } } } } // Detect trend changes if (dataLength >= 10) { for (int i = 5; i < dataLength - 5 && resultCount < MAX_RESULTS; i++) { // Calculate slope before and after double slopeBefore = 0, slopeAfter = 0; double interceptBefore = 0, interceptAfter = 0; double r2Before = 0, r2After = 0; // Create time vectors std::vector time1(5), time2(5); std::vector values1(5), values2(5); for (int j = 0; j < 5; j++) { time1[j] = j; time2[j] = j; values1[j] = timeSeries[i - 5 + j]; values2[j] = timeSeries[i + j]; } bool validBefore = calculateLinearRegression(time1.data(), values1.data(), 5, slopeBefore, interceptBefore, r2Before); bool validAfter = calculateLinearRegression(time2.data(), values2.data(), 5, slopeAfter, interceptAfter, r2After); if (validBefore && validAfter) { // Check for significant change in slope double slopeChange = slopeAfter - slopeBefore; double meanSlope = (std::abs(slopeBefore) + std::abs(slopeAfter)) / 2; if (meanSlope > 0 && std::abs(slopeChange) / meanSlope > 0.5) { AnomalyResult& anomaly = results[resultCount++]; anomaly.dataPointIndex = i; anomaly.anomalyScore = std::abs(slopeChange) / (meanSlope + 1e-6); anomaly.originalValue = timeSeries[i]; anomaly.expectedValue = timeSeries[i]; // Same value, change is in the trend anomaly.confidence = 0.5 + 0.5 * std::min(1.0, anomaly.anomalyScore / 2.0); // Copy date if available if (dates != nullptr) { anomaly.date = dates[i]; } else { memset(&anomaly.date, 0, sizeof(DateStruct)); } // Copy factor name if (factorName != nullptr) { strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1); anomaly.factorName[MAX_STRING_SIZE - 1] = '\0'; } else { anomaly.factorName[0] = '\0'; } // Set anomaly type anomaly.anomalyType = ANOMALY_TREND_CHANGE; // Generate description if (slopeBefore < 0 && slopeAfter > 0) { snprintf(anomaly.description, MAX_STRING_SIZE, "Trend reversal: changed from decreasing (%.2f/day) to increasing (%.2f/day)", -slopeBefore, slopeAfter); } else if (slopeBefore > 0 && slopeAfter < 0) { snprintf(anomaly.description, MAX_STRING_SIZE, "Trend reversal: changed from increasing (%.2f/day) to decreasing (%.2f/day)", slopeBefore, -slopeAfter); } else if (slopeAfter > slopeBefore) { snprintf(anomaly.description, MAX_STRING_SIZE, "Trend acceleration: rate of change increased from %.2f/day to %.2f/day", slopeBefore, slopeAfter); } else { snprintf(anomaly.description, MAX_STRING_SIZE, "Trend deceleration: rate of change decreased from %.2f/day to %.2f/day", slopeBefore, slopeAfter); } } } } } // Sort anomalies by score (most significant first) std::sort(results, results + resultCount, [](const AnomalyResult& a, const AnomalyResult& b) { return a.anomalyScore > b.anomalyScore; }); // Mark the end of valid results results[resultCount].dataPointIndex = -1; return results; } /** * @brief Free memory for anomaly detection results * * @param results Pointer to anomaly results array */ void free_anomaly_results(AnomalyResult* results) { delete[] results; } /** * @brief Analyze patterns related to dates (e.g., day of week effects) * * @param values Array of values * @param dates Array of corresponding dates * @param data_length Length of the arrays * @param factor_name Name of the factor being analyzed * @return DatePatternResult* Array of detected patterns */ DatePatternResult* analyze_date_patterns( const double* values, const DateStruct* dates, int data_length, const char* factor_name) { // Allocate space for results (3 patterns max + terminator) DatePatternResult* results = new DatePatternResult[4]; memset(results, 0, 4 * sizeof(DatePatternResult)); // Initialize terminator results[0].patternType = PATTERN_NONE; // Return empty result for insufficient data if (data_length < 14) { return results; } // Count occurrences by day of week double dayOfWeekSums[7] = {0}; int dayOfWeekCounts[7] = {0}; for (int i = 0; i < data_length; i++) { // Convert date to day of week (0 = Sunday, 6 = Saturday) // This is a simplified calculation and might need adjustment int year = dates[i].year; int month = dates[i].month; int day = dates[i].day; // Zeller's congruence for finding day of week if (month < 3) { month += 12; year--; } int h = (day + (13 * (month + 1)) / 5 + year + year / 4 - year / 100 + year / 400) % 7; // Convert to 0-based where 0 is Sunday int dayOfWeek = (h + 6) % 7; dayOfWeekSums[dayOfWeek] += values[i]; dayOfWeekCounts[dayOfWeek]++; } // Calculate average by day of week double dayOfWeekAvgs[7] = {0}; for (int i = 0; i < 7; i++) { if (dayOfWeekCounts[i] > 0) { dayOfWeekAvgs[i] = dayOfWeekSums[i] / dayOfWeekCounts[i]; } } // Find peak day int peakDay = 0; double peakValue = dayOfWeekAvgs[0]; for (int i = 1; i < 7; i++) { if (dayOfWeekAvgs[i] > peakValue) { peakValue = dayOfWeekAvgs[i]; peakDay = i; } } // Calculate variance to determine if there's a weekly pattern double mean = 0; for (int i = 0; i < 7; i++) { if (dayOfWeekCounts[i] > 0) { mean += dayOfWeekAvgs[i]; } } mean /= 7; double variance = 0; for (int i = 0; i < 7; i++) { if (dayOfWeekCounts[i] > 0) { double diff = dayOfWeekAvgs[i] - mean; variance += diff * diff; } } variance /= 7; // Calculate pattern strength double strength = std::min(1.0, variance / (mean * mean + 0.001)); // Only report if pattern is significant if (strength > 0.1) { results[0].patternType = PATTERN_WEEKLY; results[0].periodicity = 7; results[0].strength = strength; results[0].peakDayOfWeek = peakDay; // Copy peak values for (int i = 0; i < 7; i++) { results[0].peakValues[i] = dayOfWeekAvgs[i]; } // Generate description const char* dayNames[] = {"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"}; snprintf(results[0].description, MAX_STRING_SIZE, "Weekly pattern detected with peak on %s (strength: %.2f)", dayNames[peakDay], strength); } return results; } /** * @brief Free memory for date pattern results * * @param results Pointer to date pattern results array */ void free_date_pattern_results(DatePatternResult* results) { delete[] results; } /** * @brief Analyze cyclical patterns in time series data * * @param values Array of values * @param dates Array of corresponding dates * @param data_length Length of the arrays * @param factor_name Name of the factor being analyzed * @return CycleAnalysisResult Structure containing cycle analysis results */ CycleAnalysisResult analyze_cycles( const double* values, const DateStruct* dates, int data_length, const char* factor_name) { CycleAnalysisResult result; memset(&result, 0, sizeof(CycleAnalysisResult)); // Minimum data points required for cycle analysis if (data_length < 20) { strncpy(result.description, "Insufficient data for cycle analysis", MAX_STRING_SIZE-1); return result; } // Simple autocorrelation-based cycle detection // Find the peak in autocorrelation function after lag 0 int maxLag = data_length / 3; // Look for cycles up to 1/3 of data length double maxCorr = 0; int bestLag = 0; for (int lag = 2; lag < maxLag; lag++) { double sum = 0; double count = 0; for (int i = 0; i < data_length - lag; i++) { sum += values[i] * values[i + lag]; count++; } double corr = sum / count; if (corr > maxCorr) { maxCorr = corr; bestLag = lag; } } // Calculate average cycle length in days if (bestLag > 0 && maxCorr > 0.2) { result.cycleLength = bestLag; result.amplitude = maxCorr; result.confidence = maxCorr; result.cycleLengthVariance = bestLag * 0.2; // Simple estimation snprintf(result.description, MAX_STRING_SIZE, "Cycle detected with approximate length of %d days (confidence: %.2f)", bestLag, maxCorr); } else { strncpy(result.description, "No significant cycle detected", MAX_STRING_SIZE-1); } return result; }