nokken/native/statistics/anomaly_detection.cpp
2025-04-20 11:17:03 -04:00

494 lines
No EOL
13 KiB
C++

// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// anomaly_detection.cpp
// Implementation of anomaly and outlier detection functions
//
#include "health_analytics_engine.h"
/**
* @brief Detect anomalies in time series data
*
* @param timeSeries Time series data
* @param dataLength Length of time series
* @param threshold Z-score threshold to consider a point an anomaly
* @param dates Array of dates corresponding to time series points
* @param factorName Name of the factor being analyzed
* @return AnomalyResult* Array of detected anomalies
*/
AnomalyResult* detect_anomalies(const double* timeSeries,
int dataLength,
double threshold,
const DateStruct* dates,
const char* factorName) {
if (dataLength < 5) {
AnomalyResult* dummy = new AnomalyResult[1];
memset(dummy, 0, sizeof(AnomalyResult));
dummy[0].dataPointIndex = -1; // Mark as invalid
return dummy;
}
// Constants
const int MAX_RESULTS = 100;
// Allocate space for results
AnomalyResult* results = new AnomalyResult[MAX_RESULTS + 1];
int resultCount = 0;
// Decompose time series if enough data
std::vector<double> trend(dataLength);
std::vector<double> seasonal(dataLength);
std::vector<double> residual(dataLength);
bool hasDecomposition = false;
int seasonality = 0;
// Try to detect seasonality for decomposition
double maxAutocorr = 0;
for (int lag = 2; lag <= dataLength/3; lag++) {
double acf = calculateAutocorrelation(timeSeries, dataLength, lag);
if (acf > 0.3 && acf > maxAutocorr) {
maxAutocorr = acf;
seasonality = lag;
}
}
if (seasonality >= 2) {
hasDecomposition = decomposeTimeSeries(timeSeries, dataLength, seasonality,
trend.data(), seasonal.data(), residual.data());
}
if (!hasDecomposition) {
// Simple moving average for trend if decomposition failed
int windowSize = std::min(7, dataLength/3);
if (windowSize < 2) windowSize = 2;
calculateMovingAverage(timeSeries, dataLength, windowSize, trend.data());
// Residuals = original - trend
for (int i = 0; i < dataLength; i++) {
seasonal[i] = 0; // No seasonal component
residual[i] = timeSeries[i] - trend[i];
}
}
// Calculate residual statistics for outlier detection
double mean = 0, sumSquared = 0;
for (int i = 0; i < dataLength; i++) {
mean += residual[i];
}
mean /= dataLength;
for (int i = 0; i < dataLength; i++) {
double diff = residual[i] - mean;
sumSquared += diff * diff;
}
double stdDev = sqrt(sumSquared / dataLength);
if (stdDev <= 0) {
results[0].dataPointIndex = -1;
return results;
}
// Detect global outliers using z-score
for (int i = 0; i < dataLength && resultCount < MAX_RESULTS; i++) {
double zScore = (residual[i] - mean) / stdDev;
if (std::abs(zScore) > threshold) {
AnomalyResult& anomaly = results[resultCount++];
anomaly.dataPointIndex = i;
anomaly.anomalyScore = std::abs(zScore);
anomaly.originalValue = timeSeries[i];
anomaly.expectedValue = trend[i] + seasonal[i] + mean;
// Higher confidence for more extreme anomalies
anomaly.confidence = 0.5 + 0.5 * std::min(1.0, (std::abs(zScore) - threshold) / 5.0);
// Copy date if available
if (dates != nullptr) {
anomaly.date = dates[i];
} else {
memset(&anomaly.date, 0, sizeof(DateStruct));
}
// Copy factor name
if (factorName != nullptr) {
strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
} else {
anomaly.factorName[0] = '\0';
}
// Set anomaly type
anomaly.anomalyType = ANOMALY_OUTLIER;
// Generate description
if (zScore > 0) {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Unusually high value (%.2f standard deviations above expected)",
zScore);
} else {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Unusually low value (%.2f standard deviations below expected)",
-zScore);
}
}
}
// Detect context-based anomalies using local statistics
const int LOCAL_WINDOW = std::min(7, dataLength/5);
if (LOCAL_WINDOW >= 3) {
for (int i = LOCAL_WINDOW; i < dataLength - LOCAL_WINDOW && resultCount < MAX_RESULTS; i++) {
// Calculate local statistics
double localSum = 0, localSumSquared = 0;
for (int j = i - LOCAL_WINDOW; j <= i + LOCAL_WINDOW; j++) {
if (j != i) { // Exclude the point itself
localSum += timeSeries[j];
localSumSquared += timeSeries[j] * timeSeries[j];
}
}
double localMean = localSum / (2 * LOCAL_WINDOW);
double localVar = localSumSquared / (2 * LOCAL_WINDOW) - localMean * localMean;
double localStdDev = sqrt(std::max(localVar, 1e-6)); // Prevent division by zero
// Calculate local z-score
double localZScore = (timeSeries[i] - localMean) / localStdDev;
// Check if it's a local anomaly but not already a global anomaly
if (std::abs(localZScore) > threshold * 1.2) {
// Check if this point was already detected as a global anomaly
bool alreadyDetected = false;
for (int j = 0; j < resultCount; j++) {
if (results[j].dataPointIndex == i) {
alreadyDetected = true;
break;
}
}
if (!alreadyDetected) {
AnomalyResult& anomaly = results[resultCount++];
anomaly.dataPointIndex = i;
anomaly.anomalyScore = std::abs(localZScore);
anomaly.originalValue = timeSeries[i];
anomaly.expectedValue = localMean;
anomaly.confidence = 0.5 + 0.5 * std::min(1.0, (std::abs(localZScore) - threshold) / 5.0);
// Copy date if available
if (dates != nullptr) {
anomaly.date = dates[i];
} else {
memset(&anomaly.date, 0, sizeof(DateStruct));
}
// Copy factor name
if (factorName != nullptr) {
strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
} else {
anomaly.factorName[0] = '\0';
}
// Set anomaly type
anomaly.anomalyType = ANOMALY_CONTEXTUAL;
// Generate description
if (localZScore > 0) {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Context anomaly: value is high compared to local neighborhood (%.2f local std dev)",
localZScore);
} else {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Context anomaly: value is low compared to local neighborhood (%.2f local std dev)",
-localZScore);
}
}
}
}
}
// Detect trend changes
if (dataLength >= 10) {
for (int i = 5; i < dataLength - 5 && resultCount < MAX_RESULTS; i++) {
// Calculate slope before and after
double slopeBefore = 0, slopeAfter = 0;
double interceptBefore = 0, interceptAfter = 0;
double r2Before = 0, r2After = 0;
// Create time vectors
std::vector<double> time1(5), time2(5);
std::vector<double> values1(5), values2(5);
for (int j = 0; j < 5; j++) {
time1[j] = j;
time2[j] = j;
values1[j] = timeSeries[i - 5 + j];
values2[j] = timeSeries[i + j];
}
bool validBefore = calculateLinearRegression(time1.data(), values1.data(), 5,
slopeBefore, interceptBefore, r2Before);
bool validAfter = calculateLinearRegression(time2.data(), values2.data(), 5,
slopeAfter, interceptAfter, r2After);
if (validBefore && validAfter) {
// Check for significant change in slope
double slopeChange = slopeAfter - slopeBefore;
double meanSlope = (std::abs(slopeBefore) + std::abs(slopeAfter)) / 2;
if (meanSlope > 0 && std::abs(slopeChange) / meanSlope > 0.5) {
AnomalyResult& anomaly = results[resultCount++];
anomaly.dataPointIndex = i;
anomaly.anomalyScore = std::abs(slopeChange) / (meanSlope + 1e-6);
anomaly.originalValue = timeSeries[i];
anomaly.expectedValue = timeSeries[i]; // Same value, change is in the trend
anomaly.confidence = 0.5 + 0.5 * std::min(1.0, anomaly.anomalyScore / 2.0);
// Copy date if available
if (dates != nullptr) {
anomaly.date = dates[i];
} else {
memset(&anomaly.date, 0, sizeof(DateStruct));
}
// Copy factor name
if (factorName != nullptr) {
strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
} else {
anomaly.factorName[0] = '\0';
}
// Set anomaly type
anomaly.anomalyType = ANOMALY_TREND_CHANGE;
// Generate description
if (slopeBefore < 0 && slopeAfter > 0) {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Trend reversal: changed from decreasing (%.2f/day) to increasing (%.2f/day)",
-slopeBefore, slopeAfter);
} else if (slopeBefore > 0 && slopeAfter < 0) {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Trend reversal: changed from increasing (%.2f/day) to decreasing (%.2f/day)",
slopeBefore, -slopeAfter);
} else if (slopeAfter > slopeBefore) {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Trend acceleration: rate of change increased from %.2f/day to %.2f/day",
slopeBefore, slopeAfter);
} else {
snprintf(anomaly.description, MAX_STRING_SIZE,
"Trend deceleration: rate of change decreased from %.2f/day to %.2f/day",
slopeBefore, slopeAfter);
}
}
}
}
}
// Sort anomalies by score (most significant first)
std::sort(results, results + resultCount,
[](const AnomalyResult& a, const AnomalyResult& b) {
return a.anomalyScore > b.anomalyScore;
});
// Mark the end of valid results
results[resultCount].dataPointIndex = -1;
return results;
}
/**
* @brief Free memory for anomaly detection results
*
* @param results Pointer to anomaly results array
*/
void free_anomaly_results(AnomalyResult* results) {
delete[] results;
}
/**
* @brief Analyze patterns related to dates (e.g., day of week effects)
*
* @param values Array of values
* @param dates Array of corresponding dates
* @param data_length Length of the arrays
* @param factor_name Name of the factor being analyzed
* @return DatePatternResult* Array of detected patterns
*/
DatePatternResult* analyze_date_patterns(
const double* values,
const DateStruct* dates,
int data_length,
const char* factor_name) {
// Allocate space for results (3 patterns max + terminator)
DatePatternResult* results = new DatePatternResult[4];
memset(results, 0, 4 * sizeof(DatePatternResult));
// Initialize terminator
results[0].patternType = PATTERN_NONE;
// Return empty result for insufficient data
if (data_length < 14) {
return results;
}
// Count occurrences by day of week
double dayOfWeekSums[7] = {0};
int dayOfWeekCounts[7] = {0};
for (int i = 0; i < data_length; i++) {
// Convert date to day of week (0 = Sunday, 6 = Saturday)
// This is a simplified calculation and might need adjustment
int year = dates[i].year;
int month = dates[i].month;
int day = dates[i].day;
// Zeller's congruence for finding day of week
if (month < 3) {
month += 12;
year--;
}
int h = (day + (13 * (month + 1)) / 5 + year + year / 4 - year / 100 + year / 400) % 7;
// Convert to 0-based where 0 is Sunday
int dayOfWeek = (h + 6) % 7;
dayOfWeekSums[dayOfWeek] += values[i];
dayOfWeekCounts[dayOfWeek]++;
}
// Calculate average by day of week
double dayOfWeekAvgs[7] = {0};
for (int i = 0; i < 7; i++) {
if (dayOfWeekCounts[i] > 0) {
dayOfWeekAvgs[i] = dayOfWeekSums[i] / dayOfWeekCounts[i];
}
}
// Find peak day
int peakDay = 0;
double peakValue = dayOfWeekAvgs[0];
for (int i = 1; i < 7; i++) {
if (dayOfWeekAvgs[i] > peakValue) {
peakValue = dayOfWeekAvgs[i];
peakDay = i;
}
}
// Calculate variance to determine if there's a weekly pattern
double mean = 0;
for (int i = 0; i < 7; i++) {
if (dayOfWeekCounts[i] > 0) {
mean += dayOfWeekAvgs[i];
}
}
mean /= 7;
double variance = 0;
for (int i = 0; i < 7; i++) {
if (dayOfWeekCounts[i] > 0) {
double diff = dayOfWeekAvgs[i] - mean;
variance += diff * diff;
}
}
variance /= 7;
// Calculate pattern strength
double strength = std::min(1.0, variance / (mean * mean + 0.001));
// Only report if pattern is significant
if (strength > 0.1) {
results[0].patternType = PATTERN_WEEKLY;
results[0].periodicity = 7;
results[0].strength = strength;
results[0].peakDayOfWeek = peakDay;
// Copy peak values
for (int i = 0; i < 7; i++) {
results[0].peakValues[i] = dayOfWeekAvgs[i];
}
// Generate description
const char* dayNames[] = {"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"};
snprintf(results[0].description, MAX_STRING_SIZE,
"Weekly pattern detected with peak on %s (strength: %.2f)",
dayNames[peakDay], strength);
}
return results;
}
/**
* @brief Free memory for date pattern results
*
* @param results Pointer to date pattern results array
*/
void free_date_pattern_results(DatePatternResult* results) {
delete[] results;
}
/**
* @brief Analyze cyclical patterns in time series data
*
* @param values Array of values
* @param dates Array of corresponding dates
* @param data_length Length of the arrays
* @param factor_name Name of the factor being analyzed
* @return CycleAnalysisResult Structure containing cycle analysis results
*/
CycleAnalysisResult analyze_cycles(
const double* values,
const DateStruct* dates,
int data_length,
const char* factor_name) {
CycleAnalysisResult result;
memset(&result, 0, sizeof(CycleAnalysisResult));
// Minimum data points required for cycle analysis
if (data_length < 20) {
strncpy(result.description, "Insufficient data for cycle analysis", MAX_STRING_SIZE-1);
return result;
}
// Simple autocorrelation-based cycle detection
// Find the peak in autocorrelation function after lag 0
int maxLag = data_length / 3; // Look for cycles up to 1/3 of data length
double maxCorr = 0;
int bestLag = 0;
for (int lag = 2; lag < maxLag; lag++) {
double sum = 0;
double count = 0;
for (int i = 0; i < data_length - lag; i++) {
sum += values[i] * values[i + lag];
count++;
}
double corr = sum / count;
if (corr > maxCorr) {
maxCorr = corr;
bestLag = lag;
}
}
// Calculate average cycle length in days
if (bestLag > 0 && maxCorr > 0.2) {
result.cycleLength = bestLag;
result.amplitude = maxCorr;
result.confidence = maxCorr;
result.cycleLengthVariance = bestLag * 0.2; // Simple estimation
snprintf(result.description, MAX_STRING_SIZE,
"Cycle detected with approximate length of %d days (confidence: %.2f)",
bestLag, maxCorr);
} else {
strncpy(result.description, "No significant cycle detected", MAX_STRING_SIZE-1);
}
return result;
}