494 lines
No EOL
13 KiB
C++
494 lines
No EOL
13 KiB
C++
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
|
|
// SPDX-License-Identifier: AGPL-3.0
|
|
//
|
|
// anomaly_detection.cpp
|
|
// Implementation of anomaly and outlier detection functions
|
|
//
|
|
#include "health_analytics_engine.h"
|
|
/**
|
|
* @brief Detect anomalies in time series data
|
|
*
|
|
* @param timeSeries Time series data
|
|
* @param dataLength Length of time series
|
|
* @param threshold Z-score threshold to consider a point an anomaly
|
|
* @param dates Array of dates corresponding to time series points
|
|
* @param factorName Name of the factor being analyzed
|
|
* @return AnomalyResult* Array of detected anomalies
|
|
*/
|
|
AnomalyResult* detect_anomalies(const double* timeSeries,
|
|
int dataLength,
|
|
double threshold,
|
|
const DateStruct* dates,
|
|
const char* factorName) {
|
|
if (dataLength < 5) {
|
|
AnomalyResult* dummy = new AnomalyResult[1];
|
|
memset(dummy, 0, sizeof(AnomalyResult));
|
|
dummy[0].dataPointIndex = -1; // Mark as invalid
|
|
return dummy;
|
|
}
|
|
|
|
// Constants
|
|
const int MAX_RESULTS = 100;
|
|
|
|
// Allocate space for results
|
|
AnomalyResult* results = new AnomalyResult[MAX_RESULTS + 1];
|
|
int resultCount = 0;
|
|
|
|
// Decompose time series if enough data
|
|
std::vector<double> trend(dataLength);
|
|
std::vector<double> seasonal(dataLength);
|
|
std::vector<double> residual(dataLength);
|
|
|
|
bool hasDecomposition = false;
|
|
int seasonality = 0;
|
|
|
|
// Try to detect seasonality for decomposition
|
|
double maxAutocorr = 0;
|
|
for (int lag = 2; lag <= dataLength/3; lag++) {
|
|
double acf = calculateAutocorrelation(timeSeries, dataLength, lag);
|
|
if (acf > 0.3 && acf > maxAutocorr) {
|
|
maxAutocorr = acf;
|
|
seasonality = lag;
|
|
}
|
|
}
|
|
|
|
if (seasonality >= 2) {
|
|
hasDecomposition = decomposeTimeSeries(timeSeries, dataLength, seasonality,
|
|
trend.data(), seasonal.data(), residual.data());
|
|
}
|
|
|
|
if (!hasDecomposition) {
|
|
// Simple moving average for trend if decomposition failed
|
|
int windowSize = std::min(7, dataLength/3);
|
|
if (windowSize < 2) windowSize = 2;
|
|
|
|
calculateMovingAverage(timeSeries, dataLength, windowSize, trend.data());
|
|
|
|
// Residuals = original - trend
|
|
for (int i = 0; i < dataLength; i++) {
|
|
seasonal[i] = 0; // No seasonal component
|
|
residual[i] = timeSeries[i] - trend[i];
|
|
}
|
|
}
|
|
|
|
// Calculate residual statistics for outlier detection
|
|
double mean = 0, sumSquared = 0;
|
|
for (int i = 0; i < dataLength; i++) {
|
|
mean += residual[i];
|
|
}
|
|
mean /= dataLength;
|
|
|
|
for (int i = 0; i < dataLength; i++) {
|
|
double diff = residual[i] - mean;
|
|
sumSquared += diff * diff;
|
|
}
|
|
double stdDev = sqrt(sumSquared / dataLength);
|
|
|
|
if (stdDev <= 0) {
|
|
results[0].dataPointIndex = -1;
|
|
return results;
|
|
}
|
|
|
|
// Detect global outliers using z-score
|
|
for (int i = 0; i < dataLength && resultCount < MAX_RESULTS; i++) {
|
|
double zScore = (residual[i] - mean) / stdDev;
|
|
|
|
if (std::abs(zScore) > threshold) {
|
|
AnomalyResult& anomaly = results[resultCount++];
|
|
|
|
anomaly.dataPointIndex = i;
|
|
anomaly.anomalyScore = std::abs(zScore);
|
|
anomaly.originalValue = timeSeries[i];
|
|
anomaly.expectedValue = trend[i] + seasonal[i] + mean;
|
|
|
|
// Higher confidence for more extreme anomalies
|
|
anomaly.confidence = 0.5 + 0.5 * std::min(1.0, (std::abs(zScore) - threshold) / 5.0);
|
|
|
|
// Copy date if available
|
|
if (dates != nullptr) {
|
|
anomaly.date = dates[i];
|
|
} else {
|
|
memset(&anomaly.date, 0, sizeof(DateStruct));
|
|
}
|
|
|
|
// Copy factor name
|
|
if (factorName != nullptr) {
|
|
strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
|
|
anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
|
|
} else {
|
|
anomaly.factorName[0] = '\0';
|
|
}
|
|
|
|
// Set anomaly type
|
|
anomaly.anomalyType = ANOMALY_OUTLIER;
|
|
|
|
// Generate description
|
|
if (zScore > 0) {
|
|
snprintf(anomaly.description, MAX_STRING_SIZE,
|
|
"Unusually high value (%.2f standard deviations above expected)",
|
|
zScore);
|
|
} else {
|
|
snprintf(anomaly.description, MAX_STRING_SIZE,
|
|
"Unusually low value (%.2f standard deviations below expected)",
|
|
-zScore);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Detect context-based anomalies using local statistics
|
|
const int LOCAL_WINDOW = std::min(7, dataLength/5);
|
|
if (LOCAL_WINDOW >= 3) {
|
|
for (int i = LOCAL_WINDOW; i < dataLength - LOCAL_WINDOW && resultCount < MAX_RESULTS; i++) {
|
|
// Calculate local statistics
|
|
double localSum = 0, localSumSquared = 0;
|
|
for (int j = i - LOCAL_WINDOW; j <= i + LOCAL_WINDOW; j++) {
|
|
if (j != i) { // Exclude the point itself
|
|
localSum += timeSeries[j];
|
|
localSumSquared += timeSeries[j] * timeSeries[j];
|
|
}
|
|
}
|
|
|
|
double localMean = localSum / (2 * LOCAL_WINDOW);
|
|
double localVar = localSumSquared / (2 * LOCAL_WINDOW) - localMean * localMean;
|
|
double localStdDev = sqrt(std::max(localVar, 1e-6)); // Prevent division by zero
|
|
|
|
// Calculate local z-score
|
|
double localZScore = (timeSeries[i] - localMean) / localStdDev;
|
|
|
|
// Check if it's a local anomaly but not already a global anomaly
|
|
if (std::abs(localZScore) > threshold * 1.2) {
|
|
// Check if this point was already detected as a global anomaly
|
|
bool alreadyDetected = false;
|
|
for (int j = 0; j < resultCount; j++) {
|
|
if (results[j].dataPointIndex == i) {
|
|
alreadyDetected = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!alreadyDetected) {
|
|
AnomalyResult& anomaly = results[resultCount++];
|
|
|
|
anomaly.dataPointIndex = i;
|
|
anomaly.anomalyScore = std::abs(localZScore);
|
|
anomaly.originalValue = timeSeries[i];
|
|
anomaly.expectedValue = localMean;
|
|
anomaly.confidence = 0.5 + 0.5 * std::min(1.0, (std::abs(localZScore) - threshold) / 5.0);
|
|
|
|
// Copy date if available
|
|
if (dates != nullptr) {
|
|
anomaly.date = dates[i];
|
|
} else {
|
|
memset(&anomaly.date, 0, sizeof(DateStruct));
|
|
}
|
|
|
|
// Copy factor name
|
|
if (factorName != nullptr) {
|
|
strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
|
|
anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
|
|
} else {
|
|
anomaly.factorName[0] = '\0';
|
|
}
|
|
|
|
// Set anomaly type
|
|
anomaly.anomalyType = ANOMALY_CONTEXTUAL;
|
|
|
|
// Generate description
|
|
if (localZScore > 0) {
|
|
snprintf(anomaly.description, MAX_STRING_SIZE,
|
|
"Context anomaly: value is high compared to local neighborhood (%.2f local std dev)",
|
|
localZScore);
|
|
} else {
|
|
snprintf(anomaly.description, MAX_STRING_SIZE,
|
|
"Context anomaly: value is low compared to local neighborhood (%.2f local std dev)",
|
|
-localZScore);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Detect trend changes
|
|
if (dataLength >= 10) {
|
|
for (int i = 5; i < dataLength - 5 && resultCount < MAX_RESULTS; i++) {
|
|
// Calculate slope before and after
|
|
double slopeBefore = 0, slopeAfter = 0;
|
|
double interceptBefore = 0, interceptAfter = 0;
|
|
double r2Before = 0, r2After = 0;
|
|
|
|
// Create time vectors
|
|
std::vector<double> time1(5), time2(5);
|
|
std::vector<double> values1(5), values2(5);
|
|
|
|
for (int j = 0; j < 5; j++) {
|
|
time1[j] = j;
|
|
time2[j] = j;
|
|
values1[j] = timeSeries[i - 5 + j];
|
|
values2[j] = timeSeries[i + j];
|
|
}
|
|
|
|
bool validBefore = calculateLinearRegression(time1.data(), values1.data(), 5,
|
|
slopeBefore, interceptBefore, r2Before);
|
|
bool validAfter = calculateLinearRegression(time2.data(), values2.data(), 5,
|
|
slopeAfter, interceptAfter, r2After);
|
|
|
|
if (validBefore && validAfter) {
|
|
// Check for significant change in slope
|
|
double slopeChange = slopeAfter - slopeBefore;
|
|
double meanSlope = (std::abs(slopeBefore) + std::abs(slopeAfter)) / 2;
|
|
|
|
if (meanSlope > 0 && std::abs(slopeChange) / meanSlope > 0.5) {
|
|
AnomalyResult& anomaly = results[resultCount++];
|
|
|
|
anomaly.dataPointIndex = i;
|
|
anomaly.anomalyScore = std::abs(slopeChange) / (meanSlope + 1e-6);
|
|
anomaly.originalValue = timeSeries[i];
|
|
anomaly.expectedValue = timeSeries[i]; // Same value, change is in the trend
|
|
anomaly.confidence = 0.5 + 0.5 * std::min(1.0, anomaly.anomalyScore / 2.0);
|
|
|
|
// Copy date if available
|
|
if (dates != nullptr) {
|
|
anomaly.date = dates[i];
|
|
} else {
|
|
memset(&anomaly.date, 0, sizeof(DateStruct));
|
|
}
|
|
|
|
// Copy factor name
|
|
if (factorName != nullptr) {
|
|
strncpy(anomaly.factorName, factorName, MAX_STRING_SIZE - 1);
|
|
anomaly.factorName[MAX_STRING_SIZE - 1] = '\0';
|
|
} else {
|
|
anomaly.factorName[0] = '\0';
|
|
}
|
|
|
|
// Set anomaly type
|
|
anomaly.anomalyType = ANOMALY_TREND_CHANGE;
|
|
|
|
// Generate description
|
|
if (slopeBefore < 0 && slopeAfter > 0) {
|
|
snprintf(anomaly.description, MAX_STRING_SIZE,
|
|
"Trend reversal: changed from decreasing (%.2f/day) to increasing (%.2f/day)",
|
|
-slopeBefore, slopeAfter);
|
|
} else if (slopeBefore > 0 && slopeAfter < 0) {
|
|
snprintf(anomaly.description, MAX_STRING_SIZE,
|
|
"Trend reversal: changed from increasing (%.2f/day) to decreasing (%.2f/day)",
|
|
slopeBefore, -slopeAfter);
|
|
} else if (slopeAfter > slopeBefore) {
|
|
snprintf(anomaly.description, MAX_STRING_SIZE,
|
|
"Trend acceleration: rate of change increased from %.2f/day to %.2f/day",
|
|
slopeBefore, slopeAfter);
|
|
} else {
|
|
snprintf(anomaly.description, MAX_STRING_SIZE,
|
|
"Trend deceleration: rate of change decreased from %.2f/day to %.2f/day",
|
|
slopeBefore, slopeAfter);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sort anomalies by score (most significant first)
|
|
std::sort(results, results + resultCount,
|
|
[](const AnomalyResult& a, const AnomalyResult& b) {
|
|
return a.anomalyScore > b.anomalyScore;
|
|
});
|
|
|
|
// Mark the end of valid results
|
|
results[resultCount].dataPointIndex = -1;
|
|
|
|
return results;
|
|
}
|
|
|
|
/**
|
|
* @brief Free memory for anomaly detection results
|
|
*
|
|
* @param results Pointer to anomaly results array
|
|
*/
|
|
void free_anomaly_results(AnomalyResult* results) {
|
|
delete[] results;
|
|
}
|
|
|
|
/**
|
|
* @brief Analyze patterns related to dates (e.g., day of week effects)
|
|
*
|
|
* @param values Array of values
|
|
* @param dates Array of corresponding dates
|
|
* @param data_length Length of the arrays
|
|
* @param factor_name Name of the factor being analyzed
|
|
* @return DatePatternResult* Array of detected patterns
|
|
*/
|
|
DatePatternResult* analyze_date_patterns(
|
|
const double* values,
|
|
const DateStruct* dates,
|
|
int data_length,
|
|
const char* factor_name) {
|
|
|
|
// Allocate space for results (3 patterns max + terminator)
|
|
DatePatternResult* results = new DatePatternResult[4];
|
|
memset(results, 0, 4 * sizeof(DatePatternResult));
|
|
|
|
// Initialize terminator
|
|
results[0].patternType = PATTERN_NONE;
|
|
|
|
// Return empty result for insufficient data
|
|
if (data_length < 14) {
|
|
return results;
|
|
}
|
|
|
|
// Count occurrences by day of week
|
|
double dayOfWeekSums[7] = {0};
|
|
int dayOfWeekCounts[7] = {0};
|
|
|
|
for (int i = 0; i < data_length; i++) {
|
|
// Convert date to day of week (0 = Sunday, 6 = Saturday)
|
|
// This is a simplified calculation and might need adjustment
|
|
int year = dates[i].year;
|
|
int month = dates[i].month;
|
|
int day = dates[i].day;
|
|
|
|
// Zeller's congruence for finding day of week
|
|
if (month < 3) {
|
|
month += 12;
|
|
year--;
|
|
}
|
|
int h = (day + (13 * (month + 1)) / 5 + year + year / 4 - year / 100 + year / 400) % 7;
|
|
// Convert to 0-based where 0 is Sunday
|
|
int dayOfWeek = (h + 6) % 7;
|
|
|
|
dayOfWeekSums[dayOfWeek] += values[i];
|
|
dayOfWeekCounts[dayOfWeek]++;
|
|
}
|
|
|
|
// Calculate average by day of week
|
|
double dayOfWeekAvgs[7] = {0};
|
|
for (int i = 0; i < 7; i++) {
|
|
if (dayOfWeekCounts[i] > 0) {
|
|
dayOfWeekAvgs[i] = dayOfWeekSums[i] / dayOfWeekCounts[i];
|
|
}
|
|
}
|
|
|
|
// Find peak day
|
|
int peakDay = 0;
|
|
double peakValue = dayOfWeekAvgs[0];
|
|
for (int i = 1; i < 7; i++) {
|
|
if (dayOfWeekAvgs[i] > peakValue) {
|
|
peakValue = dayOfWeekAvgs[i];
|
|
peakDay = i;
|
|
}
|
|
}
|
|
|
|
// Calculate variance to determine if there's a weekly pattern
|
|
double mean = 0;
|
|
for (int i = 0; i < 7; i++) {
|
|
if (dayOfWeekCounts[i] > 0) {
|
|
mean += dayOfWeekAvgs[i];
|
|
}
|
|
}
|
|
mean /= 7;
|
|
|
|
double variance = 0;
|
|
for (int i = 0; i < 7; i++) {
|
|
if (dayOfWeekCounts[i] > 0) {
|
|
double diff = dayOfWeekAvgs[i] - mean;
|
|
variance += diff * diff;
|
|
}
|
|
}
|
|
variance /= 7;
|
|
|
|
// Calculate pattern strength
|
|
double strength = std::min(1.0, variance / (mean * mean + 0.001));
|
|
|
|
// Only report if pattern is significant
|
|
if (strength > 0.1) {
|
|
results[0].patternType = PATTERN_WEEKLY;
|
|
results[0].periodicity = 7;
|
|
results[0].strength = strength;
|
|
results[0].peakDayOfWeek = peakDay;
|
|
|
|
// Copy peak values
|
|
for (int i = 0; i < 7; i++) {
|
|
results[0].peakValues[i] = dayOfWeekAvgs[i];
|
|
}
|
|
|
|
// Generate description
|
|
const char* dayNames[] = {"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"};
|
|
snprintf(results[0].description, MAX_STRING_SIZE,
|
|
"Weekly pattern detected with peak on %s (strength: %.2f)",
|
|
dayNames[peakDay], strength);
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
/**
|
|
* @brief Free memory for date pattern results
|
|
*
|
|
* @param results Pointer to date pattern results array
|
|
*/
|
|
void free_date_pattern_results(DatePatternResult* results) {
|
|
delete[] results;
|
|
}
|
|
|
|
/**
|
|
* @brief Analyze cyclical patterns in time series data
|
|
*
|
|
* @param values Array of values
|
|
* @param dates Array of corresponding dates
|
|
* @param data_length Length of the arrays
|
|
* @param factor_name Name of the factor being analyzed
|
|
* @return CycleAnalysisResult Structure containing cycle analysis results
|
|
*/
|
|
CycleAnalysisResult analyze_cycles(
|
|
const double* values,
|
|
const DateStruct* dates,
|
|
int data_length,
|
|
const char* factor_name) {
|
|
|
|
CycleAnalysisResult result;
|
|
memset(&result, 0, sizeof(CycleAnalysisResult));
|
|
|
|
// Minimum data points required for cycle analysis
|
|
if (data_length < 20) {
|
|
strncpy(result.description, "Insufficient data for cycle analysis", MAX_STRING_SIZE-1);
|
|
return result;
|
|
}
|
|
|
|
// Simple autocorrelation-based cycle detection
|
|
// Find the peak in autocorrelation function after lag 0
|
|
int maxLag = data_length / 3; // Look for cycles up to 1/3 of data length
|
|
double maxCorr = 0;
|
|
int bestLag = 0;
|
|
|
|
for (int lag = 2; lag < maxLag; lag++) {
|
|
double sum = 0;
|
|
double count = 0;
|
|
|
|
for (int i = 0; i < data_length - lag; i++) {
|
|
sum += values[i] * values[i + lag];
|
|
count++;
|
|
}
|
|
|
|
double corr = sum / count;
|
|
|
|
if (corr > maxCorr) {
|
|
maxCorr = corr;
|
|
bestLag = lag;
|
|
}
|
|
}
|
|
|
|
// Calculate average cycle length in days
|
|
if (bestLag > 0 && maxCorr > 0.2) {
|
|
result.cycleLength = bestLag;
|
|
result.amplitude = maxCorr;
|
|
result.confidence = maxCorr;
|
|
result.cycleLengthVariance = bestLag * 0.2; // Simple estimation
|
|
|
|
snprintf(result.description, MAX_STRING_SIZE,
|
|
"Cycle detected with approximate length of %d days (confidence: %.2f)",
|
|
bestLag, maxCorr);
|
|
} else {
|
|
strncpy(result.description, "No significant cycle detected", MAX_STRING_SIZE-1);
|
|
}
|
|
|
|
return result;
|
|
} |