369 lines
No EOL
11 KiB
C++
369 lines
No EOL
11 KiB
C++
// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
|
|
// SPDX-License-Identifier: AGPL-3.0
|
|
//
|
|
// correlation.cpp
|
|
// Implementation of correlation analysis functions
|
|
//
|
|
#include "health_analytics_engine.h"
|
|
#include "utils.h"
|
|
/**
|
|
* @brief Find factors with strongest correlation to target variable
|
|
*
|
|
* @param target Target variable time series
|
|
* @param factors Array of factor time series
|
|
* @param factorNames Array of factor names
|
|
* @param targetLength Length of target time series
|
|
* @param factorCount Number of factors
|
|
* @return CorrelationResult* Array of correlation results (sorted by strength)
|
|
*/
|
|
CorrelationResult* find_strongest_correlations(const double* target,
|
|
const double** factors,
|
|
const char** factorNames,
|
|
int targetLength,
|
|
int factorCount) {
|
|
// Allocate results array (one more than needed to mark the end)
|
|
CorrelationResult* results = new CorrelationResult[factorCount + 1];
|
|
|
|
// Calculate correlations for each factor
|
|
for (int i = 0; i < factorCount; i++) {
|
|
// Calculate both Pearson and Spearman correlations
|
|
double pearson_corr = calculateCorrelation(target, factors[i], targetLength);
|
|
double spearman_corr = calculateSpearmanCorrelation(target, factors[i], targetLength);
|
|
|
|
// Use the correlation with higher absolute value
|
|
double corr = (std::abs(pearson_corr) > std::abs(spearman_corr)) ?
|
|
pearson_corr : spearman_corr;
|
|
|
|
results[i].factorIndex = i;
|
|
results[i].correlation = corr;
|
|
|
|
// Estimate p-value based on correlation and sample size
|
|
// This is an approximation; a real implementation would use t-distribution
|
|
double t = corr * std::sqrt((targetLength - 2) / (1 - corr * corr));
|
|
// Simplified 2-tailed p-value approximation
|
|
results[i].pValue = 2 * (1 - std::min(1.0, std::exp(-0.717 * std::abs(t) - 0.416 * t * t)));
|
|
|
|
// Copy factor name
|
|
strncpy(results[i].factorName, factorNames[i], MAX_STRING_SIZE - 1);
|
|
results[i].factorName[MAX_STRING_SIZE - 1] = '\0';
|
|
|
|
// Calculate confidence based on sample size, correlation strength, and p-value
|
|
double sample_size_factor = 1.0 - 1.0 / std::sqrt(targetLength);
|
|
double p_value_factor = 1.0 - results[i].pValue;
|
|
results[i].confidence = std::abs(corr) * sample_size_factor * p_value_factor;
|
|
}
|
|
|
|
// Sort by absolute correlation value
|
|
std::sort(results, results + factorCount,
|
|
[](const CorrelationResult& a, const CorrelationResult& b) {
|
|
return std::abs(a.correlation) > std::abs(b.correlation);
|
|
});
|
|
|
|
// Mark the end of valid results
|
|
results[factorCount].factorIndex = -1;
|
|
results[factorCount].correlation = 0;
|
|
results[factorCount].confidence = 0;
|
|
results[factorCount].factorName[0] = '\0';
|
|
|
|
return results;
|
|
}
|
|
|
|
/**
|
|
* @brief Free memory for correlation results
|
|
*
|
|
* @param results Pointer to correlation results array
|
|
*/
|
|
void free_correlation_results(CorrelationResult* results) {
|
|
delete[] results;
|
|
}
|
|
|
|
/**
|
|
* @brief Find multivariate correlations between factors
|
|
*
|
|
* @param data Array of factor time series
|
|
* @param factorNames Array of factor names
|
|
* @param factorCount Number of factors
|
|
* @param dataLength Length of time series
|
|
* @return MultivariateCorrelation* Array of multivariate correlation results
|
|
*/
|
|
MultivariateCorrelation* find_multivariate_correlations(const double** data,
|
|
const char** factorNames,
|
|
int factorCount,
|
|
int dataLength) {
|
|
if (factorCount < 2 || dataLength < 3) {
|
|
MultivariateCorrelation* dummy = new MultivariateCorrelation[1];
|
|
memset(dummy, 0, sizeof(MultivariateCorrelation));
|
|
dummy[0].factorCount = -1; // Mark as invalid
|
|
return dummy;
|
|
}
|
|
|
|
// Constants
|
|
const int MAX_RESULTS = 100;
|
|
const double MIN_CORRELATION_THRESHOLD = 0.3;
|
|
|
|
// Allocate space for up to MAX_RESULTS correlations + 1 terminator
|
|
MultivariateCorrelation* results = new MultivariateCorrelation[MAX_RESULTS + 1];
|
|
int resultCount = 0;
|
|
|
|
// Create correlation matrix for all pairwise correlations
|
|
std::vector<std::vector<double>> corrMatrix(factorCount, std::vector<double>(factorCount, 0));
|
|
|
|
for (int i = 0; i < factorCount; i++) {
|
|
corrMatrix[i][i] = 1.0; // Self-correlation is 1
|
|
|
|
for (int j = i + 1; j < factorCount; j++) {
|
|
// Calculate correlation using both Pearson and Spearman
|
|
double pearson = calculateCorrelation(data[i], data[j], dataLength);
|
|
double spearman = calculateSpearmanCorrelation(data[i], data[j], dataLength);
|
|
|
|
// Use the one with higher absolute value
|
|
double corr = (std::abs(pearson) > std::abs(spearman)) ? pearson : spearman;
|
|
|
|
// Store in both positions (symmetric matrix)
|
|
corrMatrix[i][j] = corr;
|
|
corrMatrix[j][i] = corr;
|
|
}
|
|
}
|
|
|
|
// Find pairwise correlations
|
|
for (int i = 0; i < factorCount && resultCount < MAX_RESULTS; i++) {
|
|
for (int j = i + 1; j < factorCount && resultCount < MAX_RESULTS; j++) {
|
|
double corr = corrMatrix[i][j];
|
|
|
|
// Only store significant correlations
|
|
if (std::abs(corr) >= MIN_CORRELATION_THRESHOLD) {
|
|
MultivariateCorrelation& result = results[resultCount];
|
|
|
|
result.factorCount = 2;
|
|
result.correlationStrength = corr;
|
|
result.primaryFactorIndex = i;
|
|
result.secondaryFactorIndex = j;
|
|
result.tertiaryFactorIndex = -1;
|
|
|
|
// Calculate confidence based on sample size and correlation strength
|
|
double t_stat = std::abs(corr) * std::sqrt((dataLength - 2) / (1 - corr * corr));
|
|
double p_value = 2 * (1 - std::min(1.0, std::exp(-0.717 * t_stat - 0.416 * t_stat * t_stat)));
|
|
result.confidence = std::abs(corr) * (1.0 - p_value) * (1.0 - 1.0 / std::sqrt(dataLength));
|
|
|
|
// Copy factor names
|
|
strncpy(result.factorNames[0], factorNames[i], MAX_STRING_SIZE - 1);
|
|
result.factorNames[0][MAX_STRING_SIZE - 1] = '\0';
|
|
|
|
strncpy(result.factorNames[1], factorNames[j], MAX_STRING_SIZE - 1);
|
|
result.factorNames[1][MAX_STRING_SIZE - 1] = '\0';
|
|
|
|
// Set factor weights based on correlation directionality
|
|
if (corr > 0) {
|
|
result.factorWeights[0] = 0.5;
|
|
result.factorWeights[1] = 0.5;
|
|
} else {
|
|
result.factorWeights[0] = 0.5;
|
|
result.factorWeights[1] = -0.5;
|
|
}
|
|
|
|
// Determine relationship type
|
|
result.relationshipType = RELATIONSHIP_CORRELATION;
|
|
|
|
// Generate description
|
|
const char* strength_text =
|
|
(std::abs(corr) > 0.7) ? "strong" :
|
|
(std::abs(corr) > 0.5) ? "moderate" : "weak";
|
|
|
|
const char* direction_text = (corr > 0) ? "positive" : "negative";
|
|
|
|
snprintf(result.description, MAX_STRING_SIZE,
|
|
"There is a %s %s correlation between %s and %s (r=%.2f, p=%.3f)",
|
|
strength_text, direction_text,
|
|
factorNames[i], factorNames[j], corr,
|
|
(result.confidence > 0.99) ? 0.001 : 1.0 - result.confidence);
|
|
|
|
resultCount++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Find partial correlations and three-way relationships
|
|
if (factorCount >= 3) {
|
|
// Calculate partial correlations
|
|
std::vector<std::vector<std::vector<double>>> partialCorr(
|
|
factorCount, std::vector<std::vector<double>>(
|
|
factorCount, std::vector<double>(factorCount, 0.0)));
|
|
|
|
for (int i = 0; i < factorCount; i++) {
|
|
for (int j = i + 1; j < factorCount; j++) {
|
|
for (int k = 0; k < factorCount; k++) {
|
|
if (k == i || k == j) continue;
|
|
|
|
// Calculate partial correlation between i and j controlling for k
|
|
double r_ij = corrMatrix[i][j];
|
|
double r_ik = corrMatrix[i][k];
|
|
double r_jk = corrMatrix[j][k];
|
|
|
|
double denominator = std::sqrt((1 - r_ik * r_ik) * (1 - r_jk * r_jk));
|
|
|
|
if (denominator > 1e-10) {
|
|
double partial = (r_ij - r_ik * r_jk) / denominator;
|
|
partialCorr[i][j][k] = partial;
|
|
partialCorr[j][i][k] = partial;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Find three-way relationships
|
|
for (int i = 0; i < factorCount && resultCount < MAX_RESULTS; i++) {
|
|
for (int j = i + 1; j < factorCount && resultCount < MAX_RESULTS; j++) {
|
|
for (int k = j + 1; k < factorCount && resultCount < MAX_RESULTS; k++) {
|
|
// Get pairwise correlations
|
|
double r_ij = corrMatrix[i][j];
|
|
double r_ik = corrMatrix[i][k];
|
|
double r_jk = corrMatrix[j][k];
|
|
|
|
// Check if all pairs are correlated
|
|
if (std::abs(r_ij) >= MIN_CORRELATION_THRESHOLD &&
|
|
std::abs(r_ik) >= MIN_CORRELATION_THRESHOLD &&
|
|
std::abs(r_jk) >= MIN_CORRELATION_THRESHOLD) {
|
|
|
|
// Get partial correlations
|
|
double p_ij_k = partialCorr[i][j][k]; // i,j controlling for k
|
|
double p_ik_j = partialCorr[i][k][j]; // i,k controlling for j
|
|
double p_jk_i = partialCorr[j][k][i]; // j,k controlling for i
|
|
|
|
// Determine if mediation or confounding is present
|
|
bool is_mediation = false;
|
|
int mediator = -1;
|
|
|
|
// Check if k mediates i->j
|
|
if (std::abs(p_ij_k) < std::abs(r_ij) * 0.5) {
|
|
is_mediation = true;
|
|
mediator = k;
|
|
}
|
|
// Check if j mediates i->k
|
|
else if (std::abs(p_ik_j) < std::abs(r_ik) * 0.5) {
|
|
is_mediation = true;
|
|
mediator = j;
|
|
}
|
|
// Check if i mediates j->k
|
|
else if (std::abs(p_jk_i) < std::abs(r_jk) * 0.5) {
|
|
is_mediation = true;
|
|
mediator = i;
|
|
}
|
|
|
|
MultivariateCorrelation& result = results[resultCount];
|
|
|
|
result.factorCount = 3;
|
|
// Use average correlation as strength
|
|
result.correlationStrength = (std::abs(r_ij) + std::abs(r_ik) + std::abs(r_jk)) / 3.0;
|
|
result.primaryFactorIndex = i;
|
|
result.secondaryFactorIndex = j;
|
|
result.tertiaryFactorIndex = k;
|
|
|
|
// Lower confidence for three-way relationships
|
|
result.confidence = result.correlationStrength *
|
|
(1.0 - 2.0 / std::sqrt(dataLength));
|
|
|
|
// Copy factor names
|
|
strncpy(result.factorNames[0], factorNames[i], MAX_STRING_SIZE - 1);
|
|
result.factorNames[0][MAX_STRING_SIZE - 1] = '\0';
|
|
|
|
strncpy(result.factorNames[1], factorNames[j], MAX_STRING_SIZE - 1);
|
|
result.factorNames[1][MAX_STRING_SIZE - 1] = '\0';
|
|
|
|
strncpy(result.factorNames[2], factorNames[k], MAX_STRING_SIZE - 1);
|
|
result.factorNames[2][MAX_STRING_SIZE - 1] = '\0';
|
|
|
|
// Set relationship type
|
|
if (is_mediation) {
|
|
result.relationshipType = RELATIONSHIP_MEDIATION;
|
|
|
|
// Set weights based on mediation path
|
|
if (mediator == i) {
|
|
result.factorWeights[0] = 0.5; // Mediator
|
|
result.factorWeights[1] = 0.3; // Source
|
|
result.factorWeights[2] = 0.3; // Target
|
|
} else if (mediator == j) {
|
|
result.factorWeights[0] = 0.3; // Source
|
|
result.factorWeights[1] = 0.5; // Mediator
|
|
result.factorWeights[2] = 0.3; // Target
|
|
} else {
|
|
result.factorWeights[0] = 0.3; // Source
|
|
result.factorWeights[1] = 0.3; // Target
|
|
result.factorWeights[2] = 0.5; // Mediator
|
|
}
|
|
|
|
// Generate description for mediation
|
|
snprintf(result.description, MAX_STRING_SIZE,
|
|
"Potential mediation detected: %s may mediate the relationship between %s and %s",
|
|
factorNames[mediator],
|
|
factorNames[(mediator+1) % 3],
|
|
factorNames[(mediator+2) % 3]);
|
|
} else {
|
|
result.relationshipType = RELATIONSHIP_NETWORK;
|
|
|
|
// Equal weights for network relationship
|
|
result.factorWeights[0] = 0.33;
|
|
result.factorWeights[1] = 0.33;
|
|
result.factorWeights[2] = 0.33;
|
|
|
|
// Generate description for network
|
|
snprintf(result.description, MAX_STRING_SIZE,
|
|
"Network relationship detected between %s, %s, and %s (average correlation: %.2f)",
|
|
factorNames[i], factorNames[j], factorNames[k],
|
|
result.correlationStrength);
|
|
}
|
|
|
|
resultCount++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Mark the end of valid results
|
|
if (resultCount < MAX_RESULTS) {
|
|
results[resultCount].factorCount = -1;
|
|
} else {
|
|
results[MAX_RESULTS].factorCount = -1;
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
/**
|
|
* @brief Free memory for multivariate correlation results
|
|
*
|
|
* @param results Pointer to multivariate correlation results array
|
|
*/
|
|
void free_multivariate_correlations(MultivariateCorrelation* results) {
|
|
delete[] results;
|
|
}
|
|
|
|
/**
|
|
* @brief Direct API access to correlation calculation
|
|
*
|
|
* @param x First data series
|
|
* @param y Second data series
|
|
* @param length Length of data series
|
|
* @return double Correlation coefficient
|
|
*/
|
|
double calculate_correlation(const double* x, const double* y, int length) {
|
|
if (length <= 1) return 0;
|
|
|
|
double sum_x = 0, sum_y = 0, sum_xy = 0;
|
|
double sum_x2 = 0, sum_y2 = 0;
|
|
|
|
for (int i = 0; i < length; i++) {
|
|
sum_x += x[i];
|
|
sum_y += y[i];
|
|
sum_xy += x[i] * y[i];
|
|
sum_x2 += x[i] * x[i];
|
|
sum_y2 += y[i] * y[i];
|
|
}
|
|
|
|
double denominator = sqrt((length * sum_x2 - sum_x * sum_x) *
|
|
(length * sum_y2 - sum_y * sum_y));
|
|
|
|
if (denominator < 1e-10) return 0; // Avoid division by zero
|
|
|
|
return (length * sum_xy - sum_x * sum_y) / denominator;
|
|
} |