nokken/native/statistics/correlation.cpp
2025-04-20 11:17:03 -04:00

369 lines
No EOL
11 KiB
C++

// SPDX-FileCopyrightText: © 2025 Nøkken.io <nokken.io@proton.me>
// SPDX-License-Identifier: AGPL-3.0
//
// correlation.cpp
// Implementation of correlation analysis functions
//
#include "health_analytics_engine.h"
#include "utils.h"
/**
* @brief Find factors with strongest correlation to target variable
*
* @param target Target variable time series
* @param factors Array of factor time series
* @param factorNames Array of factor names
* @param targetLength Length of target time series
* @param factorCount Number of factors
* @return CorrelationResult* Array of correlation results (sorted by strength)
*/
CorrelationResult* find_strongest_correlations(const double* target,
const double** factors,
const char** factorNames,
int targetLength,
int factorCount) {
// Allocate results array (one more than needed to mark the end)
CorrelationResult* results = new CorrelationResult[factorCount + 1];
// Calculate correlations for each factor
for (int i = 0; i < factorCount; i++) {
// Calculate both Pearson and Spearman correlations
double pearson_corr = calculateCorrelation(target, factors[i], targetLength);
double spearman_corr = calculateSpearmanCorrelation(target, factors[i], targetLength);
// Use the correlation with higher absolute value
double corr = (std::abs(pearson_corr) > std::abs(spearman_corr)) ?
pearson_corr : spearman_corr;
results[i].factorIndex = i;
results[i].correlation = corr;
// Estimate p-value based on correlation and sample size
// This is an approximation; a real implementation would use t-distribution
double t = corr * std::sqrt((targetLength - 2) / (1 - corr * corr));
// Simplified 2-tailed p-value approximation
results[i].pValue = 2 * (1 - std::min(1.0, std::exp(-0.717 * std::abs(t) - 0.416 * t * t)));
// Copy factor name
strncpy(results[i].factorName, factorNames[i], MAX_STRING_SIZE - 1);
results[i].factorName[MAX_STRING_SIZE - 1] = '\0';
// Calculate confidence based on sample size, correlation strength, and p-value
double sample_size_factor = 1.0 - 1.0 / std::sqrt(targetLength);
double p_value_factor = 1.0 - results[i].pValue;
results[i].confidence = std::abs(corr) * sample_size_factor * p_value_factor;
}
// Sort by absolute correlation value
std::sort(results, results + factorCount,
[](const CorrelationResult& a, const CorrelationResult& b) {
return std::abs(a.correlation) > std::abs(b.correlation);
});
// Mark the end of valid results
results[factorCount].factorIndex = -1;
results[factorCount].correlation = 0;
results[factorCount].confidence = 0;
results[factorCount].factorName[0] = '\0';
return results;
}
/**
* @brief Free memory for correlation results
*
* @param results Pointer to correlation results array
*/
void free_correlation_results(CorrelationResult* results) {
delete[] results;
}
/**
* @brief Find multivariate correlations between factors
*
* @param data Array of factor time series
* @param factorNames Array of factor names
* @param factorCount Number of factors
* @param dataLength Length of time series
* @return MultivariateCorrelation* Array of multivariate correlation results
*/
MultivariateCorrelation* find_multivariate_correlations(const double** data,
const char** factorNames,
int factorCount,
int dataLength) {
if (factorCount < 2 || dataLength < 3) {
MultivariateCorrelation* dummy = new MultivariateCorrelation[1];
memset(dummy, 0, sizeof(MultivariateCorrelation));
dummy[0].factorCount = -1; // Mark as invalid
return dummy;
}
// Constants
const int MAX_RESULTS = 100;
const double MIN_CORRELATION_THRESHOLD = 0.3;
// Allocate space for up to MAX_RESULTS correlations + 1 terminator
MultivariateCorrelation* results = new MultivariateCorrelation[MAX_RESULTS + 1];
int resultCount = 0;
// Create correlation matrix for all pairwise correlations
std::vector<std::vector<double>> corrMatrix(factorCount, std::vector<double>(factorCount, 0));
for (int i = 0; i < factorCount; i++) {
corrMatrix[i][i] = 1.0; // Self-correlation is 1
for (int j = i + 1; j < factorCount; j++) {
// Calculate correlation using both Pearson and Spearman
double pearson = calculateCorrelation(data[i], data[j], dataLength);
double spearman = calculateSpearmanCorrelation(data[i], data[j], dataLength);
// Use the one with higher absolute value
double corr = (std::abs(pearson) > std::abs(spearman)) ? pearson : spearman;
// Store in both positions (symmetric matrix)
corrMatrix[i][j] = corr;
corrMatrix[j][i] = corr;
}
}
// Find pairwise correlations
for (int i = 0; i < factorCount && resultCount < MAX_RESULTS; i++) {
for (int j = i + 1; j < factorCount && resultCount < MAX_RESULTS; j++) {
double corr = corrMatrix[i][j];
// Only store significant correlations
if (std::abs(corr) >= MIN_CORRELATION_THRESHOLD) {
MultivariateCorrelation& result = results[resultCount];
result.factorCount = 2;
result.correlationStrength = corr;
result.primaryFactorIndex = i;
result.secondaryFactorIndex = j;
result.tertiaryFactorIndex = -1;
// Calculate confidence based on sample size and correlation strength
double t_stat = std::abs(corr) * std::sqrt((dataLength - 2) / (1 - corr * corr));
double p_value = 2 * (1 - std::min(1.0, std::exp(-0.717 * t_stat - 0.416 * t_stat * t_stat)));
result.confidence = std::abs(corr) * (1.0 - p_value) * (1.0 - 1.0 / std::sqrt(dataLength));
// Copy factor names
strncpy(result.factorNames[0], factorNames[i], MAX_STRING_SIZE - 1);
result.factorNames[0][MAX_STRING_SIZE - 1] = '\0';
strncpy(result.factorNames[1], factorNames[j], MAX_STRING_SIZE - 1);
result.factorNames[1][MAX_STRING_SIZE - 1] = '\0';
// Set factor weights based on correlation directionality
if (corr > 0) {
result.factorWeights[0] = 0.5;
result.factorWeights[1] = 0.5;
} else {
result.factorWeights[0] = 0.5;
result.factorWeights[1] = -0.5;
}
// Determine relationship type
result.relationshipType = RELATIONSHIP_CORRELATION;
// Generate description
const char* strength_text =
(std::abs(corr) > 0.7) ? "strong" :
(std::abs(corr) > 0.5) ? "moderate" : "weak";
const char* direction_text = (corr > 0) ? "positive" : "negative";
snprintf(result.description, MAX_STRING_SIZE,
"There is a %s %s correlation between %s and %s (r=%.2f, p=%.3f)",
strength_text, direction_text,
factorNames[i], factorNames[j], corr,
(result.confidence > 0.99) ? 0.001 : 1.0 - result.confidence);
resultCount++;
}
}
}
// Find partial correlations and three-way relationships
if (factorCount >= 3) {
// Calculate partial correlations
std::vector<std::vector<std::vector<double>>> partialCorr(
factorCount, std::vector<std::vector<double>>(
factorCount, std::vector<double>(factorCount, 0.0)));
for (int i = 0; i < factorCount; i++) {
for (int j = i + 1; j < factorCount; j++) {
for (int k = 0; k < factorCount; k++) {
if (k == i || k == j) continue;
// Calculate partial correlation between i and j controlling for k
double r_ij = corrMatrix[i][j];
double r_ik = corrMatrix[i][k];
double r_jk = corrMatrix[j][k];
double denominator = std::sqrt((1 - r_ik * r_ik) * (1 - r_jk * r_jk));
if (denominator > 1e-10) {
double partial = (r_ij - r_ik * r_jk) / denominator;
partialCorr[i][j][k] = partial;
partialCorr[j][i][k] = partial;
}
}
}
}
// Find three-way relationships
for (int i = 0; i < factorCount && resultCount < MAX_RESULTS; i++) {
for (int j = i + 1; j < factorCount && resultCount < MAX_RESULTS; j++) {
for (int k = j + 1; k < factorCount && resultCount < MAX_RESULTS; k++) {
// Get pairwise correlations
double r_ij = corrMatrix[i][j];
double r_ik = corrMatrix[i][k];
double r_jk = corrMatrix[j][k];
// Check if all pairs are correlated
if (std::abs(r_ij) >= MIN_CORRELATION_THRESHOLD &&
std::abs(r_ik) >= MIN_CORRELATION_THRESHOLD &&
std::abs(r_jk) >= MIN_CORRELATION_THRESHOLD) {
// Get partial correlations
double p_ij_k = partialCorr[i][j][k]; // i,j controlling for k
double p_ik_j = partialCorr[i][k][j]; // i,k controlling for j
double p_jk_i = partialCorr[j][k][i]; // j,k controlling for i
// Determine if mediation or confounding is present
bool is_mediation = false;
int mediator = -1;
// Check if k mediates i->j
if (std::abs(p_ij_k) < std::abs(r_ij) * 0.5) {
is_mediation = true;
mediator = k;
}
// Check if j mediates i->k
else if (std::abs(p_ik_j) < std::abs(r_ik) * 0.5) {
is_mediation = true;
mediator = j;
}
// Check if i mediates j->k
else if (std::abs(p_jk_i) < std::abs(r_jk) * 0.5) {
is_mediation = true;
mediator = i;
}
MultivariateCorrelation& result = results[resultCount];
result.factorCount = 3;
// Use average correlation as strength
result.correlationStrength = (std::abs(r_ij) + std::abs(r_ik) + std::abs(r_jk)) / 3.0;
result.primaryFactorIndex = i;
result.secondaryFactorIndex = j;
result.tertiaryFactorIndex = k;
// Lower confidence for three-way relationships
result.confidence = result.correlationStrength *
(1.0 - 2.0 / std::sqrt(dataLength));
// Copy factor names
strncpy(result.factorNames[0], factorNames[i], MAX_STRING_SIZE - 1);
result.factorNames[0][MAX_STRING_SIZE - 1] = '\0';
strncpy(result.factorNames[1], factorNames[j], MAX_STRING_SIZE - 1);
result.factorNames[1][MAX_STRING_SIZE - 1] = '\0';
strncpy(result.factorNames[2], factorNames[k], MAX_STRING_SIZE - 1);
result.factorNames[2][MAX_STRING_SIZE - 1] = '\0';
// Set relationship type
if (is_mediation) {
result.relationshipType = RELATIONSHIP_MEDIATION;
// Set weights based on mediation path
if (mediator == i) {
result.factorWeights[0] = 0.5; // Mediator
result.factorWeights[1] = 0.3; // Source
result.factorWeights[2] = 0.3; // Target
} else if (mediator == j) {
result.factorWeights[0] = 0.3; // Source
result.factorWeights[1] = 0.5; // Mediator
result.factorWeights[2] = 0.3; // Target
} else {
result.factorWeights[0] = 0.3; // Source
result.factorWeights[1] = 0.3; // Target
result.factorWeights[2] = 0.5; // Mediator
}
// Generate description for mediation
snprintf(result.description, MAX_STRING_SIZE,
"Potential mediation detected: %s may mediate the relationship between %s and %s",
factorNames[mediator],
factorNames[(mediator+1) % 3],
factorNames[(mediator+2) % 3]);
} else {
result.relationshipType = RELATIONSHIP_NETWORK;
// Equal weights for network relationship
result.factorWeights[0] = 0.33;
result.factorWeights[1] = 0.33;
result.factorWeights[2] = 0.33;
// Generate description for network
snprintf(result.description, MAX_STRING_SIZE,
"Network relationship detected between %s, %s, and %s (average correlation: %.2f)",
factorNames[i], factorNames[j], factorNames[k],
result.correlationStrength);
}
resultCount++;
}
}
}
}
}
// Mark the end of valid results
if (resultCount < MAX_RESULTS) {
results[resultCount].factorCount = -1;
} else {
results[MAX_RESULTS].factorCount = -1;
}
return results;
}
/**
* @brief Free memory for multivariate correlation results
*
* @param results Pointer to multivariate correlation results array
*/
void free_multivariate_correlations(MultivariateCorrelation* results) {
delete[] results;
}
/**
* @brief Direct API access to correlation calculation
*
* @param x First data series
* @param y Second data series
* @param length Length of data series
* @return double Correlation coefficient
*/
double calculate_correlation(const double* x, const double* y, int length) {
if (length <= 1) return 0;
double sum_x = 0, sum_y = 0, sum_xy = 0;
double sum_x2 = 0, sum_y2 = 0;
for (int i = 0; i < length; i++) {
sum_x += x[i];
sum_y += y[i];
sum_xy += x[i] * y[i];
sum_x2 += x[i] * x[i];
sum_y2 += y[i] * y[i];
}
double denominator = sqrt((length * sum_x2 - sum_x * sum_x) *
(length * sum_y2 - sum_y * sum_y));
if (denominator < 1e-10) return 0; // Avoid division by zero
return (length * sum_xy - sum_x * sum_y) / denominator;
}