// SPDX-FileCopyrightText: © 2025 Nøkken.io // SPDX-License-Identifier: AGPL-3.0 // // correlation.cpp // Implementation of correlation analysis functions // #include "health_analytics_engine.h" #include "utils.h" /** * @brief Find factors with strongest correlation to target variable * * @param target Target variable time series * @param factors Array of factor time series * @param factorNames Array of factor names * @param targetLength Length of target time series * @param factorCount Number of factors * @return CorrelationResult* Array of correlation results (sorted by strength) */ CorrelationResult* find_strongest_correlations(const double* target, const double** factors, const char** factorNames, int targetLength, int factorCount) { // Allocate results array (one more than needed to mark the end) CorrelationResult* results = new CorrelationResult[factorCount + 1]; // Calculate correlations for each factor for (int i = 0; i < factorCount; i++) { // Calculate both Pearson and Spearman correlations double pearson_corr = calculateCorrelation(target, factors[i], targetLength); double spearman_corr = calculateSpearmanCorrelation(target, factors[i], targetLength); // Use the correlation with higher absolute value double corr = (std::abs(pearson_corr) > std::abs(spearman_corr)) ? pearson_corr : spearman_corr; results[i].factorIndex = i; results[i].correlation = corr; // Estimate p-value based on correlation and sample size // This is an approximation; a real implementation would use t-distribution double t = corr * std::sqrt((targetLength - 2) / (1 - corr * corr)); // Simplified 2-tailed p-value approximation results[i].pValue = 2 * (1 - std::min(1.0, std::exp(-0.717 * std::abs(t) - 0.416 * t * t))); // Copy factor name strncpy(results[i].factorName, factorNames[i], MAX_STRING_SIZE - 1); results[i].factorName[MAX_STRING_SIZE - 1] = '\0'; // Calculate confidence based on sample size, correlation strength, and p-value double sample_size_factor = 1.0 - 1.0 / std::sqrt(targetLength); double p_value_factor = 1.0 - results[i].pValue; results[i].confidence = std::abs(corr) * sample_size_factor * p_value_factor; } // Sort by absolute correlation value std::sort(results, results + factorCount, [](const CorrelationResult& a, const CorrelationResult& b) { return std::abs(a.correlation) > std::abs(b.correlation); }); // Mark the end of valid results results[factorCount].factorIndex = -1; results[factorCount].correlation = 0; results[factorCount].confidence = 0; results[factorCount].factorName[0] = '\0'; return results; } /** * @brief Free memory for correlation results * * @param results Pointer to correlation results array */ void free_correlation_results(CorrelationResult* results) { delete[] results; } /** * @brief Find multivariate correlations between factors * * @param data Array of factor time series * @param factorNames Array of factor names * @param factorCount Number of factors * @param dataLength Length of time series * @return MultivariateCorrelation* Array of multivariate correlation results */ MultivariateCorrelation* find_multivariate_correlations(const double** data, const char** factorNames, int factorCount, int dataLength) { if (factorCount < 2 || dataLength < 3) { MultivariateCorrelation* dummy = new MultivariateCorrelation[1]; memset(dummy, 0, sizeof(MultivariateCorrelation)); dummy[0].factorCount = -1; // Mark as invalid return dummy; } // Constants const int MAX_RESULTS = 100; const double MIN_CORRELATION_THRESHOLD = 0.3; // Allocate space for up to MAX_RESULTS correlations + 1 terminator MultivariateCorrelation* results = new MultivariateCorrelation[MAX_RESULTS + 1]; int resultCount = 0; // Create correlation matrix for all pairwise correlations std::vector> corrMatrix(factorCount, std::vector(factorCount, 0)); for (int i = 0; i < factorCount; i++) { corrMatrix[i][i] = 1.0; // Self-correlation is 1 for (int j = i + 1; j < factorCount; j++) { // Calculate correlation using both Pearson and Spearman double pearson = calculateCorrelation(data[i], data[j], dataLength); double spearman = calculateSpearmanCorrelation(data[i], data[j], dataLength); // Use the one with higher absolute value double corr = (std::abs(pearson) > std::abs(spearman)) ? pearson : spearman; // Store in both positions (symmetric matrix) corrMatrix[i][j] = corr; corrMatrix[j][i] = corr; } } // Find pairwise correlations for (int i = 0; i < factorCount && resultCount < MAX_RESULTS; i++) { for (int j = i + 1; j < factorCount && resultCount < MAX_RESULTS; j++) { double corr = corrMatrix[i][j]; // Only store significant correlations if (std::abs(corr) >= MIN_CORRELATION_THRESHOLD) { MultivariateCorrelation& result = results[resultCount]; result.factorCount = 2; result.correlationStrength = corr; result.primaryFactorIndex = i; result.secondaryFactorIndex = j; result.tertiaryFactorIndex = -1; // Calculate confidence based on sample size and correlation strength double t_stat = std::abs(corr) * std::sqrt((dataLength - 2) / (1 - corr * corr)); double p_value = 2 * (1 - std::min(1.0, std::exp(-0.717 * t_stat - 0.416 * t_stat * t_stat))); result.confidence = std::abs(corr) * (1.0 - p_value) * (1.0 - 1.0 / std::sqrt(dataLength)); // Copy factor names strncpy(result.factorNames[0], factorNames[i], MAX_STRING_SIZE - 1); result.factorNames[0][MAX_STRING_SIZE - 1] = '\0'; strncpy(result.factorNames[1], factorNames[j], MAX_STRING_SIZE - 1); result.factorNames[1][MAX_STRING_SIZE - 1] = '\0'; // Set factor weights based on correlation directionality if (corr > 0) { result.factorWeights[0] = 0.5; result.factorWeights[1] = 0.5; } else { result.factorWeights[0] = 0.5; result.factorWeights[1] = -0.5; } // Determine relationship type result.relationshipType = RELATIONSHIP_CORRELATION; // Generate description const char* strength_text = (std::abs(corr) > 0.7) ? "strong" : (std::abs(corr) > 0.5) ? "moderate" : "weak"; const char* direction_text = (corr > 0) ? "positive" : "negative"; snprintf(result.description, MAX_STRING_SIZE, "There is a %s %s correlation between %s and %s (r=%.2f, p=%.3f)", strength_text, direction_text, factorNames[i], factorNames[j], corr, (result.confidence > 0.99) ? 0.001 : 1.0 - result.confidence); resultCount++; } } } // Find partial correlations and three-way relationships if (factorCount >= 3) { // Calculate partial correlations std::vector>> partialCorr( factorCount, std::vector>( factorCount, std::vector(factorCount, 0.0))); for (int i = 0; i < factorCount; i++) { for (int j = i + 1; j < factorCount; j++) { for (int k = 0; k < factorCount; k++) { if (k == i || k == j) continue; // Calculate partial correlation between i and j controlling for k double r_ij = corrMatrix[i][j]; double r_ik = corrMatrix[i][k]; double r_jk = corrMatrix[j][k]; double denominator = std::sqrt((1 - r_ik * r_ik) * (1 - r_jk * r_jk)); if (denominator > 1e-10) { double partial = (r_ij - r_ik * r_jk) / denominator; partialCorr[i][j][k] = partial; partialCorr[j][i][k] = partial; } } } } // Find three-way relationships for (int i = 0; i < factorCount && resultCount < MAX_RESULTS; i++) { for (int j = i + 1; j < factorCount && resultCount < MAX_RESULTS; j++) { for (int k = j + 1; k < factorCount && resultCount < MAX_RESULTS; k++) { // Get pairwise correlations double r_ij = corrMatrix[i][j]; double r_ik = corrMatrix[i][k]; double r_jk = corrMatrix[j][k]; // Check if all pairs are correlated if (std::abs(r_ij) >= MIN_CORRELATION_THRESHOLD && std::abs(r_ik) >= MIN_CORRELATION_THRESHOLD && std::abs(r_jk) >= MIN_CORRELATION_THRESHOLD) { // Get partial correlations double p_ij_k = partialCorr[i][j][k]; // i,j controlling for k double p_ik_j = partialCorr[i][k][j]; // i,k controlling for j double p_jk_i = partialCorr[j][k][i]; // j,k controlling for i // Determine if mediation or confounding is present bool is_mediation = false; int mediator = -1; // Check if k mediates i->j if (std::abs(p_ij_k) < std::abs(r_ij) * 0.5) { is_mediation = true; mediator = k; } // Check if j mediates i->k else if (std::abs(p_ik_j) < std::abs(r_ik) * 0.5) { is_mediation = true; mediator = j; } // Check if i mediates j->k else if (std::abs(p_jk_i) < std::abs(r_jk) * 0.5) { is_mediation = true; mediator = i; } MultivariateCorrelation& result = results[resultCount]; result.factorCount = 3; // Use average correlation as strength result.correlationStrength = (std::abs(r_ij) + std::abs(r_ik) + std::abs(r_jk)) / 3.0; result.primaryFactorIndex = i; result.secondaryFactorIndex = j; result.tertiaryFactorIndex = k; // Lower confidence for three-way relationships result.confidence = result.correlationStrength * (1.0 - 2.0 / std::sqrt(dataLength)); // Copy factor names strncpy(result.factorNames[0], factorNames[i], MAX_STRING_SIZE - 1); result.factorNames[0][MAX_STRING_SIZE - 1] = '\0'; strncpy(result.factorNames[1], factorNames[j], MAX_STRING_SIZE - 1); result.factorNames[1][MAX_STRING_SIZE - 1] = '\0'; strncpy(result.factorNames[2], factorNames[k], MAX_STRING_SIZE - 1); result.factorNames[2][MAX_STRING_SIZE - 1] = '\0'; // Set relationship type if (is_mediation) { result.relationshipType = RELATIONSHIP_MEDIATION; // Set weights based on mediation path if (mediator == i) { result.factorWeights[0] = 0.5; // Mediator result.factorWeights[1] = 0.3; // Source result.factorWeights[2] = 0.3; // Target } else if (mediator == j) { result.factorWeights[0] = 0.3; // Source result.factorWeights[1] = 0.5; // Mediator result.factorWeights[2] = 0.3; // Target } else { result.factorWeights[0] = 0.3; // Source result.factorWeights[1] = 0.3; // Target result.factorWeights[2] = 0.5; // Mediator } // Generate description for mediation snprintf(result.description, MAX_STRING_SIZE, "Potential mediation detected: %s may mediate the relationship between %s and %s", factorNames[mediator], factorNames[(mediator+1) % 3], factorNames[(mediator+2) % 3]); } else { result.relationshipType = RELATIONSHIP_NETWORK; // Equal weights for network relationship result.factorWeights[0] = 0.33; result.factorWeights[1] = 0.33; result.factorWeights[2] = 0.33; // Generate description for network snprintf(result.description, MAX_STRING_SIZE, "Network relationship detected between %s, %s, and %s (average correlation: %.2f)", factorNames[i], factorNames[j], factorNames[k], result.correlationStrength); } resultCount++; } } } } } // Mark the end of valid results if (resultCount < MAX_RESULTS) { results[resultCount].factorCount = -1; } else { results[MAX_RESULTS].factorCount = -1; } return results; } /** * @brief Free memory for multivariate correlation results * * @param results Pointer to multivariate correlation results array */ void free_multivariate_correlations(MultivariateCorrelation* results) { delete[] results; } /** * @brief Direct API access to correlation calculation * * @param x First data series * @param y Second data series * @param length Length of data series * @return double Correlation coefficient */ double calculate_correlation(const double* x, const double* y, int length) { if (length <= 1) return 0; double sum_x = 0, sum_y = 0, sum_xy = 0; double sum_x2 = 0, sum_y2 = 0; for (int i = 0; i < length; i++) { sum_x += x[i]; sum_y += y[i]; sum_xy += x[i] * y[i]; sum_x2 += x[i] * x[i]; sum_y2 += y[i] * y[i]; } double denominator = sqrt((length * sum_x2 - sum_x * sum_x) * (length * sum_y2 - sum_y * sum_y)); if (denominator < 1e-10) return 0; // Avoid division by zero return (length * sum_xy - sum_x * sum_y) / denominator; }