From fc121863ff76ed7e157762dd9a692ba7a8356542 Mon Sep 17 00:00:00 2001
From: Meng Li <34143965+Meng6@users.noreply.github.com>
Date: Sun, 3 Oct 2021 01:31:14 -0400
Subject: [PATCH] Add drop highly correlated features module

---
 example_profile/example_config.yaml           |  2 ++
 renv.lock                                     | 14 +++++++++++++
 rules/models.smk                              |  4 ++++
 .../workflow_example/clean_sensor_features.R  | 21 +++++++++++++++++++
 4 files changed, 41 insertions(+)

diff --git a/example_profile/example_config.yaml b/example_profile/example_config.yaml
index f21a137c..f080c65e 100644
--- a/example_profile/example_config.yaml
+++ b/example_profile/example_config.yaml
@@ -557,6 +557,8 @@ PARAMS_FOR_ANALYSIS:
   COLS_VAR_THRESHOLD: True
   ROWS_NAN_THRESHOLD: 0.3
   DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
+  CORR_VALID_PAIRS_THRESHOLD: 0.5
+  CORR_THRESHOLD: 0.95
   
   MODEL_NAMES: [LogReg, kNN , SVM, DT, RF, GB, XGBoost, LightGBM]
   CV_METHODS: [LeaveOneOut]
diff --git a/renv.lock b/renv.lock
index 054a9662..ec769edf 100644
--- a/renv.lock
+++ b/renv.lock
@@ -205,6 +205,13 @@
       "Repository": "CRAN",
       "Hash": "b7d7f1e926dfcd57c74ce93f5c048e80"
     },
+    "caret": {
+      "Package": "caret",
+      "Version": "6.0-89",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "95cdd7da1e51ab0451c27666f15db891"
+    },
     "cellranger": {
       "Package": "cellranger",
       "Version": "1.1.0",
@@ -275,6 +282,13 @@
       "Repository": "CRAN",
       "Hash": "ae01381679f4511ca7a72d55fe175213"
     },
+    "corrr": {
+      "Package": "corrr",
+      "Version": "0.4.3",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "dbd1387c025b07f62da3334942176e14"
+    },
     "cpp11": {
       "Package": "cpp11",
       "Version": "0.2.4",
diff --git a/rules/models.smk b/rules/models.smk
index 7fe6aa6e..6af6e31e 100644
--- a/rules/models.smk
+++ b/rules/models.smk
@@ -61,6 +61,8 @@ rule clean_sensor_features_for_individual_participants:
         cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
         rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
         data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
+        corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
+        corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
     output:
         "data/processed/features/{pid}/all_sensor_features_cleaned.csv"
     script:
@@ -74,6 +76,8 @@ rule clean_sensor_features_for_all_participants:
         cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
         rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
         data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
+        corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
+        corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
     output:
         "data/processed/features/all_participants/all_sensor_features_cleaned.csv"
     script:
diff --git a/src/models/workflow_example/clean_sensor_features.R b/src/models/workflow_example/clean_sensor_features.R
index 57b9b285..af4d2a81 100644
--- a/src/models/workflow_example/clean_sensor_features.R
+++ b/src/models/workflow_example/clean_sensor_features.R
@@ -1,6 +1,9 @@
 source("renv/activate.R")
 library(tidyr)
 library("dplyr", warn.conflicts = F)
+library(tidyverse)
+library(caret)
+library(corrr)
 
 
 clean_features <- read.csv(snakemake@input[[1]])
@@ -8,6 +11,8 @@ cols_nan_threshold <- as.numeric(snakemake@params[["cols_nan_threshold"]])
 drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]])
 rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]])
 data_yielded_hours_ratio_threshold <- as.numeric(snakemake@params[["data_yielded_hours_ratio_threshold"]])
+corr_valid_pairs_threshold <- as.numeric(snakemake@params[["corr_valid_pairs_threshold"]]
+corr_threshold <- as.numeric(snakemake@params[["corr_threshold"]]
 
 # drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
 clean_features <- clean_features %>% 
@@ -26,4 +31,20 @@ clean_features <- clean_features %>%
   filter(percentage_na < rows_nan_threshold) %>% 
   select(-percentage_na)
 
+# drop highly correlated features
+features_for_corr <- clean_features %>% 
+  select_if(is.numeric) %>% 
+  select_if(sapply(., n_distinct, na.rm = T) > 1)
+
+valid_pairs <- crossprod(!is.na(features_for_corr)) >= corr_valid_pairs_threshold * nrow(features_for_corr)
+
+highly_correlated_features <- features_for_corr %>% 
+  correlate(use = "pairwise.complete.obs", method = "spearman") %>% 
+  column_to_rownames(., var = "term") %>% 
+  as.matrix() %>% 
+  replace(!valid_pairs | is.na(.), 0) %>% 
+  findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
+
+clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
+
 write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)