From fc121863ff76ed7e157762dd9a692ba7a8356542 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Sun, 3 Oct 2021 01:31:14 -0400 Subject: [PATCH] Add drop highly correlated features module --- example_profile/example_config.yaml | 2 ++ renv.lock | 14 +++++++++++++ rules/models.smk | 4 ++++ .../workflow_example/clean_sensor_features.R | 21 +++++++++++++++++++ 4 files changed, 41 insertions(+) diff --git a/example_profile/example_config.yaml b/example_profile/example_config.yaml index f21a137c..f080c65e 100644 --- a/example_profile/example_config.yaml +++ b/example_profile/example_config.yaml @@ -557,6 +557,8 @@ PARAMS_FOR_ANALYSIS: COLS_VAR_THRESHOLD: True ROWS_NAN_THRESHOLD: 0.3 DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75 + CORR_VALID_PAIRS_THRESHOLD: 0.5 + CORR_THRESHOLD: 0.95 MODEL_NAMES: [LogReg, kNN , SVM, DT, RF, GB, XGBoost, LightGBM] CV_METHODS: [LeaveOneOut] diff --git a/renv.lock b/renv.lock index 054a9662..ec769edf 100644 --- a/renv.lock +++ b/renv.lock @@ -205,6 +205,13 @@ "Repository": "CRAN", "Hash": "b7d7f1e926dfcd57c74ce93f5c048e80" }, + "caret": { + "Package": "caret", + "Version": "6.0-89", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "95cdd7da1e51ab0451c27666f15db891" + }, "cellranger": { "Package": "cellranger", "Version": "1.1.0", @@ -275,6 +282,13 @@ "Repository": "CRAN", "Hash": "ae01381679f4511ca7a72d55fe175213" }, + "corrr": { + "Package": "corrr", + "Version": "0.4.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "dbd1387c025b07f62da3334942176e14" + }, "cpp11": { "Package": "cpp11", "Version": "0.2.4", diff --git a/rules/models.smk b/rules/models.smk index 7fe6aa6e..6af6e31e 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -61,6 +61,8 @@ rule clean_sensor_features_for_individual_participants: cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"], + corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"], + corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"] output: "data/processed/features/{pid}/all_sensor_features_cleaned.csv" script: @@ -74,6 +76,8 @@ rule clean_sensor_features_for_all_participants: cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"], + corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"], + corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"] output: "data/processed/features/all_participants/all_sensor_features_cleaned.csv" script: diff --git a/src/models/workflow_example/clean_sensor_features.R b/src/models/workflow_example/clean_sensor_features.R index 57b9b285..af4d2a81 100644 --- a/src/models/workflow_example/clean_sensor_features.R +++ b/src/models/workflow_example/clean_sensor_features.R @@ -1,6 +1,9 @@ source("renv/activate.R") library(tidyr) library("dplyr", warn.conflicts = F) +library(tidyverse) +library(caret) +library(corrr) clean_features <- read.csv(snakemake@input[[1]]) @@ -8,6 +11,8 @@ cols_nan_threshold <- as.numeric(snakemake@params[["cols_nan_threshold"]]) drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]]) rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]]) data_yielded_hours_ratio_threshold <- as.numeric(snakemake@params[["data_yielded_hours_ratio_threshold"]]) +corr_valid_pairs_threshold <- as.numeric(snakemake@params[["corr_valid_pairs_threshold"]] +corr_threshold <- as.numeric(snakemake@params[["corr_threshold"]] # drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold clean_features <- clean_features %>% @@ -26,4 +31,20 @@ clean_features <- clean_features %>% filter(percentage_na < rows_nan_threshold) %>% select(-percentage_na) +# drop highly correlated features +features_for_corr <- clean_features %>% + select_if(is.numeric) %>% + select_if(sapply(., n_distinct, na.rm = T) > 1) + +valid_pairs <- crossprod(!is.na(features_for_corr)) >= corr_valid_pairs_threshold * nrow(features_for_corr) + +highly_correlated_features <- features_for_corr %>% + correlate(use = "pairwise.complete.obs", method = "spearman") %>% + column_to_rownames(., var = "term") %>% + as.matrix() %>% + replace(!valid_pairs | is.na(.), 0) %>% + findCorrelation(., cutoff = corr_threshold, verbose = F, names = T) + +clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features] + write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)