Add drop highly correlated features module
parent
dfa11acf87
commit
fc121863ff
|
@ -557,6 +557,8 @@ PARAMS_FOR_ANALYSIS:
|
||||||
COLS_VAR_THRESHOLD: True
|
COLS_VAR_THRESHOLD: True
|
||||||
ROWS_NAN_THRESHOLD: 0.3
|
ROWS_NAN_THRESHOLD: 0.3
|
||||||
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
|
DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
|
||||||
|
CORR_VALID_PAIRS_THRESHOLD: 0.5
|
||||||
|
CORR_THRESHOLD: 0.95
|
||||||
|
|
||||||
MODEL_NAMES: [LogReg, kNN , SVM, DT, RF, GB, XGBoost, LightGBM]
|
MODEL_NAMES: [LogReg, kNN , SVM, DT, RF, GB, XGBoost, LightGBM]
|
||||||
CV_METHODS: [LeaveOneOut]
|
CV_METHODS: [LeaveOneOut]
|
||||||
|
|
14
renv.lock
14
renv.lock
|
@ -205,6 +205,13 @@
|
||||||
"Repository": "CRAN",
|
"Repository": "CRAN",
|
||||||
"Hash": "b7d7f1e926dfcd57c74ce93f5c048e80"
|
"Hash": "b7d7f1e926dfcd57c74ce93f5c048e80"
|
||||||
},
|
},
|
||||||
|
"caret": {
|
||||||
|
"Package": "caret",
|
||||||
|
"Version": "6.0-89",
|
||||||
|
"Source": "Repository",
|
||||||
|
"Repository": "CRAN",
|
||||||
|
"Hash": "95cdd7da1e51ab0451c27666f15db891"
|
||||||
|
},
|
||||||
"cellranger": {
|
"cellranger": {
|
||||||
"Package": "cellranger",
|
"Package": "cellranger",
|
||||||
"Version": "1.1.0",
|
"Version": "1.1.0",
|
||||||
|
@ -275,6 +282,13 @@
|
||||||
"Repository": "CRAN",
|
"Repository": "CRAN",
|
||||||
"Hash": "ae01381679f4511ca7a72d55fe175213"
|
"Hash": "ae01381679f4511ca7a72d55fe175213"
|
||||||
},
|
},
|
||||||
|
"corrr": {
|
||||||
|
"Package": "corrr",
|
||||||
|
"Version": "0.4.3",
|
||||||
|
"Source": "Repository",
|
||||||
|
"Repository": "CRAN",
|
||||||
|
"Hash": "dbd1387c025b07f62da3334942176e14"
|
||||||
|
},
|
||||||
"cpp11": {
|
"cpp11": {
|
||||||
"Package": "cpp11",
|
"Package": "cpp11",
|
||||||
"Version": "0.2.4",
|
"Version": "0.2.4",
|
||||||
|
|
|
@ -61,6 +61,8 @@ rule clean_sensor_features_for_individual_participants:
|
||||||
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
||||||
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
||||||
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
||||||
|
corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
|
||||||
|
corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
|
||||||
output:
|
output:
|
||||||
"data/processed/features/{pid}/all_sensor_features_cleaned.csv"
|
"data/processed/features/{pid}/all_sensor_features_cleaned.csv"
|
||||||
script:
|
script:
|
||||||
|
@ -74,6 +76,8 @@ rule clean_sensor_features_for_all_participants:
|
||||||
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
||||||
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
||||||
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
||||||
|
corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
|
||||||
|
corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
|
||||||
output:
|
output:
|
||||||
"data/processed/features/all_participants/all_sensor_features_cleaned.csv"
|
"data/processed/features/all_participants/all_sensor_features_cleaned.csv"
|
||||||
script:
|
script:
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
source("renv/activate.R")
|
source("renv/activate.R")
|
||||||
library(tidyr)
|
library(tidyr)
|
||||||
library("dplyr", warn.conflicts = F)
|
library("dplyr", warn.conflicts = F)
|
||||||
|
library(tidyverse)
|
||||||
|
library(caret)
|
||||||
|
library(corrr)
|
||||||
|
|
||||||
|
|
||||||
clean_features <- read.csv(snakemake@input[[1]])
|
clean_features <- read.csv(snakemake@input[[1]])
|
||||||
|
@ -8,6 +11,8 @@ cols_nan_threshold <- as.numeric(snakemake@params[["cols_nan_threshold"]])
|
||||||
drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]])
|
drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]])
|
||||||
rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]])
|
rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]])
|
||||||
data_yielded_hours_ratio_threshold <- as.numeric(snakemake@params[["data_yielded_hours_ratio_threshold"]])
|
data_yielded_hours_ratio_threshold <- as.numeric(snakemake@params[["data_yielded_hours_ratio_threshold"]])
|
||||||
|
corr_valid_pairs_threshold <- as.numeric(snakemake@params[["corr_valid_pairs_threshold"]]
|
||||||
|
corr_threshold <- as.numeric(snakemake@params[["corr_threshold"]]
|
||||||
|
|
||||||
# drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
|
# drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
|
||||||
clean_features <- clean_features %>%
|
clean_features <- clean_features %>%
|
||||||
|
@ -26,4 +31,20 @@ clean_features <- clean_features %>%
|
||||||
filter(percentage_na < rows_nan_threshold) %>%
|
filter(percentage_na < rows_nan_threshold) %>%
|
||||||
select(-percentage_na)
|
select(-percentage_na)
|
||||||
|
|
||||||
|
# drop highly correlated features
|
||||||
|
features_for_corr <- clean_features %>%
|
||||||
|
select_if(is.numeric) %>%
|
||||||
|
select_if(sapply(., n_distinct, na.rm = T) > 1)
|
||||||
|
|
||||||
|
valid_pairs <- crossprod(!is.na(features_for_corr)) >= corr_valid_pairs_threshold * nrow(features_for_corr)
|
||||||
|
|
||||||
|
highly_correlated_features <- features_for_corr %>%
|
||||||
|
correlate(use = "pairwise.complete.obs", method = "spearman") %>%
|
||||||
|
column_to_rownames(., var = "term") %>%
|
||||||
|
as.matrix() %>%
|
||||||
|
replace(!valid_pairs | is.na(.), 0) %>%
|
||||||
|
findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
|
||||||
|
|
||||||
|
clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
|
||||||
|
|
||||||
write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
|
write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
|
||||||
|
|
Loading…
Reference in New Issue