From 3e7b9260d28395504d6d3abc0ba821747bccb4b5 Mon Sep 17 00:00:00 2001
From: Meng Li <34143965+Meng6@users.noreply.github.com>
Date: Tue, 19 Oct 2021 14:32:40 +0000
Subject: [PATCH] Fix the bug for dropping highly correlated features

---
 src/features/utils/clean_sensor_features.R | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/features/utils/clean_sensor_features.R b/src/features/utils/clean_sensor_features.R
index 10f00766..66cc1d1f 100644
--- a/src/features/utils/clean_sensor_features.R
+++ b/src/features/utils/clean_sensor_features.R
@@ -14,7 +14,7 @@ data_yielded_hours_ratio_threshold <- as.numeric(snakemake@params[["data_yielded
 corr_valid_pairs_threshold <- as.numeric(snakemake@params[["corr_valid_pairs_threshold"]])
 corr_threshold <- as.numeric(snakemake@params[["corr_threshold"]])
 
-# drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold
+# drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less or equal than data_yielded_hours_ratio_threshold
 clean_features <- clean_features %>% 
   filter(phone_data_yield_rapids_ratiovalidyieldedhours > data_yielded_hours_ratio_threshold)
 
@@ -32,19 +32,23 @@ features_for_corr <- clean_features %>%
 
 valid_pairs <- crossprod(!is.na(features_for_corr)) >= corr_valid_pairs_threshold * nrow(features_for_corr)
 
-highly_correlated_features <- features_for_corr %>% 
-  correlate(use = "pairwise.complete.obs", method = "spearman") %>% 
-  column_to_rownames(., var = "term") %>% 
-  as.matrix() %>% 
-  replace(!valid_pairs | is.na(.), 0) %>% 
-  findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
+if((dim(features_for_corr)[1] != 0) & (dim(features_for_corr)[2] != 0)){
 
-clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
+  highly_correlated_features <- features_for_corr %>% 
+    correlate(use = "pairwise.complete.obs", method = "spearman") %>% 
+    column_to_rownames(., var = "term") %>% 
+    as.matrix() %>% 
+    replace(!valid_pairs | is.na(.), 0) %>% 
+    findCorrelation(., cutoff = corr_threshold, verbose = F, names = T)
+
+  clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
+  
+}
 
 # drop rows with a percentage of NA values above rows_nan_threshold
 clean_features <- clean_features %>% 
   mutate(percentage_na =  rowSums(is.na(.)) / ncol(.)) %>% 
-  filter(percentage_na < rows_nan_threshold) %>% 
+  filter(percentage_na <= rows_nan_threshold) %>% 
   select(-percentage_na)
 
 write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)