Move the data cleaning module from example workflow to main directory

2021-10-03 11:16:47 -04:00 · 2021-10-03 11:16:47 -04:00 · 4a7989c058
parent 8e3d5eb98c
commit 4a7989c058
8 changed files with 118 additions and 62 deletions
--- a/4
+++ b/4
@ -394,6 +394,10 @@ if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]:
 if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
    files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")

+# Data Cleaning
+if config["DATA_CLEANING"]["COMPUTE"]:
+    files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
+    files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")

 rule all:
    input:
--- a/config.yaml
+++ b/config.yaml
@ -564,3 +564,17 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
  CORR_THRESHOLD: 0.1
  CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}

+
+########################################################################################################################
+#                                                    Data Cleaning                                                     #
+########################################################################################################################
+
+DATA_CLEANING:
+  COMPUTE: False
+  COLS_NAN_THRESHOLD: 0.3
+  COLS_VAR_THRESHOLD: True
+  ROWS_NAN_THRESHOLD: 0.3
+  DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
+  CORR_VALID_PAIRS_THRESHOLD: 0.5
+  CORR_THRESHOLD: 0.95
+
--- a/example_profile/Snakefile
+++ b/example_profile/Snakefile
@ -384,6 +384,11 @@ if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]:
 if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
    files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")

+# Data Cleaning
+if config["DATA_CLEANING"]["COMPUTE"]:
+    files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
+    files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
+
 # Analysis Workflow Example
 models, scalers = [], []
 for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]:
@ -401,7 +406,6 @@ files_to_compute.extend(expand("data/raw/{pid}/participant_target_with_datetime.
 files_to_compute.extend(expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"]))

 # Individual model
-files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
 files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
 files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
 files_to_compute.extend(expand(
@ -414,7 +418,6 @@ files_to_compute.extend(expand(
                            scaler=scalers))

 # Population model
-files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
 files_to_compute.append("data/processed/models/population_model/input.csv")
 files_to_compute.extend(expand("data/processed/models/population_model/output_{cv_method}/baselines.csv", cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
 files_to_compute.extend(expand(
--- a/example_profile/example_config.yaml
+++ b/example_profile/example_config.yaml
@ -534,6 +534,19 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
  CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}


+########################################################################################################################
+#                                                    Data Cleaning                                                     #
+########################################################################################################################
+
+DATA_CLEANING:
+  COMPUTE: True
+  COLS_NAN_THRESHOLD: 0.3
+  COLS_VAR_THRESHOLD: True
+  ROWS_NAN_THRESHOLD: 0.3
+  DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
+  CORR_VALID_PAIRS_THRESHOLD: 0.5
+  CORR_THRESHOLD: 0.95
+

 ########################################################################################################################
 #                                              Analysis Workflow Example                                               #
@ -551,14 +564,6 @@ PARAMS_FOR_ANALYSIS:
  TARGET:
    FOLDER: data/external/example_workflow
    CONTAINER: participant_target.csv
-
-  # Cleaning Parameters
-  COLS_NAN_THRESHOLD: 0.3
-  COLS_VAR_THRESHOLD: True
-  ROWS_NAN_THRESHOLD: 0.3
-  DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
-  CORR_VALID_PAIRS_THRESHOLD: 0.5
-  CORR_THRESHOLD: 0.95
  
  MODEL_NAMES: [LogReg, kNN , SVM, DT, RF, GB, XGBoost, LightGBM]
  CV_METHODS: [LeaveOneOut]
--- a/rules/features.smk
+++ b/rules/features.smk
@ -761,22 +761,6 @@ rule fitbit_sleep_intraday_r_features:
    script:
        "../src/features/entry.R"

-rule merge_sensor_features_for_individual_participants:
-    input:
-        feature_files = input_merge_sensor_features_for_individual_participants
-    output:
-        "data/processed/features/{pid}/all_sensor_features.csv"
-    script:
-        "../src/features/utils/merge_sensor_features_for_individual_participants.R"
-
-rule merge_sensor_features_for_all_participants:
-    input:
-        feature_files = expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])
-    output:
-        "data/processed/features/all_participants/all_sensor_features.csv"
-    script:
-        "../src/features/utils/merge_sensor_features_for_all_participants.R"
-
 rule empatica_accelerometer_python_features:
    input:
        sensor_data = "data/raw/{pid}/empatica_accelerometer_with_datetime.csv",
@ -958,3 +942,49 @@ rule empatica_tags_r_features:
        "data/interim/{pid}/empatica_tags_features/empatica_tags_r_{provider_key}.csv"
    script:
        "../src/features/entry.R"
+
+rule merge_sensor_features_for_individual_participants:
+    input:
+        feature_files = input_merge_sensor_features_for_individual_participants
+    output:
+        "data/processed/features/{pid}/all_sensor_features.csv"
+    script:
+        "../src/features/utils/merge_sensor_features_for_individual_participants.R"
+
+rule merge_sensor_features_for_all_participants:
+    input:
+        feature_files = expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])
+    output:
+        "data/processed/features/all_participants/all_sensor_features.csv"
+    script:
+        "../src/features/utils/merge_sensor_features_for_all_participants.R"
+
+rule clean_sensor_features_for_individual_participants:
+    input:
+        rules.merge_sensor_features_for_individual_participants.output
+    params:
+        cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
+        cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
+        rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
+        data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
+        corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
+        corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
+    output:
+        "data/processed/features/{pid}/all_sensor_features_cleaned.csv"
+    script:
+        "../src/features/utils/clean_sensor_features.R"
+
+rule clean_sensor_features_for_all_participants:
+    input:
+        rules.merge_sensor_features_for_all_participants.output
+    params:
+        cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
+        cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
+        rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
+        data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
+        corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
+        corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
+    output:
+        "data/processed/features/all_participants/all_sensor_features_cleaned.csv"
+    script:
+        "../src/features/utils/clean_sensor_features.R"
--- a/rules/models.smk
+++ b/rules/models.smk
@ -53,36 +53,6 @@ rule parse_targets:
    script:
        "../src/models/workflow_example/parse_targets.py"

-rule clean_sensor_features_for_individual_participants:
-    input:
-        rules.merge_sensor_features_for_individual_participants.output
-    params:
-        cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
-        cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
-        rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
-        data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
-        corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
-        corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
-    output:
-        "data/processed/features/{pid}/all_sensor_features_cleaned.csv"
-    script:
-        "../src/models/workflow_example/clean_sensor_features.R"
-
-rule clean_sensor_features_for_all_participants:
-    input:
-        rules.merge_sensor_features_for_all_participants.output
-    params:
-        cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
-        cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
-        rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
-        data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
-        corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
-        corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
-    output:
-        "data/processed/features/all_participants/all_sensor_features_cleaned.csv"
-    script:
-        "../src/models/workflow_example/clean_sensor_features.R"
-
 rule merge_features_and_targets_for_individual_model:
    input:
        cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned.csv",
--- a/src/models/workflow_example/clean_sensor_features.R
+++ b/src/models/workflow_example/clean_sensor_features.R
@ -25,12 +25,6 @@ if(nrow(clean_features))
 if(drop_zero_variance_columns)
  clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)

-# drop rows with a percentage of NA values above rows_nan_threshold
-clean_features <- clean_features %>% 
-  mutate(percentage_na =  rowSums(is.na(.)) / ncol(.)) %>% 
-  filter(percentage_na < rows_nan_threshold) %>% 
-  select(-percentage_na)
-
 # drop highly correlated features
 features_for_corr <- clean_features %>% 
  select_if(is.numeric) %>% 
@ -47,4 +41,10 @@ highly_correlated_features <- features_for_corr %>%

 clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]

+# drop rows with a percentage of NA values above rows_nan_threshold
+clean_features <- clean_features %>% 
+  mutate(percentage_na =  rowSums(is.na(.)) / ncol(.)) %>% 
+  filter(percentage_na < rows_nan_threshold) %>% 
+  select(-percentage_na)
+
 write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
--- a/tools/config.schema.yaml
+++ b/tools/config.schema.yaml
@ -36,6 +36,7 @@ required:
  - HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT
  - HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT
  - HEATMAP_FEATURE_CORRELATION_MATRIX
+  - DATA_CLEANING

 definitions:
  PROVIDER:
@ -1256,3 +1257,32 @@ properties:
      CORR_METHOD:
        type: string
        enum: ["pearson", "kendall", "spearman"]
+
+  DATA_CLEANING:
+    type: object
+    required: [COMPUTE, COLS_NAN_THRESHOLD, COLS_VAR_THRESHOLD, ROWS_NAN_THRESHOLD, DATA_YIELDED_HOURS_RATIO_THRESHOLD, CORR_VALID_PAIRS_THRESHOLD, CORR_THRESHOLD]
+    properties:
+      COMPUTE: 
+        type: boolean
+      COLS_NAN_THRESHOLD:
+        type: number
+        minimum: 0
+        maximum: 1
+      COLS_VAR_THRESHOLD:
+        type: boolean
+      ROWS_NAN_THRESHOLD: 
+        type: number
+        minimum: 0
+        maximum: 1
+      DATA_YIELDED_HOURS_RATIO_THRESHOLD:
+        type: number
+        minimum: 0
+        maximum: 1
+      CORR_VALID_PAIRS_THRESHOLD:
+        type: number
+        minimum: 0
+        maximum: 1
+      CORR_THRESHOLD:
+        type: number
+        minimum: 0
+        maximum: 1