Move the data cleaning module from example workflow to main directory

2021-10-03 11:16:47 -04:00 · 2021-10-03 11:16:47 -04:00 · 4a7989c058
parent 8e3d5eb98c
commit 4a7989c058
8 changed files with 118 additions and 62 deletions
--- a/4
+++ b/4
@ -394,6 +394,10 @@ if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]:
 if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
    files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
 # Data Cleaning
 if config["DATA_CLEANING"]["COMPUTE"]:
    files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
    files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
 rule all:
    input:
--- a/config.yaml
+++ b/config.yaml
@ -564,3 +564,17 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
  CORR_THRESHOLD: 0.1
  CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
 ########################################################################################################################
 #                                                    Data Cleaning                                                     #
 ########################################################################################################################
 DATA_CLEANING:
  COMPUTE: False
  COLS_NAN_THRESHOLD: 0.3
  COLS_VAR_THRESHOLD: True
  ROWS_NAN_THRESHOLD: 0.3
  DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
  CORR_VALID_PAIRS_THRESHOLD: 0.5
  CORR_THRESHOLD: 0.95
--- a/example_profile/Snakefile
+++ b/example_profile/Snakefile
@ -384,6 +384,11 @@ if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]:
 if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
    files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
 # Data Cleaning
 if config["DATA_CLEANING"]["COMPUTE"]:
    files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
    files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
 # Analysis Workflow Example
 models, scalers = [], []
 for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]:
@ -401,7 +406,6 @@ files_to_compute.extend(expand("data/raw/{pid}/participant_target_with_datetime.
 files_to_compute.extend(expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"]))
 # Individual model
 files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
 files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
 files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
 files_to_compute.extend(expand(
@ -414,7 +418,6 @@ files_to_compute.extend(expand(
                            scaler=scalers))
 # Population model
 files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
 files_to_compute.append("data/processed/models/population_model/input.csv")
 files_to_compute.extend(expand("data/processed/models/population_model/output_{cv_method}/baselines.csv", cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
 files_to_compute.extend(expand(
--- a/example_profile/example_config.yaml
+++ b/example_profile/example_config.yaml
@ -534,6 +534,19 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
  CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
 ########################################################################################################################
 #                                                    Data Cleaning                                                     #
 ########################################################################################################################
 DATA_CLEANING:
  COMPUTE: True
  COLS_NAN_THRESHOLD: 0.3
  COLS_VAR_THRESHOLD: True
  ROWS_NAN_THRESHOLD: 0.3
  DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
  CORR_VALID_PAIRS_THRESHOLD: 0.5
  CORR_THRESHOLD: 0.95
 ########################################################################################################################
 #                                              Analysis Workflow Example                                               #
@ -551,14 +564,6 @@ PARAMS_FOR_ANALYSIS:
  TARGET:
    FOLDER: data/external/example_workflow
    CONTAINER: participant_target.csv
  # Cleaning Parameters
  COLS_NAN_THRESHOLD: 0.3
  COLS_VAR_THRESHOLD: True
  ROWS_NAN_THRESHOLD: 0.3
  DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
  CORR_VALID_PAIRS_THRESHOLD: 0.5
  CORR_THRESHOLD: 0.95
  MODEL_NAMES: [LogReg, kNN , SVM, DT, RF, GB, XGBoost, LightGBM]
  CV_METHODS: [LeaveOneOut]
--- a/rules/features.smk
+++ b/rules/features.smk
@ -761,22 +761,6 @@ rule fitbit_sleep_intraday_r_features:
    script:
        "../src/features/entry.R"
 rule merge_sensor_features_for_individual_participants:
    input:
        feature_files = input_merge_sensor_features_for_individual_participants
    output:
        "data/processed/features/{pid}/all_sensor_features.csv"
    script:
        "../src/features/utils/merge_sensor_features_for_individual_participants.R"
 rule merge_sensor_features_for_all_participants:
    input:
        feature_files = expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])
    output:
        "data/processed/features/all_participants/all_sensor_features.csv"
    script:
        "../src/features/utils/merge_sensor_features_for_all_participants.R"
 rule empatica_accelerometer_python_features:
    input:
        sensor_data = "data/raw/{pid}/empatica_accelerometer_with_datetime.csv",
@ -958,3 +942,49 @@ rule empatica_tags_r_features:
        "data/interim/{pid}/empatica_tags_features/empatica_tags_r_{provider_key}.csv"
    script:
        "../src/features/entry.R"
 rule merge_sensor_features_for_individual_participants:
    input:
        feature_files = input_merge_sensor_features_for_individual_participants
    output:
        "data/processed/features/{pid}/all_sensor_features.csv"
    script:
        "../src/features/utils/merge_sensor_features_for_individual_participants.R"
 rule merge_sensor_features_for_all_participants:
    input:
        feature_files = expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])
    output:
        "data/processed/features/all_participants/all_sensor_features.csv"
    script:
        "../src/features/utils/merge_sensor_features_for_all_participants.R"
 rule clean_sensor_features_for_individual_participants:
    input:
        rules.merge_sensor_features_for_individual_participants.output
    params:
        cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
        cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
        rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
        data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
        corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
        corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
    output:
        "data/processed/features/{pid}/all_sensor_features_cleaned.csv"
    script:
        "../src/features/utils/clean_sensor_features.R"
 rule clean_sensor_features_for_all_participants:
    input:
        rules.merge_sensor_features_for_all_participants.output
    params:
        cols_nan_threshold = config["DATA_CLEANING"]["COLS_NAN_THRESHOLD"],
        cols_var_threshold = config["DATA_CLEANING"]["COLS_VAR_THRESHOLD"],
        rows_nan_threshold = config["DATA_CLEANING"]["ROWS_NAN_THRESHOLD"],
        data_yielded_hours_ratio_threshold = config["DATA_CLEANING"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
        corr_valid_pairs_threshold = config["DATA_CLEANING"]["CORR_VALID_PAIRS_THRESHOLD"],
        corr_threshold = config["DATA_CLEANING"]["CORR_THRESHOLD"]
    output:
        "data/processed/features/all_participants/all_sensor_features_cleaned.csv"
    script:
        "../src/features/utils/clean_sensor_features.R"
--- a/rules/models.smk
+++ b/rules/models.smk
@ -53,36 +53,6 @@ rule parse_targets:
    script:
        "../src/models/workflow_example/parse_targets.py"
 rule clean_sensor_features_for_individual_participants:
    input:
        rules.merge_sensor_features_for_individual_participants.output
    params:
        cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
        cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
        rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
        data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
        corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
        corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
    output:
        "data/processed/features/{pid}/all_sensor_features_cleaned.csv"
    script:
        "../src/models/workflow_example/clean_sensor_features.R"
 rule clean_sensor_features_for_all_participants:
    input:
        rules.merge_sensor_features_for_all_participants.output
    params:
        cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
        cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
        rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
        data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
        corr_valid_pairs_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_VALID_PAIRS_THRESHOLD"],
        corr_threshold = config["PARAMS_FOR_ANALYSIS"]["CORR_THRESHOLD"]
    output:
        "data/processed/features/all_participants/all_sensor_features_cleaned.csv"
    script:
        "../src/models/workflow_example/clean_sensor_features.R"
 rule merge_features_and_targets_for_individual_model:
    input:
        cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned.csv",
--- a/src/models/workflow_example/clean_sensor_features.R
+++ b/src/models/workflow_example/clean_sensor_features.R
@ -25,12 +25,6 @@ if(nrow(clean_features))
 if(drop_zero_variance_columns)
  clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
 # drop rows with a percentage of NA values above rows_nan_threshold
 clean_features <- clean_features %>% 
  mutate(percentage_na =  rowSums(is.na(.)) / ncol(.)) %>% 
  filter(percentage_na < rows_nan_threshold) %>% 
  select(-percentage_na)
 # drop highly correlated features
 features_for_corr <- clean_features %>% 
  select_if(is.numeric) %>% 
@ -47,4 +41,10 @@ highly_correlated_features <- features_for_corr %>%
 clean_features <- clean_features[, !names(clean_features) %in% highly_correlated_features]
 # drop rows with a percentage of NA values above rows_nan_threshold
 clean_features <- clean_features %>% 
  mutate(percentage_na =  rowSums(is.na(.)) / ncol(.)) %>% 
  filter(percentage_na < rows_nan_threshold) %>% 
  select(-percentage_na)
 write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
--- a/tools/config.schema.yaml
+++ b/tools/config.schema.yaml
@ -36,6 +36,7 @@ required:
  - HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT
  - HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT
  - HEATMAP_FEATURE_CORRELATION_MATRIX
  - DATA_CLEANING
 definitions:
  PROVIDER:
@ -1256,3 +1257,32 @@ properties:
      CORR_METHOD:
        type: string
        enum: ["pearson", "kendall", "spearman"]
  DATA_CLEANING:
    type: object
    required: [COMPUTE, COLS_NAN_THRESHOLD, COLS_VAR_THRESHOLD, ROWS_NAN_THRESHOLD, DATA_YIELDED_HOURS_RATIO_THRESHOLD, CORR_VALID_PAIRS_THRESHOLD, CORR_THRESHOLD]
    properties:
      COMPUTE: 
        type: boolean
      COLS_NAN_THRESHOLD:
        type: number
        minimum: 0
        maximum: 1
      COLS_VAR_THRESHOLD:
        type: boolean
      ROWS_NAN_THRESHOLD: 
        type: number
        minimum: 0
        maximum: 1
      DATA_YIELDED_HOURS_RATIO_THRESHOLD:
        type: number
        minimum: 0
        maximum: 1
      CORR_VALID_PAIRS_THRESHOLD:
        type: number
        minimum: 0
        maximum: 1
      CORR_THRESHOLD:
        type: number
        minimum: 0
        maximum: 1