From 8aa25144f0568401305f0a63d57115981df827a1 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Tue, 23 Jun 2020 20:46:42 -0400 Subject: [PATCH] Refactor analysis part of snakefile --- Snakefile | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++ config.yaml | 5 +-- 2 files changed, 96 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index d2bfe578..1df92c16 100644 --- a/Snakefile +++ b/Snakefile @@ -6,6 +6,8 @@ include: "rules/models.snakefile" include: "rules/reports.snakefile" include: "rules/mystudy.snakefile" # You can add snakfiles with rules tailored to your project +import itertools + files_to_compute = [] if len(config["PIDS"]) == 0: @@ -94,6 +96,97 @@ if config["CONVERSATION"]["COMPUTE"]: # TODO add files_to_compute.extend(optional_conversation_input(None)), the Android or iOS table gets processed depending on each participant files_to_compute.extend(expand("data/processed/{pid}/conversation_{segment}.csv",pid=config["PIDS"], segment = config["CONVERSATION"]["DAY_SEGMENTS"])) +if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]: + rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"] + cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"] + models, scalers, rows_nan_thresholds, cols_nan_thresholds = [], [], [], [] + for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]: + models = models + [model_name] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) * len(rows_nan_threshold) + scalers = scalers + config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name] * len(rows_nan_threshold) + rows_nan_thresholds = rows_nan_thresholds + list(itertools.chain.from_iterable([threshold] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) for threshold in rows_nan_threshold)) + cols_nan_thresholds = cols_nan_thresholds + list(itertools.chain.from_iterable([threshold] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) for threshold in cols_nan_threshold)) + results = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"] + ["merged_population_model_results"] + + files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv", + pid = config["PIDS"], + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/data_for_population_model/{source}_{day_segment}_original.csv", + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand( + expand("data/processed/{pid}/data_for_individual_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv", + pid = config["PIDS"], + days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], + days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], + cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), + zip, + rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], + cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) + files_to_compute.extend(expand( + expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv", + days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], + days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], + cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), + zip, + rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], + cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) + files_to_compute.extend(expand("data/processed/data_for_population_model/demographic_features.csv")) + files_to_compute.extend(expand("data/processed/data_for_population_model/targets_{summarised}.csv", + summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"])) + files_to_compute.extend(expand( + expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv", + days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], + days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], + cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), + zip, + rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], + cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) + files_to_compute.extend(expand( + expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv", + days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], + days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], + cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"], + summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]), + zip, + rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], + cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) + files_to_compute.extend(expand( + expand("data/processed/output_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}_{cv_method}_baseline.csv", + days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], + days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], + cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"], + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"], + summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]), + zip, + rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], + cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) + files_to_compute.extend(expand( + expand("data/processed/output_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{{model}}/{cv_method}/{source}_{day_segment}_{summarised}_{{scaler}}/{result}.csv", + days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], + days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], + cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"], + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"], + summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"], + result = results), + zip, + rows_nan_threshold = rows_nan_thresholds, + cols_nan_threshold = cols_nan_thresholds, + model = models, + scaler = scalers)) + rule all: input: files_to_compute diff --git a/config.yaml b/config.yaml index 1b7dd30c..93c7acf5 100644 --- a/config.yaml +++ b/config.yaml @@ -183,6 +183,7 @@ CONVERSATION: ### Analysis ################################################################ PARAMS_FOR_ANALYSIS: + COMPUTE: False GROUNDTRUTH_TABLE: participant_info SOURCES: &sources ["phone_features", "fitbit_features", "phone_fitbit_features"] DAY_SEGMENTS: *day_segments @@ -206,9 +207,9 @@ PARAMS_FOR_ANALYSIS: DAYS_AFTER_DISCHARGE: 7 # Cleaning Parameters - COLS_NAN_THRESHOLD: 0.5 + COLS_NAN_THRESHOLD: [0.1, 0.3, 0.5] COLS_VAR_THRESHOLD: True - ROWS_NAN_THRESHOLD: 0.5 + ROWS_NAN_THRESHOLD: [0.1, 0.3, 0.5] PARTICIPANT_DAYS_BEFORE_THRESHOLD: 7 PARTICIPANT_DAYS_AFTER_THRESHOLD: 4