From 8aa25144f0568401305f0a63d57115981df827a1 Mon Sep 17 00:00:00 2001
From: Meng Li <34143965+Meng6@users.noreply.github.com>
Date: Tue, 23 Jun 2020 20:46:42 -0400
Subject: [PATCH] Refactor analysis part of snakefile

---
 Snakefile   | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 config.yaml |  5 +--
 2 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index d2bfe578..1df92c16 100644
--- a/Snakefile
+++ b/Snakefile
@@ -6,6 +6,8 @@ include: "rules/models.snakefile"
 include: "rules/reports.snakefile"
 include: "rules/mystudy.snakefile" # You can add snakfiles with rules tailored to your project
 
+import itertools
+
 files_to_compute = []
 
 if len(config["PIDS"]) == 0:
@@ -94,6 +96,97 @@ if config["CONVERSATION"]["COMPUTE"]:
     # TODO add files_to_compute.extend(optional_conversation_input(None)), the Android or iOS table gets processed depending on each participant
     files_to_compute.extend(expand("data/processed/{pid}/conversation_{segment}.csv",pid=config["PIDS"], segment = config["CONVERSATION"]["DAY_SEGMENTS"]))
 
+if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]:
+    rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"]
+    cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"]
+    models, scalers, rows_nan_thresholds, cols_nan_thresholds = [], [], [], []
+    for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]:
+        models = models + [model_name] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) * len(rows_nan_threshold)
+        scalers = scalers + config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name] * len(rows_nan_threshold)
+        rows_nan_thresholds = rows_nan_thresholds + list(itertools.chain.from_iterable([threshold] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) for threshold in rows_nan_threshold))
+        cols_nan_thresholds = cols_nan_thresholds + list(itertools.chain.from_iterable([threshold] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) for threshold in cols_nan_threshold))
+    results = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"] + ["merged_population_model_results"]
+
+    files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv",
+                                pid = config["PIDS"],
+                                source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                                day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]))
+    files_to_compute.extend(expand("data/processed/data_for_population_model/{source}_{day_segment}_original.csv",
+                                source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                                day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]))
+    files_to_compute.extend(expand(
+                                expand("data/processed/{pid}/data_for_individual_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
+                                    pid = config["PIDS"],
+                                    days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
+                                    days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
+                                    cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
+                                    source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                                    day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
+                                zip,
+                                rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
+                                cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"]))
+    files_to_compute.extend(expand(
+                                expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
+                                    days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
+                                    days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
+                                    cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
+                                    source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                                    day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
+                                zip,
+                                rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
+                                cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"]))
+    files_to_compute.extend(expand("data/processed/data_for_population_model/demographic_features.csv"))
+    files_to_compute.extend(expand("data/processed/data_for_population_model/targets_{summarised}.csv",
+                                summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]))
+    files_to_compute.extend(expand(
+                                expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv",
+                                    days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
+                                    days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
+                                    cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
+                                    source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                                    day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
+                                zip,
+                                rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
+                                cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"]))
+    files_to_compute.extend(expand(
+                                expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv",
+                                    days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
+                                    days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
+                                    cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
+                                    source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                                    day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"],
+                                    summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
+                                zip,
+                                rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
+                                cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"]))
+    files_to_compute.extend(expand(
+                                expand("data/processed/output_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}_{cv_method}_baseline.csv",
+                                    days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
+                                    days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
+                                    cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
+                                    cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"],
+                                    source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                                    day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"],
+                                    summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
+                                zip,
+                                rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
+                                cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"]))
+    files_to_compute.extend(expand(
+                                expand("data/processed/output_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{{model}}/{cv_method}/{source}_{day_segment}_{summarised}_{{scaler}}/{result}.csv",
+                                    days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
+                                    days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
+                                    cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
+                                    cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"],
+                                    source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                                    day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"],
+                                    summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"],
+                                    result = results), 
+                                zip,
+                                rows_nan_threshold = rows_nan_thresholds,
+                                cols_nan_threshold = cols_nan_thresholds,
+                                model = models,
+                                scaler = scalers))
+
 rule all:
     input:
         files_to_compute
diff --git a/config.yaml b/config.yaml
index 1b7dd30c..93c7acf5 100644
--- a/config.yaml
+++ b/config.yaml
@@ -183,6 +183,7 @@ CONVERSATION:
 
 ### Analysis ################################################################
 PARAMS_FOR_ANALYSIS:
+  COMPUTE: False
   GROUNDTRUTH_TABLE: participant_info
   SOURCES: &sources ["phone_features", "fitbit_features", "phone_fitbit_features"]
   DAY_SEGMENTS: *day_segments
@@ -206,9 +207,9 @@ PARAMS_FOR_ANALYSIS:
     DAYS_AFTER_DISCHARGE: 7
 
   # Cleaning Parameters
-  COLS_NAN_THRESHOLD: 0.5
+  COLS_NAN_THRESHOLD: [0.1, 0.3, 0.5]
   COLS_VAR_THRESHOLD: True
-  ROWS_NAN_THRESHOLD: 0.5
+  ROWS_NAN_THRESHOLD: [0.1, 0.3, 0.5]
   PARTICIPANT_DAYS_BEFORE_THRESHOLD: 7
   PARTICIPANT_DAYS_AFTER_THRESHOLD: 4