From 93157db210a1e78b79bc1ef47b5652a26bbffd5e Mon Sep 17 00:00:00 2001
From: Meng Li <34143965+Meng6@users.noreply.github.com>
Date: Mon, 27 Jul 2020 18:27:36 -0400
Subject: [PATCH] Data cleaning section: replace "day_type" with "day_idx"

---
 Snakefile                                      | 16 ++++++++++++----
 config.yaml                                    |  1 +
 rules/models.snakefile                         | 18 ++++++++++--------
 src/models/clean_features_for_model.R          | 14 +++++++++-----
 .../merge_features_for_individual_model.R      |  4 +++-
 src/models/select_days_to_analyse.py           | 14 ++++++++++++--
 6 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/Snakefile b/Snakefile
index 6cd97e2b..ff348736 100644
--- a/Snakefile
+++ b/Snakefile
@@ -166,16 +166,22 @@ if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]:
         cols_nan_thresholds = cols_nan_thresholds + list(itertools.chain.from_iterable([threshold] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) for threshold in cols_nan_threshold))
     results = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"] + ["merged_population_model_results"]
 
-    files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv",
+    files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv",
                                 pid = config["PIDS"],
+                                min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
+                                min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
                                 source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
                                 day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]))
-    files_to_compute.extend(expand("data/processed/data_for_population_model/{source}_{day_segment}_original.csv",
+    files_to_compute.extend(expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv",
+                                min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
+                                min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
                                 source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
                                 day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]))
     files_to_compute.extend(expand(
-                                expand("data/processed/{pid}/data_for_individual_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
+                                expand("data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
                                     pid = config["PIDS"],
+                                    min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
+                                    min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
                                     days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
                                     days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
                                     cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
@@ -185,7 +191,9 @@ if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]:
                                 rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
                                 cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"]))
     files_to_compute.extend(expand(
-                                expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
+                                expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
+                                    min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
+                                    min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
                                     days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
                                     days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
                                     cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
diff --git a/config.yaml b/config.yaml
index ac08a8a6..558af929 100644
--- a/config.yaml
+++ b/config.yaml
@@ -251,6 +251,7 @@ PARAMS_FOR_ANALYSIS:
   PHONE_FITBIT_FEATURES: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile
   DEMOGRAPHIC_FEATURES: [age, gender, inpatientdays]
   CATEGORICAL_DEMOGRAPHIC_FEATURES: ["gender"]
+  FEATURES_EXCLUDE_DAY_IDX: False
 
   # Whether or not to include only days with enough valid sensed hours
   # logic can be found in rule phone_valid_sensed_days of rules/preprocessing.snakefile
diff --git a/rules/models.snakefile b/rules/models.snakefile
index d360b8b1..69b89aa0 100644
--- a/rules/models.snakefile
+++ b/rules/models.snakefile
@@ -19,7 +19,7 @@ def optional_input_days_to_include(wildcards):
 def optional_input_valid_sensed_days(wildcards):
     if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:
         # This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile
-        return ["data/interim/{pid}/phone_valid_sensed_days.csv"]
+        return ["data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv"]
     else:
         return []
 
@@ -31,15 +31,15 @@ rule merge_features_for_individual_model:
     params:
         source = "{source}"
     output:
-        "data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv"
+        "data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv"
     script:
         "../src/models/merge_features_for_individual_model.R"
 
 rule merge_features_for_population_model:
     input:
-        feature_files = expand("data/processed/{pid}/data_for_individual_model/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])
+        feature_files = expand("data/processed/{pid}/data_for_individual_model/{{min_valid_hours_per_day}}hours_{{min_valid_bins_per_hour}}bins/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])
     output:
-        "data/processed/data_for_population_model/{source}_{day_segment}_original.csv"
+        "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv"
     script:
         "../src/models/merge_features_for_population_model.R"
 
@@ -63,13 +63,14 @@ rule clean_features_for_individual_model:
     input:
         rules.merge_features_for_individual_model.output
     params:
+        features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"],
         cols_nan_threshold = "{cols_nan_threshold}",
         cols_var_threshold = "{cols_var_threshold}",
         days_before_threshold = "{days_before_threshold}",
         days_after_threshold = "{days_after_threshold}",
         rows_nan_threshold = "{rows_nan_threshold}",
     output:
-        "data/processed/{pid}/data_for_individual_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
+        "data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
     script:
         "../src/models/clean_features_for_model.R"
 
@@ -77,21 +78,22 @@ rule clean_features_for_population_model:
     input:
         rules.merge_features_for_population_model.output
     params:
+        features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"],
         cols_nan_threshold = "{cols_nan_threshold}",
         cols_var_threshold = "{cols_var_threshold}",
         days_before_threshold = "{days_before_threshold}",
         days_after_threshold = "{days_after_threshold}",
         rows_nan_threshold = "{rows_nan_threshold}",
     output:
-        "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
+        "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
     script:
         "../src/models/clean_features_for_model.R"
 
 rule nan_cells_ratio_of_cleaned_features:
     input:
-        cleaned_features = "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
+        cleaned_features = "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
     output:
-        "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv"
+        "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv"
     script:
         "../src/models/nan_cells_ratio_of_cleaned_features.py"
  
diff --git a/src/models/clean_features_for_model.R b/src/models/clean_features_for_model.R
index d766258c..ae4fade9 100644
--- a/src/models/clean_features_for_model.R
+++ b/src/models/clean_features_for_model.R
@@ -6,15 +6,15 @@ filter_participant_without_enough_days <- function(clean_features, days_before_t
   if("pid" %in% colnames(clean_features)){
     clean_features <- clean_features %>% 
       group_by(pid) %>% 
-      add_count(pid, day_type) # this adds a new column "n"
+      add_count(pid, day_idx) # this adds a new column "n"
   } else {
-    clean_features <- clean_features %>% add_count(day_type)
+    clean_features <- clean_features %>% add_count(day_idx)
   }
 
   # Only keep participants with enough days before surgery and after discharge
   clean_features <- clean_features %>% 
-    mutate(count_before = ifelse(day_type == -1, n, NA), # before surgery
-          count_after = ifelse(day_type == 1, n, NA)) %>%  # after discharge
+    mutate(count_before = ifelse(day_idx < 0, n, NA), # before surgery
+          count_after = ifelse(day_idx > 0, n, NA)) %>%  # after discharge
     fill(count_before, .direction = "downup") %>% 
     fill(count_after, .direction = "downup") %>% 
     filter(count_before >= days_before_threshold & count_after >= days_after_threshold) %>% 
@@ -30,6 +30,7 @@ drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]
 rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]])
 days_before_threshold <- as.numeric(snakemake@params[["days_before_threshold"]])
 days_after_threshold <- as.numeric(snakemake@params[["days_after_threshold"]])
+features_exclude_day_idx <- as.logical(snakemake@params[["features_exclude_day_idx"]])
 
 
 # We have to do this before and after dropping rows, that's why is duplicated
@@ -50,7 +51,10 @@ clean_features <- clean_features %>%
 
 if(nrow(clean_features) != 0){
   clean_features <- filter_participant_without_enough_days(clean_features, days_before_threshold, days_after_threshold)
-  clean_features <- clean_features %>% select(-day_type)
+  
+  # include "day_idx" as features or not
+  if(features_exclude_day_idx)
+    clean_features <- clean_features %>% select(-day_idx)
 }
 
 write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
diff --git a/src/models/merge_features_for_individual_model.R b/src/models/merge_features_for_individual_model.R
index 9c9e91d0..ea99055a 100644
--- a/src/models/merge_features_for_individual_model.R
+++ b/src/models/merge_features_for_individual_model.R
@@ -23,7 +23,9 @@ features_for_individual_model <- feature_files %>%
   reduce(full_join, by="local_date")
 
 if(!is.null(phone_valid_sensed_days) && source %in% c("phone_features", "phone_fitbit_features")){
-    features_for_individual_model <- merge(features_for_individual_model, read.csv(phone_valid_sensed_days), by="local_date") %>% select(-valid_hours)
+    valid_days <- read.csv(phone_valid_sensed_days)
+    valid_days <- valid_days[valid_days$is_valid_sensed_day == TRUE, ]
+    features_for_individual_model <- merge(features_for_individual_model, valid_days, by="local_date") %>% select(-valid_sensed_hours, -is_valid_sensed_day)
 }
 
 if(!is.null(days_to_include)){
diff --git a/src/models/select_days_to_analyse.py b/src/models/select_days_to_analyse.py
index e10e2ed3..5a1370b0 100644
--- a/src/models/select_days_to_analyse.py
+++ b/src/models/select_days_to_analyse.py
@@ -6,15 +6,25 @@ def appendDaysInRange(days_to_analyse, start_date, end_date, day_type):
     num_of_days = (end_date - start_date).days
     if np.isnan(num_of_days):
         return days_to_analyse
+
     for day in range(num_of_days + 1):
-        days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day), "day_type": day_type}, ignore_index=True)
+
+        if day_type == -1:
+            day_idx = (num_of_days - day + 1) * day_type
+        elif day_type == 1:
+            day_idx = day + 1
+        else:
+            day_idx = 0
+
+        days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day), "day_idx": day_idx}, ignore_index=True)
+    
     return days_to_analyse
 
 days_before_surgery = int(snakemake.params["days_before_surgery"])
 days_in_hospital = str(snakemake.params["days_in_hospital"])
 days_after_discharge = int(snakemake.params["days_after_discharge"])
 participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"])
-days_to_analyse = pd.DataFrame(columns = ["local_date", "day_type"])
+days_to_analyse = pd.DataFrame(columns = ["local_date", "day_idx"])
 
 try:
     surgery_date, discharge_date = participant_info["surgery_date"].iloc[0].date(), participant_info["discharge_date"].iloc[0].date()