Data cleaning section: replace "day_type" with "day_idx"

2020-07-27 18:27:36 -04:00 · 2020-07-27 18:27:36 -04:00 · 93157db210
parent 08ae04df41
commit 93157db210
6 changed files with 47 additions and 20 deletions
--- a/16
+++ b/16
@ -166,16 +166,22 @@ if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]:
        cols_nan_thresholds = cols_nan_thresholds + list(itertools.chain.from_iterable([threshold] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) for threshold in cols_nan_threshold))
    results = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"] + ["merged_population_model_results"]
-    files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv",
+    files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv",
                                pid = config["PIDS"],
                                min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
                                min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
                                source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
                                day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]))
-    files_to_compute.extend(expand("data/processed/data_for_population_model/{source}_{day_segment}_original.csv",
+    files_to_compute.extend(expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv",
                                min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
                                min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
                                source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
                                day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]))
    files_to_compute.extend(expand(
-                                expand("data/processed/{pid}/data_for_individual_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
+                                expand("data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
                                    pid = config["PIDS"],
                                    min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
                                    min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
                                    days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
                                    days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
                                    cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
@ -185,7 +191,9 @@ if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]:
                                rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
                                cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"]))
    files_to_compute.extend(expand(
-                                expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
+                                expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
                                    min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
                                    min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
                                    days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
                                    days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
                                    cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
--- a/config.yaml
+++ b/config.yaml
@ -251,6 +251,7 @@ PARAMS_FOR_ANALYSIS:
  PHONE_FITBIT_FEATURES: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile
  DEMOGRAPHIC_FEATURES: [age, gender, inpatientdays]
  CATEGORICAL_DEMOGRAPHIC_FEATURES: ["gender"]
  FEATURES_EXCLUDE_DAY_IDX: False
  # Whether or not to include only days with enough valid sensed hours
  # logic can be found in rule phone_valid_sensed_days of rules/preprocessing.snakefile
--- a/rules/models.snakefile
+++ b/rules/models.snakefile
@ -19,7 +19,7 @@ def optional_input_days_to_include(wildcards):
 def optional_input_valid_sensed_days(wildcards):
    if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:
        # This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile
-        return ["data/interim/{pid}/phone_valid_sensed_days.csv"]
+        return ["data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv"]
    else:
        return []
@ -31,15 +31,15 @@ rule merge_features_for_individual_model:
    params:
        source = "{source}"
    output:
-        "data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv"
+        "data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv"
    script:
        "../src/models/merge_features_for_individual_model.R"
 rule merge_features_for_population_model:
    input:
-        feature_files = expand("data/processed/{pid}/data_for_individual_model/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])
+        feature_files = expand("data/processed/{pid}/data_for_individual_model/{{min_valid_hours_per_day}}hours_{{min_valid_bins_per_hour}}bins/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])
    output:
-        "data/processed/data_for_population_model/{source}_{day_segment}_original.csv"
+        "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv"
    script:
        "../src/models/merge_features_for_population_model.R"
@ -63,13 +63,14 @@ rule clean_features_for_individual_model:
    input:
        rules.merge_features_for_individual_model.output
    params:
        features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"],
        cols_nan_threshold = "{cols_nan_threshold}",
        cols_var_threshold = "{cols_var_threshold}",
        days_before_threshold = "{days_before_threshold}",
        days_after_threshold = "{days_after_threshold}",
        rows_nan_threshold = "{rows_nan_threshold}",
    output:
-        "data/processed/{pid}/data_for_individual_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
+        "data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
    script:
        "../src/models/clean_features_for_model.R"
@ -77,21 +78,22 @@ rule clean_features_for_population_model:
    input:
        rules.merge_features_for_population_model.output
    params:
        features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"],
        cols_nan_threshold = "{cols_nan_threshold}",
        cols_var_threshold = "{cols_var_threshold}",
        days_before_threshold = "{days_before_threshold}",
        days_after_threshold = "{days_after_threshold}",
        rows_nan_threshold = "{rows_nan_threshold}",
    output:
-        "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
+        "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
    script:
        "../src/models/clean_features_for_model.R"
 rule nan_cells_ratio_of_cleaned_features:
    input:
-        cleaned_features = "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
+        cleaned_features = "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
    output:
-        "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv"
+        "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv"
    script:
        "../src/models/nan_cells_ratio_of_cleaned_features.py"
--- a/src/models/clean_features_for_model.R
+++ b/src/models/clean_features_for_model.R
@ -6,15 +6,15 @@ filter_participant_without_enough_days <- function(clean_features, days_before_t
  if("pid" %in% colnames(clean_features)){
    clean_features <- clean_features %>% 
      group_by(pid) %>% 
-      add_count(pid, day_type) # this adds a new column "n"
+      add_count(pid, day_idx) # this adds a new column "n"
  } else {
-    clean_features <- clean_features %>% add_count(day_type)
+    clean_features <- clean_features %>% add_count(day_idx)
  }
  # Only keep participants with enough days before surgery and after discharge
  clean_features <- clean_features %>% 
-    mutate(count_before = ifelse(day_type == -1, n, NA), # before surgery
+    mutate(count_before = ifelse(day_idx < 0, n, NA), # before surgery
-          count_after = ifelse(day_type == 1, n, NA)) %>%  # after discharge
+          count_after = ifelse(day_idx > 0, n, NA)) %>%  # after discharge
    fill(count_before, .direction = "downup") %>% 
    fill(count_after, .direction = "downup") %>% 
    filter(count_before >= days_before_threshold & count_after >= days_after_threshold) %>% 
@ -30,6 +30,7 @@ drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]
 rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]])
 days_before_threshold <- as.numeric(snakemake@params[["days_before_threshold"]])
 days_after_threshold <- as.numeric(snakemake@params[["days_after_threshold"]])
 features_exclude_day_idx <- as.logical(snakemake@params[["features_exclude_day_idx"]])
 # We have to do this before and after dropping rows, that's why is duplicated
@ -50,7 +51,10 @@ clean_features <- clean_features %>%
 if(nrow(clean_features) != 0){
  clean_features <- filter_participant_without_enough_days(clean_features, days_before_threshold, days_after_threshold)
-  clean_features <- clean_features %>% select(-day_type)
+  
  # include "day_idx" as features or not
  if(features_exclude_day_idx)
    clean_features <- clean_features %>% select(-day_idx)
 }
 write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
--- a/src/models/merge_features_for_individual_model.R
+++ b/src/models/merge_features_for_individual_model.R
@ -23,7 +23,9 @@ features_for_individual_model <- feature_files %>%
  reduce(full_join, by="local_date")
 if(!is.null(phone_valid_sensed_days) && source %in% c("phone_features", "phone_fitbit_features")){
-    features_for_individual_model <- merge(features_for_individual_model, read.csv(phone_valid_sensed_days), by="local_date") %>% select(-valid_hours)
+    valid_days <- read.csv(phone_valid_sensed_days)
    valid_days <- valid_days[valid_days$is_valid_sensed_day == TRUE, ]
    features_for_individual_model <- merge(features_for_individual_model, valid_days, by="local_date") %>% select(-valid_sensed_hours, -is_valid_sensed_day)
 }
 if(!is.null(days_to_include)){
--- a/src/models/select_days_to_analyse.py
+++ b/src/models/select_days_to_analyse.py
@ -6,15 +6,25 @@ def appendDaysInRange(days_to_analyse, start_date, end_date, day_type):
    num_of_days = (end_date - start_date).days
    if np.isnan(num_of_days):
        return days_to_analyse
    for day in range(num_of_days + 1):
-        days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day), "day_type": day_type}, ignore_index=True)
+
        if day_type == -1:
            day_idx = (num_of_days - day + 1) * day_type
        elif day_type == 1:
            day_idx = day + 1
        else:
            day_idx = 0
        days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day), "day_idx": day_idx}, ignore_index=True)
    return days_to_analyse
 days_before_surgery = int(snakemake.params["days_before_surgery"])
 days_in_hospital = str(snakemake.params["days_in_hospital"])
 days_after_discharge = int(snakemake.params["days_after_discharge"])
 participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"])
-days_to_analyse = pd.DataFrame(columns = ["local_date", "day_type"])
+days_to_analyse = pd.DataFrame(columns = ["local_date", "day_idx"])
 try:
    surgery_date, discharge_date = participant_info["surgery_date"].iloc[0].date(), participant_info["discharge_date"].iloc[0].date()