From 93157db210a1e78b79bc1ef47b5652a26bbffd5e Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Mon, 27 Jul 2020 18:27:36 -0400 Subject: [PATCH] Data cleaning section: replace "day_type" with "day_idx" --- Snakefile | 16 ++++++++++++---- config.yaml | 1 + rules/models.snakefile | 18 ++++++++++-------- src/models/clean_features_for_model.R | 14 +++++++++----- .../merge_features_for_individual_model.R | 4 +++- src/models/select_days_to_analyse.py | 14 ++++++++++++-- 6 files changed, 47 insertions(+), 20 deletions(-) diff --git a/Snakefile b/Snakefile index 6cd97e2b..ff348736 100644 --- a/Snakefile +++ b/Snakefile @@ -166,16 +166,22 @@ if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]: cols_nan_thresholds = cols_nan_thresholds + list(itertools.chain.from_iterable([threshold] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) for threshold in cols_nan_threshold)) results = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"] + ["merged_population_model_results"] - files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv", + files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv", pid = config["PIDS"], + min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], + min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"], source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"])) - files_to_compute.extend(expand("data/processed/data_for_population_model/{source}_{day_segment}_original.csv", + files_to_compute.extend(expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv", + min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], + min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"], source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"])) files_to_compute.extend(expand( - expand("data/processed/{pid}/data_for_individual_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv", + expand("data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv", pid = config["PIDS"], + min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], + min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"], days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], @@ -185,7 +191,9 @@ if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]: rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) files_to_compute.extend(expand( - expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv", + expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv", + min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], + min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"], days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], diff --git a/config.yaml b/config.yaml index ac08a8a6..558af929 100644 --- a/config.yaml +++ b/config.yaml @@ -251,6 +251,7 @@ PARAMS_FOR_ANALYSIS: PHONE_FITBIT_FEATURES: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile DEMOGRAPHIC_FEATURES: [age, gender, inpatientdays] CATEGORICAL_DEMOGRAPHIC_FEATURES: ["gender"] + FEATURES_EXCLUDE_DAY_IDX: False # Whether or not to include only days with enough valid sensed hours # logic can be found in rule phone_valid_sensed_days of rules/preprocessing.snakefile diff --git a/rules/models.snakefile b/rules/models.snakefile index d360b8b1..69b89aa0 100644 --- a/rules/models.snakefile +++ b/rules/models.snakefile @@ -19,7 +19,7 @@ def optional_input_days_to_include(wildcards): def optional_input_valid_sensed_days(wildcards): if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]: # This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile - return ["data/interim/{pid}/phone_valid_sensed_days.csv"] + return ["data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv"] else: return [] @@ -31,15 +31,15 @@ rule merge_features_for_individual_model: params: source = "{source}" output: - "data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv" + "data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv" script: "../src/models/merge_features_for_individual_model.R" rule merge_features_for_population_model: input: - feature_files = expand("data/processed/{pid}/data_for_individual_model/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"]) + feature_files = expand("data/processed/{pid}/data_for_individual_model/{{min_valid_hours_per_day}}hours_{{min_valid_bins_per_hour}}bins/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"]) output: - "data/processed/data_for_population_model/{source}_{day_segment}_original.csv" + "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv" script: "../src/models/merge_features_for_population_model.R" @@ -63,13 +63,14 @@ rule clean_features_for_individual_model: input: rules.merge_features_for_individual_model.output params: + features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"], cols_nan_threshold = "{cols_nan_threshold}", cols_var_threshold = "{cols_var_threshold}", days_before_threshold = "{days_before_threshold}", days_after_threshold = "{days_after_threshold}", rows_nan_threshold = "{rows_nan_threshold}", output: - "data/processed/{pid}/data_for_individual_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv" + "data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv" script: "../src/models/clean_features_for_model.R" @@ -77,21 +78,22 @@ rule clean_features_for_population_model: input: rules.merge_features_for_population_model.output params: + features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"], cols_nan_threshold = "{cols_nan_threshold}", cols_var_threshold = "{cols_var_threshold}", days_before_threshold = "{days_before_threshold}", days_after_threshold = "{days_after_threshold}", rows_nan_threshold = "{rows_nan_threshold}", output: - "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv" + "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv" script: "../src/models/clean_features_for_model.R" rule nan_cells_ratio_of_cleaned_features: input: - cleaned_features = "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv" + cleaned_features = "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv" output: - "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv" + "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv" script: "../src/models/nan_cells_ratio_of_cleaned_features.py" diff --git a/src/models/clean_features_for_model.R b/src/models/clean_features_for_model.R index d766258c..ae4fade9 100644 --- a/src/models/clean_features_for_model.R +++ b/src/models/clean_features_for_model.R @@ -6,15 +6,15 @@ filter_participant_without_enough_days <- function(clean_features, days_before_t if("pid" %in% colnames(clean_features)){ clean_features <- clean_features %>% group_by(pid) %>% - add_count(pid, day_type) # this adds a new column "n" + add_count(pid, day_idx) # this adds a new column "n" } else { - clean_features <- clean_features %>% add_count(day_type) + clean_features <- clean_features %>% add_count(day_idx) } # Only keep participants with enough days before surgery and after discharge clean_features <- clean_features %>% - mutate(count_before = ifelse(day_type == -1, n, NA), # before surgery - count_after = ifelse(day_type == 1, n, NA)) %>% # after discharge + mutate(count_before = ifelse(day_idx < 0, n, NA), # before surgery + count_after = ifelse(day_idx > 0, n, NA)) %>% # after discharge fill(count_before, .direction = "downup") %>% fill(count_after, .direction = "downup") %>% filter(count_before >= days_before_threshold & count_after >= days_after_threshold) %>% @@ -30,6 +30,7 @@ drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"] rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]]) days_before_threshold <- as.numeric(snakemake@params[["days_before_threshold"]]) days_after_threshold <- as.numeric(snakemake@params[["days_after_threshold"]]) +features_exclude_day_idx <- as.logical(snakemake@params[["features_exclude_day_idx"]]) # We have to do this before and after dropping rows, that's why is duplicated @@ -50,7 +51,10 @@ clean_features <- clean_features %>% if(nrow(clean_features) != 0){ clean_features <- filter_participant_without_enough_days(clean_features, days_before_threshold, days_after_threshold) - clean_features <- clean_features %>% select(-day_type) + + # include "day_idx" as features or not + if(features_exclude_day_idx) + clean_features <- clean_features %>% select(-day_idx) } write.csv(clean_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/models/merge_features_for_individual_model.R b/src/models/merge_features_for_individual_model.R index 9c9e91d0..ea99055a 100644 --- a/src/models/merge_features_for_individual_model.R +++ b/src/models/merge_features_for_individual_model.R @@ -23,7 +23,9 @@ features_for_individual_model <- feature_files %>% reduce(full_join, by="local_date") if(!is.null(phone_valid_sensed_days) && source %in% c("phone_features", "phone_fitbit_features")){ - features_for_individual_model <- merge(features_for_individual_model, read.csv(phone_valid_sensed_days), by="local_date") %>% select(-valid_hours) + valid_days <- read.csv(phone_valid_sensed_days) + valid_days <- valid_days[valid_days$is_valid_sensed_day == TRUE, ] + features_for_individual_model <- merge(features_for_individual_model, valid_days, by="local_date") %>% select(-valid_sensed_hours, -is_valid_sensed_day) } if(!is.null(days_to_include)){ diff --git a/src/models/select_days_to_analyse.py b/src/models/select_days_to_analyse.py index e10e2ed3..5a1370b0 100644 --- a/src/models/select_days_to_analyse.py +++ b/src/models/select_days_to_analyse.py @@ -6,15 +6,25 @@ def appendDaysInRange(days_to_analyse, start_date, end_date, day_type): num_of_days = (end_date - start_date).days if np.isnan(num_of_days): return days_to_analyse + for day in range(num_of_days + 1): - days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day), "day_type": day_type}, ignore_index=True) + + if day_type == -1: + day_idx = (num_of_days - day + 1) * day_type + elif day_type == 1: + day_idx = day + 1 + else: + day_idx = 0 + + days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day), "day_idx": day_idx}, ignore_index=True) + return days_to_analyse days_before_surgery = int(snakemake.params["days_before_surgery"]) days_in_hospital = str(snakemake.params["days_in_hospital"]) days_after_discharge = int(snakemake.params["days_after_discharge"]) participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"]) -days_to_analyse = pd.DataFrame(columns = ["local_date", "day_type"]) +days_to_analyse = pd.DataFrame(columns = ["local_date", "day_idx"]) try: surgery_date, discharge_date = participant_info["surgery_date"].iloc[0].date(), participant_info["discharge_date"].iloc[0].date()