diff --git a/config.yaml b/config.yaml index 655bc06d..b31a131d 100644 --- a/config.yaml +++ b/config.yaml @@ -154,7 +154,8 @@ PARAMS_FOR_ANALYSIS: COLS_NAN_THRESHOLD: 0.5 COLS_VAR_THRESHOLD: True ROWS_NAN_THRESHOLD: 0.5 - PARTICIPANTS_DAY_THRESHOLD: 7 + PARTICIPANT_DAYS_BEFORE_THRESHOLD: 7 + PARTICIPANT_DAYS_AFTER_THRESHOLD: 4 SUMMARISED: ["summarised"] # "summarised" or "notsummarised" diff --git a/rules/models.snakefile b/rules/models.snakefile index 3108e7ab..a4219806 100644 --- a/rules/models.snakefile +++ b/rules/models.snakefile @@ -64,7 +64,8 @@ rule clean_features_for_individual_model: cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], - participants_day_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"] + days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], + days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"] output: "data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_clean.csv" script: @@ -77,7 +78,8 @@ rule clean_features_for_population_model: cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], - participants_day_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"] + days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], + days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"] output: "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv" script: diff --git a/src/models/clean_features_for_model.R b/src/models/clean_features_for_model.R index afef17ce..1dd8ee9d 100644 --- a/src/models/clean_features_for_model.R +++ b/src/models/clean_features_for_model.R @@ -2,14 +2,25 @@ source("packrat/init.R") library(tidyr) library(dplyr) -filter_participant_without_enough_days <- function(clean_features, participants_day_threshold){ - if("pid" %in% colnames(clean_features)) - clean_features <- clean_features %>% group_by(pid) - +filter_participant_without_enough_days <- function(clean_features, days_before_threshold, days_after_threshold){ + if("pid" %in% colnames(clean_features)){ + clean_features <- clean_features %>% + group_by(pid) %>% + add_count(pid, day_type) # this adds a new column "n" + } else { + clean_features <- clean_features %>% add_count(day_type) + } + + # Only keep participants with enough days before surgery and after discharge clean_features <- clean_features %>% - filter(n() >= participants_day_threshold) %>% + mutate(count_before = ifelse(day_type == -1, n, NA), # before surgery + count_after = ifelse(day_type == 1, n, NA)) %>% # after discharge + fill(count_before, .direction = "downup") %>% + fill(count_after, .direction = "downup") %>% + filter(count_before >= days_before_threshold & count_after >= days_after_threshold) %>% + select(-n, -count_before, -count_after) %>% ungroup() - + return(clean_features) } @@ -17,10 +28,12 @@ clean_features <- read.csv(snakemake@input[[1]]) cols_nan_threshold <- snakemake@params[["cols_nan_threshold"]] drop_zero_variance_columns <- snakemake@params[["cols_var_threshold"]] rows_nan_threshold <- snakemake@params[["rows_nan_threshold"]] -participants_day_threshold <- snakemake@params[["participants_day_threshold"]] +days_before_threshold <- snakemake@params[["days_before_threshold"]] +days_after_threshold <- snakemake@params[["days_after_threshold"]] + # We have to do this before and after dropping rows, that's why is duplicated -clean_features <- filter_participant_without_enough_days(clean_features, participants_day_threshold) +clean_features <- filter_participant_without_enough_days(clean_features, days_before_threshold, days_after_threshold) # drop columns with a percentage of NA values above cols_nan_threshold if(nrow(clean_features)) @@ -35,6 +48,9 @@ clean_features <- clean_features %>% filter(percentage_na < rows_nan_threshold) %>% select(-percentage_na) -clean_features <- filter_participant_without_enough_days(clean_features, participants_day_threshold) +if(nrow(clean_features) != 0){ + clean_features <- filter_participant_without_enough_days(clean_features, days_before_threshold, days_after_threshold) + clean_features <- clean_features %>% select(-day_type) +} write.csv(clean_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/models/select_days_to_analyse.py b/src/models/select_days_to_analyse.py index c74d1736..e10e2ed3 100644 --- a/src/models/select_days_to_analyse.py +++ b/src/models/select_days_to_analyse.py @@ -2,19 +2,19 @@ import numpy as np import pandas as pd from datetime import timedelta -def appendDaysInRange(days_to_analyse, start_date, end_date): +def appendDaysInRange(days_to_analyse, start_date, end_date, day_type): num_of_days = (end_date - start_date).days if np.isnan(num_of_days): return days_to_analyse for day in range(num_of_days + 1): - days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day)}, ignore_index=True) + days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day), "day_type": day_type}, ignore_index=True) return days_to_analyse days_before_surgery = int(snakemake.params["days_before_surgery"]) days_in_hospital = str(snakemake.params["days_in_hospital"]) days_after_discharge = int(snakemake.params["days_after_discharge"]) participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"]) -days_to_analyse = pd.DataFrame(columns = ["local_date"]) +days_to_analyse = pd.DataFrame(columns = ["local_date", "day_type"]) try: surgery_date, discharge_date = participant_info["surgery_date"].iloc[0].date(), participant_info["discharge_date"].iloc[0].date() @@ -24,9 +24,10 @@ else: start_date = surgery_date - timedelta(days = days_before_surgery) end_date = discharge_date + timedelta(days = days_after_discharge) - days_to_analyse = appendDaysInRange(days_to_analyse, start_date, surgery_date - timedelta(days = 1)) + # days before surgery: -1; in hospital: 0; after discharge: 1 + days_to_analyse = appendDaysInRange(days_to_analyse, start_date, surgery_date - timedelta(days = 1), -1) if days_in_hospital == "T": - days_to_analyse = appendDaysInRange(days_to_analyse, surgery_date, discharge_date) - days_to_analyse = appendDaysInRange(days_to_analyse, discharge_date + timedelta(days = 1), end_date) + days_to_analyse = appendDaysInRange(days_to_analyse, surgery_date, discharge_date, 0) + days_to_analyse = appendDaysInRange(days_to_analyse, discharge_date + timedelta(days = 1), end_date, 1) days_to_analyse.to_csv(snakemake.output[0], index=False)