Split days threshold of data cleaning into days_before_surgery and days_after_discharge

Co-authored-by: JulioV <juliovhz@gmail.com>
pull/95/head
Meng Li 2020-04-29 14:37:40 -04:00
parent 3314040912
commit 9ddb50ed59
4 changed files with 38 additions and 18 deletions

View File

@ -154,7 +154,8 @@ PARAMS_FOR_ANALYSIS:
COLS_NAN_THRESHOLD: 0.5 COLS_NAN_THRESHOLD: 0.5
COLS_VAR_THRESHOLD: True COLS_VAR_THRESHOLD: True
ROWS_NAN_THRESHOLD: 0.5 ROWS_NAN_THRESHOLD: 0.5
PARTICIPANTS_DAY_THRESHOLD: 7 PARTICIPANT_DAYS_BEFORE_THRESHOLD: 7
PARTICIPANT_DAYS_AFTER_THRESHOLD: 4
SUMMARISED: ["summarised"] # "summarised" or "notsummarised" SUMMARISED: ["summarised"] # "summarised" or "notsummarised"

View File

@ -64,7 +64,8 @@ rule clean_features_for_individual_model:
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
participants_day_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"] days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"]
output: output:
"data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_clean.csv" "data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_clean.csv"
script: script:
@ -77,7 +78,8 @@ rule clean_features_for_population_model:
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
participants_day_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"] days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"]
output: output:
"data/processed/data_for_population_model/{source}_{day_segment}_clean.csv" "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv"
script: script:

View File

@ -2,14 +2,25 @@ source("packrat/init.R")
library(tidyr) library(tidyr)
library(dplyr) library(dplyr)
filter_participant_without_enough_days <- function(clean_features, participants_day_threshold){ filter_participant_without_enough_days <- function(clean_features, days_before_threshold, days_after_threshold){
if("pid" %in% colnames(clean_features)) if("pid" %in% colnames(clean_features)){
clean_features <- clean_features %>% group_by(pid) clean_features <- clean_features %>%
group_by(pid) %>%
add_count(pid, day_type) # this adds a new column "n"
} else {
clean_features <- clean_features %>% add_count(day_type)
}
# Only keep participants with enough days before surgery and after discharge
clean_features <- clean_features %>% clean_features <- clean_features %>%
filter(n() >= participants_day_threshold) %>% mutate(count_before = ifelse(day_type == -1, n, NA), # before surgery
count_after = ifelse(day_type == 1, n, NA)) %>% # after discharge
fill(count_before, .direction = "downup") %>%
fill(count_after, .direction = "downup") %>%
filter(count_before >= days_before_threshold & count_after >= days_after_threshold) %>%
select(-n, -count_before, -count_after) %>%
ungroup() ungroup()
return(clean_features) return(clean_features)
} }
@ -17,10 +28,12 @@ clean_features <- read.csv(snakemake@input[[1]])
cols_nan_threshold <- snakemake@params[["cols_nan_threshold"]] cols_nan_threshold <- snakemake@params[["cols_nan_threshold"]]
drop_zero_variance_columns <- snakemake@params[["cols_var_threshold"]] drop_zero_variance_columns <- snakemake@params[["cols_var_threshold"]]
rows_nan_threshold <- snakemake@params[["rows_nan_threshold"]] rows_nan_threshold <- snakemake@params[["rows_nan_threshold"]]
participants_day_threshold <- snakemake@params[["participants_day_threshold"]] days_before_threshold <- snakemake@params[["days_before_threshold"]]
days_after_threshold <- snakemake@params[["days_after_threshold"]]
# We have to do this before and after dropping rows, that's why is duplicated # We have to do this before and after dropping rows, that's why is duplicated
clean_features <- filter_participant_without_enough_days(clean_features, participants_day_threshold) clean_features <- filter_participant_without_enough_days(clean_features, days_before_threshold, days_after_threshold)
# drop columns with a percentage of NA values above cols_nan_threshold # drop columns with a percentage of NA values above cols_nan_threshold
if(nrow(clean_features)) if(nrow(clean_features))
@ -35,6 +48,9 @@ clean_features <- clean_features %>%
filter(percentage_na < rows_nan_threshold) %>% filter(percentage_na < rows_nan_threshold) %>%
select(-percentage_na) select(-percentage_na)
clean_features <- filter_participant_without_enough_days(clean_features, participants_day_threshold) if(nrow(clean_features) != 0){
clean_features <- filter_participant_without_enough_days(clean_features, days_before_threshold, days_after_threshold)
clean_features <- clean_features %>% select(-day_type)
}
write.csv(clean_features, snakemake@output[[1]], row.names = FALSE) write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -2,19 +2,19 @@ import numpy as np
import pandas as pd import pandas as pd
from datetime import timedelta from datetime import timedelta
def appendDaysInRange(days_to_analyse, start_date, end_date): def appendDaysInRange(days_to_analyse, start_date, end_date, day_type):
num_of_days = (end_date - start_date).days num_of_days = (end_date - start_date).days
if np.isnan(num_of_days): if np.isnan(num_of_days):
return days_to_analyse return days_to_analyse
for day in range(num_of_days + 1): for day in range(num_of_days + 1):
days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day)}, ignore_index=True) days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day), "day_type": day_type}, ignore_index=True)
return days_to_analyse return days_to_analyse
days_before_surgery = int(snakemake.params["days_before_surgery"]) days_before_surgery = int(snakemake.params["days_before_surgery"])
days_in_hospital = str(snakemake.params["days_in_hospital"]) days_in_hospital = str(snakemake.params["days_in_hospital"])
days_after_discharge = int(snakemake.params["days_after_discharge"]) days_after_discharge = int(snakemake.params["days_after_discharge"])
participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"]) participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"])
days_to_analyse = pd.DataFrame(columns = ["local_date"]) days_to_analyse = pd.DataFrame(columns = ["local_date", "day_type"])
try: try:
surgery_date, discharge_date = participant_info["surgery_date"].iloc[0].date(), participant_info["discharge_date"].iloc[0].date() surgery_date, discharge_date = participant_info["surgery_date"].iloc[0].date(), participant_info["discharge_date"].iloc[0].date()
@ -24,9 +24,10 @@ else:
start_date = surgery_date - timedelta(days = days_before_surgery) start_date = surgery_date - timedelta(days = days_before_surgery)
end_date = discharge_date + timedelta(days = days_after_discharge) end_date = discharge_date + timedelta(days = days_after_discharge)
days_to_analyse = appendDaysInRange(days_to_analyse, start_date, surgery_date - timedelta(days = 1)) # days before surgery: -1; in hospital: 0; after discharge: 1
days_to_analyse = appendDaysInRange(days_to_analyse, start_date, surgery_date - timedelta(days = 1), -1)
if days_in_hospital == "T": if days_in_hospital == "T":
days_to_analyse = appendDaysInRange(days_to_analyse, surgery_date, discharge_date) days_to_analyse = appendDaysInRange(days_to_analyse, surgery_date, discharge_date, 0)
days_to_analyse = appendDaysInRange(days_to_analyse, discharge_date + timedelta(days = 1), end_date) days_to_analyse = appendDaysInRange(days_to_analyse, discharge_date + timedelta(days = 1), end_date, 1)
days_to_analyse.to_csv(snakemake.output[0], index=False) days_to_analyse.to_csv(snakemake.output[0], index=False)