Data cleaning section: replace "day_type" with "day_idx"
parent
08ae04df41
commit
93157db210
16
Snakefile
16
Snakefile
|
@ -166,16 +166,22 @@ if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]:
|
||||||
cols_nan_thresholds = cols_nan_thresholds + list(itertools.chain.from_iterable([threshold] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) for threshold in cols_nan_threshold))
|
cols_nan_thresholds = cols_nan_thresholds + list(itertools.chain.from_iterable([threshold] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) for threshold in cols_nan_threshold))
|
||||||
results = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"] + ["merged_population_model_results"]
|
results = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"] + ["merged_population_model_results"]
|
||||||
|
|
||||||
files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv",
|
files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv",
|
||||||
pid = config["PIDS"],
|
pid = config["PIDS"],
|
||||||
|
min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
|
||||||
|
min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
|
||||||
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
|
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
|
||||||
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]))
|
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]))
|
||||||
files_to_compute.extend(expand("data/processed/data_for_population_model/{source}_{day_segment}_original.csv",
|
files_to_compute.extend(expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv",
|
||||||
|
min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
|
||||||
|
min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
|
||||||
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
|
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
|
||||||
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]))
|
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]))
|
||||||
files_to_compute.extend(expand(
|
files_to_compute.extend(expand(
|
||||||
expand("data/processed/{pid}/data_for_individual_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
|
expand("data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
|
||||||
pid = config["PIDS"],
|
pid = config["PIDS"],
|
||||||
|
min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
|
||||||
|
min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
|
||||||
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
|
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
|
||||||
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
|
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
|
||||||
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
||||||
|
@ -185,7 +191,9 @@ if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]:
|
||||||
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
||||||
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"]))
|
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"]))
|
||||||
files_to_compute.extend(expand(
|
files_to_compute.extend(expand(
|
||||||
expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
|
expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
|
||||||
|
min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
|
||||||
|
min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
|
||||||
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
|
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
|
||||||
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
|
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
|
||||||
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
||||||
|
|
|
@ -251,6 +251,7 @@ PARAMS_FOR_ANALYSIS:
|
||||||
PHONE_FITBIT_FEATURES: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile
|
PHONE_FITBIT_FEATURES: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile
|
||||||
DEMOGRAPHIC_FEATURES: [age, gender, inpatientdays]
|
DEMOGRAPHIC_FEATURES: [age, gender, inpatientdays]
|
||||||
CATEGORICAL_DEMOGRAPHIC_FEATURES: ["gender"]
|
CATEGORICAL_DEMOGRAPHIC_FEATURES: ["gender"]
|
||||||
|
FEATURES_EXCLUDE_DAY_IDX: False
|
||||||
|
|
||||||
# Whether or not to include only days with enough valid sensed hours
|
# Whether or not to include only days with enough valid sensed hours
|
||||||
# logic can be found in rule phone_valid_sensed_days of rules/preprocessing.snakefile
|
# logic can be found in rule phone_valid_sensed_days of rules/preprocessing.snakefile
|
||||||
|
|
|
@ -19,7 +19,7 @@ def optional_input_days_to_include(wildcards):
|
||||||
def optional_input_valid_sensed_days(wildcards):
|
def optional_input_valid_sensed_days(wildcards):
|
||||||
if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:
|
if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:
|
||||||
# This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile
|
# This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile
|
||||||
return ["data/interim/{pid}/phone_valid_sensed_days.csv"]
|
return ["data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv"]
|
||||||
else:
|
else:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
@ -31,15 +31,15 @@ rule merge_features_for_individual_model:
|
||||||
params:
|
params:
|
||||||
source = "{source}"
|
source = "{source}"
|
||||||
output:
|
output:
|
||||||
"data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv"
|
"data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv"
|
||||||
script:
|
script:
|
||||||
"../src/models/merge_features_for_individual_model.R"
|
"../src/models/merge_features_for_individual_model.R"
|
||||||
|
|
||||||
rule merge_features_for_population_model:
|
rule merge_features_for_population_model:
|
||||||
input:
|
input:
|
||||||
feature_files = expand("data/processed/{pid}/data_for_individual_model/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])
|
feature_files = expand("data/processed/{pid}/data_for_individual_model/{{min_valid_hours_per_day}}hours_{{min_valid_bins_per_hour}}bins/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])
|
||||||
output:
|
output:
|
||||||
"data/processed/data_for_population_model/{source}_{day_segment}_original.csv"
|
"data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv"
|
||||||
script:
|
script:
|
||||||
"../src/models/merge_features_for_population_model.R"
|
"../src/models/merge_features_for_population_model.R"
|
||||||
|
|
||||||
|
@ -63,13 +63,14 @@ rule clean_features_for_individual_model:
|
||||||
input:
|
input:
|
||||||
rules.merge_features_for_individual_model.output
|
rules.merge_features_for_individual_model.output
|
||||||
params:
|
params:
|
||||||
|
features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"],
|
||||||
cols_nan_threshold = "{cols_nan_threshold}",
|
cols_nan_threshold = "{cols_nan_threshold}",
|
||||||
cols_var_threshold = "{cols_var_threshold}",
|
cols_var_threshold = "{cols_var_threshold}",
|
||||||
days_before_threshold = "{days_before_threshold}",
|
days_before_threshold = "{days_before_threshold}",
|
||||||
days_after_threshold = "{days_after_threshold}",
|
days_after_threshold = "{days_after_threshold}",
|
||||||
rows_nan_threshold = "{rows_nan_threshold}",
|
rows_nan_threshold = "{rows_nan_threshold}",
|
||||||
output:
|
output:
|
||||||
"data/processed/{pid}/data_for_individual_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
|
"data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
|
||||||
script:
|
script:
|
||||||
"../src/models/clean_features_for_model.R"
|
"../src/models/clean_features_for_model.R"
|
||||||
|
|
||||||
|
@ -77,21 +78,22 @@ rule clean_features_for_population_model:
|
||||||
input:
|
input:
|
||||||
rules.merge_features_for_population_model.output
|
rules.merge_features_for_population_model.output
|
||||||
params:
|
params:
|
||||||
|
features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"],
|
||||||
cols_nan_threshold = "{cols_nan_threshold}",
|
cols_nan_threshold = "{cols_nan_threshold}",
|
||||||
cols_var_threshold = "{cols_var_threshold}",
|
cols_var_threshold = "{cols_var_threshold}",
|
||||||
days_before_threshold = "{days_before_threshold}",
|
days_before_threshold = "{days_before_threshold}",
|
||||||
days_after_threshold = "{days_after_threshold}",
|
days_after_threshold = "{days_after_threshold}",
|
||||||
rows_nan_threshold = "{rows_nan_threshold}",
|
rows_nan_threshold = "{rows_nan_threshold}",
|
||||||
output:
|
output:
|
||||||
"data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
|
"data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
|
||||||
script:
|
script:
|
||||||
"../src/models/clean_features_for_model.R"
|
"../src/models/clean_features_for_model.R"
|
||||||
|
|
||||||
rule nan_cells_ratio_of_cleaned_features:
|
rule nan_cells_ratio_of_cleaned_features:
|
||||||
input:
|
input:
|
||||||
cleaned_features = "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
|
cleaned_features = "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
|
||||||
output:
|
output:
|
||||||
"data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv"
|
"data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv"
|
||||||
script:
|
script:
|
||||||
"../src/models/nan_cells_ratio_of_cleaned_features.py"
|
"../src/models/nan_cells_ratio_of_cleaned_features.py"
|
||||||
|
|
||||||
|
|
|
@ -6,15 +6,15 @@ filter_participant_without_enough_days <- function(clean_features, days_before_t
|
||||||
if("pid" %in% colnames(clean_features)){
|
if("pid" %in% colnames(clean_features)){
|
||||||
clean_features <- clean_features %>%
|
clean_features <- clean_features %>%
|
||||||
group_by(pid) %>%
|
group_by(pid) %>%
|
||||||
add_count(pid, day_type) # this adds a new column "n"
|
add_count(pid, day_idx) # this adds a new column "n"
|
||||||
} else {
|
} else {
|
||||||
clean_features <- clean_features %>% add_count(day_type)
|
clean_features <- clean_features %>% add_count(day_idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Only keep participants with enough days before surgery and after discharge
|
# Only keep participants with enough days before surgery and after discharge
|
||||||
clean_features <- clean_features %>%
|
clean_features <- clean_features %>%
|
||||||
mutate(count_before = ifelse(day_type == -1, n, NA), # before surgery
|
mutate(count_before = ifelse(day_idx < 0, n, NA), # before surgery
|
||||||
count_after = ifelse(day_type == 1, n, NA)) %>% # after discharge
|
count_after = ifelse(day_idx > 0, n, NA)) %>% # after discharge
|
||||||
fill(count_before, .direction = "downup") %>%
|
fill(count_before, .direction = "downup") %>%
|
||||||
fill(count_after, .direction = "downup") %>%
|
fill(count_after, .direction = "downup") %>%
|
||||||
filter(count_before >= days_before_threshold & count_after >= days_after_threshold) %>%
|
filter(count_before >= days_before_threshold & count_after >= days_after_threshold) %>%
|
||||||
|
@ -30,6 +30,7 @@ drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]
|
||||||
rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]])
|
rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]])
|
||||||
days_before_threshold <- as.numeric(snakemake@params[["days_before_threshold"]])
|
days_before_threshold <- as.numeric(snakemake@params[["days_before_threshold"]])
|
||||||
days_after_threshold <- as.numeric(snakemake@params[["days_after_threshold"]])
|
days_after_threshold <- as.numeric(snakemake@params[["days_after_threshold"]])
|
||||||
|
features_exclude_day_idx <- as.logical(snakemake@params[["features_exclude_day_idx"]])
|
||||||
|
|
||||||
|
|
||||||
# We have to do this before and after dropping rows, that's why is duplicated
|
# We have to do this before and after dropping rows, that's why is duplicated
|
||||||
|
@ -50,7 +51,10 @@ clean_features <- clean_features %>%
|
||||||
|
|
||||||
if(nrow(clean_features) != 0){
|
if(nrow(clean_features) != 0){
|
||||||
clean_features <- filter_participant_without_enough_days(clean_features, days_before_threshold, days_after_threshold)
|
clean_features <- filter_participant_without_enough_days(clean_features, days_before_threshold, days_after_threshold)
|
||||||
clean_features <- clean_features %>% select(-day_type)
|
|
||||||
|
# include "day_idx" as features or not
|
||||||
|
if(features_exclude_day_idx)
|
||||||
|
clean_features <- clean_features %>% select(-day_idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
|
write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
|
||||||
|
|
|
@ -23,7 +23,9 @@ features_for_individual_model <- feature_files %>%
|
||||||
reduce(full_join, by="local_date")
|
reduce(full_join, by="local_date")
|
||||||
|
|
||||||
if(!is.null(phone_valid_sensed_days) && source %in% c("phone_features", "phone_fitbit_features")){
|
if(!is.null(phone_valid_sensed_days) && source %in% c("phone_features", "phone_fitbit_features")){
|
||||||
features_for_individual_model <- merge(features_for_individual_model, read.csv(phone_valid_sensed_days), by="local_date") %>% select(-valid_hours)
|
valid_days <- read.csv(phone_valid_sensed_days)
|
||||||
|
valid_days <- valid_days[valid_days$is_valid_sensed_day == TRUE, ]
|
||||||
|
features_for_individual_model <- merge(features_for_individual_model, valid_days, by="local_date") %>% select(-valid_sensed_hours, -is_valid_sensed_day)
|
||||||
}
|
}
|
||||||
|
|
||||||
if(!is.null(days_to_include)){
|
if(!is.null(days_to_include)){
|
||||||
|
|
|
@ -6,15 +6,25 @@ def appendDaysInRange(days_to_analyse, start_date, end_date, day_type):
|
||||||
num_of_days = (end_date - start_date).days
|
num_of_days = (end_date - start_date).days
|
||||||
if np.isnan(num_of_days):
|
if np.isnan(num_of_days):
|
||||||
return days_to_analyse
|
return days_to_analyse
|
||||||
|
|
||||||
for day in range(num_of_days + 1):
|
for day in range(num_of_days + 1):
|
||||||
days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day), "day_type": day_type}, ignore_index=True)
|
|
||||||
|
if day_type == -1:
|
||||||
|
day_idx = (num_of_days - day + 1) * day_type
|
||||||
|
elif day_type == 1:
|
||||||
|
day_idx = day + 1
|
||||||
|
else:
|
||||||
|
day_idx = 0
|
||||||
|
|
||||||
|
days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day), "day_idx": day_idx}, ignore_index=True)
|
||||||
|
|
||||||
return days_to_analyse
|
return days_to_analyse
|
||||||
|
|
||||||
days_before_surgery = int(snakemake.params["days_before_surgery"])
|
days_before_surgery = int(snakemake.params["days_before_surgery"])
|
||||||
days_in_hospital = str(snakemake.params["days_in_hospital"])
|
days_in_hospital = str(snakemake.params["days_in_hospital"])
|
||||||
days_after_discharge = int(snakemake.params["days_after_discharge"])
|
days_after_discharge = int(snakemake.params["days_after_discharge"])
|
||||||
participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"])
|
participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"])
|
||||||
days_to_analyse = pd.DataFrame(columns = ["local_date", "day_type"])
|
days_to_analyse = pd.DataFrame(columns = ["local_date", "day_idx"])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
surgery_date, discharge_date = participant_info["surgery_date"].iloc[0].date(), participant_info["discharge_date"].iloc[0].date()
|
surgery_date, discharge_date = participant_info["surgery_date"].iloc[0].date(), participant_info["discharge_date"].iloc[0].date()
|
||||||
|
|
Loading…
Reference in New Issue