Data cleaning section: replace "day_type" with "day_idx"

pull/95/head
Meng Li 2020-07-27 18:27:36 -04:00
parent 08ae04df41
commit 93157db210
6 changed files with 47 additions and 20 deletions

View File

@ -166,16 +166,22 @@ if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]:
cols_nan_thresholds = cols_nan_thresholds + list(itertools.chain.from_iterable([threshold] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) for threshold in cols_nan_threshold))
results = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"] + ["merged_population_model_results"]
files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv",
files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv",
pid = config["PIDS"],
min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]))
files_to_compute.extend(expand("data/processed/data_for_population_model/{source}_{day_segment}_original.csv",
files_to_compute.extend(expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv",
min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]))
files_to_compute.extend(expand(
expand("data/processed/{pid}/data_for_individual_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
expand("data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
pid = config["PIDS"],
min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
@ -185,7 +191,9 @@ if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]:
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"]))
files_to_compute.extend(expand(
expand("data/processed/data_for_population_model/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"],
min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"],
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],

View File

@ -251,6 +251,7 @@ PARAMS_FOR_ANALYSIS:
PHONE_FITBIT_FEATURES: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile
DEMOGRAPHIC_FEATURES: [age, gender, inpatientdays]
CATEGORICAL_DEMOGRAPHIC_FEATURES: ["gender"]
FEATURES_EXCLUDE_DAY_IDX: False
# Whether or not to include only days with enough valid sensed hours
# logic can be found in rule phone_valid_sensed_days of rules/preprocessing.snakefile

View File

@ -19,7 +19,7 @@ def optional_input_days_to_include(wildcards):
def optional_input_valid_sensed_days(wildcards):
if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:
# This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile
return ["data/interim/{pid}/phone_valid_sensed_days.csv"]
return ["data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv"]
else:
return []
@ -31,15 +31,15 @@ rule merge_features_for_individual_model:
params:
source = "{source}"
output:
"data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv"
"data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv"
script:
"../src/models/merge_features_for_individual_model.R"
rule merge_features_for_population_model:
input:
feature_files = expand("data/processed/{pid}/data_for_individual_model/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])
feature_files = expand("data/processed/{pid}/data_for_individual_model/{{min_valid_hours_per_day}}hours_{{min_valid_bins_per_hour}}bins/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])
output:
"data/processed/data_for_population_model/{source}_{day_segment}_original.csv"
"data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv"
script:
"../src/models/merge_features_for_population_model.R"
@ -63,13 +63,14 @@ rule clean_features_for_individual_model:
input:
rules.merge_features_for_individual_model.output
params:
features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"],
cols_nan_threshold = "{cols_nan_threshold}",
cols_var_threshold = "{cols_var_threshold}",
days_before_threshold = "{days_before_threshold}",
days_after_threshold = "{days_after_threshold}",
rows_nan_threshold = "{rows_nan_threshold}",
output:
"data/processed/{pid}/data_for_individual_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
"data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
script:
"../src/models/clean_features_for_model.R"
@ -77,21 +78,22 @@ rule clean_features_for_population_model:
input:
rules.merge_features_for_population_model.output
params:
features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"],
cols_nan_threshold = "{cols_nan_threshold}",
cols_var_threshold = "{cols_var_threshold}",
days_before_threshold = "{days_before_threshold}",
days_after_threshold = "{days_after_threshold}",
rows_nan_threshold = "{rows_nan_threshold}",
output:
"data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
"data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
script:
"../src/models/clean_features_for_model.R"
rule nan_cells_ratio_of_cleaned_features:
input:
cleaned_features = "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
cleaned_features = "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
output:
"data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv"
"data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv"
script:
"../src/models/nan_cells_ratio_of_cleaned_features.py"

View File

@ -6,15 +6,15 @@ filter_participant_without_enough_days <- function(clean_features, days_before_t
if("pid" %in% colnames(clean_features)){
clean_features <- clean_features %>%
group_by(pid) %>%
add_count(pid, day_type) # this adds a new column "n"
add_count(pid, day_idx) # this adds a new column "n"
} else {
clean_features <- clean_features %>% add_count(day_type)
clean_features <- clean_features %>% add_count(day_idx)
}
# Only keep participants with enough days before surgery and after discharge
clean_features <- clean_features %>%
mutate(count_before = ifelse(day_type == -1, n, NA), # before surgery
count_after = ifelse(day_type == 1, n, NA)) %>% # after discharge
mutate(count_before = ifelse(day_idx < 0, n, NA), # before surgery
count_after = ifelse(day_idx > 0, n, NA)) %>% # after discharge
fill(count_before, .direction = "downup") %>%
fill(count_after, .direction = "downup") %>%
filter(count_before >= days_before_threshold & count_after >= days_after_threshold) %>%
@ -30,6 +30,7 @@ drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]
rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]])
days_before_threshold <- as.numeric(snakemake@params[["days_before_threshold"]])
days_after_threshold <- as.numeric(snakemake@params[["days_after_threshold"]])
features_exclude_day_idx <- as.logical(snakemake@params[["features_exclude_day_idx"]])
# We have to do this before and after dropping rows, that's why is duplicated
@ -50,7 +51,10 @@ clean_features <- clean_features %>%
if(nrow(clean_features) != 0){
clean_features <- filter_participant_without_enough_days(clean_features, days_before_threshold, days_after_threshold)
clean_features <- clean_features %>% select(-day_type)
# include "day_idx" as features or not
if(features_exclude_day_idx)
clean_features <- clean_features %>% select(-day_idx)
}
write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -23,7 +23,9 @@ features_for_individual_model <- feature_files %>%
reduce(full_join, by="local_date")
if(!is.null(phone_valid_sensed_days) && source %in% c("phone_features", "phone_fitbit_features")){
features_for_individual_model <- merge(features_for_individual_model, read.csv(phone_valid_sensed_days), by="local_date") %>% select(-valid_hours)
valid_days <- read.csv(phone_valid_sensed_days)
valid_days <- valid_days[valid_days$is_valid_sensed_day == TRUE, ]
features_for_individual_model <- merge(features_for_individual_model, valid_days, by="local_date") %>% select(-valid_sensed_hours, -is_valid_sensed_day)
}
if(!is.null(days_to_include)){

View File

@ -6,15 +6,25 @@ def appendDaysInRange(days_to_analyse, start_date, end_date, day_type):
num_of_days = (end_date - start_date).days
if np.isnan(num_of_days):
return days_to_analyse
for day in range(num_of_days + 1):
days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day), "day_type": day_type}, ignore_index=True)
if day_type == -1:
day_idx = (num_of_days - day + 1) * day_type
elif day_type == 1:
day_idx = day + 1
else:
day_idx = 0
days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day), "day_idx": day_idx}, ignore_index=True)
return days_to_analyse
days_before_surgery = int(snakemake.params["days_before_surgery"])
days_in_hospital = str(snakemake.params["days_in_hospital"])
days_after_discharge = int(snakemake.params["days_after_discharge"])
participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"])
days_to_analyse = pd.DataFrame(columns = ["local_date", "day_type"])
days_to_analyse = pd.DataFrame(columns = ["local_date", "day_idx"])
try:
surgery_date, discharge_date = participant_info["surgery_date"].iloc[0].date(), participant_info["discharge_date"].iloc[0].date()