From 0e173872dfa5bb3e676145c3f9a33af8d9c689d9 Mon Sep 17 00:00:00 2001 From: JulioV Date: Tue, 17 Mar 2020 21:15:53 -0400 Subject: [PATCH] Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model --- .gitignore | 2 +- Snakefile | 11 +++- config.yaml | 24 ++++++-- rules/models.snakefile | 56 +++++++++++++++++-- rules/mystudy.snakefile | 9 ++- src/models/clean_metrics_for_model.R | 40 +++++++++++++ .../merge_metrics_for_individual_model.R | 14 +++-- .../merge_metrics_for_population_model.R | 6 +- src/models/predict_model.py | 0 src/models/select_days_to_analyse.py | 15 ++--- src/models/train_model.py | 0 11 files changed, 141 insertions(+), 36 deletions(-) create mode 100644 src/models/clean_metrics_for_model.R delete mode 100644 src/models/predict_model.py delete mode 100644 src/models/train_model.py diff --git a/.gitignore b/.gitignore index bd8660bc..b1e8af77 100644 --- a/.gitignore +++ b/.gitignore @@ -107,4 +107,4 @@ reports/ *.Rproj .RData .Rhistory -*_profile/ \ No newline at end of file +sn_profile_*/ diff --git a/Snakefile b/Snakefile index 0037cdfc..c504f623 100644 --- a/Snakefile +++ b/Snakefile @@ -66,11 +66,18 @@ rule all: pid = config["PIDS"], day_segment = config["STEP"]["DAY_SEGMENTS"]), # Models - expand("data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}.csv", + expand("data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}_original.csv", pid = config["PIDS"], source = config["METRICS_FOR_ANALYSIS"]["SOURCES"], day_segment = config["METRICS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), - expand("data/processed/metrics_for_population_model/{source}_{day_segment}.csv", + expand("data/processed/metrics_for_population_model/{source}_{day_segment}_original.csv", + source = config["METRICS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["METRICS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), + expand("data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}_clean.csv", + pid = config["PIDS"], + source = config["METRICS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["METRICS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), + expand("data/processed/metrics_for_population_model/{source}_{day_segment}_clean.csv", source = config["METRICS_FOR_ANALYSIS"]["SOURCES"], day_segment = config["METRICS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), # Vizualisations diff --git a/config.yaml b/config.yaml index 5a1b5227..5b68e0c2 100644 --- a/config.yaml +++ b/config.yaml @@ -125,12 +125,28 @@ STEP: INCLUDE_ZERO_STEP_ROWS: True METRICS_FOR_ANALYSIS: + GROUNDTRUTH_TABLE: participant_info SOURCES: &sources ["phone_metrics", "fitbit_metrics", "phone_fitbit_metrics"] DAY_SEGMENTS: *day_segments PHONE_METRICS: [accelerometer, applications_foreground, battery, call_incoming, call_missed, call_outgoing, google_activity_recognition, light, location_barnett, screen, sms_received, sms_sent] FITBIT_METRICS: [fitbit_heartrate, fitbit_step] PHONE_FITBIT_METRICS: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile - DROP_VALID_SENSED_DAYS: True - DAYS_BEFORE_SURGERY: 15 - DAYS_AFTER_DISCHARGE: 7 - DAYS_IN_HOSPITAL: F + + # Whether or not to include only days with enough valid sensed hours + # logic can be found in rule phone_valid_sensed_days of rules/preprocessing.snakefile + DROP_VALID_SENSED_DAYS: + ENABLED: True + + # Whether or not to include certain days in the analysis, logic can be found in rule days_to_analyse of rules/mystudy.snakefile + # If you want to include all days downloaded for each participant, set ENABLED to False + DAYS_TO_ANALYSE: + ENABLED: True + DAYS_BEFORE_SURGERY: 15 + DAYS_IN_HOSPITAL: F # T or F + DAYS_AFTER_DISCHARGE: 7 + + # Cleaning Parameters + COLS_NAN_THRESHOLD: 0.5 + COLS_VAR_THRESHOLD: True + ROWS_NAN_THRESHOLD: 0.5 + PARTICIPANTS_DAY_THRESHOLD: 7 diff --git a/rules/models.snakefile b/rules/models.snakefile index 78dc3f60..48848119 100644 --- a/rules/models.snakefile +++ b/rules/models.snakefile @@ -4,22 +4,66 @@ def input_merge_metrics_of_single_participant(wildcards): else: return expand("data/processed/{pid}/{metrics}_{day_segment}.csv", pid=wildcards.pid, metrics=config["METRICS_FOR_ANALYSIS"][wildcards.source.upper()], day_segment=wildcards.day_segment) +def optional_input_days_to_include(wildcards): + if config["METRICS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["ENABLED"]: + # This input automatically trigers the rule days_to_analyse in mystudy.snakefile + return ["data/interim/{pid}/days_to_analyse" + \ + "_" + str(config["METRICS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"]) + \ + "_" + str(config["METRICS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]) + \ + "_" + str(config["METRICS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"]) + ".csv"] + else: + return [] + +def optional_input_valid_sensed_days(wildcards): + if config["METRICS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]: + # This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile + return ["data/interim/{pid}/phone_valid_sensed_days.csv"] + else: + return [] + rule merge_metrics_for_individual_model: input: metric_files = input_merge_metrics_of_single_participant, - phone_valid_sensed_days = "data/interim/{pid}/phone_valid_sensed_days.csv" + phone_valid_sensed_days = optional_input_valid_sensed_days, + days_to_include = optional_input_days_to_include params: - drop_valid_sensed_days = config["METRICS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"], source = "{source}" output: - "data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}.csv" + "data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}_original.csv" script: "../src/models/merge_metrics_for_individual_model.R" rule merge_metrics_for_population_model: input: - metric_files = expand("data/processed/{pid}/metrics_for_individual_model/{{source}}_{{day_segment}}.csv", pid=config["PIDS"]) + metric_files = expand("data/processed/{pid}/metrics_for_individual_model/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"]) output: - "data/processed/metrics_for_population_model/{source}_{day_segment}.csv" + "data/processed/metrics_for_population_model/{source}_{day_segment}_original.csv" script: - "../src/models/merge_metrics_for_population_model.R" \ No newline at end of file + "../src/models/merge_metrics_for_population_model.R" + +rule clean_metrics_for_individual_model: + input: + rules.merge_metrics_for_individual_model.output + params: + cols_nan_threshold = config["METRICS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], + cols_var_threshold = config["METRICS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + rows_nan_threshold = config["METRICS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], + participants_day_threshold = config["METRICS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"] + output: + "data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}_clean.csv" + script: + "../src/models/clean_metrics_for_model.R" + +rule clean_metrics_for_population_model: + input: + rules.merge_metrics_for_population_model.output + params: + cols_nan_threshold = config["METRICS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], + cols_var_threshold = config["METRICS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + rows_nan_threshold = config["METRICS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], + participants_day_threshold = config["METRICS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"] + output: + "data/processed/metrics_for_population_model/{source}_{day_segment}_clean.csv" + script: + "../src/models/clean_metrics_for_model.R" + diff --git a/rules/mystudy.snakefile b/rules/mystudy.snakefile index 5e9aeec9..6d226335 100644 --- a/rules/mystudy.snakefile +++ b/rules/mystudy.snakefile @@ -1,11 +1,10 @@ rule days_to_analyse: input: - participant_info = "data/external/participant_info.csv", - pid_file = "data/external/{pid}" + participant_info = "data/raw/{pid}/" + config["METRICS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv" params: - days_before_surgery = config["METRICS_FOR_ANALYSIS"]["DAYS_BEFORE_SURGERY"], - days_after_discharge = config["METRICS_FOR_ANALYSIS"]["DAYS_AFTER_DISCHARGE"], - days_in_hospital= config["METRICS_FOR_ANALYSIS"]["DAYS_IN_HOSPITAL"] + days_before_surgery = "{days_before_surgery}", + days_in_hospital = "{days_in_hospital}", + days_after_discharge= "{days_after_discharge}" output: "data/interim/{pid}/days_to_analyse_{days_before_surgery}_{days_in_hospital}_{days_after_discharge}.csv" script: diff --git a/src/models/clean_metrics_for_model.R b/src/models/clean_metrics_for_model.R new file mode 100644 index 00000000..2095922e --- /dev/null +++ b/src/models/clean_metrics_for_model.R @@ -0,0 +1,40 @@ +source("packrat/init.R") +library(tidyr) +library(dplyr) + +filter_participant_without_enough_days <- function(clean_metrics, participants_day_threshold){ + if("pid" %in% colnames(clean_metrics)) + clean_metrics <- clean_metrics %>% group_by(pid) + + clean_metrics <- clean_metrics %>% + filter(n() >= participants_day_threshold) %>% + ungroup() + + return(clean_metrics) +} + +clean_metrics <- read.csv(snakemake@input[[1]]) +cols_nan_threshold <- snakemake@params[["cols_nan_threshold"]] +drop_zero_variance_columns <- snakemake@params[["cols_var_threshold"]] +rows_nan_threshold <- snakemake@params[["rows_nan_threshold"]] +participants_day_threshold <- snakemake@params[["participants_day_threshold"]] + +# We have to do this before and after dropping rows, that's why is duplicated +clean_metrics <- filter_participant_without_enough_days(clean_metrics, participants_day_threshold) + +# drop columns with a percentage of NA values above cols_nan_threshold +if(nrow(clean_metrics)) + clean_metrics <- clean_metrics %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold ) + +if(drop_zero_variance_columns) + clean_metrics <- clean_metrics %>% select_if(grepl("pid|local_date",names(.)) | sapply(., n_distinct) > 1) + +# drop rows with a percentage of NA values above rows_nan_threshold +clean_metrics <- clean_metrics %>% + mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>% + filter(percentage_na < rows_nan_threshold) %>% + select(-percentage_na) + +clean_metrics <- filter_participant_without_enough_days(clean_metrics, participants_day_threshold) + +write.csv(clean_metrics, snakemake@output[[1]], row.names = FALSE) diff --git a/src/models/merge_metrics_for_individual_model.R b/src/models/merge_metrics_for_individual_model.R index 03730d6c..6fd0fed2 100644 --- a/src/models/merge_metrics_for_individual_model.R +++ b/src/models/merge_metrics_for_individual_model.R @@ -5,16 +5,20 @@ library(purrr) library(dplyr) metric_files <- snakemake@input[["metric_files"]] -phone_valid_sensed_days <- read.csv(snakemake@input[["phone_valid_sensed_days"]]) -drop_valid_sensed_days <- snakemake@params[["drop_valid_sensed_days"]] +phone_valid_sensed_days <- snakemake@input[["phone_valid_sensed_days"]] +days_to_include <- snakemake@input[["days_to_include"]] source <- snakemake@params[["source"]] metrics_for_individual_model <- metric_files %>% map(read.csv, stringsAsFactors = F, colClasses = c(local_date = "character")) %>% reduce(full_join, by="local_date") -if(drop_valid_sensed_days && source == "phone_metrics"){ - metrics_for_individual_model <- merge(metrics_for_individual_model, phone_valid_sensed_days, by="local_date") %>% select(-valid_hours) - } +if(!is.null(phone_valid_sensed_days) && source %in% c("phone_metrics", "phone_fitbit_metrics")){ + metrics_for_individual_model <- merge(metrics_for_individual_model, read.csv(phone_valid_sensed_days), by="local_date") %>% select(-valid_hours) +} + +if(!is.null(days_to_include)){ + metrics_for_individual_model <- merge(metrics_for_individual_model, read.csv(days_to_include), by="local_date") +} write.csv(metrics_for_individual_model, snakemake@output[[1]], row.names = FALSE) \ No newline at end of file diff --git a/src/models/merge_metrics_for_population_model.R b/src/models/merge_metrics_for_population_model.R index c08958dc..84b2d3fe 100644 --- a/src/models/merge_metrics_for_population_model.R +++ b/src/models/merge_metrics_for_population_model.R @@ -7,10 +7,10 @@ library(stringr) metric_files <- snakemake@input[["metric_files"]] -metrics_of_all_participants <- data_frame(filename = metric_files) %>% # create a data frame +metrics_of_all_participants <- tibble(filename = metric_files) %>% # create a data frame mutate(file_contents = map(filename, ~ read.csv(., stringsAsFactors = F, colClasses = c(local_date = "character"))), - pid = str_match(filename, ".*/(p[0-9]{2})/.*")[,2]) %>% - unnest() %>% + pid = str_match(filename, ".*/([a-zA-Z]+?[0-9]+?)/.*")[,2]) %>% + unnest(cols = c(file_contents)) %>% select(-filename) write.csv(metrics_of_all_participants, snakemake@output[[1]], row.names = FALSE) \ No newline at end of file diff --git a/src/models/predict_model.py b/src/models/predict_model.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/models/select_days_to_analyse.py b/src/models/select_days_to_analyse.py index bcb543e4..0e21ae89 100644 --- a/src/models/select_days_to_analyse.py +++ b/src/models/select_days_to_analyse.py @@ -4,20 +4,15 @@ from datetime import timedelta def appendDaysInRange(days_to_analyse, start_date, end_date): num_of_days = (end_date - start_date).days for day in range(num_of_days + 1): - days_to_analyse = days_to_analyse.append({"days_to_analyse": start_date + timedelta(days = day)}, ignore_index=True) + days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day)}, ignore_index=True) return days_to_analyse -days_before_surgery = snakemake.params["days_before_surgery"] -days_in_hospital = snakemake.params["days_in_hospital"] -days_after_discharge = snakemake.params["days_after_discharge"] - +days_before_surgery = int(snakemake.params["days_before_surgery"]) +days_in_hospital = str(snakemake.params["days_in_hospital"]) +days_after_discharge = int(snakemake.params["days_after_discharge"]) participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"]) -with open(snakemake.input["pid_file"], encoding="ISO-8859-1") as external_file: - pid_file_content = external_file.readlines() -device_ids = pid_file_content[0].strip().split(",") +days_to_analyse = pd.DataFrame(columns = ["local_date"]) -days_to_analyse = pd.DataFrame(columns = ["days_to_analyse"]) -participant_info = participant_info[participant_info["device_id"].isin(device_ids)] try: surgery_date, discharge_date = participant_info["surgery_date"].iloc[0].date(), participant_info["discharge_date"].iloc[0].date() except: diff --git a/src/models/train_model.py b/src/models/train_model.py deleted file mode 100644 index e69de29b..00000000