diff --git a/Snakefile b/Snakefile index 7379de28..9cea55e8 100644 --- a/Snakefile +++ b/Snakefile @@ -11,12 +11,13 @@ rule all: # My study (this is an example of a rule created specifically for a study) expand("data/interim/{pid}/days_to_analyse_{days_before_surgery}_{days_in_hospital}_{days_after_discharge}.csv", pid=config["PIDS"], - days_before_surgery = config["METRICS_FOR_ANALYSIS"]["DAYS_BEFORE_SURGERY"], - days_after_discharge= config["METRICS_FOR_ANALYSIS"]["DAYS_AFTER_DISCHARGE"], - days_in_hospital= config["METRICS_FOR_ANALYSIS"]["DAYS_IN_HOSPITAL"]), + days_before_surgery = config["PARAMS_FOR_ANALYSIS"]["DAYS_BEFORE_SURGERY"], + days_after_discharge= config["PARAMS_FOR_ANALYSIS"]["DAYS_AFTER_DISCHARGE"], + days_in_hospital= config["PARAMS_FOR_ANALYSIS"]["DAYS_IN_HOSPITAL"]), expand("data/processed/{pid}/targets_{summarised}.csv", pid = config["PIDS"], - summarised = config["METRICS_FOR_ANALYSIS"]["SUMMARISED"]), + summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]), + expand("data/processed/{pid}/demographic_features.csv", pid=config["PIDS"]), # Feature extraction expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SENSORS"]), expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["FITBIT_TABLE"]), @@ -71,20 +72,20 @@ rule all: pid=config["PIDS"], segment = config["WIFI"]["DAY_SEGMENTS"]), # Models - expand("data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}_original.csv", + expand("data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_original.csv", pid = config["PIDS"], - source = config["METRICS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["METRICS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), - expand("data/processed/metrics_for_population_model/{source}_{day_segment}_original.csv", - source = config["METRICS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["METRICS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), - expand("data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}_clean.csv", + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), + expand("data/processed/features_for_population_model/{source}_{day_segment}_original.csv", + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), + expand("data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_clean.csv", pid = config["PIDS"], - source = config["METRICS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["METRICS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), - expand("data/processed/metrics_for_population_model/{source}_{day_segment}_clean.csv", - source = config["METRICS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["METRICS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), + expand("data/processed/features_for_population_model/{source}_{day_segment}_clean.csv", + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), # Vizualisations expand("reports/figures/{pid}/{sensor}_heatmap_rows.html", pid=config["PIDS"], sensor=config["SENSORS"]), expand("reports/figures/{pid}/compliance_heatmap.html", pid=config["PIDS"]), diff --git a/config.yaml b/config.yaml index 4e169c54..8be8aaf8 100644 --- a/config.yaml +++ b/config.yaml @@ -128,13 +128,14 @@ WIFI: DAY_SEGMENTS: *day_segments FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] -METRICS_FOR_ANALYSIS: +PARAMS_FOR_ANALYSIS: GROUNDTRUTH_TABLE: participant_info - SOURCES: &sources ["phone_metrics", "fitbit_metrics", "phone_fitbit_metrics"] + SOURCES: &sources ["phone_features", "fitbit_features", "phone_fitbit_features"] DAY_SEGMENTS: *day_segments - PHONE_METRICS: [accelerometer, applications_foreground, battery, call_incoming, call_missed, call_outgoing, google_activity_recognition, light, location_barnett, screen, sms_received, sms_sent] - FITBIT_METRICS: [fitbit_heartrate, fitbit_step] - PHONE_FITBIT_METRICS: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile + PHONE_FEATURES: [accelerometer, applications_foreground, battery, call_incoming, call_missed, call_outgoing, google_activity_recognition, light, location_barnett, screen, sms_received, sms_sent] + FITBIT_FEATURES: [fitbit_heartrate, fitbit_step] + PHONE_FITBIT_FEATURES: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile + DEMOGRAPHIC_FEATURES: [age, gender, inpatientdays] # Whether or not to include only days with enough valid sensed hours # logic can be found in rule phone_valid_sensed_days of rules/preprocessing.snakefile @@ -154,3 +155,8 @@ METRICS_FOR_ANALYSIS: COLS_VAR_THRESHOLD: True ROWS_NAN_THRESHOLD: 0.5 PARTICIPANTS_DAY_THRESHOLD: 7 + + # Target Settings: + # 1 => TARGETS_RATIO_THRESHOLD (ceiling) or more of available CESD scores were TARGETS_VALUE_THRESHOLD or higher; 0 => otherwise + TARGETS_RATIO_THRESHOLD: 0.5 + TARGETS_VALUE_THRESHOLD: 16 diff --git a/rules/models.snakefile b/rules/models.snakefile index 48848119..7c0b209b 100644 --- a/rules/models.snakefile +++ b/rules/models.snakefile @@ -1,69 +1,69 @@ -def input_merge_metrics_of_single_participant(wildcards): - if wildcards.source == "phone_fitbit_metrics": - return expand("data/processed/{pid}/{metrics}_{day_segment}.csv", pid=wildcards.pid, metrics=config["METRICS_FOR_ANALYSIS"]["PHONE_METRICS"] + config["METRICS_FOR_ANALYSIS"]["FITBIT_METRICS"], day_segment=wildcards.day_segment) +def input_merge_features_of_single_participant(wildcards): + if wildcards.source == "phone_fitbit_features": + return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"]["PHONE_FEATURES"] + config["PARAMS_FOR_ANALYSIS"]["FITBIT_FEATURES"], day_segment=wildcards.day_segment) else: - return expand("data/processed/{pid}/{metrics}_{day_segment}.csv", pid=wildcards.pid, metrics=config["METRICS_FOR_ANALYSIS"][wildcards.source.upper()], day_segment=wildcards.day_segment) + return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"][wildcards.source.upper()], day_segment=wildcards.day_segment) def optional_input_days_to_include(wildcards): - if config["METRICS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["ENABLED"]: + if config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["ENABLED"]: # This input automatically trigers the rule days_to_analyse in mystudy.snakefile return ["data/interim/{pid}/days_to_analyse" + \ - "_" + str(config["METRICS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"]) + \ - "_" + str(config["METRICS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]) + \ - "_" + str(config["METRICS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"]) + ".csv"] + "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"]) + \ + "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]) + \ + "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"]) + ".csv"] else: return [] def optional_input_valid_sensed_days(wildcards): - if config["METRICS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]: + if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]: # This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile return ["data/interim/{pid}/phone_valid_sensed_days.csv"] else: return [] -rule merge_metrics_for_individual_model: +rule merge_features_for_individual_model: input: - metric_files = input_merge_metrics_of_single_participant, + feature_files = input_merge_features_of_single_participant, phone_valid_sensed_days = optional_input_valid_sensed_days, days_to_include = optional_input_days_to_include params: source = "{source}" output: - "data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}_original.csv" + "data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_original.csv" script: - "../src/models/merge_metrics_for_individual_model.R" + "../src/models/merge_features_for_individual_model.R" -rule merge_metrics_for_population_model: +rule merge_targets_for_population_model: input: - metric_files = expand("data/processed/{pid}/metrics_for_individual_model/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"]) + data_files = expand("data/processed/{pid}/targets_{{summarised}}.csv", pid=config["PIDS"]) output: - "data/processed/metrics_for_population_model/{source}_{day_segment}_original.csv" + "data/processed/features_for_population_model/targets_{summarised}.csv" script: - "../src/models/merge_metrics_for_population_model.R" + "../src/models/merge_data_for_population_model.py" -rule clean_metrics_for_individual_model: +rule clean_features_for_individual_model: input: - rules.merge_metrics_for_individual_model.output + rules.merge_features_for_individual_model.output params: - cols_nan_threshold = config["METRICS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], - cols_var_threshold = config["METRICS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], - rows_nan_threshold = config["METRICS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], - participants_day_threshold = config["METRICS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"] + cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], + cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], + participants_day_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"] output: - "data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}_clean.csv" + "data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_clean.csv" script: - "../src/models/clean_metrics_for_model.R" + "../src/models/clean_features_for_model.R" -rule clean_metrics_for_population_model: +rule clean_features_for_population_model: input: - rules.merge_metrics_for_population_model.output + rules.merge_features_for_population_model.output params: - cols_nan_threshold = config["METRICS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], - cols_var_threshold = config["METRICS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], - rows_nan_threshold = config["METRICS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], - participants_day_threshold = config["METRICS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"] + cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], + cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], + participants_day_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"] output: - "data/processed/metrics_for_population_model/{source}_{day_segment}_clean.csv" + "data/processed/features_for_population_model/{source}_{day_segment}_clean.csv" script: - "../src/models/clean_metrics_for_model.R" + "../src/models/clean_features_for_model.R" diff --git a/rules/mystudy.snakefile b/rules/mystudy.snakefile index 466b87ad..30e2a90f 100644 --- a/rules/mystudy.snakefile +++ b/rules/mystudy.snakefile @@ -1,6 +1,6 @@ rule days_to_analyse: input: - participant_info = "data/raw/{pid}/" + config["METRICS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv" + participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv" params: days_before_surgery = "{days_before_surgery}", days_in_hospital = "{days_in_hospital}", @@ -10,12 +10,26 @@ rule days_to_analyse: script: "../src/models/select_days_to_analyse.py" -rule get_targets: +rule targets: input: - participant_info = "data/raw/{pid}/" + config["METRICS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv" + participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv" params: - summarised = "{summarised}" + pid = "{pid}", + summarised = "{summarised}", + targets_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["TARGETS_RATIO_THRESHOLD"], + targets_value_threshold = config["PARAMS_FOR_ANALYSIS"]["TARGETS_VALUE_THRESHOLD"] output: "data/processed/{pid}/targets_{summarised}.csv" script: - "../src/models/get_targets.py" + "../src/models/targets.py" + +rule demographic_features: + input: + participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv" + params: + pid = "{pid}", + features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC_FEATURES"] + output: + "data/processed/{pid}/demographic_features.csv" + script: + "../src/features/demographic_features.py" diff --git a/src/features/demographic_features.py b/src/features/demographic_features.py new file mode 100644 index 00000000..63718350 --- /dev/null +++ b/src/features/demographic_features.py @@ -0,0 +1,17 @@ +import pandas as pd + +pid = snakemake.params["pid"] +requested_features = snakemake.params["features"] +demographic_features = pd.DataFrame(columns=["pid"] + requested_features) + +participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"]) +demographic_features.loc[0, "pid"] = pid +if not participant_info.empty: + if "age" in requested_features: + demographic_features.loc[0, "age"] = participant_info.loc[0, "age"] + if "gender" in requested_features: + demographic_features.loc[0, "gender"] = participant_info.loc[0, "gender"] + if "inpatientdays" in requested_features: + demographic_features.loc[0, "inpatientdays"] = (participant_info.loc[0, "discharge_date"] - participant_info.loc[0, "surgery_date"]).days + +demographic_features.to_csv(snakemake.output[0], index=False) diff --git a/src/models/clean_features_for_model.R b/src/models/clean_features_for_model.R new file mode 100644 index 00000000..afef17ce --- /dev/null +++ b/src/models/clean_features_for_model.R @@ -0,0 +1,40 @@ +source("packrat/init.R") +library(tidyr) +library(dplyr) + +filter_participant_without_enough_days <- function(clean_features, participants_day_threshold){ + if("pid" %in% colnames(clean_features)) + clean_features <- clean_features %>% group_by(pid) + + clean_features <- clean_features %>% + filter(n() >= participants_day_threshold) %>% + ungroup() + + return(clean_features) +} + +clean_features <- read.csv(snakemake@input[[1]]) +cols_nan_threshold <- snakemake@params[["cols_nan_threshold"]] +drop_zero_variance_columns <- snakemake@params[["cols_var_threshold"]] +rows_nan_threshold <- snakemake@params[["rows_nan_threshold"]] +participants_day_threshold <- snakemake@params[["participants_day_threshold"]] + +# We have to do this before and after dropping rows, that's why is duplicated +clean_features <- filter_participant_without_enough_days(clean_features, participants_day_threshold) + +# drop columns with a percentage of NA values above cols_nan_threshold +if(nrow(clean_features)) + clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold ) + +if(drop_zero_variance_columns) + clean_features <- clean_features %>% select_if(grepl("pid|local_date",names(.)) | sapply(., n_distinct, na.rm = T) > 1) + +# drop rows with a percentage of NA values above rows_nan_threshold +clean_features <- clean_features %>% + mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>% + filter(percentage_na < rows_nan_threshold) %>% + select(-percentage_na) + +clean_features <- filter_participant_without_enough_days(clean_features, participants_day_threshold) + +write.csv(clean_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/models/clean_metrics_for_model.R b/src/models/clean_metrics_for_model.R deleted file mode 100644 index ef72ba6e..00000000 --- a/src/models/clean_metrics_for_model.R +++ /dev/null @@ -1,40 +0,0 @@ -source("packrat/init.R") -library(tidyr) -library(dplyr) - -filter_participant_without_enough_days <- function(clean_metrics, participants_day_threshold){ - if("pid" %in% colnames(clean_metrics)) - clean_metrics <- clean_metrics %>% group_by(pid) - - clean_metrics <- clean_metrics %>% - filter(n() >= participants_day_threshold) %>% - ungroup() - - return(clean_metrics) -} - -clean_metrics <- read.csv(snakemake@input[[1]]) -cols_nan_threshold <- snakemake@params[["cols_nan_threshold"]] -drop_zero_variance_columns <- snakemake@params[["cols_var_threshold"]] -rows_nan_threshold <- snakemake@params[["rows_nan_threshold"]] -participants_day_threshold <- snakemake@params[["participants_day_threshold"]] - -# We have to do this before and after dropping rows, that's why is duplicated -clean_metrics <- filter_participant_without_enough_days(clean_metrics, participants_day_threshold) - -# drop columns with a percentage of NA values above cols_nan_threshold -if(nrow(clean_metrics)) - clean_metrics <- clean_metrics %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold ) - -if(drop_zero_variance_columns) - clean_metrics <- clean_metrics %>% select_if(grepl("pid|local_date",names(.)) | sapply(., n_distinct, na.rm = T) > 1) - -# drop rows with a percentage of NA values above rows_nan_threshold -clean_metrics <- clean_metrics %>% - mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>% - filter(percentage_na < rows_nan_threshold) %>% - select(-percentage_na) - -clean_metrics <- filter_participant_without_enough_days(clean_metrics, participants_day_threshold) - -write.csv(clean_metrics, snakemake@output[[1]], row.names = FALSE) diff --git a/src/models/get_targets.py b/src/models/get_targets.py deleted file mode 100644 index 9a1629b7..00000000 --- a/src/models/get_targets.py +++ /dev/null @@ -1,16 +0,0 @@ -import pandas as pd - -participant_info = pd.read_csv(snakemake.input["participant_info"]) -summarised = snakemake.params["summarised"] -pid = snakemake.input["participant_info"].split("/")[2] - -targets = pd.DataFrame({"pid": [pid], "target": [None]}) -if summarised == "summarised": - if not participant_info.empty: - cesds = participant_info.loc[0, ["preop_cesd_total", "inpatient_cesd_total", "postop_cesd_total", "3month_cesd_total"]] - # targets: 1 => 50% (ceiling) or more of available CESD scores were 16 or higher; 0 => otherwise - threshold_num = (cesds.count() + 1) // 2 - threshold_cesd = 16 - target = 1 if cesds.apply(lambda x : 1 if x >= threshold_cesd else 0).sum() >= threshold_num else 0 - targets.loc[0, "target"] = target -targets.to_csv(snakemake.output[0], index=False) diff --git a/src/models/merge_features_for_individual_model.R b/src/models/merge_features_for_individual_model.R new file mode 100644 index 00000000..713bea03 --- /dev/null +++ b/src/models/merge_features_for_individual_model.R @@ -0,0 +1,24 @@ +source("packrat/init.R") + +library(tidyr) +library(purrr) +library(dplyr) + +feature_files <- snakemake@input[["feature_files"]] +phone_valid_sensed_days <- snakemake@input[["phone_valid_sensed_days"]] +days_to_include <- snakemake@input[["days_to_include"]] +source <- snakemake@params[["source"]] + +features_for_individual_model <- feature_files %>% + map(read.csv, stringsAsFactors = F, colClasses = c(local_date = "character")) %>% + reduce(full_join, by="local_date") + +if(!is.null(phone_valid_sensed_days) && source %in% c("phone_features", "phone_fitbit_features")){ + features_for_individual_model <- merge(features_for_individual_model, read.csv(phone_valid_sensed_days), by="local_date") %>% select(-valid_hours) +} + +if(!is.null(days_to_include)){ + features_for_individual_model <- merge(features_for_individual_model, read.csv(days_to_include), by="local_date") +} + +write.csv(features_for_individual_model, snakemake@output[[1]], row.names = FALSE) \ No newline at end of file diff --git a/src/models/merge_metrics_for_population_model.R b/src/models/merge_features_for_population_model.R similarity index 60% rename from src/models/merge_metrics_for_population_model.R rename to src/models/merge_features_for_population_model.R index 84b2d3fe..b40218c1 100644 --- a/src/models/merge_metrics_for_population_model.R +++ b/src/models/merge_features_for_population_model.R @@ -5,12 +5,13 @@ library(purrr) library(dplyr) library(stringr) -metric_files <- snakemake@input[["metric_files"]] +feature_files <- snakemake@input[["feature_files"]] -metrics_of_all_participants <- tibble(filename = metric_files) %>% # create a data frame + +features_of_all_participants <- tibble(filename = feature_files) %>% # create a data frame mutate(file_contents = map(filename, ~ read.csv(., stringsAsFactors = F, colClasses = c(local_date = "character"))), pid = str_match(filename, ".*/([a-zA-Z]+?[0-9]+?)/.*")[,2]) %>% unnest(cols = c(file_contents)) %>% select(-filename) -write.csv(metrics_of_all_participants, snakemake@output[[1]], row.names = FALSE) \ No newline at end of file +write.csv(features_of_all_participants, snakemake@output[[1]], row.names = FALSE) \ No newline at end of file diff --git a/src/models/merge_metrics_for_individual_model.R b/src/models/merge_metrics_for_individual_model.R deleted file mode 100644 index 6fd0fed2..00000000 --- a/src/models/merge_metrics_for_individual_model.R +++ /dev/null @@ -1,24 +0,0 @@ -source("packrat/init.R") - -library(tidyr) -library(purrr) -library(dplyr) - -metric_files <- snakemake@input[["metric_files"]] -phone_valid_sensed_days <- snakemake@input[["phone_valid_sensed_days"]] -days_to_include <- snakemake@input[["days_to_include"]] -source <- snakemake@params[["source"]] - -metrics_for_individual_model <- metric_files %>% - map(read.csv, stringsAsFactors = F, colClasses = c(local_date = "character")) %>% - reduce(full_join, by="local_date") - -if(!is.null(phone_valid_sensed_days) && source %in% c("phone_metrics", "phone_fitbit_metrics")){ - metrics_for_individual_model <- merge(metrics_for_individual_model, read.csv(phone_valid_sensed_days), by="local_date") %>% select(-valid_hours) -} - -if(!is.null(days_to_include)){ - metrics_for_individual_model <- merge(metrics_for_individual_model, read.csv(days_to_include), by="local_date") -} - -write.csv(metrics_for_individual_model, snakemake@output[[1]], row.names = FALSE) \ No newline at end of file diff --git a/src/models/targets.py b/src/models/targets.py new file mode 100644 index 00000000..b7c4f771 --- /dev/null +++ b/src/models/targets.py @@ -0,0 +1,20 @@ +import pandas as pd +import numpy as np + +pid = snakemake.params["pid"] +summarised = snakemake.params["summarised"] +targets_ratio_threshold = snakemake.params["targets_ratio_threshold"] +targets_value_threshold = snakemake.params["targets_value_threshold"] + +if summarised == "summarised": + targets = pd.DataFrame(columns=["pid", "target"]) + participant_info = pd.read_csv(snakemake.input["participant_info"]) + + if not participant_info.empty: + cesds = participant_info.loc[0, ["preop_cesd_total", "inpatient_cesd_total", "postop_cesd_total", "3month_cesd_total"]] + # targets: 1 => 50% (ceiling) or more of available CESD scores were 16 or higher; 0 => otherwise + num_threshold = int((cesds.count() + 1) * targets_ratio_threshold) + target = 1 if cesds.apply(lambda x : 1 if x >= targets_value_threshold else 0).sum() >= num_threshold else 0 + targets.loc[0, :] = [pid, target] + +targets.to_csv(snakemake.output[0], index=False)