Add demographic_features and targets module; refactor analysis code

Co-authored-by: JulioV <juliovhz@gmail.com>
2020-04-16 12:38:28 -04:00 · 2020-04-16 12:38:28 -04:00 · eac721de84
parent 695984586f
commit eac721de84
12 changed files with 185 additions and 142 deletions
--- a/33
+++ b/33
@ -11,12 +11,13 @@ rule all:
        # My study (this is an example of a rule created specifically for a study)
        expand("data/interim/{pid}/days_to_analyse_{days_before_surgery}_{days_in_hospital}_{days_after_discharge}.csv",
                            pid=config["PIDS"],
-                            days_before_surgery = config["METRICS_FOR_ANALYSIS"]["DAYS_BEFORE_SURGERY"],
-                            days_after_discharge= config["METRICS_FOR_ANALYSIS"]["DAYS_AFTER_DISCHARGE"],
-                            days_in_hospital= config["METRICS_FOR_ANALYSIS"]["DAYS_IN_HOSPITAL"]),
+                            days_before_surgery = config["PARAMS_FOR_ANALYSIS"]["DAYS_BEFORE_SURGERY"],
+                            days_after_discharge= config["PARAMS_FOR_ANALYSIS"]["DAYS_AFTER_DISCHARGE"],
+                            days_in_hospital= config["PARAMS_FOR_ANALYSIS"]["DAYS_IN_HOSPITAL"]),
        expand("data/processed/{pid}/targets_{summarised}.csv", 
                            pid = config["PIDS"],
-                            summarised = config["METRICS_FOR_ANALYSIS"]["SUMMARISED"]),
+                            summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
+        expand("data/processed/{pid}/demographic_features.csv", pid=config["PIDS"]),
        # Feature extraction
        expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
        expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["FITBIT_TABLE"]),
@ -71,20 +72,20 @@ rule all:
                            pid=config["PIDS"], 
                            segment = config["WIFI"]["DAY_SEGMENTS"]),
        # Models
-        expand("data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}_original.csv",
+        expand("data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_original.csv",
                                pid = config["PIDS"],
-                                source = config["METRICS_FOR_ANALYSIS"]["SOURCES"],
-                                day_segment = config["METRICS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
-        expand("data/processed/metrics_for_population_model/{source}_{day_segment}_original.csv",
-                                source = config["METRICS_FOR_ANALYSIS"]["SOURCES"],
-                                day_segment = config["METRICS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
-        expand("data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}_clean.csv",
+                                source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                                day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
+        expand("data/processed/features_for_population_model/{source}_{day_segment}_original.csv",
+                                source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                                day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
+        expand("data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_clean.csv",
                                pid = config["PIDS"],
-                                source = config["METRICS_FOR_ANALYSIS"]["SOURCES"],
-                                day_segment = config["METRICS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
-        expand("data/processed/metrics_for_population_model/{source}_{day_segment}_clean.csv",
-                                source = config["METRICS_FOR_ANALYSIS"]["SOURCES"],
-                                day_segment = config["METRICS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
+                                source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                                day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
+        expand("data/processed/features_for_population_model/{source}_{day_segment}_clean.csv",
+                                source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                                day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
        # Vizualisations
        expand("reports/figures/{pid}/{sensor}_heatmap_rows.html", pid=config["PIDS"], sensor=config["SENSORS"]),
        expand("reports/figures/{pid}/compliance_heatmap.html", pid=config["PIDS"]),
--- a/config.yaml
+++ b/config.yaml
@ -128,13 +128,14 @@ WIFI:
  DAY_SEGMENTS: *day_segments
  FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]

-METRICS_FOR_ANALYSIS:
+PARAMS_FOR_ANALYSIS:
  GROUNDTRUTH_TABLE: participant_info
-  SOURCES: &sources ["phone_metrics", "fitbit_metrics", "phone_fitbit_metrics"]
+  SOURCES: &sources ["phone_features", "fitbit_features", "phone_fitbit_features"]
  DAY_SEGMENTS: *day_segments
-  PHONE_METRICS: [accelerometer, applications_foreground, battery, call_incoming, call_missed, call_outgoing, google_activity_recognition, light, location_barnett, screen, sms_received, sms_sent]
-  FITBIT_METRICS: [fitbit_heartrate, fitbit_step]
-  PHONE_FITBIT_METRICS: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile
+  PHONE_FEATURES: [accelerometer, applications_foreground, battery, call_incoming, call_missed, call_outgoing, google_activity_recognition, light, location_barnett, screen, sms_received, sms_sent]
+  FITBIT_FEATURES: [fitbit_heartrate, fitbit_step]
+  PHONE_FITBIT_FEATURES: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile
+  DEMOGRAPHIC_FEATURES: [age, gender, inpatientdays]
  
  # Whether or not to include only days with enough valid sensed hours
  # logic can be found in rule phone_valid_sensed_days of rules/preprocessing.snakefile
@ -154,3 +155,8 @@ METRICS_FOR_ANALYSIS:
  COLS_VAR_THRESHOLD: True
  ROWS_NAN_THRESHOLD: 0.5
  PARTICIPANTS_DAY_THRESHOLD: 7
+
+  # Target Settings:
+  # 1 => TARGETS_RATIO_THRESHOLD (ceiling) or more of available CESD scores were TARGETS_VALUE_THRESHOLD or higher; 0 => otherwise
+  TARGETS_RATIO_THRESHOLD: 0.5
+  TARGETS_VALUE_THRESHOLD: 16
--- a/rules/models.snakefile
+++ b/rules/models.snakefile
@ -1,69 +1,69 @@
-def input_merge_metrics_of_single_participant(wildcards):
-    if wildcards.source == "phone_fitbit_metrics":
-        return expand("data/processed/{pid}/{metrics}_{day_segment}.csv", pid=wildcards.pid, metrics=config["METRICS_FOR_ANALYSIS"]["PHONE_METRICS"] + config["METRICS_FOR_ANALYSIS"]["FITBIT_METRICS"], day_segment=wildcards.day_segment)
+def input_merge_features_of_single_participant(wildcards):
+    if wildcards.source == "phone_fitbit_features":
+        return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"]["PHONE_FEATURES"] + config["PARAMS_FOR_ANALYSIS"]["FITBIT_FEATURES"], day_segment=wildcards.day_segment)
    else:
-        return expand("data/processed/{pid}/{metrics}_{day_segment}.csv", pid=wildcards.pid, metrics=config["METRICS_FOR_ANALYSIS"][wildcards.source.upper()], day_segment=wildcards.day_segment)
+        return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"][wildcards.source.upper()], day_segment=wildcards.day_segment)

 def optional_input_days_to_include(wildcards):
-    if config["METRICS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["ENABLED"]:
+    if config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["ENABLED"]:
        # This input automatically trigers the rule days_to_analyse in mystudy.snakefile
        return ["data/interim/{pid}/days_to_analyse" + \
-                    "_" + str(config["METRICS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"]) + \
-                    "_" + str(config["METRICS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]) + \
-                    "_" + str(config["METRICS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"]) + ".csv"]
+                    "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"]) + \
+                    "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]) + \
+                    "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"]) + ".csv"]
    else:
        return []

 def optional_input_valid_sensed_days(wildcards):
-    if config["METRICS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:
+    if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:
        # This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile
        return ["data/interim/{pid}/phone_valid_sensed_days.csv"]
    else:
        return []

-rule merge_metrics_for_individual_model:
+rule merge_features_for_individual_model:
    input:
-        metric_files = input_merge_metrics_of_single_participant,
+        feature_files = input_merge_features_of_single_participant,
        phone_valid_sensed_days = optional_input_valid_sensed_days,
        days_to_include = optional_input_days_to_include
    params:
        source = "{source}"
    output:
-        "data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}_original.csv"
+        "data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_original.csv"
    script:
-        "../src/models/merge_metrics_for_individual_model.R"
+        "../src/models/merge_features_for_individual_model.R"

-rule merge_metrics_for_population_model:
+rule merge_targets_for_population_model:
    input:
-        metric_files = expand("data/processed/{pid}/metrics_for_individual_model/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])
+        data_files = expand("data/processed/{pid}/targets_{{summarised}}.csv", pid=config["PIDS"])
    output:
-        "data/processed/metrics_for_population_model/{source}_{day_segment}_original.csv" 
+        "data/processed/features_for_population_model/targets_{summarised}.csv"
    script:
-        "../src/models/merge_metrics_for_population_model.R"
+        "../src/models/merge_data_for_population_model.py"

-rule clean_metrics_for_individual_model:
+rule clean_features_for_individual_model:
    input:
-        rules.merge_metrics_for_individual_model.output
+        rules.merge_features_for_individual_model.output
    params:
-        cols_nan_threshold = config["METRICS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
-        cols_var_threshold = config["METRICS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
-        rows_nan_threshold = config["METRICS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
-        participants_day_threshold = config["METRICS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"]
+        cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
+        cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
+        rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
+        participants_day_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"]
    output:
-        "data/processed/{pid}/metrics_for_individual_model/{source}_{day_segment}_clean.csv"
+        "data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_clean.csv"
    script:
-        "../src/models/clean_metrics_for_model.R"
+        "../src/models/clean_features_for_model.R"

-rule clean_metrics_for_population_model:
+rule clean_features_for_population_model:
    input:
-        rules.merge_metrics_for_population_model.output
+        rules.merge_features_for_population_model.output
    params:
-        cols_nan_threshold = config["METRICS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
-        cols_var_threshold = config["METRICS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
-        rows_nan_threshold = config["METRICS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
-        participants_day_threshold = config["METRICS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"]
+        cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
+        cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
+        rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
+        participants_day_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"]
    output:
-        "data/processed/metrics_for_population_model/{source}_{day_segment}_clean.csv"
+        "data/processed/features_for_population_model/{source}_{day_segment}_clean.csv"
    script:
-        "../src/models/clean_metrics_for_model.R"
+        "../src/models/clean_features_for_model.R"

--- a/rules/mystudy.snakefile
+++ b/rules/mystudy.snakefile
@ -1,6 +1,6 @@
 rule days_to_analyse:
    input:
-        participant_info = "data/raw/{pid}/" + config["METRICS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv"
+        participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv"
    params:
        days_before_surgery = "{days_before_surgery}",
        days_in_hospital = "{days_in_hospital}",
@ -10,12 +10,26 @@ rule days_to_analyse:
    script:
        "../src/models/select_days_to_analyse.py"

-rule get_targets:
+rule targets:
    input:
-        participant_info = "data/raw/{pid}/" + config["METRICS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv"
+        participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv"
    params:
-        summarised = "{summarised}"
+        pid = "{pid}",
+        summarised = "{summarised}",
+        targets_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["TARGETS_RATIO_THRESHOLD"],
+        targets_value_threshold = config["PARAMS_FOR_ANALYSIS"]["TARGETS_VALUE_THRESHOLD"]
    output:
        "data/processed/{pid}/targets_{summarised}.csv"
    script:
-        "../src/models/get_targets.py"
+        "../src/models/targets.py"
+
+rule demographic_features:
+    input:
+        participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv"
+    params:
+        pid = "{pid}",
+        features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC_FEATURES"]
+    output:
+        "data/processed/{pid}/demographic_features.csv"
+    script:
+        "../src/features/demographic_features.py"
--- a/src/features/demographic_features.py
+++ b/src/features/demographic_features.py
@ -0,0 +1,17 @@
+import pandas as pd
+
+pid = snakemake.params["pid"]
+requested_features = snakemake.params["features"]
+demographic_features = pd.DataFrame(columns=["pid"] + requested_features)
+
+participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"])
+demographic_features.loc[0, "pid"] = pid
+if not participant_info.empty:
+    if "age" in requested_features:
+        demographic_features.loc[0, "age"] = participant_info.loc[0, "age"]
+    if "gender" in requested_features:
+        demographic_features.loc[0, "gender"] = participant_info.loc[0, "gender"]
+    if "inpatientdays" in requested_features:
+        demographic_features.loc[0, "inpatientdays"] = (participant_info.loc[0, "discharge_date"] - participant_info.loc[0, "surgery_date"]).days
+
+demographic_features.to_csv(snakemake.output[0], index=False)
--- a/src/models/clean_features_for_model.R
+++ b/src/models/clean_features_for_model.R
@ -0,0 +1,40 @@
+source("packrat/init.R")
+library(tidyr)
+library(dplyr)
+
+filter_participant_without_enough_days <- function(clean_features, participants_day_threshold){
+  if("pid" %in% colnames(clean_features))
+    clean_features <- clean_features %>% group_by(pid)
+  
+  clean_features <- clean_features %>% 
+    filter(n() >= participants_day_threshold) %>% 
+    ungroup()
+  
+  return(clean_features)
+}
+
+clean_features <- read.csv(snakemake@input[[1]])
+cols_nan_threshold <- snakemake@params[["cols_nan_threshold"]]
+drop_zero_variance_columns <- snakemake@params[["cols_var_threshold"]]
+rows_nan_threshold <- snakemake@params[["rows_nan_threshold"]]
+participants_day_threshold <- snakemake@params[["participants_day_threshold"]]
+
+# We have to do this before and after dropping rows, that's why is duplicated
+clean_features <- filter_participant_without_enough_days(clean_features, participants_day_threshold)
+
+# drop columns with a percentage of NA values above cols_nan_threshold
+if(nrow(clean_features))
+    clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
+
+if(drop_zero_variance_columns)
+  clean_features <- clean_features %>% select_if(grepl("pid|local_date",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
+
+# drop rows with a percentage of NA values above rows_nan_threshold
+clean_features <- clean_features %>% 
+  mutate(percentage_na =  rowSums(is.na(.)) / ncol(.)) %>% 
+  filter(percentage_na < rows_nan_threshold) %>% 
+  select(-percentage_na)
+
+clean_features <- filter_participant_without_enough_days(clean_features, participants_day_threshold)
+
+write.csv(clean_features, snakemake@output[[1]], row.names = FALSE)
--- a/src/models/clean_metrics_for_model.R
+++ b/src/models/clean_metrics_for_model.R
@ -1,40 +0,0 @@
-source("packrat/init.R")
-library(tidyr)
-library(dplyr)
-
-filter_participant_without_enough_days <- function(clean_metrics, participants_day_threshold){
-  if("pid" %in% colnames(clean_metrics))
-    clean_metrics <- clean_metrics %>% group_by(pid)
-  
-  clean_metrics <- clean_metrics %>% 
-    filter(n() >= participants_day_threshold) %>% 
-    ungroup()
-  
-  return(clean_metrics)
-}
-
-clean_metrics <- read.csv(snakemake@input[[1]])
-cols_nan_threshold <- snakemake@params[["cols_nan_threshold"]]
-drop_zero_variance_columns <- snakemake@params[["cols_var_threshold"]]
-rows_nan_threshold <- snakemake@params[["rows_nan_threshold"]]
-participants_day_threshold <- snakemake@params[["participants_day_threshold"]]
-
-# We have to do this before and after dropping rows, that's why is duplicated
-clean_metrics <- filter_participant_without_enough_days(clean_metrics, participants_day_threshold)
-
-# drop columns with a percentage of NA values above cols_nan_threshold
-if(nrow(clean_metrics))
-    clean_metrics <- clean_metrics %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
-
-if(drop_zero_variance_columns)
-  clean_metrics <- clean_metrics %>% select_if(grepl("pid|local_date",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
-
-# drop rows with a percentage of NA values above rows_nan_threshold
-clean_metrics <- clean_metrics %>% 
-  mutate(percentage_na =  rowSums(is.na(.)) / ncol(.)) %>% 
-  filter(percentage_na < rows_nan_threshold) %>% 
-  select(-percentage_na)
-
-clean_metrics <- filter_participant_without_enough_days(clean_metrics, participants_day_threshold)
-
-write.csv(clean_metrics, snakemake@output[[1]], row.names = FALSE)
--- a/src/models/get_targets.py
+++ b/src/models/get_targets.py
@ -1,16 +0,0 @@
-import pandas as pd
-
-participant_info = pd.read_csv(snakemake.input["participant_info"])
-summarised = snakemake.params["summarised"]
-pid = snakemake.input["participant_info"].split("/")[2]
-
-targets = pd.DataFrame({"pid": [pid], "target": [None]})
-if summarised == "summarised":
-    if not participant_info.empty:
-        cesds = participant_info.loc[0, ["preop_cesd_total", "inpatient_cesd_total", "postop_cesd_total", "3month_cesd_total"]]
-        # targets: 1 => 50% (ceiling) or more of available CESD scores were 16 or higher; 0 => otherwise
-        threshold_num = (cesds.count() + 1) // 2
-        threshold_cesd = 16
-        target = 1 if cesds.apply(lambda x : 1 if x >= threshold_cesd else 0).sum() >= threshold_num else 0
-        targets.loc[0, "target"] = target
-targets.to_csv(snakemake.output[0], index=False)
--- a/src/models/merge_features_for_individual_model.R
+++ b/src/models/merge_features_for_individual_model.R
@ -0,0 +1,24 @@
+source("packrat/init.R")
+
+library(tidyr)
+library(purrr)
+library(dplyr)
+
+feature_files  <- snakemake@input[["feature_files"]]
+phone_valid_sensed_days  <- snakemake@input[["phone_valid_sensed_days"]]
+days_to_include <- snakemake@input[["days_to_include"]]
+source <- snakemake@params[["source"]]
+
+features_for_individual_model <- feature_files %>%
+  map(read.csv, stringsAsFactors = F, colClasses = c(local_date = "character")) %>%
+  reduce(full_join, by="local_date")
+
+if(!is.null(phone_valid_sensed_days) && source %in% c("phone_features", "phone_fitbit_features")){
+    features_for_individual_model <- merge(features_for_individual_model, read.csv(phone_valid_sensed_days), by="local_date") %>% select(-valid_hours)
+}
+
+if(!is.null(days_to_include)){
+  features_for_individual_model <- merge(features_for_individual_model, read.csv(days_to_include), by="local_date")
+}
+
+write.csv(features_for_individual_model, snakemake@output[[1]], row.names = FALSE)
--- a/src/models/merge_features_for_population_model.R
+++ b/src/models/merge_features_for_population_model.R
@ -5,12 +5,13 @@ library(purrr)
 library(dplyr)
 library(stringr)

-metric_files  <- snakemake@input[["metric_files"]]
+feature_files  <- snakemake@input[["feature_files"]]

-metrics_of_all_participants <- tibble(filename = metric_files) %>% # create a data frame
+
+features_of_all_participants <- tibble(filename = feature_files) %>% # create a data frame
  mutate(file_contents = map(filename, ~ read.csv(., stringsAsFactors = F, colClasses = c(local_date = "character"))),
         pid = str_match(filename, ".*/([a-zA-Z]+?[0-9]+?)/.*")[,2]) %>%
  unnest(cols = c(file_contents)) %>%
  select(-filename)

-write.csv(metrics_of_all_participants, snakemake@output[[1]], row.names = FALSE)
+write.csv(features_of_all_participants, snakemake@output[[1]], row.names = FALSE)
--- a/src/models/merge_metrics_for_individual_model.R
+++ b/src/models/merge_metrics_for_individual_model.R
@ -1,24 +0,0 @@
-source("packrat/init.R")
-
-library(tidyr)
-library(purrr)
-library(dplyr)
-
-metric_files  <- snakemake@input[["metric_files"]]
-phone_valid_sensed_days  <- snakemake@input[["phone_valid_sensed_days"]]
-days_to_include <- snakemake@input[["days_to_include"]]
-source <- snakemake@params[["source"]]
-
-metrics_for_individual_model <- metric_files %>%
-  map(read.csv, stringsAsFactors = F, colClasses = c(local_date = "character")) %>%
-  reduce(full_join, by="local_date")
-
-if(!is.null(phone_valid_sensed_days) && source %in% c("phone_metrics", "phone_fitbit_metrics")){
-    metrics_for_individual_model <- merge(metrics_for_individual_model, read.csv(phone_valid_sensed_days), by="local_date") %>% select(-valid_hours)
-}
-
-if(!is.null(days_to_include)){
-  metrics_for_individual_model <- merge(metrics_for_individual_model, read.csv(days_to_include), by="local_date")
-}
-
-write.csv(metrics_for_individual_model, snakemake@output[[1]], row.names = FALSE)
--- a/src/models/targets.py
+++ b/src/models/targets.py
@ -0,0 +1,20 @@
+import pandas as pd
+import numpy as np
+
+pid = snakemake.params["pid"]
+summarised = snakemake.params["summarised"]
+targets_ratio_threshold = snakemake.params["targets_ratio_threshold"]
+targets_value_threshold = snakemake.params["targets_value_threshold"]
+
+if summarised == "summarised":
+    targets = pd.DataFrame(columns=["pid", "target"])
+    participant_info = pd.read_csv(snakemake.input["participant_info"])
+
+    if not participant_info.empty:
+        cesds = participant_info.loc[0, ["preop_cesd_total", "inpatient_cesd_total", "postop_cesd_total", "3month_cesd_total"]]
+        # targets: 1 => 50% (ceiling) or more of available CESD scores were 16 or higher; 0 => otherwise
+        num_threshold = int((cesds.count() + 1) * targets_ratio_threshold)
+        target = 1 if cesds.apply(lambda x : 1 if x >= targets_value_threshold else 0).sum() >= num_threshold else 0
+        targets.loc[0, :] = [pid, target]
+
+targets.to_csv(snakemake.output[0], index=False)