Refactor script to create participants files

2020-10-27 17:13:16 -04:00 · 2020-10-27 17:13:16 -04:00 · d5931c75d8
parent c8176b2d90
commit d5931c75d8
4 changed files with 100 additions and 61 deletions
--- a/config.yaml
+++ b/config.yaml
@ -2,12 +2,6 @@
 # You must create a file for each participant named pXXX containing their device_id. This can be done manually or automatically
 PIDS: [test01]

-# Global var with common day segments
-DAY_SEGMENTS: &day_segments
-  TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
-  FILE: "data/external/daysegments_periodic.csv"
-  INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data
-
 # Use tz codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones. Double check your code, for example EST is not US Eastern Time.
 TIMEZONE: &timezone
  America/New_York
@ -15,23 +9,31 @@ TIMEZONE: &timezone
 DATABASE_GROUP: &database_group
  MY_GROUP

-# config section for the script that creates participant files automatically
-PARTICIPANT_FILES: # run snakemake -j1 -R parse_participant_files
+# run 'snakemake -j1 create_participants_files'
+CREATE_PARTICIPANT_FILES:
+  SOURCE:
+    TYPE: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE
+    DATABASE_GROUP: *database_group
+    CSV_FILE_PATH: "data/external/example_participants.csv" # must have columns: PHONE DEVICE_ID_COLUMN, FITBIT DEVICE_ID_COLUMN, pid , label, start_date, end_date
+    TIMEZONE: *timezone # only used for AWARE_DEVICE_TABLE
  PHONE_SECTION:
    ADD: TRUE
-    PARSED_FROM: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE
-    PARSED_SOURCE: *database_group # DB credentials group or CSV file path. If CSV file, it should have: device_id, pid (optional), label (optional), start_date (optional), end_date (optional)
+    DEVICE_ID_COLUMN: device_id # column name
    IGNORED_DEVICE_IDS: []
  FITBIT_SECTION:
-    ADD: FALSE
-    SAME_AS_PHONE: FALSE # If TRUE, all config below is ignored
-    PARSED_FROM: CSV_FILE
-    PARSED_SOURCE: "external/my_fitbit_participants.csv" # CSV file should have: device_id, pid (optional), label (optional), start_date (optional), end_date (optional)
+    ADD: TRUE
+    DEVICE_ID_COLUMN: device_id # column name
+    IGNORED_DEVICE_IDS: []
+
+DAY_SEGMENTS: &day_segments
+  TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
+  FILE: "data/external/daysegments_periodic.csv"
+  INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data

 SENSOR_DATA:
  PHONE:
    SOURCE: 
-      TYPE: DATABASE # Phone only supports DATABASE for now
+      TYPE: DATABASE
      DATABASE_GROUP: *database_group
      DEVICE_ID_COLUMN: device_id # column name
    TIMEZONE: 
@ -46,6 +48,9 @@ SENSOR_DATA:
      TYPE: SINGLE # Fitbit only supports SINGLE timezones
      VALUE: *timezone # timezone code (e.g. America/New_York, see attribute TIMEZONE above and https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)

+############## PHONE ###########################################################
+################################################################################
+
 PHONE_VALID_SENSED_BINS:
  COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features
  BIN_SIZE: &bin_size 5 # (in minutes)
--- a/rules/preprocessing.smk
+++ b/rules/preprocessing.smk
@ -15,14 +15,13 @@ rule create_example_participant_files:
    shell:
        "echo 'a748ee1a-1d0b-4ae9-9074-279a2b6ba524\nandroid\ntest01\n2020/04/23,2020/05/04\n' >> ./data/external/example01 && echo '13dbc8a3-dae3-4834-823a-4bc96a7d459d\nios\ntest02\n2020/04/23,2020/05/04\n' >> ./data/external/example02"

-# rule download_participants:
-#     params:
-#         group = config["DOWNLOAD_PARTICIPANTS"]["GROUP"],
-#         ignored_device_ids = config["DOWNLOAD_PARTICIPANTS"]["IGNORED_DEVICE_IDS"],
-#         timezone = config["TIMEZONE"]
-#     priority: 1
-#     script:
-#         "../src/data/download_participants.R"
+rule create_participants_files:
+    input:
+        participants_file = [] if config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["TYPE"] == "AWARE_DEVICE_TABLE" else config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["CSV_FILE_PATH"] 
+    params:
+        config = config["CREATE_PARTICIPANT_FILES"]
+    script:
+        "../src/data/create_participants_files.R"

 rule download_phone_data:
    input:
--- a/src/data/create_participants_files.R
+++ b/src/data/create_participants_files.R
@ -0,0 +1,73 @@
+source("renv/activate.R")
+
+library(RMySQL)
+library(stringr)
+library(purrr)
+library(readr)
+library("dplyr", warn.conflicts = F)
+
+config <- snakemake@params[["config"]]
+group <- config$SOURCE$DATABASE_GROUP
+timezone <- config$SOURCE$TIMEZONE
+phone_device_id_column = config$PHONE_SECTION$DEVICE_ID_COLUMN
+fitbit_device_id_column = config$FITBIT_SECTION$DEVICE_ID_COLUMN
+add_fitbit_section = config$PHONE_SECTION$ADD
+add_phone_section = config$FITBIT_SECTION$ADD
+phone_ignored = config$PHONE_SECTION$IGNORED_DEVICE_IDS
+fitbit_ignored = config$FITBIT_SECTION$IGNORED_DEVICE_IDS
+
+rmysql.settingsfile <- "./.env"
+
+if(config$SOURCE$TYPE == "AWARE_DEVICE_TABLE"){
+  database <- dbConnect(MySQL(), default.file = rmysql.settingsfile, group = group)
+  if(config$FITBIT_SECTION$ADD == TRUE){
+    query <- paste("SELECT",phone_device_id_column, ",",fitbit_device_id_column," as _temp_fitbit_id, brand, label, timestamp FROM aware_device order by timestamp asc")
+    fitbit_device_id_column <- "_temp_fitbit_id"
+  }
+  else 
+    query <- paste("SELECT ",phone_device_id_column,", brand, label, timestamp FROM aware_device order by timestamp asc")
+  participants <- dbGetQuery(database, query)
+  dbDisconnect(database)
+  participants <- participants %>% 
+    mutate(pid = if_else(row_number()<10, paste0("p","0",row_number()), paste0("p", row_number())),
+           platform = if_else(brand == "iPhone", "ios", "android"), brand = NULL,
+           label = iconv(if_else(label == "", "EMPTY_LABEL", label), from = "UTF-8", to = "UTF-8", sub=''),
+           start_date = format(as.POSIXct(timestamp / 1000, origin = "1970-01-01", tz = timezone), "%Y-%m-%d"),
+           end_date = format(Sys.Date(), "%Y-%m-%d"),
+           !!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)),
+           !!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column)))
+
+} else if(config$SOURCE$TYPE == "CSV_FILE"){
+  participants <- read_csv(config$SOURCE$CSV_FILE_PATH, col_types=cols_only(device_id="c",pid="c",label="c",platform="c",
+                            start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c"))
+  participants <- participants %>% 
+  mutate(!!phone_device_id_column := str_replace(!!rlang::sym(phone_device_id_column), ";",","),
+         platform = str_replace(platform, ";",","),
+         !!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)),
+         !!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column)))
+}
+
+participants %>%
+  pwalk(function(add_phone_section, add_fitbit_section, phone_device_id_column, fitbit_device_id_column, ...) {
+    empty_phone <- c("PHONE:", "  DEVICE_IDS:", "  PLATFORMS:","  LABEL:", "  START_DATE:", "  END_DATE:")
+    empty_fitbit <- c("FITBIT:", "  DEVICE_IDS:", "  LABEL:", "  START_DATE:", "  END_DATE:")
+    row <- tibble(...)
+    lines <- c()
+
+    if(add_phone_section == TRUE && !is.na(row[phone_device_id_column])){
+      lines <- append(lines, c("PHONE:", paste0("  DEVICE_IDS: [",row[phone_device_id_column],"]"), paste0("  PLATFORMS: [",row$platform,"]"),
+                               paste("  LABEL:",row$label), paste("  START_DATE:", row$start_date), paste("  END_DATE:", row$end_date)))
+    }else
+      lines <- append(lines, empty_phone)
+    
+    if(add_fitbit_section == TRUE && !is.na(row[fitbit_device_id_column])){
+      lines <- append(lines, c("FITBIT:", paste0("  DEVICE_IDS: [",row[fitbit_device_id_column],"]"),
+                               paste("  LABEL:",row$label), paste("  START_DATE:", row$start_date), paste("  END_DATE:", row$end_date)))
+    } else
+      lines <- append(lines, empty_fitbit)
+    
+    file_connection <- file(paste0("./data/external/participant_files/", row$pid, ".yaml"))
+    writeLines(lines, file_connection)
+    close(file_connection)
+
+  }, add_phone_section, add_fitbit_section, phone_device_id_column, fitbit_device_id_column)
--- a/src/data/download_participants.R
+++ b/src/data/download_participants.R
@ -1,38 +0,0 @@
-source("renv/activate.R")
-
-library(RMySQL)
-
-group <- snakemake@params[["group"]]
-ignored_device_ids <- snakemake@params[["ignored_device_ids"]]
-timezone <- snakemake@params[["timezone"]]
-rmysql.settingsfile <- "./.env"
-
-stopDB <- dbConnect(MySQL(), default.file = rmysql.settingsfile, group = group)
-query <- "SELECT device_id, brand, label, timestamp FROM aware_device order by timestamp asc"
-participants <- dbGetQuery(stopDB, query)
-pids <- c()
-
-end_date <- format(Sys.Date(), "%Y/%m/%d")
-
-for(id in 1:nrow(participants)){
-    device_id <- participants$device_id[[id]]
-    brand <- ifelse(participants$brand[[id]] == "iPhone", "ios", "android")
-    label <- ifelse(participants$label[[id]] == "", "EMPTY_LABEL", participants$label[[id]])
-    label <- iconv(label, from = "UTF-8", to = "UTF-8", sub='')
-    start_date <- format(as.POSIXct(participants$timestamp[[id]] / 1000, origin = "1970-01-01", tz = timezone), "%Y/%m/%d")
-    if(!(device_id %in% ignored_device_ids)){
-        pid <- paste0("p", ifelse(id < 10, paste0("0", id), id))
-        pids <- append(pids, pid)
-        file_connection <- file(paste0("./data/external/", pid))
-        writeLines(c(device_id, brand, label, paste0(start_date, ",", end_date)), file_connection)
-        close(file_connection)
-    }
-}
-
-file_lines <-readLines("./config.yaml")
-for (i in 1:length(file_lines)){
-  if(startsWith(file_lines[i], "PIDS:")){
-    file_lines[i] <- paste0("PIDS: [", paste(pids, collapse = ", "), "]")
-  }
-}
-writeLines(file_lines, con = "./config.yaml")