Refactor script to create participants files

2020-10-27 17:13:16 -04:00 · 2020-10-27 17:13:16 -04:00 · d5931c75d8
parent c8176b2d90
commit d5931c75d8
4 changed files with 100 additions and 61 deletions
--- a/config.yaml
+++ b/config.yaml
@ -2,12 +2,6 @@
 # You must create a file for each participant named pXXX containing their device_id. This can be done manually or automatically
 PIDS: [test01]
 # Global var with common day segments
 DAY_SEGMENTS: &day_segments
  TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
  FILE: "data/external/daysegments_periodic.csv"
  INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data
 # Use tz codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones. Double check your code, for example EST is not US Eastern Time.
 TIMEZONE: &timezone
  America/New_York
@ -15,23 +9,31 @@ TIMEZONE: &timezone
 DATABASE_GROUP: &database_group
  MY_GROUP
-# config section for the script that creates participant files automatically
+# run 'snakemake -j1 create_participants_files'
-PARTICIPANT_FILES: # run snakemake -j1 -R parse_participant_files
+CREATE_PARTICIPANT_FILES:
  SOURCE:
    TYPE: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE
    DATABASE_GROUP: *database_group
    CSV_FILE_PATH: "data/external/example_participants.csv" # must have columns: PHONE DEVICE_ID_COLUMN, FITBIT DEVICE_ID_COLUMN, pid , label, start_date, end_date
    TIMEZONE: *timezone # only used for AWARE_DEVICE_TABLE
  PHONE_SECTION:
    ADD: TRUE
-    PARSED_FROM: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE
+    DEVICE_ID_COLUMN: device_id # column name
    PARSED_SOURCE: *database_group # DB credentials group or CSV file path. If CSV file, it should have: device_id, pid (optional), label (optional), start_date (optional), end_date (optional)
    IGNORED_DEVICE_IDS: []
  FITBIT_SECTION:
-    ADD: FALSE
+    ADD: TRUE
-    SAME_AS_PHONE: FALSE # If TRUE, all config below is ignored
+    DEVICE_ID_COLUMN: device_id # column name
-    PARSED_FROM: CSV_FILE
+    IGNORED_DEVICE_IDS: []
-    PARSED_SOURCE: "external/my_fitbit_participants.csv" # CSV file should have: device_id, pid (optional), label (optional), start_date (optional), end_date (optional)
+
 DAY_SEGMENTS: &day_segments
  TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
  FILE: "data/external/daysegments_periodic.csv"
  INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data
 SENSOR_DATA:
  PHONE:
    SOURCE: 
-      TYPE: DATABASE # Phone only supports DATABASE for now
+      TYPE: DATABASE
      DATABASE_GROUP: *database_group
      DEVICE_ID_COLUMN: device_id # column name
    TIMEZONE: 
@ -46,6 +48,9 @@ SENSOR_DATA:
      TYPE: SINGLE # Fitbit only supports SINGLE timezones
      VALUE: *timezone # timezone code (e.g. America/New_York, see attribute TIMEZONE above and https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
 ############## PHONE ###########################################################
 ################################################################################
 PHONE_VALID_SENSED_BINS:
  COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features
  BIN_SIZE: &bin_size 5 # (in minutes)
--- a/rules/preprocessing.smk
+++ b/rules/preprocessing.smk
@ -15,14 +15,13 @@ rule create_example_participant_files:
    shell:
        "echo 'a748ee1a-1d0b-4ae9-9074-279a2b6ba524\nandroid\ntest01\n2020/04/23,2020/05/04\n' >> ./data/external/example01 && echo '13dbc8a3-dae3-4834-823a-4bc96a7d459d\nios\ntest02\n2020/04/23,2020/05/04\n' >> ./data/external/example02"
-# rule download_participants:
+rule create_participants_files:
-#     params:
+    input:
-#         group = config["DOWNLOAD_PARTICIPANTS"]["GROUP"],
+        participants_file = [] if config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["TYPE"] == "AWARE_DEVICE_TABLE" else config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["CSV_FILE_PATH"] 
-#         ignored_device_ids = config["DOWNLOAD_PARTICIPANTS"]["IGNORED_DEVICE_IDS"],
+    params:
-#         timezone = config["TIMEZONE"]
+        config = config["CREATE_PARTICIPANT_FILES"]
-#     priority: 1
+    script:
-#     script:
+        "../src/data/create_participants_files.R"
 #         "../src/data/download_participants.R"
 rule download_phone_data:
    input:
--- a/src/data/create_participants_files.R
+++ b/src/data/create_participants_files.R
@ -0,0 +1,73 @@
 source("renv/activate.R")
 library(RMySQL)
 library(stringr)
 library(purrr)
 library(readr)
 library("dplyr", warn.conflicts = F)
 config <- snakemake@params[["config"]]
 group <- config$SOURCE$DATABASE_GROUP
 timezone <- config$SOURCE$TIMEZONE
 phone_device_id_column = config$PHONE_SECTION$DEVICE_ID_COLUMN
 fitbit_device_id_column = config$FITBIT_SECTION$DEVICE_ID_COLUMN
 add_fitbit_section = config$PHONE_SECTION$ADD
 add_phone_section = config$FITBIT_SECTION$ADD
 phone_ignored = config$PHONE_SECTION$IGNORED_DEVICE_IDS
 fitbit_ignored = config$FITBIT_SECTION$IGNORED_DEVICE_IDS
 rmysql.settingsfile <- "./.env"
 if(config$SOURCE$TYPE == "AWARE_DEVICE_TABLE"){
  database <- dbConnect(MySQL(), default.file = rmysql.settingsfile, group = group)
  if(config$FITBIT_SECTION$ADD == TRUE){
    query <- paste("SELECT",phone_device_id_column, ",",fitbit_device_id_column," as _temp_fitbit_id, brand, label, timestamp FROM aware_device order by timestamp asc")
    fitbit_device_id_column <- "_temp_fitbit_id"
  }
  else 
    query <- paste("SELECT ",phone_device_id_column,", brand, label, timestamp FROM aware_device order by timestamp asc")
  participants <- dbGetQuery(database, query)
  dbDisconnect(database)
  participants <- participants %>% 
    mutate(pid = if_else(row_number()<10, paste0("p","0",row_number()), paste0("p", row_number())),
           platform = if_else(brand == "iPhone", "ios", "android"), brand = NULL,
           label = iconv(if_else(label == "", "EMPTY_LABEL", label), from = "UTF-8", to = "UTF-8", sub=''),
           start_date = format(as.POSIXct(timestamp / 1000, origin = "1970-01-01", tz = timezone), "%Y-%m-%d"),
           end_date = format(Sys.Date(), "%Y-%m-%d"),
           !!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)),
           !!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column)))
 } else if(config$SOURCE$TYPE == "CSV_FILE"){
  participants <- read_csv(config$SOURCE$CSV_FILE_PATH, col_types=cols_only(device_id="c",pid="c",label="c",platform="c",
                            start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c"))
  participants <- participants %>% 
  mutate(!!phone_device_id_column := str_replace(!!rlang::sym(phone_device_id_column), ";",","),
         platform = str_replace(platform, ";",","),
         !!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)),
         !!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column)))
 }
 participants %>%
  pwalk(function(add_phone_section, add_fitbit_section, phone_device_id_column, fitbit_device_id_column, ...) {
    empty_phone <- c("PHONE:", "  DEVICE_IDS:", "  PLATFORMS:","  LABEL:", "  START_DATE:", "  END_DATE:")
    empty_fitbit <- c("FITBIT:", "  DEVICE_IDS:", "  LABEL:", "  START_DATE:", "  END_DATE:")
    row <- tibble(...)
    lines <- c()
    if(add_phone_section == TRUE && !is.na(row[phone_device_id_column])){
      lines <- append(lines, c("PHONE:", paste0("  DEVICE_IDS: [",row[phone_device_id_column],"]"), paste0("  PLATFORMS: [",row$platform,"]"),
                               paste("  LABEL:",row$label), paste("  START_DATE:", row$start_date), paste("  END_DATE:", row$end_date)))
    }else
      lines <- append(lines, empty_phone)
    if(add_fitbit_section == TRUE && !is.na(row[fitbit_device_id_column])){
      lines <- append(lines, c("FITBIT:", paste0("  DEVICE_IDS: [",row[fitbit_device_id_column],"]"),
                               paste("  LABEL:",row$label), paste("  START_DATE:", row$start_date), paste("  END_DATE:", row$end_date)))
    } else
      lines <- append(lines, empty_fitbit)
    file_connection <- file(paste0("./data/external/participant_files/", row$pid, ".yaml"))
    writeLines(lines, file_connection)
    close(file_connection)
  }, add_phone_section, add_fitbit_section, phone_device_id_column, fitbit_device_id_column)
--- a/src/data/download_participants.R
+++ b/src/data/download_participants.R
@ -1,38 +0,0 @@
 source("renv/activate.R")
 library(RMySQL)
 group <- snakemake@params[["group"]]
 ignored_device_ids <- snakemake@params[["ignored_device_ids"]]
 timezone <- snakemake@params[["timezone"]]
 rmysql.settingsfile <- "./.env"
 stopDB <- dbConnect(MySQL(), default.file = rmysql.settingsfile, group = group)
 query <- "SELECT device_id, brand, label, timestamp FROM aware_device order by timestamp asc"
 participants <- dbGetQuery(stopDB, query)
 pids <- c()
 end_date <- format(Sys.Date(), "%Y/%m/%d")
 for(id in 1:nrow(participants)){
    device_id <- participants$device_id[[id]]
    brand <- ifelse(participants$brand[[id]] == "iPhone", "ios", "android")
    label <- ifelse(participants$label[[id]] == "", "EMPTY_LABEL", participants$label[[id]])
    label <- iconv(label, from = "UTF-8", to = "UTF-8", sub='')
    start_date <- format(as.POSIXct(participants$timestamp[[id]] / 1000, origin = "1970-01-01", tz = timezone), "%Y/%m/%d")
    if(!(device_id %in% ignored_device_ids)){
        pid <- paste0("p", ifelse(id < 10, paste0("0", id), id))
        pids <- append(pids, pid)
        file_connection <- file(paste0("./data/external/", pid))
        writeLines(c(device_id, brand, label, paste0(start_date, ",", end_date)), file_connection)
        close(file_connection)
    }
 }
 file_lines <-readLines("./config.yaml")
 for (i in 1:length(file_lines)){
  if(startsWith(file_lines[i], "PIDS:")){
    file_lines[i] <- paste0("PIDS: [", paste(pids, collapse = ", "), "]")
  }
 }
 writeLines(file_lines, con = "./config.yaml")