Refactor script to create participants files

pull/103/head
JulioV 2020-10-27 17:13:16 -04:00
parent c8176b2d90
commit d5931c75d8
4 changed files with 100 additions and 61 deletions

View File

@ -2,12 +2,6 @@
# You must create a file for each participant named pXXX containing their device_id. This can be done manually or automatically
PIDS: [test01]
# Global var with common day segments
DAY_SEGMENTS: &day_segments
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
FILE: "data/external/daysegments_periodic.csv"
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data
# Use tz codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones. Double check your code, for example EST is not US Eastern Time.
TIMEZONE: &timezone
America/New_York
@ -15,23 +9,31 @@ TIMEZONE: &timezone
DATABASE_GROUP: &database_group
MY_GROUP
# config section for the script that creates participant files automatically
PARTICIPANT_FILES: # run snakemake -j1 -R parse_participant_files
# run 'snakemake -j1 create_participants_files'
CREATE_PARTICIPANT_FILES:
SOURCE:
TYPE: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE
DATABASE_GROUP: *database_group
CSV_FILE_PATH: "data/external/example_participants.csv" # must have columns: PHONE DEVICE_ID_COLUMN, FITBIT DEVICE_ID_COLUMN, pid , label, start_date, end_date
TIMEZONE: *timezone # only used for AWARE_DEVICE_TABLE
PHONE_SECTION:
ADD: TRUE
PARSED_FROM: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE
PARSED_SOURCE: *database_group # DB credentials group or CSV file path. If CSV file, it should have: device_id, pid (optional), label (optional), start_date (optional), end_date (optional)
DEVICE_ID_COLUMN: device_id # column name
IGNORED_DEVICE_IDS: []
FITBIT_SECTION:
ADD: FALSE
SAME_AS_PHONE: FALSE # If TRUE, all config below is ignored
PARSED_FROM: CSV_FILE
PARSED_SOURCE: "external/my_fitbit_participants.csv" # CSV file should have: device_id, pid (optional), label (optional), start_date (optional), end_date (optional)
ADD: TRUE
DEVICE_ID_COLUMN: device_id # column name
IGNORED_DEVICE_IDS: []
DAY_SEGMENTS: &day_segments
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
FILE: "data/external/daysegments_periodic.csv"
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data
SENSOR_DATA:
PHONE:
SOURCE:
TYPE: DATABASE # Phone only supports DATABASE for now
TYPE: DATABASE
DATABASE_GROUP: *database_group
DEVICE_ID_COLUMN: device_id # column name
TIMEZONE:
@ -46,6 +48,9 @@ SENSOR_DATA:
TYPE: SINGLE # Fitbit only supports SINGLE timezones
VALUE: *timezone # timezone code (e.g. America/New_York, see attribute TIMEZONE above and https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
############## PHONE ###########################################################
################################################################################
PHONE_VALID_SENSED_BINS:
COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features
BIN_SIZE: &bin_size 5 # (in minutes)

View File

@ -15,14 +15,13 @@ rule create_example_participant_files:
shell:
"echo 'a748ee1a-1d0b-4ae9-9074-279a2b6ba524\nandroid\ntest01\n2020/04/23,2020/05/04\n' >> ./data/external/example01 && echo '13dbc8a3-dae3-4834-823a-4bc96a7d459d\nios\ntest02\n2020/04/23,2020/05/04\n' >> ./data/external/example02"
# rule download_participants:
# params:
# group = config["DOWNLOAD_PARTICIPANTS"]["GROUP"],
# ignored_device_ids = config["DOWNLOAD_PARTICIPANTS"]["IGNORED_DEVICE_IDS"],
# timezone = config["TIMEZONE"]
# priority: 1
# script:
# "../src/data/download_participants.R"
rule create_participants_files:
input:
participants_file = [] if config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["TYPE"] == "AWARE_DEVICE_TABLE" else config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["CSV_FILE_PATH"]
params:
config = config["CREATE_PARTICIPANT_FILES"]
script:
"../src/data/create_participants_files.R"
rule download_phone_data:
input:

View File

@ -0,0 +1,73 @@
source("renv/activate.R")
library(RMySQL)
library(stringr)
library(purrr)
library(readr)
library("dplyr", warn.conflicts = F)
config <- snakemake@params[["config"]]
group <- config$SOURCE$DATABASE_GROUP
timezone <- config$SOURCE$TIMEZONE
phone_device_id_column = config$PHONE_SECTION$DEVICE_ID_COLUMN
fitbit_device_id_column = config$FITBIT_SECTION$DEVICE_ID_COLUMN
add_fitbit_section = config$PHONE_SECTION$ADD
add_phone_section = config$FITBIT_SECTION$ADD
phone_ignored = config$PHONE_SECTION$IGNORED_DEVICE_IDS
fitbit_ignored = config$FITBIT_SECTION$IGNORED_DEVICE_IDS
rmysql.settingsfile <- "./.env"
if(config$SOURCE$TYPE == "AWARE_DEVICE_TABLE"){
database <- dbConnect(MySQL(), default.file = rmysql.settingsfile, group = group)
if(config$FITBIT_SECTION$ADD == TRUE){
query <- paste("SELECT",phone_device_id_column, ",",fitbit_device_id_column," as _temp_fitbit_id, brand, label, timestamp FROM aware_device order by timestamp asc")
fitbit_device_id_column <- "_temp_fitbit_id"
}
else
query <- paste("SELECT ",phone_device_id_column,", brand, label, timestamp FROM aware_device order by timestamp asc")
participants <- dbGetQuery(database, query)
dbDisconnect(database)
participants <- participants %>%
mutate(pid = if_else(row_number()<10, paste0("p","0",row_number()), paste0("p", row_number())),
platform = if_else(brand == "iPhone", "ios", "android"), brand = NULL,
label = iconv(if_else(label == "", "EMPTY_LABEL", label), from = "UTF-8", to = "UTF-8", sub=''),
start_date = format(as.POSIXct(timestamp / 1000, origin = "1970-01-01", tz = timezone), "%Y-%m-%d"),
end_date = format(Sys.Date(), "%Y-%m-%d"),
!!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)),
!!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column)))
} else if(config$SOURCE$TYPE == "CSV_FILE"){
participants <- read_csv(config$SOURCE$CSV_FILE_PATH, col_types=cols_only(device_id="c",pid="c",label="c",platform="c",
start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c"))
participants <- participants %>%
mutate(!!phone_device_id_column := str_replace(!!rlang::sym(phone_device_id_column), ";",","),
platform = str_replace(platform, ";",","),
!!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)),
!!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column)))
}
participants %>%
pwalk(function(add_phone_section, add_fitbit_section, phone_device_id_column, fitbit_device_id_column, ...) {
empty_phone <- c("PHONE:", " DEVICE_IDS:", " PLATFORMS:"," LABEL:", " START_DATE:", " END_DATE:")
empty_fitbit <- c("FITBIT:", " DEVICE_IDS:", " LABEL:", " START_DATE:", " END_DATE:")
row <- tibble(...)
lines <- c()
if(add_phone_section == TRUE && !is.na(row[phone_device_id_column])){
lines <- append(lines, c("PHONE:", paste0(" DEVICE_IDS: [",row[phone_device_id_column],"]"), paste0(" PLATFORMS: [",row$platform,"]"),
paste(" LABEL:",row$label), paste(" START_DATE:", row$start_date), paste(" END_DATE:", row$end_date)))
}else
lines <- append(lines, empty_phone)
if(add_fitbit_section == TRUE && !is.na(row[fitbit_device_id_column])){
lines <- append(lines, c("FITBIT:", paste0(" DEVICE_IDS: [",row[fitbit_device_id_column],"]"),
paste(" LABEL:",row$label), paste(" START_DATE:", row$start_date), paste(" END_DATE:", row$end_date)))
} else
lines <- append(lines, empty_fitbit)
file_connection <- file(paste0("./data/external/participant_files/", row$pid, ".yaml"))
writeLines(lines, file_connection)
close(file_connection)
}, add_phone_section, add_fitbit_section, phone_device_id_column, fitbit_device_id_column)

View File

@ -1,38 +0,0 @@
source("renv/activate.R")
library(RMySQL)
group <- snakemake@params[["group"]]
ignored_device_ids <- snakemake@params[["ignored_device_ids"]]
timezone <- snakemake@params[["timezone"]]
rmysql.settingsfile <- "./.env"
stopDB <- dbConnect(MySQL(), default.file = rmysql.settingsfile, group = group)
query <- "SELECT device_id, brand, label, timestamp FROM aware_device order by timestamp asc"
participants <- dbGetQuery(stopDB, query)
pids <- c()
end_date <- format(Sys.Date(), "%Y/%m/%d")
for(id in 1:nrow(participants)){
device_id <- participants$device_id[[id]]
brand <- ifelse(participants$brand[[id]] == "iPhone", "ios", "android")
label <- ifelse(participants$label[[id]] == "", "EMPTY_LABEL", participants$label[[id]])
label <- iconv(label, from = "UTF-8", to = "UTF-8", sub='')
start_date <- format(as.POSIXct(participants$timestamp[[id]] / 1000, origin = "1970-01-01", tz = timezone), "%Y/%m/%d")
if(!(device_id %in% ignored_device_ids)){
pid <- paste0("p", ifelse(id < 10, paste0("0", id), id))
pids <- append(pids, pid)
file_connection <- file(paste0("./data/external/", pid))
writeLines(c(device_id, brand, label, paste0(start_date, ",", end_date)), file_connection)
close(file_connection)
}
}
file_lines <-readLines("./config.yaml")
for (i in 1:length(file_lines)){
if(startsWith(file_lines[i], "PIDS:")){
file_lines[i] <- paste0("PIDS: [", paste(pids, collapse = ", "), "]")
}
}
writeLines(file_lines, con = "./config.yaml")