Refactor script to create participants files
parent
c8176b2d90
commit
d5931c75d8
35
config.yaml
35
config.yaml
|
@ -2,12 +2,6 @@
|
|||
# You must create a file for each participant named pXXX containing their device_id. This can be done manually or automatically
|
||||
PIDS: [test01]
|
||||
|
||||
# Global var with common day segments
|
||||
DAY_SEGMENTS: &day_segments
|
||||
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
|
||||
FILE: "data/external/daysegments_periodic.csv"
|
||||
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data
|
||||
|
||||
# Use tz codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones. Double check your code, for example EST is not US Eastern Time.
|
||||
TIMEZONE: &timezone
|
||||
America/New_York
|
||||
|
@ -15,23 +9,31 @@ TIMEZONE: &timezone
|
|||
DATABASE_GROUP: &database_group
|
||||
MY_GROUP
|
||||
|
||||
# config section for the script that creates participant files automatically
|
||||
PARTICIPANT_FILES: # run snakemake -j1 -R parse_participant_files
|
||||
# run 'snakemake -j1 create_participants_files'
|
||||
CREATE_PARTICIPANT_FILES:
|
||||
SOURCE:
|
||||
TYPE: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE
|
||||
DATABASE_GROUP: *database_group
|
||||
CSV_FILE_PATH: "data/external/example_participants.csv" # must have columns: PHONE DEVICE_ID_COLUMN, FITBIT DEVICE_ID_COLUMN, pid , label, start_date, end_date
|
||||
TIMEZONE: *timezone # only used for AWARE_DEVICE_TABLE
|
||||
PHONE_SECTION:
|
||||
ADD: TRUE
|
||||
PARSED_FROM: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE
|
||||
PARSED_SOURCE: *database_group # DB credentials group or CSV file path. If CSV file, it should have: device_id, pid (optional), label (optional), start_date (optional), end_date (optional)
|
||||
DEVICE_ID_COLUMN: device_id # column name
|
||||
IGNORED_DEVICE_IDS: []
|
||||
FITBIT_SECTION:
|
||||
ADD: FALSE
|
||||
SAME_AS_PHONE: FALSE # If TRUE, all config below is ignored
|
||||
PARSED_FROM: CSV_FILE
|
||||
PARSED_SOURCE: "external/my_fitbit_participants.csv" # CSV file should have: device_id, pid (optional), label (optional), start_date (optional), end_date (optional)
|
||||
ADD: TRUE
|
||||
DEVICE_ID_COLUMN: device_id # column name
|
||||
IGNORED_DEVICE_IDS: []
|
||||
|
||||
DAY_SEGMENTS: &day_segments
|
||||
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
|
||||
FILE: "data/external/daysegments_periodic.csv"
|
||||
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data
|
||||
|
||||
SENSOR_DATA:
|
||||
PHONE:
|
||||
SOURCE:
|
||||
TYPE: DATABASE # Phone only supports DATABASE for now
|
||||
TYPE: DATABASE
|
||||
DATABASE_GROUP: *database_group
|
||||
DEVICE_ID_COLUMN: device_id # column name
|
||||
TIMEZONE:
|
||||
|
@ -46,6 +48,9 @@ SENSOR_DATA:
|
|||
TYPE: SINGLE # Fitbit only supports SINGLE timezones
|
||||
VALUE: *timezone # timezone code (e.g. America/New_York, see attribute TIMEZONE above and https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
|
||||
|
||||
############## PHONE ###########################################################
|
||||
################################################################################
|
||||
|
||||
PHONE_VALID_SENSED_BINS:
|
||||
COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features
|
||||
BIN_SIZE: &bin_size 5 # (in minutes)
|
||||
|
|
|
@ -15,14 +15,13 @@ rule create_example_participant_files:
|
|||
shell:
|
||||
"echo 'a748ee1a-1d0b-4ae9-9074-279a2b6ba524\nandroid\ntest01\n2020/04/23,2020/05/04\n' >> ./data/external/example01 && echo '13dbc8a3-dae3-4834-823a-4bc96a7d459d\nios\ntest02\n2020/04/23,2020/05/04\n' >> ./data/external/example02"
|
||||
|
||||
# rule download_participants:
|
||||
# params:
|
||||
# group = config["DOWNLOAD_PARTICIPANTS"]["GROUP"],
|
||||
# ignored_device_ids = config["DOWNLOAD_PARTICIPANTS"]["IGNORED_DEVICE_IDS"],
|
||||
# timezone = config["TIMEZONE"]
|
||||
# priority: 1
|
||||
# script:
|
||||
# "../src/data/download_participants.R"
|
||||
rule create_participants_files:
|
||||
input:
|
||||
participants_file = [] if config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["TYPE"] == "AWARE_DEVICE_TABLE" else config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["CSV_FILE_PATH"]
|
||||
params:
|
||||
config = config["CREATE_PARTICIPANT_FILES"]
|
||||
script:
|
||||
"../src/data/create_participants_files.R"
|
||||
|
||||
rule download_phone_data:
|
||||
input:
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
source("renv/activate.R")
|
||||
|
||||
library(RMySQL)
|
||||
library(stringr)
|
||||
library(purrr)
|
||||
library(readr)
|
||||
library("dplyr", warn.conflicts = F)
|
||||
|
||||
config <- snakemake@params[["config"]]
|
||||
group <- config$SOURCE$DATABASE_GROUP
|
||||
timezone <- config$SOURCE$TIMEZONE
|
||||
phone_device_id_column = config$PHONE_SECTION$DEVICE_ID_COLUMN
|
||||
fitbit_device_id_column = config$FITBIT_SECTION$DEVICE_ID_COLUMN
|
||||
add_fitbit_section = config$PHONE_SECTION$ADD
|
||||
add_phone_section = config$FITBIT_SECTION$ADD
|
||||
phone_ignored = config$PHONE_SECTION$IGNORED_DEVICE_IDS
|
||||
fitbit_ignored = config$FITBIT_SECTION$IGNORED_DEVICE_IDS
|
||||
|
||||
rmysql.settingsfile <- "./.env"
|
||||
|
||||
if(config$SOURCE$TYPE == "AWARE_DEVICE_TABLE"){
|
||||
database <- dbConnect(MySQL(), default.file = rmysql.settingsfile, group = group)
|
||||
if(config$FITBIT_SECTION$ADD == TRUE){
|
||||
query <- paste("SELECT",phone_device_id_column, ",",fitbit_device_id_column," as _temp_fitbit_id, brand, label, timestamp FROM aware_device order by timestamp asc")
|
||||
fitbit_device_id_column <- "_temp_fitbit_id"
|
||||
}
|
||||
else
|
||||
query <- paste("SELECT ",phone_device_id_column,", brand, label, timestamp FROM aware_device order by timestamp asc")
|
||||
participants <- dbGetQuery(database, query)
|
||||
dbDisconnect(database)
|
||||
participants <- participants %>%
|
||||
mutate(pid = if_else(row_number()<10, paste0("p","0",row_number()), paste0("p", row_number())),
|
||||
platform = if_else(brand == "iPhone", "ios", "android"), brand = NULL,
|
||||
label = iconv(if_else(label == "", "EMPTY_LABEL", label), from = "UTF-8", to = "UTF-8", sub=''),
|
||||
start_date = format(as.POSIXct(timestamp / 1000, origin = "1970-01-01", tz = timezone), "%Y-%m-%d"),
|
||||
end_date = format(Sys.Date(), "%Y-%m-%d"),
|
||||
!!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)),
|
||||
!!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column)))
|
||||
|
||||
} else if(config$SOURCE$TYPE == "CSV_FILE"){
|
||||
participants <- read_csv(config$SOURCE$CSV_FILE_PATH, col_types=cols_only(device_id="c",pid="c",label="c",platform="c",
|
||||
start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c"))
|
||||
participants <- participants %>%
|
||||
mutate(!!phone_device_id_column := str_replace(!!rlang::sym(phone_device_id_column), ";",","),
|
||||
platform = str_replace(platform, ";",","),
|
||||
!!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)),
|
||||
!!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column)))
|
||||
}
|
||||
|
||||
participants %>%
|
||||
pwalk(function(add_phone_section, add_fitbit_section, phone_device_id_column, fitbit_device_id_column, ...) {
|
||||
empty_phone <- c("PHONE:", " DEVICE_IDS:", " PLATFORMS:"," LABEL:", " START_DATE:", " END_DATE:")
|
||||
empty_fitbit <- c("FITBIT:", " DEVICE_IDS:", " LABEL:", " START_DATE:", " END_DATE:")
|
||||
row <- tibble(...)
|
||||
lines <- c()
|
||||
|
||||
if(add_phone_section == TRUE && !is.na(row[phone_device_id_column])){
|
||||
lines <- append(lines, c("PHONE:", paste0(" DEVICE_IDS: [",row[phone_device_id_column],"]"), paste0(" PLATFORMS: [",row$platform,"]"),
|
||||
paste(" LABEL:",row$label), paste(" START_DATE:", row$start_date), paste(" END_DATE:", row$end_date)))
|
||||
}else
|
||||
lines <- append(lines, empty_phone)
|
||||
|
||||
if(add_fitbit_section == TRUE && !is.na(row[fitbit_device_id_column])){
|
||||
lines <- append(lines, c("FITBIT:", paste0(" DEVICE_IDS: [",row[fitbit_device_id_column],"]"),
|
||||
paste(" LABEL:",row$label), paste(" START_DATE:", row$start_date), paste(" END_DATE:", row$end_date)))
|
||||
} else
|
||||
lines <- append(lines, empty_fitbit)
|
||||
|
||||
file_connection <- file(paste0("./data/external/participant_files/", row$pid, ".yaml"))
|
||||
writeLines(lines, file_connection)
|
||||
close(file_connection)
|
||||
|
||||
}, add_phone_section, add_fitbit_section, phone_device_id_column, fitbit_device_id_column)
|
|
@ -1,38 +0,0 @@
|
|||
source("renv/activate.R")
|
||||
|
||||
library(RMySQL)
|
||||
|
||||
group <- snakemake@params[["group"]]
|
||||
ignored_device_ids <- snakemake@params[["ignored_device_ids"]]
|
||||
timezone <- snakemake@params[["timezone"]]
|
||||
rmysql.settingsfile <- "./.env"
|
||||
|
||||
stopDB <- dbConnect(MySQL(), default.file = rmysql.settingsfile, group = group)
|
||||
query <- "SELECT device_id, brand, label, timestamp FROM aware_device order by timestamp asc"
|
||||
participants <- dbGetQuery(stopDB, query)
|
||||
pids <- c()
|
||||
|
||||
end_date <- format(Sys.Date(), "%Y/%m/%d")
|
||||
|
||||
for(id in 1:nrow(participants)){
|
||||
device_id <- participants$device_id[[id]]
|
||||
brand <- ifelse(participants$brand[[id]] == "iPhone", "ios", "android")
|
||||
label <- ifelse(participants$label[[id]] == "", "EMPTY_LABEL", participants$label[[id]])
|
||||
label <- iconv(label, from = "UTF-8", to = "UTF-8", sub='')
|
||||
start_date <- format(as.POSIXct(participants$timestamp[[id]] / 1000, origin = "1970-01-01", tz = timezone), "%Y/%m/%d")
|
||||
if(!(device_id %in% ignored_device_ids)){
|
||||
pid <- paste0("p", ifelse(id < 10, paste0("0", id), id))
|
||||
pids <- append(pids, pid)
|
||||
file_connection <- file(paste0("./data/external/", pid))
|
||||
writeLines(c(device_id, brand, label, paste0(start_date, ",", end_date)), file_connection)
|
||||
close(file_connection)
|
||||
}
|
||||
}
|
||||
|
||||
file_lines <-readLines("./config.yaml")
|
||||
for (i in 1:length(file_lines)){
|
||||
if(startsWith(file_lines[i], "PIDS:")){
|
||||
file_lines[i] <- paste0("PIDS: [", paste(pids, collapse = ", "), "]")
|
||||
}
|
||||
}
|
||||
writeLines(file_lines, con = "./config.yaml")
|
Loading…
Reference in New Issue