Compare commits

...

12 Commits

Author SHA1 Message Date
junos b99a3c19ed Update dbplyr to the latest version.
distinct changed its behaviour from 2.0.0 to 2.1.0.
2021-11-29 18:34:26 +01:00
junos 04ad2d0b81 Source specific container script.
It is probably not worth the effort of making this general.
2021-11-29 18:19:47 +01:00
junos da5ff0f36e Correct small errors in settings. 2021-11-29 18:04:06 +01:00
junos 35d9779026 Prepare the tibble in requested format.
Write it to a CSV file.
2021-11-29 17:54:16 +01:00
junos 32025cbd8c Start with a tibble from CSV. 2021-11-29 17:51:07 +01:00
junos 181e4f0118 Add parameters to yaml file.
And use these in the prepare_participants_file function.
2021-11-29 16:57:50 +01:00
junos 39bd244511 [WIP] Prepare yaml files.
These will be used to create participants files.
2021-11-24 19:11:19 +01:00
junos ab84109d55 Prepare a function to compile participants data.
It combines functions from container.R
2021-11-24 19:07:56 +01:00
junos f9863ec622 Fix small mistakes. 2021-11-24 19:01:30 +01:00
junos c1f56c61e8 Add a function to pull start and end datetimes. 2021-11-24 18:33:06 +01:00
junos 3acf6ece14 Add a function to pull device IDs. 2021-11-24 18:23:53 +01:00
junos 8b2717122d Add a function to get participants' IDs. 2021-11-24 18:05:17 +01:00
5 changed files with 177 additions and 2 deletions

View File

@ -7,6 +7,7 @@ PIDS: [nokia_0000003]
# See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files
CREATE_PARTICIPANT_FILES:
USERNAMES_CSV: "data/external/example_usernames.csv"
CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format
PHONE_SECTION:
ADD: True

View File

@ -319,10 +319,10 @@
},
"dbplyr": {
"Package": "dbplyr",
"Version": "2.0.0",
"Version": "2.1.1",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "714005206038b1dda74cb1de85029a20"
"Hash": "1f37fa4ab2f5f7eded42f78b9a887182"
},
"desc": {
"Package": "desc",

View File

@ -4,6 +4,19 @@ rule create_example_participant_files:
shell:
"echo 'PHONE:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n PLATFORMS: [android]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example01.yaml && echo 'PHONE:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n PLATFORMS: [ios]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example02.yaml"
rule prepare_participants_csv:
input:
username_list = config["CREATE_PARTICIPANT_FILES"]["USERNAMES_CSV"]
params:
data_configuration = config["PHONE_DATA_STREAMS"][config["PHONE_DATA_STREAMS"]["USE"]],
participants_table = "participants",
device_id_table = "light_sensor",
start_end_date_table = "esm"
output:
participants_file = config["CREATE_PARTICIPANT_FILES"]["CSV_FILE_PATH"]
script:
"../src/data/translate_usernames_into_participants_data.R"
rule create_participants_files:
input:
participants_file = config["CREATE_PARTICIPANT_FILES"]["CSV_FILE_PATH"]

View File

@ -29,6 +29,7 @@ library(RPostgres)
# <stdin>:1:10: fatal error: libpq-fe.h: No such file or directory
# compilation terminated.
library(dbplyr)
library(yaml)
#' @description
@ -105,3 +106,101 @@ pull_data <- function(stream_parameters, device, sensor, sensor_container, colum
return(sensor_data)
}
#' @description
#' Gets participants' IDs for specified usernames.
#'
#' @param stream_parameters The PHONE_DATA_STREAMS key in config.yaml. If you need specific parameters add them there.
#' @param usernames A vector of usernames
#' @param participants_container The name of the database table containing participants data, such as their username.
#' @return A dataframe with participant IDs matching usernames
pull_participants_ids <- function(stream_parameters, usernames, participants_container) {
dbEngine <- get_db_engine(stream_parameters$DATABASE_GROUP)
query_participant_id <- tbl(dbEngine, participants_container) %>%
filter(username %in% usernames) %>%
select(username, id)
message(paste0("Executing the following query to get the participant's id: \n", sql_render(query_participant_id)))
participant_data <- query_participant_id %>% collect()
dbDisconnect(dbEngine)
if(nrow(participant_data) == 0)
warning(paste("We could not find requested usernames (", usernames, ") in ", participants_container))
return(participant_data)
}
#' @description
#' Gets participants' IDs for specified participant IDs
#'
#' @param stream_parameters The PHONE_DATA_STREAMS key in config.yaml. If you need specific parameters add them there.
#' @param participants_ids A vector of numeric participant IDs
#' @param device_id_container The name of the database table which will be used to determine distinct device ID. Ideally, a table that reliably contains data, but not too much.
#' @return A dataframe with a row matching each distinct device ID with a participant ID
pull_participants_device_ids <- function(stream_parameters, participants_ids, device_id_container) {
dbEngine <- get_db_engine(stream_parameters$DATABASE_GROUP)
query_device_id <- tbl(dbEngine, device_id_container) %>%
filter(participant_id %in% !!participants_ids) %>%
group_by(participant_id) %>%
distinct(device_id, .keep_all = FALSE)
message(paste0("Executing the following query to get the distinct device IDs: \n", sql_render(query_device_id)))
device_ids <- query_device_id %>% collect()
dbDisconnect(dbEngine)
if(nrow(device_ids) == 0)
warning(paste("We could not find device IDs for requested participant IDs (", participants_ids, ") in ", device_id_container))
return(device_ids)
}
#' @description
#' Gets start and end datetimes for specified participant IDs.
#'
#' @param stream_parameters The PHONE_DATA_STREAMS key in config.yaml. If you need specific parameters add them there.
#' @param participants_ids A vector of numeric participant IDs
#' @param start_end_date_container The name of the database table which will be used to determine when a participant started and ended their participation. Briefing and debriefing EMAs can be meaningfully used here.
#' @return A dataframe relating participant IDs with their start and end datetimes.
pull_participants_start_end_dates <- function(stream_parameters, participants_ids, start_end_date_container) {
dbEngine <- get_db_engine(stream_parameters$DATABASE_GROUP)
query_timestamps <- tbl(dbEngine, start_end_date_container) %>%
filter(
participant_id %in% !!participants_ids,
double_esm_user_answer_timestamp > 0
) %>%
group_by(participant_id) %>%
summarise(
timestamp_min = min(double_esm_user_answer_timestamp, na.rm = TRUE),
timestamp_max = max(double_esm_user_answer_timestamp, na.rm = TRUE)
) %>%
select(participant_id, timestamp_min, timestamp_max)
message(paste0("Executing the following query to get the starting and ending datetimes: \n", sql_render(query_timestamps)))
start_end_timestamps <- query_timestamps %>% collect()
if(nrow(start_end_timestamps) == 0)
warning(paste("We could not find datetimes for requested participant IDs (", participants_ids, ") in ", start_end_date_container))
start_end_times <- start_end_timestamps %>%
mutate(
datetime_start = as_datetime(timestamp_min/1000, tz = "UTC"),
datetime_end = as_datetime(timestamp_max/1000, tz = "UTC")
) %>%
select(-c(timestamp_min, timestamp_max))
dbDisconnect(dbEngine)
return(start_end_times)
}

View File

@ -0,0 +1,62 @@
source("renv/activate.R")
source("src/data/streams/aware_postgresql/container.R")
library(RPostgres)
library(magrittr)
library(tidyverse)
library(lubridate)
prepare_participants_file <- function() {
username_list_csv_location <- snakemake@input[["username_list"]]
data_configuration <- snakemake@params[["data_configuration"]]
participants_container <- snakemake@params[["participants_table"]]
device_id_container <- snakemake@params[["device_id_table"]]
start_end_date_container <- snakemake@params[["start_end_date_table"]]
output_data_file <- snakemake@output[["participants_file"]]
platform <- "android"
pid_format <- "p%03d"
datetime_format <- "%Y-%m-%d %H:%M:%S"
participant_data <- read_csv(username_list_csv_location, col_types = "c", progress = FALSE)
usernames <- participant_data$label
participant_ids <- pull_participants_ids(data_configuration, usernames, participants_container)
participant_data %<>%
left_join(participant_ids, by = c("label" = "username")) %>%
rename(participant_id = id)
device_ids <- pull_participants_device_ids(data_configuration, participant_data$participant_id, device_id_container)
device_ids %<>%
group_by(participant_id) %>%
summarise(device_ids = list(unique(device_id)))
participant_data %<>%
left_join(device_ids, by = "participant_id")
start_end_datetimes <- pull_participants_start_end_dates(data_configuration, participant_data$participant_id, start_end_date_container)
participant_data %<>%
left_join(start_end_datetimes, by = "participant_id")
participant_data %<>%
mutate(
pid = sprintf(pid_format, participant_id),
start_date = strftime(datetime_start, format=datetime_format, tz = "UTC", usetz = FALSE), #TODO Check what timezone is expected
end_date = strftime(datetime_end, format=datetime_format, tz = "UTC", usetz = FALSE),
empatica_id = "placeholder", #TODO Provide in file?
device_id = map_chr(device_ids, str_c, collapse = ";"),
number_of_devices = map_int(device_ids, length),
fitbit_id = ""
) %>%
rowwise() %>%
mutate(platform = str_c(replicate(number_of_devices, platform), collapse = ";")) %>%
ungroup() %>%
arrange(pid) %>%
select(pid, label, start_date, end_date, empatica_id, device_id, platform, fitbit_id)
write_csv(participant_data, output_data_file)
}
prepare_participants_file()