Update dbplyr to the latest version.

distinct changed its behaviour from 2.0.0 to 2.1.0.
Source specific container script.
2021-11-29 18:34:26 +01:00 · 2021-11-29 18:19:47 +01:00 · 2021-11-29 18:04:06 +01:00 · 2021-11-29 17:54:16 +01:00 · 2021-11-29 17:51:07 +01:00 · 2021-11-29 16:57:50 +01:00
5 changed files with 177 additions and 2 deletions
--- a/config.yaml
+++ b/config.yaml
@ -7,6 +7,7 @@ PIDS: [nokia_0000003]

 # See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files
 CREATE_PARTICIPANT_FILES:
+  USERNAMES_CSV: "data/external/example_usernames.csv"
  CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format
  PHONE_SECTION:
    ADD: True
--- a/renv.lock
+++ b/renv.lock
@ -319,10 +319,10 @@
    },
    "dbplyr": {
      "Package": "dbplyr",
-      "Version": "2.0.0",
+      "Version": "2.1.1",
      "Source": "Repository",
      "Repository": "CRAN",
-      "Hash": "714005206038b1dda74cb1de85029a20"
+      "Hash": "1f37fa4ab2f5f7eded42f78b9a887182"
    },
    "desc": {
      "Package": "desc",
--- a/rules/preprocessing.smk
+++ b/rules/preprocessing.smk
@ -4,6 +4,19 @@ rule create_example_participant_files:
    shell:
        "echo 'PHONE:\n  DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n  PLATFORMS: [android]\n  LABEL: test-01\n  START_DATE: 2020-04-23 00:00:00\n  END_DATE: 2020-05-04 23:59:59\nFITBIT:\n  DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n  LABEL: test-01\n  START_DATE: 2020-04-23 00:00:00\n  END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example01.yaml && echo 'PHONE:\n  DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n  PLATFORMS: [ios]\n  LABEL: test-02\n  START_DATE: 2020-04-23 00:00:00\n  END_DATE: 2020-05-04 23:59:59\nFITBIT:\n  DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n  LABEL: test-02\n  START_DATE: 2020-04-23 00:00:00\n  END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example02.yaml"

+rule prepare_participants_csv:
+    input:
+        username_list = config["CREATE_PARTICIPANT_FILES"]["USERNAMES_CSV"]
+    params:
+        data_configuration = config["PHONE_DATA_STREAMS"][config["PHONE_DATA_STREAMS"]["USE"]],
+        participants_table = "participants",
+        device_id_table = "light_sensor",
+        start_end_date_table = "esm"
+    output:
+        participants_file = config["CREATE_PARTICIPANT_FILES"]["CSV_FILE_PATH"]
+    script:
+        "../src/data/translate_usernames_into_participants_data.R"
+
 rule create_participants_files:
    input:
        participants_file = config["CREATE_PARTICIPANT_FILES"]["CSV_FILE_PATH"] 
--- a/src/data/streams/aware_postgresql/container.R
+++ b/src/data/streams/aware_postgresql/container.R
@ -29,6 +29,7 @@ library(RPostgres)
 #   <stdin>:1:10: fatal error: libpq-fe.h: No such file or directory
 # compilation terminated.

+library(dbplyr)
 library(yaml)

 #' @description
@ -105,3 +106,101 @@ pull_data <- function(stream_parameters, device, sensor, sensor_container, colum
  return(sensor_data)
 }

+#' @description
+#' Gets participants' IDs for specified usernames.
+#'
+#' @param stream_parameters The PHONE_DATA_STREAMS key in config.yaml. If you need specific parameters add them there.
+#' @param usernames A vector of usernames
+#' @param participants_container The name of the database table containing participants data, such as their username.
+#' @return A dataframe with participant IDs matching usernames
+
+pull_participants_ids <- function(stream_parameters, usernames, participants_container) {
+  dbEngine <- get_db_engine(stream_parameters$DATABASE_GROUP)
+  
+  query_participant_id <- tbl(dbEngine, participants_container) %>% 
+    filter(username %in% usernames) %>% 
+    select(username, id)
+  
+  message(paste0("Executing the following query to get the participant's id: \n", sql_render(query_participant_id)))
+  
+  participant_data <- query_participant_id %>% collect()
+
+  dbDisconnect(dbEngine)
+  
+  if(nrow(participant_data) == 0)
+    warning(paste("We could not find requested usernames (", usernames,  ") in ", participants_container))
+  
+  return(participant_data)
+}
+
+#' @description
+#' Gets participants' IDs for specified participant IDs
+#'
+#' @param stream_parameters The PHONE_DATA_STREAMS key in config.yaml. If you need specific parameters add them there.
+#' @param participants_ids A vector of numeric participant IDs
+#' @param device_id_container The name of the database table which will be used to determine distinct device ID. Ideally, a table that reliably contains data, but not too much.
+#' @return A dataframe with a row matching each distinct device ID with a participant ID
+
+pull_participants_device_ids <- function(stream_parameters, participants_ids, device_id_container) {
+  dbEngine <- get_db_engine(stream_parameters$DATABASE_GROUP)
+
+  query_device_id <- tbl(dbEngine, device_id_container) %>%
+    filter(participant_id %in% !!participants_ids) %>% 
+    group_by(participant_id) %>% 
+    distinct(device_id, .keep_all = FALSE)
+  
+  message(paste0("Executing the following query to get the distinct device IDs: \n", sql_render(query_device_id)))
+  
+  device_ids <- query_device_id %>% collect()
+  
+  dbDisconnect(dbEngine)
+  
+  if(nrow(device_ids) == 0)
+    warning(paste("We could not find device IDs for requested participant IDs (", participants_ids,  ") in ", device_id_container))
+  
+  return(device_ids)
+}
+
+#' @description
+#' Gets start and end datetimes for specified participant IDs.
+#'
+#' @param stream_parameters The PHONE_DATA_STREAMS key in config.yaml. If you need specific parameters add them there.
+#' @param participants_ids A vector of numeric participant IDs
+#' @param start_end_date_container The name of the database table which will be used to determine when a participant started and ended their participation. Briefing and debriefing EMAs can be meaningfully used here.
+#' @return A dataframe relating participant IDs with their start and end datetimes.
+
+pull_participants_start_end_dates <- function(stream_parameters, participants_ids, start_end_date_container) {
+  dbEngine <- get_db_engine(stream_parameters$DATABASE_GROUP)
+
+  query_timestamps <- tbl(dbEngine, start_end_date_container) %>% 
+    filter(
+      participant_id %in% !!participants_ids,
+      double_esm_user_answer_timestamp > 0
+    ) %>% 
+    group_by(participant_id) %>% 
+    summarise(
+      timestamp_min = min(double_esm_user_answer_timestamp, na.rm = TRUE),
+      timestamp_max = max(double_esm_user_answer_timestamp, na.rm = TRUE)
+    ) %>% 
+    select(participant_id, timestamp_min, timestamp_max)
+  
+  message(paste0("Executing the following query to get the starting and ending datetimes: \n", sql_render(query_timestamps)))
+  
+  start_end_timestamps <- query_timestamps %>% collect()
+  
+  if(nrow(start_end_timestamps) == 0)
+    warning(paste("We could not find datetimes for requested participant IDs (", participants_ids,  ") in ", start_end_date_container))
+
+  start_end_times <- start_end_timestamps %>% 
+    mutate(    
+      datetime_start = as_datetime(timestamp_min/1000, tz = "UTC"),
+      datetime_end = as_datetime(timestamp_max/1000, tz = "UTC")
+    ) %>% 
+    select(-c(timestamp_min, timestamp_max))
+  
+  dbDisconnect(dbEngine)
+  
+  return(start_end_times)
+}
+
+
--- a/src/data/translate_usernames_into_participants_data.R
+++ b/src/data/translate_usernames_into_participants_data.R
@ -0,0 +1,62 @@
+source("renv/activate.R")
+source("src/data/streams/aware_postgresql/container.R")
+
+library(RPostgres)
+library(magrittr)
+library(tidyverse)
+library(lubridate)
+
+prepare_participants_file <- function() {
+
+  username_list_csv_location <- snakemake@input[["username_list"]]
+
+  data_configuration <- snakemake@params[["data_configuration"]]
+  participants_container <- snakemake@params[["participants_table"]]
+  device_id_container <- snakemake@params[["device_id_table"]]
+  start_end_date_container <- snakemake@params[["start_end_date_table"]]
+
+  output_data_file <- snakemake@output[["participants_file"]]
+
+  platform <- "android"
+  pid_format <- "p%03d"
+  datetime_format <- "%Y-%m-%d %H:%M:%S"
+
+  participant_data <- read_csv(username_list_csv_location, col_types = "c", progress = FALSE)
+  usernames <- participant_data$label
+
+  participant_ids <- pull_participants_ids(data_configuration, usernames, participants_container)
+  participant_data %<>%
+    left_join(participant_ids, by = c("label" = "username")) %>%
+    rename(participant_id = id)
+
+  device_ids <- pull_participants_device_ids(data_configuration, participant_data$participant_id, device_id_container)
+  device_ids %<>%
+    group_by(participant_id) %>%
+    summarise(device_ids = list(unique(device_id)))
+  participant_data %<>%
+    left_join(device_ids, by = "participant_id")
+
+  start_end_datetimes <- pull_participants_start_end_dates(data_configuration, participant_data$participant_id, start_end_date_container)
+  participant_data %<>%
+    left_join(start_end_datetimes, by = "participant_id")
+
+  participant_data %<>%
+  mutate(
+    pid = sprintf(pid_format, participant_id),
+    start_date = strftime(datetime_start, format=datetime_format, tz = "UTC", usetz = FALSE), #TODO Check what timezone is expected
+    end_date = strftime(datetime_end, format=datetime_format, tz = "UTC", usetz = FALSE),
+    empatica_id = "placeholder", #TODO Provide in file?
+    device_id = map_chr(device_ids, str_c, collapse = ";"),
+    number_of_devices = map_int(device_ids, length),
+    fitbit_id = ""
+    ) %>%
+  rowwise() %>%
+  mutate(platform = str_c(replicate(number_of_devices, platform), collapse = ";")) %>%
+  ungroup() %>%
+  arrange(pid) %>%
+  select(pid, label, start_date, end_date, empatica_id, device_id, platform, fitbit_id)
+
+  write_csv(participant_data, output_data_file)
+}
+
+prepare_participants_file()
Author	SHA1	Message	Date
junos	b99a3c19ed	Update dbplyr to the latest version. distinct changed its behaviour from 2.0.0 to 2.1.0.	2021-11-29 18:34:26 +01:00
junos	04ad2d0b81	Source specific container script. It is probably not worth the effort of making this general.	2021-11-29 18:19:47 +01:00
junos	da5ff0f36e	Correct small errors in settings.	2021-11-29 18:04:06 +01:00
junos	35d9779026	Prepare the tibble in requested format. Write it to a CSV file.	2021-11-29 17:54:16 +01:00
junos	32025cbd8c	Start with a tibble from CSV.	2021-11-29 17:51:07 +01:00
junos	181e4f0118	Add parameters to yaml file. And use these in the prepare_participants_file function.	2021-11-29 16:57:50 +01:00
junos	39bd244511	[WIP] Prepare yaml files. These will be used to create participants files.	2021-11-24 19:11:19 +01:00
junos	ab84109d55	Prepare a function to compile participants data. It combines functions from container.R	2021-11-24 19:07:56 +01:00
junos	f9863ec622	Fix small mistakes.	2021-11-24 19:01:30 +01:00
junos	c1f56c61e8	Add a function to pull start and end datetimes.	2021-11-24 18:33:06 +01:00
junos	3acf6ece14	Add a function to pull device IDs.	2021-11-24 18:23:53 +01:00
junos	8b2717122d	Add a function to get participants' IDs.	2021-11-24 18:05:17 +01:00