2020-05-02 01:46:04 +02:00
source ( " renv/activate.R" )
2020-06-30 23:34:18 +02:00
source ( " src/data/unify_utils.R" )
2020-12-04 20:57:13 +01:00
library ( RMariaDB )
2019-10-24 18:11:24 +02:00
library ( stringr )
2020-10-23 16:41:00 +02:00
library ( " dplyr" , warn.conflicts = F )
2020-04-02 22:28:19 +02:00
library ( readr )
2020-10-21 01:12:01 +02:00
library ( yaml )
library ( lubridate )
options ( scipen = 999 )
2019-10-24 18:11:24 +02:00
2020-06-30 23:34:18 +02:00
validate_deviceid_platforms <- function ( device_ids , platforms ) {
if ( length ( device_ids ) == 1 ) {
if ( length ( platforms ) > 1 || ( platforms != " android" && platforms != " ios" ) )
stop ( paste0 ( " If you have 1 device_id, its platform should be 'android' or 'ios' but you typed: '" , paste0 ( platforms , collapse = " ," ) , " '. Participant file: " , participant ) )
} else if ( length ( device_ids ) > 1 && length ( platforms ) == 1 ) {
if ( platforms != " android" && platforms != " ios" && platforms != " multiple" )
stop ( paste0 ( " If you have more than 1 device_id, platform should be 'android', 'ios' OR 'multiple' but you typed: '" , paste0 ( platforms , collapse = " s," ) , " '. Participant file: " , participant ) )
} else if ( length ( device_ids ) > 1 && length ( platforms ) > 1 ) {
if ( length ( device_ids ) != length ( platforms ) )
stop ( paste0 ( " The number of device_ids should match the number of platforms. Participant file:" , participant ) )
if ( all ( intersect ( c ( " android" , " ios" ) , unique ( platforms ) ) != c ( " android" , " ios" ) ) )
stop ( paste0 ( " If you have more than 1 device_id and more than 1 platform, the platforms should be a mix of 'android' AND 'ios' but you typed: '" , paste0 ( platforms , collapse = " ," ) , " '. Participant file: " , participant ) )
}
}
is_multiplaform_participant <- function ( dbEngine , device_ids , platforms ) {
# Multiple android and ios platforms or the same platform (android, ios) for multiple devices
if ( ( length ( device_ids ) > 1 && length ( platforms ) > 1 ) || ( length ( device_ids ) > 1 && length ( platforms ) == 1 && ( platforms == " android" || platforms == " ios" ) ) ) {
return ( TRUE )
}
# Multiple platforms for multiple devices, we search the platform for every device in the aware_device table
if ( length ( device_ids ) > 1 && length ( platforms ) == 1 && platforms == " multiple" ) {
devices_platforms <- dbGetQuery ( dbEngine , paste0 ( " SELECT device_id,brand FROM aware_device WHERE device_id IN ('" , paste0 ( device_ids , collapse = " ','" ) , " ')" ) )
platforms <- devices_platforms %>% distinct ( brand ) %>% pull ( brand )
# Android phones have different brands so we check that we got at least two different platforms and one of them is iPhone
if ( length ( platforms ) > 1 && " iPhone" %in% platforms ) {
return ( TRUE )
}
}
return ( FALSE )
}
2020-10-21 01:12:01 +02:00
get_timestamp_filter <- function ( device_ids , participant , timezone ) {
# Read start and end date from the participant file to filter data within that range
start_date <- ymd_hms ( paste ( participant $ PHONE $ START_DATE , " 00:00:00" ) , tz = timezone , quiet = TRUE )
end_date <- ymd_hms ( paste ( participant $ PHONE $ END_DATE , " 23:59:59" ) , tz = timezone , quiet = TRUE )
start_timestamp = as.numeric ( start_date ) * 1000
end_timestamp = as.numeric ( end_date ) * 1000
if ( is.na ( start_timestamp ) ) {
message ( paste ( " PHONE[START_DATE] was not provided or failed to parse (" , participant $ PHONE $ START_DATE , " ), all data for" , paste0 ( device_ids , collapse = " ," ) , " is returned" ) )
return ( " " )
} else if ( is.na ( end_timestamp ) ) {
message ( paste ( " PHONE[END_DATE] was not provided or failed to parse (" , participant $ PHONE $ END_DATE , " ), all data for" , paste0 ( device_ids , collapse = " ," ) , " is returned" ) )
return ( " " )
} else if ( start_timestamp > end_timestamp ) {
stop ( paste ( " Start date has to be before end date in PHONE[TIME_SPAN] (" , start_date , " ," , date ( end_date ) , " ), all data for" , paste0 ( device_ids , collapse = " ," ) , " is returned" ) )
return ( " " )
} else {
message ( paste ( " Filtering data between" , start_date , " and" , end_date , " in" , timezone , " for" , paste0 ( device_ids , collapse = " ," ) ) )
return ( paste0 ( " AND timestamp BETWEEN " , start_timestamp , " AND " , end_timestamp ) )
}
}
participant_file <- snakemake @ input [ [1 ] ]
source <- snakemake @ params [ [ " source" ] ]
group <- source $ DATABASE_GROUP
2019-10-24 22:08:05 +02:00
table <- snakemake @ params [ [ " table" ] ]
2020-10-19 21:07:12 +02:00
sensor <- snakemake @ params [ [ " sensor" ] ]
2020-02-20 21:51:22 +01:00
timezone <- snakemake @ params [ [ " timezone" ] ]
2020-06-30 23:34:18 +02:00
aware_multiplatform_tables <- str_split ( snakemake @ params [ [ " aware_multiplatform_tables" ] ] , " ," ) [ [1 ] ]
2019-10-24 18:11:24 +02:00
sensor_file <- snakemake @ output [ [1 ] ]
2020-10-21 01:12:01 +02:00
participant <- read_yaml ( participant_file )
if ( ! " PHONE" %in% names ( participant ) ) {
stop ( paste ( " The following participant file does not have a PHONE section, create one manually or automatically (see the docs):" , participant_file ) )
}
device_ids <- participant $ PHONE $ DEVICE_IDS
2020-06-30 23:34:18 +02:00
unified_device_id <- tail ( device_ids , 1 )
2020-10-21 01:12:01 +02:00
platforms <- participant $ PHONE $ PLATFORMS
2020-06-30 23:34:18 +02:00
validate_deviceid_platforms ( device_ids , platforms )
2020-10-21 01:12:01 +02:00
timestamp_filter <- get_timestamp_filter ( device_ids , participant , timezone )
2020-02-20 21:51:22 +01:00
2020-12-04 20:57:13 +01:00
dbEngine <- dbConnect ( MariaDB ( ) , default.file = " ./.env" , group = group )
2019-10-24 18:11:24 +02:00
2020-10-19 21:07:12 +02:00
if ( is_multiplaform_participant ( dbEngine , device_ids , platforms ) ) {
2020-10-21 01:12:01 +02:00
sensor_data <- unify_raw_data ( dbEngine , table , sensor , timestamp_filter , aware_multiplatform_tables , device_ids , platforms )
2020-10-19 21:07:12 +02:00
} else {
# table has two elements for conversation and activity recognition (they store data on a different table for ios and android)
2020-10-21 01:12:01 +02:00
if ( length ( table ) > 1 )
2020-10-19 21:07:12 +02:00
table <- table [ [toupper ( platforms [1 ] ) ] ]
2020-10-21 01:12:01 +02:00
query <- paste0 ( " SELECT * FROM " , table , " WHERE " , source $ DEVICE_ID_COLUMN , " IN ('" , paste0 ( device_ids , collapse = " ','" ) , " ')" , timestamp_filter )
sensor_data <- dbGetQuery ( dbEngine , query ) %>%
rename ( device_id = source $ DEVICE_ID_COLUMN )
2020-10-19 21:07:12 +02:00
}
sensor_data <- sensor_data %>% arrange ( timestamp )
# Unify device_id
sensor_data <- sensor_data %>% mutate ( device_id = unified_device_id )
2020-12-04 21:23:08 +01:00
# Removing blob_feature conversation column (it's loaded as a list column that crashes write_csv)
sensor_data <- sensor_data %>% select ( - any_of ( " blob_feature" ) )
2020-10-19 21:07:12 +02:00
# Droping duplicates on all columns except for _id or id
sensor_data <- sensor_data %>% distinct ( ! ! ! syms ( setdiff ( names ( sensor_data ) , c ( " _id" , " id" ) ) ) )
2020-06-30 23:34:18 +02:00
2020-04-02 22:28:19 +02:00
write_csv ( sensor_data , sensor_file )
2020-06-30 23:34:18 +02:00
dbDisconnect ( dbEngine )