rapids/src/data/unify_utils.R

198 lines
11 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

library("dplyr", warn.conflicts = F)
library(stringr)
unify_ios_screen <- function(ios_screen){
# In Android we only process UNLOCK to OFF episodes. In iOS we only process UNLOCK to LOCKED episodes,
# thus, we replace LOCKED with OFF episodes (2 to 0) so we can use Android's code for iOS
ios_screen <- ios_screen %>%
# only keep consecutive pairs of 3,2 events
filter( (screen_status == 3 & lead(screen_status) == 2) | (screen_status == 2 & lag(screen_status) == 3) ) %>%
mutate(screen_status = replace(screen_status, screen_status == 2, 0))
return(ios_screen)
}
unify_ios_battery <- function(ios_battery){
# We only need to unify battery data for iOS client V1. V2 does it out-of-the-box
# V1 will not have rows where battery_status is equal to 4
if(nrow(ios_battery %>% filter(battery_status == 4)) == 0)
ios_battery <- ios_battery %>%
mutate(battery_status = replace(battery_status, battery_status == 3, 5),
battery_status = replace(battery_status, battery_status == 1, 3))
return(ios_battery)
}
unify_ios_calls <- function(ios_calls){
# Androids call types 1=incoming, 2=outgoing, 3=missed
# iOS' call status 1=incoming, 2=connected, 3=dialing, 4=disconnected
# iOS' call types based on call status: (1,2,4)=incoming=1, (3,2,4)=outgoing=2, (1,4) or (3,4)=missed=3
# Sometimes (due to a possible bug in Aware) sequences get logged on the exact same timestamp, thus 3-item sequences can be 2,3,4 or 3,2,4
# Even tho iOS stores the duration of ringing/dialing for missed calls, we set it to 0 to match Android
ios_calls <- ios_calls %>%
arrange(trace, timestamp, call_type) %>%
group_by(trace) %>%
# search for the disconnect event, as it is common to outgoing, received and missed calls
mutate(completed_call = ifelse(call_type == 4, 2, 0),
# assign the same ID to all events before a 4
completed_call = cumsum(c(1, head(completed_call, -1) != tail(completed_call, -1))),
# hack to match ID of last event (4) to that of the previous rows
completed_call = ifelse(call_type == 4, completed_call - 1, completed_call))
# We check utc_date_time and local_date_time exist because sometimes we call this function from
# download_dataset to unify multi-platform participants. At that point such time columns are missing
if("utc_date_time" %in% colnames(ios_calls) && "local_date_time" %in% colnames(ios_calls)){
ios_calls <- ios_calls %>% summarise(call_type_sequence = paste(call_type, collapse = ","), # collapse all events before a 4
# sanity check, timestamp_diff should be equal or close to duration sum
# timestamp_diff = trunc((last(timestamp) - first(timestamp)) / 1000)
# use call_duration = last(call_duration) if you want duration from pick up to hang up
# use call_duration = sum(call_duration) if you want duration from dialing/ringing to hang up
call_duration = last(call_duration),
timestamp = first(timestamp),
utc_date_time = first(utc_date_time),
local_date_time = first(local_date_time),
local_date = first(local_date),
local_time = first(local_time),
local_hour = first(local_hour),
local_minute = first(local_minute),
local_timezone = first(local_timezone),
assigned_segments = first(assigned_segments))
}
else {
ios_calls <- ios_calls %>% summarise(call_type_sequence = paste(call_type, collapse = ","), call_duration = sum(call_duration), timestamp = first(timestamp))
}
ios_calls <- ios_calls %>% mutate(call_type = case_when(
call_type_sequence == "1,2,4" | call_type_sequence == "2,1,4" ~ 1, # incoming
call_type_sequence == "1,4" ~ 3, # missed
call_type_sequence == "3,2,4" | call_type_sequence == "2,3,4" ~ 2, # outgoing
call_type_sequence == "3,4" ~ 4, # outgoing missed, we create this temp missed state to assign a duration of 0 below
TRUE ~ -1), # other, call sequences without a disconnect (4) event are discarded
# assign a duration of 0 to incoming and outgoing missed calls
call_duration = ifelse(call_type == 3 | call_type == 4, 0, call_duration),
# get rid of the temp missed call type, set to 2 to match Android. See https://github.com/carissalow/rapids/issues/79
call_type = ifelse(call_type == 4, 2, call_type)
) %>%
# discard sequences without an event 4 (disconnect)
filter(call_type > 0) %>%
ungroup() %>%
arrange(timestamp)
return(ios_calls)
}
clean_ios_activity_column <- function(ios_gar){
ios_gar <- ios_gar %>%
mutate(activities = str_replace_all(activities, pattern = '("|\\[|\\])', replacement = ""))
existent_multiple_activities <- ios_gar %>%
filter(str_detect(activities, ",")) %>%
group_by(activities) %>%
summarise(mutiple_activities = unique(activities)) %>%
pull(mutiple_activities)
known_multiple_activities <- c("stationary,automotive")
unkown_multiple_actvities <- setdiff(existent_multiple_activities, known_multiple_activities)
if(length(unkown_multiple_actvities) > 0){
stop(paste0("There are unkwown combinations of ios activities, you need to implement the decision of the ones to keep: ", unkown_multiple_actvities))
}
ios_gar <- ios_gar %>%
mutate(activities = str_replace_all(activities, pattern = "stationary,automotive", replacement = "automotive"))
return(ios_gar)
}
unify_ios_activity_recognition <- function(ios_gar){
# We only need to unify Google Activity Recognition data for iOS
# discard rows where activities column is blank
ios_gar <- ios_gar[-which(ios_gar$activities == ""), ]
# clean "activities" column of ios_gar
ios_gar <- clean_ios_activity_column(ios_gar)
# make it compatible with android version: generate "activity_name" and "activity_type" columns
ios_gar <- ios_gar %>%
mutate(activity_name = case_when(activities == "automotive" ~ "in_vehicle",
activities == "cycling" ~ "on_bicycle",
activities == "walking" | activities == "running" ~ "on_foot",
activities == "stationary" ~ "still"),
activity_type = case_when(activities == "automotive" ~ 0,
activities == "cycling" ~ 1,
activities == "walking" | activities == "running" ~ 2,
activities == "stationary" ~ 3,
activities == "unknown" ~ 4))
return(ios_gar)
}
unify_ios_conversation <- function(conversation){
if(nrow(conversation) > 0){
duration_check <- conversation %>%
select(double_convo_start, double_convo_end) %>%
mutate(start_is_seconds = double_convo_start <= 9999999999,
end_is_seconds = double_convo_end <= 9999999999) # Values smaller than 9999999999 are in seconds instead of milliseconds
start_end_in_seconds = sum(duration_check$start_is_seconds) + sum(duration_check$end_is_seconds)
if(start_end_in_seconds > 0) # convert seconds to milliseconds
conversation <- conversation %>% mutate(double_convo_start = double_convo_start * 1000, double_convo_end = double_convo_end * 1000)
}
return(conversation)
}
# This function is used in download_dataset.R
unify_raw_data <- function(dbEngine, sensor_table, sensor, timestamp_filter, aware_multiplatform_tables, device_ids, platforms){
# If platforms is 'multiple', fetch each device_id's platform from aware_device, otherwise, use those given by the user
if(length(platforms) == 1 && platforms == "multiple")
devices_platforms <- dbGetQuery(dbEngine, paste0("SELECT device_id,brand FROM aware_device WHERE device_id IN ('", paste0(device_ids, collapse = "','"), "')")) %>%
mutate(platform = ifelse(brand == "iPhone", "ios", "android"))
else
devices_platforms <- data.frame(device_id = device_ids, platform = platforms)
# Get existent tables in database
available_tables_in_db <- dbGetQuery(dbEngine, paste0("SELECT table_name FROM information_schema.tables WHERE table_schema='", dbGetInfo(dbEngine)$dbname,"'"))[[1]]
if(!any(sensor_table %in% available_tables_in_db))
stop(paste0("You requested data from these table(s) ", paste0(sensor_table, collapse=", "), " but they don't exist in your database ", dbGetInfo(dbEngine)$dbname))
# Parse the table names for activity recognition and conversation plugins because they are different between android and ios
ar_tables <- setNames(aware_multiplatform_tables[1:2], c("android", "ios"))
conversation_tables <- setNames(aware_multiplatform_tables[3:4], c("android", "ios"))
participants_sensordata <- list()
for(i in 1:nrow(devices_platforms)) {
row <- devices_platforms[i,]
device_id <- row$device_id
platform <- row$platform
# Handle special cases when tables for the same sensor have different names for Android and iOS (AR and conversation)
if(length(sensor_table) == 1)
table <- sensor_table
else if(all(sensor_table == ar_tables))
table <- ar_tables[[platform]]
else if(all(sensor_table == conversation_tables))
table <- conversation_tables[[platform]]
if(table %in% available_tables_in_db){
query <- paste0("SELECT * FROM ", table, " WHERE device_id IN ('", device_id, "')", timestamp_filter)
sensor_data <- unify_data(dbGetQuery(dbEngine, query), sensor, platform)
participants_sensordata <- append(participants_sensordata, list(sensor_data))
}else{
warning(paste0("Missing ", table, " table. We unified the data from ", paste0(devices_platforms$device_id, collapse = " and "), " but without records from this missing table for ", device_id))
}
}
unified_data <- bind_rows(participants_sensordata)
return(unified_data)
}
# This function is used in unify_ios_android.R and unify_raw_data function
unify_data <- function(sensor_data, sensor, platform){
if(sensor == "phone_calls" & platform == "ios"){
sensor_data = unify_ios_calls(sensor_data)
} else if(sensor == "phone_battery" & platform == "ios"){
sensor_data = unify_ios_battery(sensor_data)
} else if(sensor == "phone_activity_recognition" & platform == "ios"){
sensor_data = unify_ios_activity_recognition(sensor_data)
} else if(sensor == "phone_screen" & platform == "ios"){
sensor_data = unify_ios_screen(sensor_data)
} else if(sensor == "phone_conversation" & platform == "ios"){
sensor_data = unify_ios_conversation(sensor_data)
}
return(sensor_data)
}