Fix datetime labels of event segments across multiple tzs

pull/130/head
JulioV 2021-04-06 13:58:58 -04:00
parent 8909876cff
commit 1025e6d9d8
5 changed files with 51 additions and 14 deletions

View File

@ -271,6 +271,19 @@ Time segments (or epochs) are the time windows on which you want to extract beha
The three `mood` segments are 1 hour, 1 day and 7 days long and have no shift. In addition, these `mood` segments are grouped together, meaning that although RAPIDS will compute features on each one of them, some necessary information to compute a few of such features will be extracted from all three segments, for example the phone contact that called a participant the most or the location clusters visited by a participant. The three `mood` segments are 1 hour, 1 day and 7 days long and have no shift. In addition, these `mood` segments are grouped together, meaning that although RAPIDS will compute features on each one of them, some necessary information to compute a few of such features will be extracted from all three segments, for example the phone contact that called a participant the most or the location clusters visited by a participant.
??? info "Date time labels of event segments"
In the final feature file, you will find a row per event segment. The `local_segment` column of each row has a `label`, a start date-time string, and an end date-time string.
```bash
weeklysurvey2060#2020-09-12 01:00:00,2020-09-18 23:59:59
```
All sensor data is always segmented based on timestamps, and the date-time strings are attached for informative purposes. For example, you can plot your features based on these strings.
When you configure RAPIDS to work with a single time zone, such tz code will be used to convert start/end timestamps (the ones you typed in the event segments file) into start/end date-time strings. However, when you configure RAPIDS to work with multiple time zones, RAPIDS will use the most common time zone across all devices of every participant to do the conversion. The most common time zone is the one in which a participant spent the most time.
In practical terms, this means that the date-time strings of event segments that happened in uncommon time zones will have shifted start/end date-time labels. However, the data within each segment was correctly filtered based on timestamps.
### Segment Examples ### Segment Examples
=== "5-minutes" === "5-minutes"

View File

@ -29,14 +29,9 @@ infer_event_segments <- function(tz, segments){
return(inferred) return(inferred)
} }
assign_to_event_segments <- function(sensor_data, time_segments){ assign_to_event_segments <- function(sensor_data, time_segments, most_common_tz){
inferred_time_segments <- infer_event_segments(most_common_tz, time_segments)
sensor_data <- sensor_data %>% sensor_data <- sensor_data %>%
group_by(local_timezone) %>% assign_rows_to_segments(inferred_time_segments) %>%
nest() %>% arrange(timestamp)
mutate(inferred_time_segments = map(local_timezone, infer_event_segments, time_segments),
data = map2(data, inferred_time_segments, assign_rows_to_segments)) %>%
select(-inferred_time_segments) %>%
unnest(data) %>%
arrange(timestamp) %>%
ungroup()
} }

View File

@ -79,6 +79,31 @@ infer_tz_codes_from_phones <- function(data_device_ids, tz_codes, participant_fi
data_tz_codes data_tz_codes
} }
get_devices_ids <- function(participant_data){
devices_ids = c()
for(device in participant_data)
for(attribute in names(device))
if(attribute == "DEVICE_IDS")
devices_ids <- c(devices_ids, device[[attribute]])
return(devices_ids)
}
get_participant_most_common_tz <- function(tz_codes_file, participant_file){
tz_codes <- read.csv(tz_codes_file)
participant_device_ids <- get_devices_ids(read_yaml(participant_file))
participant_tz_codes <- tz_codes %>% filter(device_id %in% participant_device_ids)
most_common_tz <- buils_tz_intervals(participant_tz_codes, "all") %>%
mutate(duration = end_timestamp - timestamp) %>%
filter(duration == max(duration)) %>%
head(1) %>%
pull(tzcode)
if(length(most_common_tz)==0)
most_common_tz <- "UTC"
return(most_common_tz)
}
# TODO include CSV timezone file in rule # TODO include CSV timezone file in rule
multiple_time_zone_assignment <- function(sensor_data, timezone_parameters, device_type, pid, participant_file){ multiple_time_zone_assignment <- function(sensor_data, timezone_parameters, device_type, pid, participant_file){
if(nrow(sensor_data) == 0) if(nrow(sensor_data) == 0)

View File

@ -16,7 +16,7 @@ assign_rows_to_segments <- function(data, segments){
data data
} }
assign_to_time_segment <- function(sensor_data, time_segments, time_segments_type, include_past_periodic_segments){ assign_to_time_segment <- function(sensor_data, time_segments, time_segments_type, include_past_periodic_segments, most_common_tz){
if(nrow(sensor_data) == 0 || nrow(time_segments) == 0) if(nrow(sensor_data) == 0 || nrow(time_segments) == 0)
return(sensor_data %>% mutate(assigned_segments = NA)) return(sensor_data %>% mutate(assigned_segments = NA))
@ -28,7 +28,7 @@ assign_to_time_segment <- function(sensor_data, time_segments, time_segments_typ
} else if ( time_segments_type == "EVENT"){ } else if ( time_segments_type == "EVENT"){
source("src/data/datetime/assign_to_event_segments.R") source("src/data/datetime/assign_to_event_segments.R")
sensor_data <- assign_to_event_segments(sensor_data, time_segments) sensor_data <- assign_to_event_segments(sensor_data, time_segments, most_common_tz)
return(sensor_data) return(sensor_data)
} }
} }

View File

@ -111,14 +111,18 @@ readable_datetime <- function(){
validate_user_timezones(timezone_parameters) validate_user_timezones(timezone_parameters)
if(timezone_parameters$TYPE == "SINGLE") if(timezone_parameters$TYPE == "SINGLE"){
output <- input %>% mutate(local_timezone = timezone_parameters$SINGLE$TZCODE) output <- input %>% mutate(local_timezone = timezone_parameters$SINGLE$TZCODE)
else if(timezone_parameters$TYPE == "MULTIPLE") most_common_tz <- timezone_parameters$SINGLE$TZCODE
}
else if(timezone_parameters$TYPE == "MULTIPLE"){
output <- multiple_time_zone_assignment(input, timezone_parameters, device_type, pid, participant_file) output <- multiple_time_zone_assignment(input, timezone_parameters, device_type, pid, participant_file)
most_common_tz <- get_participant_most_common_tz(timezone_parameters$MULTIPLE$TZCODES_FILE, participant_file) # in assign_to_multiple_timezones.R
}
output <- create_mising_temporal_column(output, device_type) output <- create_mising_temporal_column(output, device_type)
output <- split_local_date_time(output) output <- split_local_date_time(output)
output <- assign_to_time_segment(output, time_segments, time_segments_type, include_past_periodic_segments) output <- assign_to_time_segment(output, time_segments, time_segments_type, include_past_periodic_segments, most_common_tz)
output <- filter_wanted_dates(output, participant_file, device_type) output <- filter_wanted_dates(output, participant_file, device_type)
output <- output %>% arrange(timestamp) output <- output %>% arrange(timestamp)