Fix datetime labels of event segments across multiple tzs

2021-04-06 13:58:58 -04:00 · 2021-04-06 13:58:58 -04:00 · 1025e6d9d8
parent 8909876cff
commit 1025e6d9d8
5 changed files with 51 additions and 14 deletions
--- a/docs/setup/configuration.md
+++ b/docs/setup/configuration.md
@ -271,6 +271,19 @@ Time segments (or epochs) are the time windows on which you want to extract beha
        
        The three `mood` segments are 1 hour, 1 day and 7 days long and have no shift. In addition, these `mood` segments are grouped together, meaning that although RAPIDS will compute features on each one of them, some necessary information to compute a few of such features will be extracted from all three segments, for example the phone contact that called a participant the most or the location clusters visited by a participant.

+    ??? info "Date time labels of event segments"
+        In the final feature file, you will find a row per event segment. The `local_segment` column of each row has a `label`, a start date-time string, and an end date-time string.
+
+        ```bash
+        weeklysurvey2060#2020-09-12 01:00:00,2020-09-18 23:59:59
+        ```
+
+        All sensor data is always segmented based on timestamps, and the date-time strings are attached for informative purposes. For example, you can plot your features based on these strings. 
+
+        When you configure RAPIDS to work with a single time zone, such tz code will be used to convert start/end timestamps (the ones you typed in the event segments file) into start/end date-time strings. However, when you configure RAPIDS to work with multiple time zones, RAPIDS will use the most common time zone across all devices of every participant to do the conversion. The most common time zone is the one in which a participant spent the most time.
+
+        In practical terms, this means that the date-time strings of event segments that happened in uncommon time zones will have shifted start/end date-time labels. However, the data within each segment was correctly filtered based on timestamps.
+
 ### Segment Examples

 === "5-minutes"
--- a/src/data/datetime/assign_to_event_segments.R
+++ b/src/data/datetime/assign_to_event_segments.R
@ -29,14 +29,9 @@ infer_event_segments <- function(tz, segments){
  return(inferred)
 }

-assign_to_event_segments <- function(sensor_data, time_segments){
+assign_to_event_segments <- function(sensor_data, time_segments, most_common_tz){
+  inferred_time_segments <- infer_event_segments(most_common_tz, time_segments)
  sensor_data <- sensor_data %>% 
-    group_by(local_timezone) %>% 
-    nest() %>% 
-    mutate(inferred_time_segments = map(local_timezone, infer_event_segments, time_segments),
-           data = map2(data, inferred_time_segments, assign_rows_to_segments)) %>% 
-    select(-inferred_time_segments) %>% 
-    unnest(data) %>% 
-    arrange(timestamp) %>%
-    ungroup()
+    assign_rows_to_segments(inferred_time_segments) %>% 
+    arrange(timestamp)
 }
--- a/src/data/datetime/assign_to_multiple_timezones.R
+++ b/src/data/datetime/assign_to_multiple_timezones.R
@ -79,6 +79,31 @@ infer_tz_codes_from_phones <- function(data_device_ids, tz_codes, participant_fi
  data_tz_codes
 }

+get_devices_ids <- function(participant_data){
+  devices_ids = c()
+  for(device in participant_data)
+    for(attribute in names(device))
+      if(attribute == "DEVICE_IDS")
+        devices_ids <- c(devices_ids, device[[attribute]])
+      return(devices_ids)
+}
+
+get_participant_most_common_tz <- function(tz_codes_file, participant_file){
+  tz_codes <- read.csv(tz_codes_file)
+  participant_device_ids <- get_devices_ids(read_yaml(participant_file))
+  
+  participant_tz_codes <- tz_codes %>% filter(device_id %in% participant_device_ids)
+  most_common_tz <- buils_tz_intervals(participant_tz_codes, "all") %>% 
+    mutate(duration = end_timestamp - timestamp) %>% 
+    filter(duration == max(duration)) %>% 
+    head(1) %>% 
+    pull(tzcode)
+
+  if(length(most_common_tz)==0)
+    most_common_tz <- "UTC"
+  return(most_common_tz)
+}
+
 # TODO include CSV timezone file in rule
 multiple_time_zone_assignment <- function(sensor_data, timezone_parameters, device_type, pid, participant_file){
  if(nrow(sensor_data) == 0)
--- a/src/data/datetime/assign_to_time_segment.R
+++ b/src/data/datetime/assign_to_time_segment.R
@ -16,7 +16,7 @@ assign_rows_to_segments <- function(data, segments){
  data
 }

-assign_to_time_segment <- function(sensor_data, time_segments, time_segments_type, include_past_periodic_segments){
+assign_to_time_segment <- function(sensor_data, time_segments, time_segments_type, include_past_periodic_segments, most_common_tz){
  
  if(nrow(sensor_data) == 0 || nrow(time_segments) == 0)
    return(sensor_data %>% mutate(assigned_segments = NA))
@ -28,7 +28,7 @@ assign_to_time_segment <- function(sensor_data, time_segments, time_segments_typ
    
  } else if ( time_segments_type == "EVENT"){
    source("src/data/datetime/assign_to_event_segments.R")
-    sensor_data <- assign_to_event_segments(sensor_data, time_segments)
+    sensor_data <- assign_to_event_segments(sensor_data, time_segments, most_common_tz)
    return(sensor_data)
  }
 }
--- a/src/data/datetime/readable_datetime.R
+++ b/src/data/datetime/readable_datetime.R
@ -111,14 +111,18 @@ readable_datetime <- function(){

  validate_user_timezones(timezone_parameters)
  
-  if(timezone_parameters$TYPE == "SINGLE")
+  if(timezone_parameters$TYPE == "SINGLE"){
    output <- input %>% mutate(local_timezone = timezone_parameters$SINGLE$TZCODE)
-  else if(timezone_parameters$TYPE == "MULTIPLE")
+    most_common_tz <- timezone_parameters$SINGLE$TZCODE
+  }
+  else if(timezone_parameters$TYPE == "MULTIPLE"){
    output <- multiple_time_zone_assignment(input, timezone_parameters, device_type, pid, participant_file)
+    most_common_tz <- get_participant_most_common_tz(timezone_parameters$MULTIPLE$TZCODES_FILE, participant_file) # in assign_to_multiple_timezones.R
+  }

  output <- create_mising_temporal_column(output, device_type)
  output <- split_local_date_time(output)
-  output <- assign_to_time_segment(output, time_segments, time_segments_type, include_past_periodic_segments)
+  output <- assign_to_time_segment(output, time_segments, time_segments_type, include_past_periodic_segments, most_common_tz)
  output <- filter_wanted_dates(output, participant_file, device_type)
  output <- output %>% arrange(timestamp)