From 1025e6d9d8b0f3064f7afa09655161aa26468743 Mon Sep 17 00:00:00 2001
From: JulioV <JulioV@users.noreply.github.com>
Date: Tue, 6 Apr 2021 13:58:58 -0400
Subject: [PATCH] Fix datetime labels of event segments across multiple tzs

---
 docs/setup/configuration.md                   | 13 ++++++++++
 src/data/datetime/assign_to_event_segments.R  | 13 +++-------
 .../datetime/assign_to_multiple_timezones.R   | 25 +++++++++++++++++++
 src/data/datetime/assign_to_time_segment.R    |  4 +--
 src/data/datetime/readable_datetime.R         | 10 +++++---
 5 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/docs/setup/configuration.md b/docs/setup/configuration.md
index a00f1e55..b6532069 100644
--- a/docs/setup/configuration.md
+++ b/docs/setup/configuration.md
@@ -271,6 +271,19 @@ Time segments (or epochs) are the time windows on which you want to extract beha
         
         The three `mood` segments are 1 hour, 1 day and 7 days long and have no shift. In addition, these `mood` segments are grouped together, meaning that although RAPIDS will compute features on each one of them, some necessary information to compute a few of such features will be extracted from all three segments, for example the phone contact that called a participant the most or the location clusters visited by a participant.
 
+    ??? info "Date time labels of event segments"
+        In the final feature file, you will find a row per event segment. The `local_segment` column of each row has a `label`, a start date-time string, and an end date-time string.
+
+        ```bash
+        weeklysurvey2060#2020-09-12 01:00:00,2020-09-18 23:59:59
+        ```
+
+        All sensor data is always segmented based on timestamps, and the date-time strings are attached for informative purposes. For example, you can plot your features based on these strings. 
+
+        When you configure RAPIDS to work with a single time zone, such tz code will be used to convert start/end timestamps (the ones you typed in the event segments file) into start/end date-time strings. However, when you configure RAPIDS to work with multiple time zones, RAPIDS will use the most common time zone across all devices of every participant to do the conversion. The most common time zone is the one in which a participant spent the most time.
+
+        In practical terms, this means that the date-time strings of event segments that happened in uncommon time zones will have shifted start/end date-time labels. However, the data within each segment was correctly filtered based on timestamps.
+
 ### Segment Examples
 
 === "5-minutes"
diff --git a/src/data/datetime/assign_to_event_segments.R b/src/data/datetime/assign_to_event_segments.R
index 3b94ab05..aab500d8 100644
--- a/src/data/datetime/assign_to_event_segments.R
+++ b/src/data/datetime/assign_to_event_segments.R
@@ -29,14 +29,9 @@ infer_event_segments <- function(tz, segments){
   return(inferred)
 }
 
-assign_to_event_segments <- function(sensor_data, time_segments){
+assign_to_event_segments <- function(sensor_data, time_segments, most_common_tz){
+  inferred_time_segments <- infer_event_segments(most_common_tz, time_segments)
   sensor_data <- sensor_data %>% 
-    group_by(local_timezone) %>% 
-    nest() %>% 
-    mutate(inferred_time_segments = map(local_timezone, infer_event_segments, time_segments),
-           data = map2(data, inferred_time_segments, assign_rows_to_segments)) %>% 
-    select(-inferred_time_segments) %>% 
-    unnest(data) %>% 
-    arrange(timestamp) %>%
-    ungroup()
+    assign_rows_to_segments(inferred_time_segments) %>% 
+    arrange(timestamp)
 }
\ No newline at end of file
diff --git a/src/data/datetime/assign_to_multiple_timezones.R b/src/data/datetime/assign_to_multiple_timezones.R
index 9263fac9..3e73ea92 100644
--- a/src/data/datetime/assign_to_multiple_timezones.R
+++ b/src/data/datetime/assign_to_multiple_timezones.R
@@ -79,6 +79,31 @@ infer_tz_codes_from_phones <- function(data_device_ids, tz_codes, participant_fi
   data_tz_codes
 }
 
+get_devices_ids <- function(participant_data){
+  devices_ids = c()
+  for(device in participant_data)
+    for(attribute in names(device))
+      if(attribute == "DEVICE_IDS")
+        devices_ids <- c(devices_ids, device[[attribute]])
+      return(devices_ids)
+}
+
+get_participant_most_common_tz <- function(tz_codes_file, participant_file){
+  tz_codes <- read.csv(tz_codes_file)
+  participant_device_ids <- get_devices_ids(read_yaml(participant_file))
+  
+  participant_tz_codes <- tz_codes %>% filter(device_id %in% participant_device_ids)
+  most_common_tz <- buils_tz_intervals(participant_tz_codes, "all") %>% 
+    mutate(duration = end_timestamp - timestamp) %>% 
+    filter(duration == max(duration)) %>% 
+    head(1) %>% 
+    pull(tzcode)
+
+  if(length(most_common_tz)==0)
+    most_common_tz <- "UTC"
+  return(most_common_tz)
+}
+
 # TODO include CSV timezone file in rule
 multiple_time_zone_assignment <- function(sensor_data, timezone_parameters, device_type, pid, participant_file){
   if(nrow(sensor_data) == 0)
diff --git a/src/data/datetime/assign_to_time_segment.R b/src/data/datetime/assign_to_time_segment.R
index bf2eb551..4375cb73 100644
--- a/src/data/datetime/assign_to_time_segment.R
+++ b/src/data/datetime/assign_to_time_segment.R
@@ -16,7 +16,7 @@ assign_rows_to_segments <- function(data, segments){
   data
 }
 
-assign_to_time_segment <- function(sensor_data, time_segments, time_segments_type, include_past_periodic_segments){
+assign_to_time_segment <- function(sensor_data, time_segments, time_segments_type, include_past_periodic_segments, most_common_tz){
   
   if(nrow(sensor_data) == 0 || nrow(time_segments) == 0)
     return(sensor_data %>% mutate(assigned_segments = NA))
@@ -28,7 +28,7 @@ assign_to_time_segment <- function(sensor_data, time_segments, time_segments_typ
     
   } else if ( time_segments_type == "EVENT"){
     source("src/data/datetime/assign_to_event_segments.R")
-    sensor_data <- assign_to_event_segments(sensor_data, time_segments)
+    sensor_data <- assign_to_event_segments(sensor_data, time_segments, most_common_tz)
     return(sensor_data)
   }
 }
\ No newline at end of file
diff --git a/src/data/datetime/readable_datetime.R b/src/data/datetime/readable_datetime.R
index 4f2a289c..79fc19d2 100644
--- a/src/data/datetime/readable_datetime.R
+++ b/src/data/datetime/readable_datetime.R
@@ -111,14 +111,18 @@ readable_datetime <- function(){
 
   validate_user_timezones(timezone_parameters)
   
-  if(timezone_parameters$TYPE == "SINGLE")
+  if(timezone_parameters$TYPE == "SINGLE"){
     output <- input %>% mutate(local_timezone = timezone_parameters$SINGLE$TZCODE)
-  else if(timezone_parameters$TYPE == "MULTIPLE")
+    most_common_tz <- timezone_parameters$SINGLE$TZCODE
+  }
+  else if(timezone_parameters$TYPE == "MULTIPLE"){
     output <- multiple_time_zone_assignment(input, timezone_parameters, device_type, pid, participant_file)
+    most_common_tz <- get_participant_most_common_tz(timezone_parameters$MULTIPLE$TZCODES_FILE, participant_file) # in assign_to_multiple_timezones.R
+  }
 
   output <- create_mising_temporal_column(output, device_type)
   output <- split_local_date_time(output)
-  output <- assign_to_time_segment(output, time_segments, time_segments_type, include_past_periodic_segments)
+  output <- assign_to_time_segment(output, time_segments, time_segments_type, include_past_periodic_segments, most_common_tz)
   output <- filter_wanted_dates(output, participant_file, device_type)
   output <- output %>% arrange(timestamp)