From 2fe5a0d8226d9e0b7d829fcc5e234b68bc2ba25f Mon Sep 17 00:00:00 2001 From: JulioV Date: Mon, 26 Oct 2020 15:10:32 -0400 Subject: [PATCH] Update R chunk_episodes --- src/features/utils/utils.R | 47 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/src/features/utils/utils.R b/src/features/utils/utils.R index bfc3cb71..3ff3caac 100644 --- a/src/features/utils/utils.R +++ b/src/features/utils/utils.R @@ -15,31 +15,32 @@ filter_data_by_segment <- function(data, day_segment){ } chunk_episodes <- function(sensor_episodes){ - columns_to_drop <- c("timestamp", "duration", "utc_date_time", "local_date_time", "local_date", "local_time", "local_hour", "local_minute", "segment_start", "segment_end", 'timestamp_plus_duration' ) - - chunked_episodes <- sensor_episodes %>% separate(col = local_segment, - into = c("local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"), - sep = "#", - remove = FALSE) %>% - unite(col = "segment_start", "local_start_date", "local_start_time", sep = " ",remove = TRUE) %>% - unite(col = "segment_end", "local_end_date", "local_end_time", sep = " ",remove = TRUE) %>% - mutate(local_segment_label = NULL, - timestamp_plus_duration = timestamp + (duration * 1000 * 60)) %>% + columns_to_drop <- c("^timestamp$", "utc_date_time", "local_date_time", "local_date", "local_time", "local_hour", "local_minute", "segment_start", "segment_end" ) + + chunked_episodes <- sensor_episodes %>% + separate(col = timestamps_segment, + into = c("segment_start_timestamp", "segment_end_timestamp"), + sep = ",", convert = TRUE, remove = TRUE) %>% group_by(local_timezone) %>% nest() %>% - mutate( - data = map(data, ~.x %>% mutate(segment_start = as.numeric(lubridate::ymd_hms(segment_start, tz = local_timezone)) * 1000, - segment_end = as.numeric(lubridate::ymd_hms(segment_end, tz = local_timezone)) * 1000)), - # We group by episode_id and those variables from the original episodes we want to keep once we summarise - data = map(data, ~.x %>% group_by_at(vars(c("episode_id", setdiff(colnames(.x), columns_to_drop) ))) %>% - summarize(chunked_start = max(first(timestamp), first(segment_start)), - chunked_end = min(last(timestamp_plus_duration), last(segment_end)), - duration = (chunked_end - chunked_start) / (1000 * 60 ), - chunked_start = format(lubridate::as_datetime(chunked_start / 1000, tz = local_timezone), "%Y-%m-%d %H:%M:%S"), - chunked_end = format(lubridate::as_datetime(chunked_end / 1000, tz = local_timezone), "%Y-%m-%d %H:%M:%S"))) - ) %>% - unnest(data) - + mutate(data = map(data, ~.x %>% + distinct(start_timestamp, end_timestamp, local_segment, .keep_all = TRUE) %>% + mutate(start_timestamp = pmax(start_timestamp, segment_start_timestamp), + end_timestamp = pmin(end_timestamp, segment_end_timestamp), + duration = (end_timestamp - start_timestamp) / (1000 * 60)) %>% + select(-matches(columns_to_drop)) %>% + group_by_at(vars(setdiff(colnames(.), c("start_timestamp", "end_timestamp", "duration")))) %>% + summarize(start_timestamp = first(start_timestamp), + end_timestamp = last(end_timestamp), + duration = sum(duration)) %>% + mutate(local_start_date_time = format(lubridate::as_datetime(start_timestamp / 1000, tz = local_timezone), "%Y-%m-%d %H:%M:%S"), + local_end_date_time = format(lubridate::as_datetime(end_timestamp / 1000, tz = local_timezone), "%Y-%m-%d %H:%M:%S")) %>% + ungroup()) + ) %>% + unnest(data) %>% + ungroup() %>% + select(-local_timezone) + return(chunked_episodes) }