Update day segment format

pull/103/head
JulioV 2020-09-28 11:38:47 -04:00
parent 0469f78210
commit 9e15f46fc3
3 changed files with 105 additions and 69 deletions

View File

@ -1,32 +1,35 @@
library("tidyverse")
library("lubridate")
options(scipen=999)
find_segments_frequency <- function(local_date, local_time, segments){
find_segments_frequency <- function(local_date, local_time, local_timezone, segments){
assigned_segments <- segments[segments$segment_start<= local_time & segments$segment_end >= local_time, ]
assigned_segments["segment_start_ts"] = as.numeric(lubridate::as_datetime(stringi::stri_c(local_date,assigned_segments$segment_id_start_time), tz = local_timezone)) * 1000
assigned_segments["segment_end_ts"] = as.numeric(lubridate::as_datetime(stringi::stri_c(local_date,assigned_segments$segment_id_end_time), tz = local_timezone)) * 1000 + 999
return(stringi::stri_c(stringi::stri_c("[",
assigned_segments[["label"]], "#",
local_date, "#",
assigned_segments[["segment_id_start_time"]], "#",
local_date, "#",
assigned_segments[["segment_id_end_time"]],
local_date, " ",
assigned_segments[["segment_id_start_time"]], ",",
local_date, " ",
assigned_segments[["segment_id_end_time"]], ";",
assigned_segments[["segment_start_ts"]], ",",
assigned_segments[["segment_end_ts"]],
"]"), collapse = "|"))
}
find_segments_periodic <- function(timestamp, segments){
# crossing and pivot_longer make segments a tibble, thus we need to extract [["segment_id"]]
return(stringi::stri_c(segments[[1]][segments[[1]]$segment_start_ts<= timestamp & segments[[1]]$segment_end_ts >= timestamp, "segment_id"][["segment_id"]], collapse = "|"))
}
# We might need to optimise the event function as well, filter, and pull are slow
find_segments_event <- function(timestamp, segments){
return(stringi::stri_c(segments %>%
filter(segment_start <= timestamp & segment_end >= timestamp) %>%
pull(segment_id), collapse = "|"))
# segments is a data.frame, we don't need to extract [["segment_id"]] like in find_segments_periodic
return(stringi::stri_c(segments[[1]][segments[[1]]$segment_start_ts<= timestamp & segments[[1]]$segment_end_ts >= timestamp, "segment_id"], collapse = "|"))
}
assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, include_past_periodic_segments){
if(nrow(sensor_data) == 0)
return(sensor_data %>% mutate(assigned_segments = NA))
if(day_segments_type == "FREQUENCY"){ #FREQUENCY
@ -36,8 +39,9 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type,
segment_id_end_time = paste(str_pad(hour(ymd("1970-01-01") + end_time),2, pad="0"), str_pad(minute(ymd("1970-01-01") + end_time),2, pad="0"), str_pad(second(ymd("1970-01-01") + end_time),2, pad="0"),sep =":"), # add ymd("1970-01-01") to get a real time instead of duration
segment_start = as.numeric(start_time),
segment_end = as.numeric(end_time))
sensor_data <- sensor_data %>% mutate(local_time_obj = as.numeric(lubridate::hms(local_time)),
assigned_segments = map2_chr(local_date, local_time_obj, ~find_segments_frequency(.x, .y, day_segments))) %>% select(-local_time_obj)
assigned_segments = pmap_chr(list(local_date, local_time_obj, local_timezone), find_segments_frequency, day_segments)) %>% select(-local_time_obj)
} else if (day_segments_type == "PERIODIC"){ #PERIODIC
@ -104,24 +108,21 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type,
filter(repeats_on == day_type & repeats_value == day_value) %>%
mutate(segment_id_start = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM")), # The segment ids (label#start#end) are computed in UTC to avoid having different labels for instances of a segment that happen in different timezones
segment_id_end = segment_id_start + lubridate::duration(length),
segment_start_ts = as.numeric(lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = local_timezone)), # The actual segments are computed using timestamps taking into account the timezone
segment_end_ts = segment_start_ts + as.numeric(lubridate::duration(length)),
segment_start_ts = as.numeric(lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = local_timezone)) * 1000, # The actual segments are computed using timestamps taking into account the timezone
segment_end_ts = segment_start_ts + as.numeric(lubridate::duration(length)) * 1000 + 999,
segment_id = paste0("[",
paste(sep= "#",
label,
lubridate::date(segment_id_start),
paste(str_pad(hour(segment_id_start),2, pad="0"),
str_pad(minute(segment_id_start),2, pad="0"),
str_pad(second(segment_id_start),2, pad="0"),sep =":"),
lubridate::date(segment_id_end),
paste(str_pad(hour(segment_id_end),2, pad="0"),
str_pad(minute(segment_id_end),2, pad="0"),
str_pad(second(segment_id_end),2, pad="0"),sep =":")
paste0(
label,"#",
paste0(lubridate::date(segment_id_start), " ",
paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",",
lubridate::date(segment_id_end), " ",
paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";",
paste0(segment_start_ts, ",", segment_end_ts)
),
"]")) %>%
select(segment_start_ts, segment_end_ts, segment_id)),
# loop thorugh every day segment and assigned it to the rows that fall within its start and end
data = map2(data, inferred_day_segments, ~ .x %>% mutate(row_date_time = as.numeric(lubridate::ymd_hms(local_date_time, tz = local_timezone)),
"]")) %>%
select(segment_start_ts, segment_end_ts, segment_id) %>%
drop_na(segment_start_ts, segment_end_ts)), # drop day segments with an invalid start or end time (mostly due to daylight saving changes, e.g. 2020-03-08 02:00:00 EST does not exist, clock jumps from 1am to 3am)
data = map2(data, inferred_day_segments, ~ .x %>% mutate(row_date_time = as.numeric(lubridate::ymd_hms(local_date_time, tz = local_timezone)) * 1000,
assigned_segments = map_chr(row_date_time, ~find_segments_periodic(.x, inferred_day_segments)),
row_date_time = NULL))
) %>%
@ -132,28 +133,31 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type,
} else if ( day_segments_type == "EVENT"){
most_common_tz <- sensor_data %>% count(local_timezone) %>% slice(which.max(n)) %>% pull(local_timezone)
day_segments <- day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift),
segment_start = event_timestamp + (as.integer(seconds(lubridate::duration(shift))) * ifelse(shift_direction >= 0, 1, -1) * 1000),
segment_end = segment_start + (as.integer(seconds(lubridate::duration(length))) * 1000),
segment_start_datetime = lubridate::as_datetime(segment_start/1000, tz = most_common_tz), # these start and end datetime objects are for labeling only
segment_end_datetime = lubridate::as_datetime(segment_end/1000, tz = most_common_tz),
segment_id = paste0("[",
paste(sep= "#",
label,
lubridate::date(segment_start_datetime),
paste(str_pad(hour(segment_start_datetime),2, pad="0"),
str_pad(minute(segment_start_datetime),2, pad="0"),
str_pad(second(segment_start_datetime),2, pad="0"),sep =":"),
lubridate::date(segment_end_datetime),
paste(str_pad(hour(segment_end_datetime),2, pad="0"),
str_pad(minute(segment_end_datetime),2, pad="0"),
str_pad(second(segment_end_datetime),2, pad="0"),sep =":")
),
"]")) %>%
select(-segment_start_datetime, -segment_end_datetime)
sensor_data <- sensor_data %>% mutate(assigned_segments = map_chr(timestamp, ~find_segments_event(.x, day_segments)))
sensor_data <- sensor_data %>%
group_by(local_timezone) %>%
nest() %>%
mutate(inferred_day_segments = map(local_timezone, ~ day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift),
segment_start_ts = event_timestamp + (as.integer(seconds(lubridate::duration(shift))) * ifelse(shift_direction >= 0, 1, -1) * 1000),
segment_end_ts = segment_start_ts + (as.integer(seconds(lubridate::duration(length))) * 1000),
segment_id_start = lubridate::as_datetime(segment_start_ts/1000, tz = .x), # these start and end datetime objects are for labeling only
segment_id_end = lubridate::as_datetime(segment_end_ts/1000, tz = .x),
segment_end_ts = segment_end_ts + 999,
segment_id = paste0("[",
paste0(
label,"#",
paste0(lubridate::date(segment_id_start), " ",
paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",",
lubridate::date(segment_id_end), " ",
paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";",
paste0(segment_start_ts, ",", segment_end_ts)
),
"]")) %>%
select(-segment_id_start, -segment_id_end)),
data = map2(data, inferred_day_segments, ~ .x %>% mutate(assigned_segments = map_chr(timestamp, ~find_segments_event(.x, inferred_day_segments))))) %>%
select(-inferred_day_segments) %>%
unnest(data) %>%
arrange(timestamp)
}
return(sensor_data)

View File

@ -3,14 +3,44 @@ library("stringr")
rapids_log_tag <- "RAPIDS:"
filter_data_by_segment <- function(data, day_segment){
# Filter the rows that belong to day_segment, and put the segment full name in a new column for grouping
date_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2}"
hour_regex = "[0-9]{2}:[0-9]{2}:[0-9]{2}"
data <- data %>%
filter(grepl(paste0("\\[", day_segment, "#"), assigned_segments)) %>%
mutate(local_segment = str_extract(assigned_segments, paste0("\\[", day_segment, "#", date_regex, "#", hour_regex, "#", date_regex, "#", hour_regex, "\\]")),
local_segment = str_sub(local_segment, 2, -2)) # get rid of first and last character([])
return(data)
# Filter the rows that belong to day_segment, and put the segment full name in a new column for grouping
datetime_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
timestamp_regex = "[0-9]{13}"
data <- data %>%
filter(grepl(paste0("\\[", day_segment, "#"), assigned_segments)) %>%
mutate(local_segment = str_extract(assigned_segments, paste0("\\[", day_segment, "#", datetime_regex, ",", datetime_regex, ";", timestamp_regex, ",", timestamp_regex, "\\]"))) %>%
extract(local_segment, into = c("local_segment", "timestamps_segment"), paste0("\\[(", day_segment, "#", datetime_regex, ",", datetime_regex, ");(", timestamp_regex, ",", timestamp_regex, ")\\]")) %>%
select(-assigned_segments)
return(data)
}
chunk_episodes <- function(sensor_episodes){
columns_to_drop <- c("timestamp", "duration", "utc_date_time", "local_date_time", "local_date", "local_time", "local_hour", "local_minute", "segment_start", "segment_end", 'timestamp_plus_duration' )
chunked_episodes <- sensor_episodes %>% separate(col = local_segment,
into = c("local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"),
sep = "#",
remove = FALSE) %>%
unite(col = "segment_start", "local_start_date", "local_start_time", sep = " ",remove = TRUE) %>%
unite(col = "segment_end", "local_end_date", "local_end_time", sep = " ",remove = TRUE) %>%
mutate(local_segment_label = NULL,
timestamp_plus_duration = timestamp + (duration * 1000 * 60)) %>%
group_by(local_timezone) %>%
nest() %>%
mutate(
data = map(data, ~.x %>% mutate(segment_start = as.numeric(lubridate::ymd_hms(segment_start, tz = local_timezone)) * 1000,
segment_end = as.numeric(lubridate::ymd_hms(segment_end, tz = local_timezone)) * 1000)),
# We group by episode_id and those variables from the original episodes we want to keep once we summarise
data = map(data, ~.x %>% group_by_at(vars(c("episode_id", setdiff(colnames(.x), columns_to_drop) ))) %>%
summarize(chunked_start = max(first(timestamp), first(segment_start)),
chunked_end = min(last(timestamp_plus_duration), last(segment_end)),
duration = (chunked_end - chunked_start) / (1000 * 60 ),
chunked_start = format(lubridate::as_datetime(chunked_start / 1000, tz = local_timezone), "%Y-%m-%d %H:%M:%S"),
chunked_end = format(lubridate::as_datetime(chunked_end / 1000, tz = local_timezone), "%Y-%m-%d %H:%M:%S")))
) %>%
unnest(data)
return(chunked_episodes)
}
fetch_provider_features <- function(provider, provider_key, config_key, sensor_data_file, day_segments_file){
@ -39,14 +69,14 @@ fetch_provider_features <- function(provider, provider_key, config_key, sensor_d
sensor_features <- merge(sensor_features, features, all = TRUE)
}
} else {
} else { # This is redundant, if COMPUTE is FALSE this script will be never executed
for(feature in provider[["FEATURES"]])
sensor_features[,feature] <- NA
}
sensor_features <- sensor_features %>% separate(col = local_segment,
into = c("local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"),
sep = "#",
remove = FALSE)
sensor_features <- sensor_features %>% extract(col = local_segment,
into = c("local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"),
"(.*)#(.*),(.*)",
remove = FALSE)
return(sensor_features)
}

View File

@ -1,10 +1,12 @@
rapids_log_tag = "RAPIDS:"
def filter_data_by_segment(data, day_segment):
date_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2}"
hour_regex = "[0-9]{2}:[0-9]{2}:[0-9]{2}"
segment_regex = "\[({}#{}#{}#{}#{})\]".format(day_segment, date_regex, hour_regex, date_regex, hour_regex)
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
timestamps_regex = "[0-9]{13}"
segment_regex = "\[({}#{},{};{},{})\]".format(day_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex)
data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True)
data = data.drop(columns=["assigned_segments"])
return(data.dropna(subset = ["local_segment"]))
def chunk_episodes(sensor_episodes):
@ -80,9 +82,9 @@ def fetch_provider_features(provider, provider_key, config_key, sensor_data_file
for feature in provider["FEATURES"]:
sensor_features[feature] = None
segment_colums = pd.DataFrame()
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="#", expand=True)
new_segment_columns = split_segemnt_columns if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"])
segment_colums[["local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"]] = new_segment_columns
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
for i in range(segment_colums.shape[1]):
sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])