Update day segment format
parent
0469f78210
commit
9e15f46fc3
|
@ -1,32 +1,35 @@
|
|||
library("tidyverse")
|
||||
library("lubridate")
|
||||
options(scipen=999)
|
||||
|
||||
find_segments_frequency <- function(local_date, local_time, segments){
|
||||
find_segments_frequency <- function(local_date, local_time, local_timezone, segments){
|
||||
|
||||
assigned_segments <- segments[segments$segment_start<= local_time & segments$segment_end >= local_time, ]
|
||||
assigned_segments["segment_start_ts"] = as.numeric(lubridate::as_datetime(stringi::stri_c(local_date,assigned_segments$segment_id_start_time), tz = local_timezone)) * 1000
|
||||
assigned_segments["segment_end_ts"] = as.numeric(lubridate::as_datetime(stringi::stri_c(local_date,assigned_segments$segment_id_end_time), tz = local_timezone)) * 1000 + 999
|
||||
|
||||
return(stringi::stri_c(stringi::stri_c("[",
|
||||
assigned_segments[["label"]], "#",
|
||||
local_date, "#",
|
||||
assigned_segments[["segment_id_start_time"]], "#",
|
||||
local_date, "#",
|
||||
assigned_segments[["segment_id_end_time"]],
|
||||
local_date, " ",
|
||||
assigned_segments[["segment_id_start_time"]], ",",
|
||||
local_date, " ",
|
||||
assigned_segments[["segment_id_end_time"]], ";",
|
||||
assigned_segments[["segment_start_ts"]], ",",
|
||||
assigned_segments[["segment_end_ts"]],
|
||||
"]"), collapse = "|"))
|
||||
}
|
||||
|
||||
find_segments_periodic <- function(timestamp, segments){
|
||||
# crossing and pivot_longer make segments a tibble, thus we need to extract [["segment_id"]]
|
||||
return(stringi::stri_c(segments[[1]][segments[[1]]$segment_start_ts<= timestamp & segments[[1]]$segment_end_ts >= timestamp, "segment_id"][["segment_id"]], collapse = "|"))
|
||||
}
|
||||
|
||||
# We might need to optimise the event function as well, filter, and pull are slow
|
||||
find_segments_event <- function(timestamp, segments){
|
||||
return(stringi::stri_c(segments %>%
|
||||
filter(segment_start <= timestamp & segment_end >= timestamp) %>%
|
||||
pull(segment_id), collapse = "|"))
|
||||
# segments is a data.frame, we don't need to extract [["segment_id"]] like in find_segments_periodic
|
||||
return(stringi::stri_c(segments[[1]][segments[[1]]$segment_start_ts<= timestamp & segments[[1]]$segment_end_ts >= timestamp, "segment_id"], collapse = "|"))
|
||||
}
|
||||
|
||||
assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, include_past_periodic_segments){
|
||||
if(nrow(sensor_data) == 0)
|
||||
return(sensor_data %>% mutate(assigned_segments = NA))
|
||||
|
||||
if(day_segments_type == "FREQUENCY"){ #FREQUENCY
|
||||
|
||||
|
@ -36,8 +39,9 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type,
|
|||
segment_id_end_time = paste(str_pad(hour(ymd("1970-01-01") + end_time),2, pad="0"), str_pad(minute(ymd("1970-01-01") + end_time),2, pad="0"), str_pad(second(ymd("1970-01-01") + end_time),2, pad="0"),sep =":"), # add ymd("1970-01-01") to get a real time instead of duration
|
||||
segment_start = as.numeric(start_time),
|
||||
segment_end = as.numeric(end_time))
|
||||
|
||||
sensor_data <- sensor_data %>% mutate(local_time_obj = as.numeric(lubridate::hms(local_time)),
|
||||
assigned_segments = map2_chr(local_date, local_time_obj, ~find_segments_frequency(.x, .y, day_segments))) %>% select(-local_time_obj)
|
||||
assigned_segments = pmap_chr(list(local_date, local_time_obj, local_timezone), find_segments_frequency, day_segments)) %>% select(-local_time_obj)
|
||||
|
||||
} else if (day_segments_type == "PERIODIC"){ #PERIODIC
|
||||
|
||||
|
@ -104,24 +108,21 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type,
|
|||
filter(repeats_on == day_type & repeats_value == day_value) %>%
|
||||
mutate(segment_id_start = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM")), # The segment ids (label#start#end) are computed in UTC to avoid having different labels for instances of a segment that happen in different timezones
|
||||
segment_id_end = segment_id_start + lubridate::duration(length),
|
||||
segment_start_ts = as.numeric(lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = local_timezone)), # The actual segments are computed using timestamps taking into account the timezone
|
||||
segment_end_ts = segment_start_ts + as.numeric(lubridate::duration(length)),
|
||||
segment_start_ts = as.numeric(lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = local_timezone)) * 1000, # The actual segments are computed using timestamps taking into account the timezone
|
||||
segment_end_ts = segment_start_ts + as.numeric(lubridate::duration(length)) * 1000 + 999,
|
||||
segment_id = paste0("[",
|
||||
paste(sep= "#",
|
||||
label,
|
||||
lubridate::date(segment_id_start),
|
||||
paste(str_pad(hour(segment_id_start),2, pad="0"),
|
||||
str_pad(minute(segment_id_start),2, pad="0"),
|
||||
str_pad(second(segment_id_start),2, pad="0"),sep =":"),
|
||||
lubridate::date(segment_id_end),
|
||||
paste(str_pad(hour(segment_id_end),2, pad="0"),
|
||||
str_pad(minute(segment_id_end),2, pad="0"),
|
||||
str_pad(second(segment_id_end),2, pad="0"),sep =":")
|
||||
paste0(
|
||||
label,"#",
|
||||
paste0(lubridate::date(segment_id_start), " ",
|
||||
paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",",
|
||||
lubridate::date(segment_id_end), " ",
|
||||
paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";",
|
||||
paste0(segment_start_ts, ",", segment_end_ts)
|
||||
),
|
||||
"]")) %>%
|
||||
select(segment_start_ts, segment_end_ts, segment_id)),
|
||||
# loop thorugh every day segment and assigned it to the rows that fall within its start and end
|
||||
data = map2(data, inferred_day_segments, ~ .x %>% mutate(row_date_time = as.numeric(lubridate::ymd_hms(local_date_time, tz = local_timezone)),
|
||||
"]")) %>%
|
||||
select(segment_start_ts, segment_end_ts, segment_id) %>%
|
||||
drop_na(segment_start_ts, segment_end_ts)), # drop day segments with an invalid start or end time (mostly due to daylight saving changes, e.g. 2020-03-08 02:00:00 EST does not exist, clock jumps from 1am to 3am)
|
||||
data = map2(data, inferred_day_segments, ~ .x %>% mutate(row_date_time = as.numeric(lubridate::ymd_hms(local_date_time, tz = local_timezone)) * 1000,
|
||||
assigned_segments = map_chr(row_date_time, ~find_segments_periodic(.x, inferred_day_segments)),
|
||||
row_date_time = NULL))
|
||||
) %>%
|
||||
|
@ -132,28 +133,31 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type,
|
|||
|
||||
} else if ( day_segments_type == "EVENT"){
|
||||
|
||||
most_common_tz <- sensor_data %>% count(local_timezone) %>% slice(which.max(n)) %>% pull(local_timezone)
|
||||
day_segments <- day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift),
|
||||
segment_start = event_timestamp + (as.integer(seconds(lubridate::duration(shift))) * ifelse(shift_direction >= 0, 1, -1) * 1000),
|
||||
segment_end = segment_start + (as.integer(seconds(lubridate::duration(length))) * 1000),
|
||||
segment_start_datetime = lubridate::as_datetime(segment_start/1000, tz = most_common_tz), # these start and end datetime objects are for labeling only
|
||||
segment_end_datetime = lubridate::as_datetime(segment_end/1000, tz = most_common_tz),
|
||||
segment_id = paste0("[",
|
||||
paste(sep= "#",
|
||||
label,
|
||||
lubridate::date(segment_start_datetime),
|
||||
paste(str_pad(hour(segment_start_datetime),2, pad="0"),
|
||||
str_pad(minute(segment_start_datetime),2, pad="0"),
|
||||
str_pad(second(segment_start_datetime),2, pad="0"),sep =":"),
|
||||
lubridate::date(segment_end_datetime),
|
||||
paste(str_pad(hour(segment_end_datetime),2, pad="0"),
|
||||
str_pad(minute(segment_end_datetime),2, pad="0"),
|
||||
str_pad(second(segment_end_datetime),2, pad="0"),sep =":")
|
||||
),
|
||||
"]")) %>%
|
||||
select(-segment_start_datetime, -segment_end_datetime)
|
||||
|
||||
sensor_data <- sensor_data %>% mutate(assigned_segments = map_chr(timestamp, ~find_segments_event(.x, day_segments)))
|
||||
sensor_data <- sensor_data %>%
|
||||
group_by(local_timezone) %>%
|
||||
nest() %>%
|
||||
mutate(inferred_day_segments = map(local_timezone, ~ day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift),
|
||||
segment_start_ts = event_timestamp + (as.integer(seconds(lubridate::duration(shift))) * ifelse(shift_direction >= 0, 1, -1) * 1000),
|
||||
segment_end_ts = segment_start_ts + (as.integer(seconds(lubridate::duration(length))) * 1000),
|
||||
segment_id_start = lubridate::as_datetime(segment_start_ts/1000, tz = .x), # these start and end datetime objects are for labeling only
|
||||
segment_id_end = lubridate::as_datetime(segment_end_ts/1000, tz = .x),
|
||||
segment_end_ts = segment_end_ts + 999,
|
||||
segment_id = paste0("[",
|
||||
paste0(
|
||||
label,"#",
|
||||
paste0(lubridate::date(segment_id_start), " ",
|
||||
paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",",
|
||||
lubridate::date(segment_id_end), " ",
|
||||
paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";",
|
||||
paste0(segment_start_ts, ",", segment_end_ts)
|
||||
),
|
||||
"]")) %>%
|
||||
select(-segment_id_start, -segment_id_end)),
|
||||
data = map2(data, inferred_day_segments, ~ .x %>% mutate(assigned_segments = map_chr(timestamp, ~find_segments_event(.x, inferred_day_segments))))) %>%
|
||||
select(-inferred_day_segments) %>%
|
||||
unnest(data) %>%
|
||||
arrange(timestamp)
|
||||
|
||||
}
|
||||
|
||||
return(sensor_data)
|
||||
|
|
|
@ -3,14 +3,44 @@ library("stringr")
|
|||
rapids_log_tag <- "RAPIDS:"
|
||||
|
||||
filter_data_by_segment <- function(data, day_segment){
|
||||
# Filter the rows that belong to day_segment, and put the segment full name in a new column for grouping
|
||||
date_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2}"
|
||||
hour_regex = "[0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||
data <- data %>%
|
||||
filter(grepl(paste0("\\[", day_segment, "#"), assigned_segments)) %>%
|
||||
mutate(local_segment = str_extract(assigned_segments, paste0("\\[", day_segment, "#", date_regex, "#", hour_regex, "#", date_regex, "#", hour_regex, "\\]")),
|
||||
local_segment = str_sub(local_segment, 2, -2)) # get rid of first and last character([])
|
||||
return(data)
|
||||
# Filter the rows that belong to day_segment, and put the segment full name in a new column for grouping
|
||||
datetime_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||
timestamp_regex = "[0-9]{13}"
|
||||
data <- data %>%
|
||||
filter(grepl(paste0("\\[", day_segment, "#"), assigned_segments)) %>%
|
||||
mutate(local_segment = str_extract(assigned_segments, paste0("\\[", day_segment, "#", datetime_regex, ",", datetime_regex, ";", timestamp_regex, ",", timestamp_regex, "\\]"))) %>%
|
||||
extract(local_segment, into = c("local_segment", "timestamps_segment"), paste0("\\[(", day_segment, "#", datetime_regex, ",", datetime_regex, ");(", timestamp_regex, ",", timestamp_regex, ")\\]")) %>%
|
||||
select(-assigned_segments)
|
||||
return(data)
|
||||
}
|
||||
|
||||
chunk_episodes <- function(sensor_episodes){
|
||||
columns_to_drop <- c("timestamp", "duration", "utc_date_time", "local_date_time", "local_date", "local_time", "local_hour", "local_minute", "segment_start", "segment_end", 'timestamp_plus_duration' )
|
||||
|
||||
chunked_episodes <- sensor_episodes %>% separate(col = local_segment,
|
||||
into = c("local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"),
|
||||
sep = "#",
|
||||
remove = FALSE) %>%
|
||||
unite(col = "segment_start", "local_start_date", "local_start_time", sep = " ",remove = TRUE) %>%
|
||||
unite(col = "segment_end", "local_end_date", "local_end_time", sep = " ",remove = TRUE) %>%
|
||||
mutate(local_segment_label = NULL,
|
||||
timestamp_plus_duration = timestamp + (duration * 1000 * 60)) %>%
|
||||
group_by(local_timezone) %>%
|
||||
nest() %>%
|
||||
mutate(
|
||||
data = map(data, ~.x %>% mutate(segment_start = as.numeric(lubridate::ymd_hms(segment_start, tz = local_timezone)) * 1000,
|
||||
segment_end = as.numeric(lubridate::ymd_hms(segment_end, tz = local_timezone)) * 1000)),
|
||||
# We group by episode_id and those variables from the original episodes we want to keep once we summarise
|
||||
data = map(data, ~.x %>% group_by_at(vars(c("episode_id", setdiff(colnames(.x), columns_to_drop) ))) %>%
|
||||
summarize(chunked_start = max(first(timestamp), first(segment_start)),
|
||||
chunked_end = min(last(timestamp_plus_duration), last(segment_end)),
|
||||
duration = (chunked_end - chunked_start) / (1000 * 60 ),
|
||||
chunked_start = format(lubridate::as_datetime(chunked_start / 1000, tz = local_timezone), "%Y-%m-%d %H:%M:%S"),
|
||||
chunked_end = format(lubridate::as_datetime(chunked_end / 1000, tz = local_timezone), "%Y-%m-%d %H:%M:%S")))
|
||||
) %>%
|
||||
unnest(data)
|
||||
|
||||
return(chunked_episodes)
|
||||
}
|
||||
|
||||
fetch_provider_features <- function(provider, provider_key, config_key, sensor_data_file, day_segments_file){
|
||||
|
@ -39,14 +69,14 @@ fetch_provider_features <- function(provider, provider_key, config_key, sensor_d
|
|||
|
||||
sensor_features <- merge(sensor_features, features, all = TRUE)
|
||||
}
|
||||
} else {
|
||||
} else { # This is redundant, if COMPUTE is FALSE this script will be never executed
|
||||
for(feature in provider[["FEATURES"]])
|
||||
sensor_features[,feature] <- NA
|
||||
}
|
||||
|
||||
sensor_features <- sensor_features %>% separate(col = local_segment,
|
||||
into = c("local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"),
|
||||
sep = "#",
|
||||
remove = FALSE)
|
||||
sensor_features <- sensor_features %>% extract(col = local_segment,
|
||||
into = c("local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"),
|
||||
"(.*)#(.*),(.*)",
|
||||
remove = FALSE)
|
||||
return(sensor_features)
|
||||
}
|
|
@ -1,10 +1,12 @@
|
|||
rapids_log_tag = "RAPIDS:"
|
||||
|
||||
def filter_data_by_segment(data, day_segment):
|
||||
date_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2}"
|
||||
hour_regex = "[0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||
segment_regex = "\[({}#{}#{}#{}#{})\]".format(day_segment, date_regex, hour_regex, date_regex, hour_regex)
|
||||
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||
timestamps_regex = "[0-9]{13}"
|
||||
segment_regex = "\[({}#{},{};{},{})\]".format(day_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex)
|
||||
data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
|
||||
data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True)
|
||||
data = data.drop(columns=["assigned_segments"])
|
||||
return(data.dropna(subset = ["local_segment"]))
|
||||
|
||||
def chunk_episodes(sensor_episodes):
|
||||
|
@ -80,9 +82,9 @@ def fetch_provider_features(provider, provider_key, config_key, sensor_data_file
|
|||
for feature in provider["FEATURES"]:
|
||||
sensor_features[feature] = None
|
||||
segment_colums = pd.DataFrame()
|
||||
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="#", expand=True)
|
||||
new_segment_columns = split_segemnt_columns if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"])
|
||||
segment_colums[["local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"]] = new_segment_columns
|
||||
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
||||
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
||||
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
|
||||
for i in range(segment_colums.shape[1]):
|
||||
sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||
|
||||
|
|
Loading…
Reference in New Issue