Optimise assign to day segment

pull/103/head
JulioV 2020-09-16 14:54:20 -04:00
parent c577229f18
commit 4dc8d38c66
5 changed files with 90 additions and 46 deletions

View File

@ -6,6 +6,7 @@ PIDS: [test01]
DAY_SEGMENTS: &day_segments DAY_SEGMENTS: &day_segments
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
FILE: "data/external/daysegments_periodic.csv" FILE: "data/external/daysegments_periodic.csv"
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data
# Global timezone # Global timezone
# Use codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones # Use codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones

View File

@ -1,6 +1,6 @@
label,start_time,length,repeats_on,repeats_value label,start_time,length,repeats_on,repeats_value
daily,00:00:00,23H 59M 59S,every_day,0 daily,00:00:00,23H 59M 59S,every_day,0
weekly,00:00:00,6D 23H 59M 59S,mday,31 weekly,00:00:00,6D 23H 59M 59S,wday,1
morning,06:00:00,5H 59M 59S,every_day,0 morning,06:00:00,5H 59M 59S,every_day,0
afternoon,12:00:00,5H 59M 59S,every_day,0 afternoon,12:00:00,5H 59M 59S,every_day,0
evening,18:00:00,5H 59M 59S,wday,2 evening,18:00:00,5H 59M 59S,wday,2

1 label start_time length repeats_on repeats_value
2 daily 00:00:00 23H 59M 59S every_day 0
3 weekly 00:00:00 6D 23H 59M 59S mday wday 31 1
4 morning 06:00:00 5H 59M 59S every_day 0
5 afternoon 12:00:00 5H 59M 59S every_day 0
6 evening 18:00:00 5H 59M 59S wday 2

View File

@ -67,7 +67,8 @@ rule readable_datetime:
params: params:
timezones = None, timezones = None,
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"], fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
day_segments_type = config["DAY_SEGMENTS"]["TYPE"] day_segments_type = config["DAY_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
wildcard_constraints: wildcard_constraints:
sensor = '.*(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ').*' # only process smartphone sensors, not fitbit sensor = '.*(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ').*' # only process smartphone sensors, not fitbit
output: output:

View File

@ -5,29 +5,28 @@ find_segments_frequency <- function(local_date, local_time_obj, segments){
return(paste(segments %>% return(paste(segments %>%
filter(local_time_obj >= segment_start & local_time_obj <= segment_end) %>% filter(local_time_obj >= segment_start & local_time_obj <= segment_end) %>%
mutate(segment_id = paste0("[", mutate(segment_id = paste0("[",
label, "#", label, "#",
local_date, "#", local_date, "#",
paste(str_pad(hour(segment_start),2, pad="0"), str_pad(minute(segment_start),2, pad="0"), str_pad(second(segment_start),2, pad="0"),sep =":"), "#", paste(str_pad(hour(segment_start),2, pad="0"), str_pad(minute(segment_start),2, pad="0"), str_pad(second(segment_start),2, pad="0"),sep =":"), "#",
local_date, "#", local_date, "#",
paste(str_pad(hour(segment_end),2, pad="0"), str_pad(minute(segment_end),2, pad="0"), str_pad(second(segment_end),2, pad="0"),sep =":"), paste(str_pad(hour(segment_end),2, pad="0"), str_pad(minute(segment_end),2, pad="0"), str_pad(second(segment_end),2, pad="0"),sep =":"),
"]")) %>% "]")) %>%
pull(segment_id), collapse = "|")) pull(segment_id), collapse = "|"))
} }
find_segments_periodic <- function(date_time, segments){ find_segments_periodic <- function(timestamp, segments){
return(stringi::stri_c(segments[[1]] %>% # We might need to optimise the frequency and event functions as well
filter(date_time %within% segment_interval) %>% return(stringi::stri_c(segments[[1]][segments[[1]]$segment_start_ts<= timestamp & segments[[1]]$segment_end_ts >= timestamp, "segment_id"][["segment_id"]], collapse = "|"))
pull(segment_id), collapse = "|"))
} }
find_segments_event <- function(timestamp, segments){ find_segments_event <- function(timestamp, segments){
return(stringi::stri_c(segments %>% return(stringi::stri_c(segments %>%
filter(timestamp >= segment_start & timestamp <= segment_end) %>% filter(timestamp >= segment_start & timestamp <= segment_end) %>%
pull(segment_id), collapse = "|")) pull(segment_id), collapse = "|"))
} }
assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type){ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, include_past_periodic_segments){
if(day_segments_type == "FREQUENCY"){ #FREQUENCY if(day_segments_type == "FREQUENCY"){ #FREQUENCY
day_segments <- day_segments %>% mutate(segment_start = lubridate::parse_date_time(start_time, orders = c("HMS", "HM")), day_segments <- day_segments %>% mutate(segment_start = lubridate::parse_date_time(start_time, orders = c("HMS", "HM")),
@ -37,20 +36,62 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type){
} else if (day_segments_type == "PERIODIC"){ #PERIODIC } else if (day_segments_type == "PERIODIC"){ #PERIODIC
# We need to take into account segment start dates that could include the first day of data
day_segments <- day_segments %>% mutate(length_duration = duration(length))
wday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "wday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
wday_delay <- if_else(is.na(wday_delay) | include_past_periodic_segments == FALSE, duration("0days"), wday_delay)
mday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "mday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
mday_delay <- if_else(is.na(mday_delay) | include_past_periodic_segments == FALSE, duration("0days"), mday_delay)
qday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "qday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
qday_delay <- if_else(is.na(qday_delay) | include_past_periodic_segments == FALSE, duration("0days"), qday_delay)
yday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "yday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
yday_delay <- if_else(is.na(yday_delay) | include_past_periodic_segments == FALSE, duration("0days"), yday_delay)
sensor_data <- sensor_data %>% sensor_data <- sensor_data %>%
mutate(row_n = row_number()) %>% # mutate(row_n = row_number()) %>%
group_by(local_timezone) %>% group_by(local_timezone) %>%
nest() %>% nest() %>%
# get existent days that we need to start segments from # get existent days that we need to start segments from
mutate(existent_dates = map(data, ~.x %>% mutate(every_date = map(data, ~.x %>%
distinct(local_date) %>% distinct(local_date) %>%
mutate(local_date_obj = lubridate::ymd(local_date, tz = local_timezone), mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
every_day = 0, complete(local_date_obj = seq(min(local_date_obj), max(local_date_obj), by="days")) %>%
wday = wday(local_date_obj, week_start = 1), mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
mday = mday(local_date_obj), mutate(every_day = 0)),
qday = qday(local_date_obj), week_dates = map(data, ~.x %>%
yday = yday(local_date_obj) distinct(local_date) %>%
) ), mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
complete(local_date_obj = seq(date(min(local_date_obj) - wday_delay), max(local_date_obj), by="days")) %>%
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
mutate(wday = wday(local_date_obj, week_start = 1)) ),
month_dates = map(data, ~.x %>%
distinct(local_date) %>%
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
complete(local_date_obj = seq(date(min(local_date_obj) - mday_delay), max(local_date_obj), by="days")) %>%
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
mutate(mday = mday(local_date_obj))),
quarter_dates = map(data, ~.x %>%
distinct(local_date) %>%
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
complete(local_date_obj = seq(date(min(local_date_obj) - qday_delay), max(local_date_obj), by="days")) %>%
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
mutate(qday = qday(local_date_obj)) ),
year_dates = map(data, ~.x %>%
distinct(local_date) %>%
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
complete(local_date_obj = seq(date(min(local_date_obj) - yday_delay), max(local_date_obj), by="days")) %>%
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
mutate(yday = yday(local_date_obj)) ),
existent_dates = pmap(list(every_date, week_dates, month_dates, quarter_dates, year_dates),
function(every_date, week_dates, month_dates, quarter_dates, year_dates) reduce(list(every_date, week_dates,month_dates, quarter_dates, year_dates), .f=full_join)),
every_date = NULL,
week_dates = NULL,
month_dates = NULL,
quarter_dates = NULL,
year_dates = NULL,
# build the actual day segments taking into account the users requested leangth and repeat schedule # build the actual day segments taking into account the users requested leangth and repeat schedule
inferred_day_segments = map(existent_dates, inferred_day_segments = map(existent_dates,
~ crossing(day_segments, .x) %>% ~ crossing(day_segments, .x) %>%
@ -58,31 +99,31 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type){
filter(repeats_on == day_type & repeats_value == day_value) %>% filter(repeats_on == day_type & repeats_value == day_value) %>%
mutate(segment_start = (lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = local_timezone)), mutate(segment_start = (lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = local_timezone)),
segment_end = segment_start + lubridate::period(length), segment_end = segment_start + lubridate::period(length),
segment_interval = lubridate::interval(segment_start, segment_end), segment_start_ts = as.numeric(segment_start),
segment_end_ts = as.numeric(segment_end),
segment_id = paste0("[", segment_id = paste0("[",
paste(sep= "#", paste(sep= "#",
label, label,
lubridate::date(int_start(segment_interval)), lubridate::date(segment_start),
paste(str_pad(hour(int_start(segment_interval)),2, pad="0"), paste(str_pad(hour(segment_start),2, pad="0"),
str_pad(minute(int_start(segment_interval)),2, pad="0"), str_pad(minute(segment_start),2, pad="0"),
str_pad(second(int_start(segment_interval)),2, pad="0"),sep =":"), str_pad(second(segment_start),2, pad="0"),sep =":"),
lubridate::date(int_end(segment_interval)), lubridate::date(segment_end),
paste(str_pad(hour(int_end(segment_interval)),2, pad="0"), paste(str_pad(hour(segment_end),2, pad="0"),
str_pad(minute(int_end(segment_interval)),2, pad="0"), str_pad(minute(segment_end),2, pad="0"),
str_pad(second(int_end(segment_interval)),2, pad="0"),sep =":") str_pad(second(segment_end),2, pad="0"),sep =":")
), ),
"]")) %>% "]")) %>%
select(segment_interval, label, segment_id)), select(segment_start_ts, segment_end_ts, segment_id)),
# loop thorugh every day segment and assigned it to the rows that fall within its start and end # loop thorugh every day segment and assigned it to the rows that fall within its start and end
data = map2(data, inferred_day_segments, ~ .x %>% mutate(row_date_time = lubridate::ymd_hms(local_date_time, tz = local_timezone), data = map2(data, inferred_day_segments, ~ .x %>% mutate(row_date_time = as.numeric(lubridate::ymd_hms(local_date_time, tz = local_timezone)),
assigned_segments = map_chr(row_date_time, ~find_segments_periodic(.x, inferred_day_segments))) %>% assigned_segments = map_chr(row_date_time, ~find_segments_periodic(.x, inferred_day_segments)),
select(-row_date_time)) row_date_time = NULL))
) %>% ) %>%
select(-existent_dates, -inferred_day_segments) %>% select(-existent_dates, -inferred_day_segments) %>%
unnest(cols = data) %>% unnest(cols = data) %>%
arrange(row_n) %>% arrange(timestamp)
select(-row_n)
} else if ( day_segments_type == "EVENT"){ } else if ( day_segments_type == "EVENT"){

View File

@ -10,6 +10,7 @@ day_segments_type <- snakemake@params[["day_segments_type"]]
sensor_output <- snakemake@output[[1]] sensor_output <- snakemake@output[[1]]
timezone_periods <- snakemake@params[["timezone_periods"]] timezone_periods <- snakemake@params[["timezone_periods"]]
fixed_timezone <- snakemake@params[["fixed_timezone"]] fixed_timezone <- snakemake@params[["fixed_timezone"]]
include_past_periodic_segments <- snakemake@params[["include_past_periodic_segments"]]
split_local_date_time <- function(data, day_segments){ split_local_date_time <- function(data, day_segments){
split_data <- data %>% split_data <- data %>%
@ -43,6 +44,6 @@ if(!is.null(timezone_periods)){
local_timezone = fixed_timezone, local_timezone = fixed_timezone,
local_date_time = format(utc_date_time, tz = fixed_timezone, "%Y-%m-%d %H:%M:%S")) local_date_time = format(utc_date_time, tz = fixed_timezone, "%Y-%m-%d %H:%M:%S"))
output <- split_local_date_time(output, day_segments) output <- split_local_date_time(output, day_segments)
output <- assign_to_day_segment(output, day_segments, day_segments_type) output <- assign_to_day_segment(output, day_segments, day_segments_type, include_past_periodic_segments)
write_csv(output, sensor_output) write_csv(output, sensor_output)
} }