Optimise assign to day segment
parent
c577229f18
commit
4dc8d38c66
|
@ -6,6 +6,7 @@ PIDS: [test01]
|
|||
DAY_SEGMENTS: &day_segments
|
||||
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
|
||||
FILE: "data/external/daysegments_periodic.csv"
|
||||
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data
|
||||
|
||||
# Global timezone
|
||||
# Use codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
label,start_time,length,repeats_on,repeats_value
|
||||
daily,00:00:00,23H 59M 59S,every_day,0
|
||||
weekly,00:00:00,6D 23H 59M 59S,mday,31
|
||||
weekly,00:00:00,6D 23H 59M 59S,wday,1
|
||||
morning,06:00:00,5H 59M 59S,every_day,0
|
||||
afternoon,12:00:00,5H 59M 59S,every_day,0
|
||||
evening,18:00:00,5H 59M 59S,wday,2
|
||||
|
|
|
|
@ -67,7 +67,8 @@ rule readable_datetime:
|
|||
params:
|
||||
timezones = None,
|
||||
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
|
||||
day_segments_type = config["DAY_SEGMENTS"]["TYPE"]
|
||||
day_segments_type = config["DAY_SEGMENTS"]["TYPE"],
|
||||
include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
|
||||
wildcard_constraints:
|
||||
sensor = '.*(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ').*' # only process smartphone sensors, not fitbit
|
||||
output:
|
||||
|
|
|
@ -5,28 +5,27 @@ find_segments_frequency <- function(local_date, local_time_obj, segments){
|
|||
return(paste(segments %>%
|
||||
filter(local_time_obj >= segment_start & local_time_obj <= segment_end) %>%
|
||||
mutate(segment_id = paste0("[",
|
||||
label, "#",
|
||||
local_date, "#",
|
||||
paste(str_pad(hour(segment_start),2, pad="0"), str_pad(minute(segment_start),2, pad="0"), str_pad(second(segment_start),2, pad="0"),sep =":"), "#",
|
||||
local_date, "#",
|
||||
paste(str_pad(hour(segment_end),2, pad="0"), str_pad(minute(segment_end),2, pad="0"), str_pad(second(segment_end),2, pad="0"),sep =":"),
|
||||
"]")) %>%
|
||||
label, "#",
|
||||
local_date, "#",
|
||||
paste(str_pad(hour(segment_start),2, pad="0"), str_pad(minute(segment_start),2, pad="0"), str_pad(second(segment_start),2, pad="0"),sep =":"), "#",
|
||||
local_date, "#",
|
||||
paste(str_pad(hour(segment_end),2, pad="0"), str_pad(minute(segment_end),2, pad="0"), str_pad(second(segment_end),2, pad="0"),sep =":"),
|
||||
"]")) %>%
|
||||
pull(segment_id), collapse = "|"))
|
||||
}
|
||||
|
||||
find_segments_periodic <- function(date_time, segments){
|
||||
return(stringi::stri_c(segments[[1]] %>%
|
||||
filter(date_time %within% segment_interval) %>%
|
||||
pull(segment_id), collapse = "|"))
|
||||
find_segments_periodic <- function(timestamp, segments){
|
||||
# We might need to optimise the frequency and event functions as well
|
||||
return(stringi::stri_c(segments[[1]][segments[[1]]$segment_start_ts<= timestamp & segments[[1]]$segment_end_ts >= timestamp, "segment_id"][["segment_id"]], collapse = "|"))
|
||||
}
|
||||
|
||||
find_segments_event <- function(timestamp, segments){
|
||||
return(stringi::stri_c(segments %>%
|
||||
filter(timestamp >= segment_start & timestamp <= segment_end) %>%
|
||||
pull(segment_id), collapse = "|"))
|
||||
filter(timestamp >= segment_start & timestamp <= segment_end) %>%
|
||||
pull(segment_id), collapse = "|"))
|
||||
}
|
||||
|
||||
assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type){
|
||||
assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, include_past_periodic_segments){
|
||||
|
||||
if(day_segments_type == "FREQUENCY"){ #FREQUENCY
|
||||
|
||||
|
@ -37,20 +36,62 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type){
|
|||
|
||||
} else if (day_segments_type == "PERIODIC"){ #PERIODIC
|
||||
|
||||
# We need to take into account segment start dates that could include the first day of data
|
||||
day_segments <- day_segments %>% mutate(length_duration = duration(length))
|
||||
wday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "wday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
|
||||
wday_delay <- if_else(is.na(wday_delay) | include_past_periodic_segments == FALSE, duration("0days"), wday_delay)
|
||||
|
||||
mday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "mday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
|
||||
mday_delay <- if_else(is.na(mday_delay) | include_past_periodic_segments == FALSE, duration("0days"), mday_delay)
|
||||
|
||||
qday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "qday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
|
||||
qday_delay <- if_else(is.na(qday_delay) | include_past_periodic_segments == FALSE, duration("0days"), qday_delay)
|
||||
|
||||
yday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "yday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
|
||||
yday_delay <- if_else(is.na(yday_delay) | include_past_periodic_segments == FALSE, duration("0days"), yday_delay)
|
||||
|
||||
sensor_data <- sensor_data %>%
|
||||
mutate(row_n = row_number()) %>%
|
||||
# mutate(row_n = row_number()) %>%
|
||||
group_by(local_timezone) %>%
|
||||
nest() %>%
|
||||
# get existent days that we need to start segments from
|
||||
mutate(existent_dates = map(data, ~.x %>%
|
||||
mutate(every_date = map(data, ~.x %>%
|
||||
distinct(local_date) %>%
|
||||
mutate(local_date_obj = lubridate::ymd(local_date, tz = local_timezone),
|
||||
every_day = 0,
|
||||
wday = wday(local_date_obj, week_start = 1),
|
||||
mday = mday(local_date_obj),
|
||||
qday = qday(local_date_obj),
|
||||
yday = yday(local_date_obj)
|
||||
) ),
|
||||
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
|
||||
complete(local_date_obj = seq(min(local_date_obj), max(local_date_obj), by="days")) %>%
|
||||
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
|
||||
mutate(every_day = 0)),
|
||||
week_dates = map(data, ~.x %>%
|
||||
distinct(local_date) %>%
|
||||
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
|
||||
complete(local_date_obj = seq(date(min(local_date_obj) - wday_delay), max(local_date_obj), by="days")) %>%
|
||||
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
|
||||
mutate(wday = wday(local_date_obj, week_start = 1)) ),
|
||||
month_dates = map(data, ~.x %>%
|
||||
distinct(local_date) %>%
|
||||
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
|
||||
complete(local_date_obj = seq(date(min(local_date_obj) - mday_delay), max(local_date_obj), by="days")) %>%
|
||||
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
|
||||
mutate(mday = mday(local_date_obj))),
|
||||
quarter_dates = map(data, ~.x %>%
|
||||
distinct(local_date) %>%
|
||||
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
|
||||
complete(local_date_obj = seq(date(min(local_date_obj) - qday_delay), max(local_date_obj), by="days")) %>%
|
||||
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
|
||||
mutate(qday = qday(local_date_obj)) ),
|
||||
year_dates = map(data, ~.x %>%
|
||||
distinct(local_date) %>%
|
||||
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
|
||||
complete(local_date_obj = seq(date(min(local_date_obj) - yday_delay), max(local_date_obj), by="days")) %>%
|
||||
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
|
||||
mutate(yday = yday(local_date_obj)) ),
|
||||
existent_dates = pmap(list(every_date, week_dates, month_dates, quarter_dates, year_dates),
|
||||
function(every_date, week_dates, month_dates, quarter_dates, year_dates) reduce(list(every_date, week_dates,month_dates, quarter_dates, year_dates), .f=full_join)),
|
||||
every_date = NULL,
|
||||
week_dates = NULL,
|
||||
month_dates = NULL,
|
||||
quarter_dates = NULL,
|
||||
year_dates = NULL,
|
||||
# build the actual day segments taking into account the users requested leangth and repeat schedule
|
||||
inferred_day_segments = map(existent_dates,
|
||||
~ crossing(day_segments, .x) %>%
|
||||
|
@ -58,30 +99,30 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type){
|
|||
filter(repeats_on == day_type & repeats_value == day_value) %>%
|
||||
mutate(segment_start = (lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = local_timezone)),
|
||||
segment_end = segment_start + lubridate::period(length),
|
||||
segment_interval = lubridate::interval(segment_start, segment_end),
|
||||
segment_start_ts = as.numeric(segment_start),
|
||||
segment_end_ts = as.numeric(segment_end),
|
||||
segment_id = paste0("[",
|
||||
paste(sep= "#",
|
||||
label,
|
||||
lubridate::date(int_start(segment_interval)),
|
||||
paste(str_pad(hour(int_start(segment_interval)),2, pad="0"),
|
||||
str_pad(minute(int_start(segment_interval)),2, pad="0"),
|
||||
str_pad(second(int_start(segment_interval)),2, pad="0"),sep =":"),
|
||||
lubridate::date(int_end(segment_interval)),
|
||||
paste(str_pad(hour(int_end(segment_interval)),2, pad="0"),
|
||||
str_pad(minute(int_end(segment_interval)),2, pad="0"),
|
||||
str_pad(second(int_end(segment_interval)),2, pad="0"),sep =":")
|
||||
),
|
||||
"]")) %>%
|
||||
select(segment_interval, label, segment_id)),
|
||||
paste(sep= "#",
|
||||
label,
|
||||
lubridate::date(segment_start),
|
||||
paste(str_pad(hour(segment_start),2, pad="0"),
|
||||
str_pad(minute(segment_start),2, pad="0"),
|
||||
str_pad(second(segment_start),2, pad="0"),sep =":"),
|
||||
lubridate::date(segment_end),
|
||||
paste(str_pad(hour(segment_end),2, pad="0"),
|
||||
str_pad(minute(segment_end),2, pad="0"),
|
||||
str_pad(second(segment_end),2, pad="0"),sep =":")
|
||||
),
|
||||
"]")) %>%
|
||||
select(segment_start_ts, segment_end_ts, segment_id)),
|
||||
# loop thorugh every day segment and assigned it to the rows that fall within its start and end
|
||||
data = map2(data, inferred_day_segments, ~ .x %>% mutate(row_date_time = lubridate::ymd_hms(local_date_time, tz = local_timezone),
|
||||
assigned_segments = map_chr(row_date_time, ~find_segments_periodic(.x, inferred_day_segments))) %>%
|
||||
select(-row_date_time))
|
||||
data = map2(data, inferred_day_segments, ~ .x %>% mutate(row_date_time = as.numeric(lubridate::ymd_hms(local_date_time, tz = local_timezone)),
|
||||
assigned_segments = map_chr(row_date_time, ~find_segments_periodic(.x, inferred_day_segments)),
|
||||
row_date_time = NULL))
|
||||
) %>%
|
||||
select(-existent_dates, -inferred_day_segments) %>%
|
||||
unnest(cols = data) %>%
|
||||
arrange(row_n) %>%
|
||||
select(-row_n)
|
||||
arrange(timestamp)
|
||||
|
||||
|
||||
} else if ( day_segments_type == "EVENT"){
|
||||
|
|
|
@ -10,6 +10,7 @@ day_segments_type <- snakemake@params[["day_segments_type"]]
|
|||
sensor_output <- snakemake@output[[1]]
|
||||
timezone_periods <- snakemake@params[["timezone_periods"]]
|
||||
fixed_timezone <- snakemake@params[["fixed_timezone"]]
|
||||
include_past_periodic_segments <- snakemake@params[["include_past_periodic_segments"]]
|
||||
|
||||
split_local_date_time <- function(data, day_segments){
|
||||
split_data <- data %>%
|
||||
|
@ -43,6 +44,6 @@ if(!is.null(timezone_periods)){
|
|||
local_timezone = fixed_timezone,
|
||||
local_date_time = format(utc_date_time, tz = fixed_timezone, "%Y-%m-%d %H:%M:%S"))
|
||||
output <- split_local_date_time(output, day_segments)
|
||||
output <- assign_to_day_segment(output, day_segments, day_segments_type)
|
||||
output <- assign_to_day_segment(output, day_segments, day_segments_type, include_past_periodic_segments)
|
||||
write_csv(output, sensor_output)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue