Optimise assign to day segment
parent
c577229f18
commit
4dc8d38c66
|
@ -6,6 +6,7 @@ PIDS: [test01]
|
||||||
DAY_SEGMENTS: &day_segments
|
DAY_SEGMENTS: &day_segments
|
||||||
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
|
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
|
||||||
FILE: "data/external/daysegments_periodic.csv"
|
FILE: "data/external/daysegments_periodic.csv"
|
||||||
|
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, if set to TRUE we consider day segments back enough in the past as to include the first day of data
|
||||||
|
|
||||||
# Global timezone
|
# Global timezone
|
||||||
# Use codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
|
# Use codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
label,start_time,length,repeats_on,repeats_value
|
label,start_time,length,repeats_on,repeats_value
|
||||||
daily,00:00:00,23H 59M 59S,every_day,0
|
daily,00:00:00,23H 59M 59S,every_day,0
|
||||||
weekly,00:00:00,6D 23H 59M 59S,mday,31
|
weekly,00:00:00,6D 23H 59M 59S,wday,1
|
||||||
morning,06:00:00,5H 59M 59S,every_day,0
|
morning,06:00:00,5H 59M 59S,every_day,0
|
||||||
afternoon,12:00:00,5H 59M 59S,every_day,0
|
afternoon,12:00:00,5H 59M 59S,every_day,0
|
||||||
evening,18:00:00,5H 59M 59S,wday,2
|
evening,18:00:00,5H 59M 59S,wday,2
|
||||||
|
|
|
|
@ -67,7 +67,8 @@ rule readable_datetime:
|
||||||
params:
|
params:
|
||||||
timezones = None,
|
timezones = None,
|
||||||
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
|
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
|
||||||
day_segments_type = config["DAY_SEGMENTS"]["TYPE"]
|
day_segments_type = config["DAY_SEGMENTS"]["TYPE"],
|
||||||
|
include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
|
||||||
wildcard_constraints:
|
wildcard_constraints:
|
||||||
sensor = '.*(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ').*' # only process smartphone sensors, not fitbit
|
sensor = '.*(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ').*' # only process smartphone sensors, not fitbit
|
||||||
output:
|
output:
|
||||||
|
|
|
@ -5,29 +5,28 @@ find_segments_frequency <- function(local_date, local_time_obj, segments){
|
||||||
return(paste(segments %>%
|
return(paste(segments %>%
|
||||||
filter(local_time_obj >= segment_start & local_time_obj <= segment_end) %>%
|
filter(local_time_obj >= segment_start & local_time_obj <= segment_end) %>%
|
||||||
mutate(segment_id = paste0("[",
|
mutate(segment_id = paste0("[",
|
||||||
label, "#",
|
label, "#",
|
||||||
local_date, "#",
|
local_date, "#",
|
||||||
paste(str_pad(hour(segment_start),2, pad="0"), str_pad(minute(segment_start),2, pad="0"), str_pad(second(segment_start),2, pad="0"),sep =":"), "#",
|
paste(str_pad(hour(segment_start),2, pad="0"), str_pad(minute(segment_start),2, pad="0"), str_pad(second(segment_start),2, pad="0"),sep =":"), "#",
|
||||||
local_date, "#",
|
local_date, "#",
|
||||||
paste(str_pad(hour(segment_end),2, pad="0"), str_pad(minute(segment_end),2, pad="0"), str_pad(second(segment_end),2, pad="0"),sep =":"),
|
paste(str_pad(hour(segment_end),2, pad="0"), str_pad(minute(segment_end),2, pad="0"), str_pad(second(segment_end),2, pad="0"),sep =":"),
|
||||||
"]")) %>%
|
"]")) %>%
|
||||||
pull(segment_id), collapse = "|"))
|
pull(segment_id), collapse = "|"))
|
||||||
}
|
}
|
||||||
|
|
||||||
find_segments_periodic <- function(date_time, segments){
|
find_segments_periodic <- function(timestamp, segments){
|
||||||
return(stringi::stri_c(segments[[1]] %>%
|
# We might need to optimise the frequency and event functions as well
|
||||||
filter(date_time %within% segment_interval) %>%
|
return(stringi::stri_c(segments[[1]][segments[[1]]$segment_start_ts<= timestamp & segments[[1]]$segment_end_ts >= timestamp, "segment_id"][["segment_id"]], collapse = "|"))
|
||||||
pull(segment_id), collapse = "|"))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
find_segments_event <- function(timestamp, segments){
|
find_segments_event <- function(timestamp, segments){
|
||||||
return(stringi::stri_c(segments %>%
|
return(stringi::stri_c(segments %>%
|
||||||
filter(timestamp >= segment_start & timestamp <= segment_end) %>%
|
filter(timestamp >= segment_start & timestamp <= segment_end) %>%
|
||||||
pull(segment_id), collapse = "|"))
|
pull(segment_id), collapse = "|"))
|
||||||
}
|
}
|
||||||
|
|
||||||
assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type){
|
assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, include_past_periodic_segments){
|
||||||
|
|
||||||
if(day_segments_type == "FREQUENCY"){ #FREQUENCY
|
if(day_segments_type == "FREQUENCY"){ #FREQUENCY
|
||||||
|
|
||||||
day_segments <- day_segments %>% mutate(segment_start = lubridate::parse_date_time(start_time, orders = c("HMS", "HM")),
|
day_segments <- day_segments %>% mutate(segment_start = lubridate::parse_date_time(start_time, orders = c("HMS", "HM")),
|
||||||
|
@ -37,20 +36,62 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type){
|
||||||
|
|
||||||
} else if (day_segments_type == "PERIODIC"){ #PERIODIC
|
} else if (day_segments_type == "PERIODIC"){ #PERIODIC
|
||||||
|
|
||||||
|
# We need to take into account segment start dates that could include the first day of data
|
||||||
|
day_segments <- day_segments %>% mutate(length_duration = duration(length))
|
||||||
|
wday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "wday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
|
||||||
|
wday_delay <- if_else(is.na(wday_delay) | include_past_periodic_segments == FALSE, duration("0days"), wday_delay)
|
||||||
|
|
||||||
|
mday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "mday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
|
||||||
|
mday_delay <- if_else(is.na(mday_delay) | include_past_periodic_segments == FALSE, duration("0days"), mday_delay)
|
||||||
|
|
||||||
|
qday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "qday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
|
||||||
|
qday_delay <- if_else(is.na(qday_delay) | include_past_periodic_segments == FALSE, duration("0days"), qday_delay)
|
||||||
|
|
||||||
|
yday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "yday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
|
||||||
|
yday_delay <- if_else(is.na(yday_delay) | include_past_periodic_segments == FALSE, duration("0days"), yday_delay)
|
||||||
|
|
||||||
sensor_data <- sensor_data %>%
|
sensor_data <- sensor_data %>%
|
||||||
mutate(row_n = row_number()) %>%
|
# mutate(row_n = row_number()) %>%
|
||||||
group_by(local_timezone) %>%
|
group_by(local_timezone) %>%
|
||||||
nest() %>%
|
nest() %>%
|
||||||
# get existent days that we need to start segments from
|
# get existent days that we need to start segments from
|
||||||
mutate(existent_dates = map(data, ~.x %>%
|
mutate(every_date = map(data, ~.x %>%
|
||||||
distinct(local_date) %>%
|
distinct(local_date) %>%
|
||||||
mutate(local_date_obj = lubridate::ymd(local_date, tz = local_timezone),
|
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
|
||||||
every_day = 0,
|
complete(local_date_obj = seq(min(local_date_obj), max(local_date_obj), by="days")) %>%
|
||||||
wday = wday(local_date_obj, week_start = 1),
|
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
|
||||||
mday = mday(local_date_obj),
|
mutate(every_day = 0)),
|
||||||
qday = qday(local_date_obj),
|
week_dates = map(data, ~.x %>%
|
||||||
yday = yday(local_date_obj)
|
distinct(local_date) %>%
|
||||||
) ),
|
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
|
||||||
|
complete(local_date_obj = seq(date(min(local_date_obj) - wday_delay), max(local_date_obj), by="days")) %>%
|
||||||
|
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
|
||||||
|
mutate(wday = wday(local_date_obj, week_start = 1)) ),
|
||||||
|
month_dates = map(data, ~.x %>%
|
||||||
|
distinct(local_date) %>%
|
||||||
|
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
|
||||||
|
complete(local_date_obj = seq(date(min(local_date_obj) - mday_delay), max(local_date_obj), by="days")) %>%
|
||||||
|
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
|
||||||
|
mutate(mday = mday(local_date_obj))),
|
||||||
|
quarter_dates = map(data, ~.x %>%
|
||||||
|
distinct(local_date) %>%
|
||||||
|
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
|
||||||
|
complete(local_date_obj = seq(date(min(local_date_obj) - qday_delay), max(local_date_obj), by="days")) %>%
|
||||||
|
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
|
||||||
|
mutate(qday = qday(local_date_obj)) ),
|
||||||
|
year_dates = map(data, ~.x %>%
|
||||||
|
distinct(local_date) %>%
|
||||||
|
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
|
||||||
|
complete(local_date_obj = seq(date(min(local_date_obj) - yday_delay), max(local_date_obj), by="days")) %>%
|
||||||
|
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
|
||||||
|
mutate(yday = yday(local_date_obj)) ),
|
||||||
|
existent_dates = pmap(list(every_date, week_dates, month_dates, quarter_dates, year_dates),
|
||||||
|
function(every_date, week_dates, month_dates, quarter_dates, year_dates) reduce(list(every_date, week_dates,month_dates, quarter_dates, year_dates), .f=full_join)),
|
||||||
|
every_date = NULL,
|
||||||
|
week_dates = NULL,
|
||||||
|
month_dates = NULL,
|
||||||
|
quarter_dates = NULL,
|
||||||
|
year_dates = NULL,
|
||||||
# build the actual day segments taking into account the users requested leangth and repeat schedule
|
# build the actual day segments taking into account the users requested leangth and repeat schedule
|
||||||
inferred_day_segments = map(existent_dates,
|
inferred_day_segments = map(existent_dates,
|
||||||
~ crossing(day_segments, .x) %>%
|
~ crossing(day_segments, .x) %>%
|
||||||
|
@ -58,31 +99,31 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type){
|
||||||
filter(repeats_on == day_type & repeats_value == day_value) %>%
|
filter(repeats_on == day_type & repeats_value == day_value) %>%
|
||||||
mutate(segment_start = (lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = local_timezone)),
|
mutate(segment_start = (lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = local_timezone)),
|
||||||
segment_end = segment_start + lubridate::period(length),
|
segment_end = segment_start + lubridate::period(length),
|
||||||
segment_interval = lubridate::interval(segment_start, segment_end),
|
segment_start_ts = as.numeric(segment_start),
|
||||||
|
segment_end_ts = as.numeric(segment_end),
|
||||||
segment_id = paste0("[",
|
segment_id = paste0("[",
|
||||||
paste(sep= "#",
|
paste(sep= "#",
|
||||||
label,
|
label,
|
||||||
lubridate::date(int_start(segment_interval)),
|
lubridate::date(segment_start),
|
||||||
paste(str_pad(hour(int_start(segment_interval)),2, pad="0"),
|
paste(str_pad(hour(segment_start),2, pad="0"),
|
||||||
str_pad(minute(int_start(segment_interval)),2, pad="0"),
|
str_pad(minute(segment_start),2, pad="0"),
|
||||||
str_pad(second(int_start(segment_interval)),2, pad="0"),sep =":"),
|
str_pad(second(segment_start),2, pad="0"),sep =":"),
|
||||||
lubridate::date(int_end(segment_interval)),
|
lubridate::date(segment_end),
|
||||||
paste(str_pad(hour(int_end(segment_interval)),2, pad="0"),
|
paste(str_pad(hour(segment_end),2, pad="0"),
|
||||||
str_pad(minute(int_end(segment_interval)),2, pad="0"),
|
str_pad(minute(segment_end),2, pad="0"),
|
||||||
str_pad(second(int_end(segment_interval)),2, pad="0"),sep =":")
|
str_pad(second(segment_end),2, pad="0"),sep =":")
|
||||||
),
|
),
|
||||||
"]")) %>%
|
"]")) %>%
|
||||||
select(segment_interval, label, segment_id)),
|
select(segment_start_ts, segment_end_ts, segment_id)),
|
||||||
# loop thorugh every day segment and assigned it to the rows that fall within its start and end
|
# loop thorugh every day segment and assigned it to the rows that fall within its start and end
|
||||||
data = map2(data, inferred_day_segments, ~ .x %>% mutate(row_date_time = lubridate::ymd_hms(local_date_time, tz = local_timezone),
|
data = map2(data, inferred_day_segments, ~ .x %>% mutate(row_date_time = as.numeric(lubridate::ymd_hms(local_date_time, tz = local_timezone)),
|
||||||
assigned_segments = map_chr(row_date_time, ~find_segments_periodic(.x, inferred_day_segments))) %>%
|
assigned_segments = map_chr(row_date_time, ~find_segments_periodic(.x, inferred_day_segments)),
|
||||||
select(-row_date_time))
|
row_date_time = NULL))
|
||||||
) %>%
|
) %>%
|
||||||
select(-existent_dates, -inferred_day_segments) %>%
|
select(-existent_dates, -inferred_day_segments) %>%
|
||||||
unnest(cols = data) %>%
|
unnest(cols = data) %>%
|
||||||
arrange(row_n) %>%
|
arrange(timestamp)
|
||||||
select(-row_n)
|
|
||||||
|
|
||||||
|
|
||||||
} else if ( day_segments_type == "EVENT"){
|
} else if ( day_segments_type == "EVENT"){
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,7 @@ day_segments_type <- snakemake@params[["day_segments_type"]]
|
||||||
sensor_output <- snakemake@output[[1]]
|
sensor_output <- snakemake@output[[1]]
|
||||||
timezone_periods <- snakemake@params[["timezone_periods"]]
|
timezone_periods <- snakemake@params[["timezone_periods"]]
|
||||||
fixed_timezone <- snakemake@params[["fixed_timezone"]]
|
fixed_timezone <- snakemake@params[["fixed_timezone"]]
|
||||||
|
include_past_periodic_segments <- snakemake@params[["include_past_periodic_segments"]]
|
||||||
|
|
||||||
split_local_date_time <- function(data, day_segments){
|
split_local_date_time <- function(data, day_segments){
|
||||||
split_data <- data %>%
|
split_data <- data %>%
|
||||||
|
@ -43,6 +44,6 @@ if(!is.null(timezone_periods)){
|
||||||
local_timezone = fixed_timezone,
|
local_timezone = fixed_timezone,
|
||||||
local_date_time = format(utc_date_time, tz = fixed_timezone, "%Y-%m-%d %H:%M:%S"))
|
local_date_time = format(utc_date_time, tz = fixed_timezone, "%Y-%m-%d %H:%M:%S"))
|
||||||
output <- split_local_date_time(output, day_segments)
|
output <- split_local_date_time(output, day_segments)
|
||||||
output <- assign_to_day_segment(output, day_segments, day_segments_type)
|
output <- assign_to_day_segment(output, day_segments, day_segments_type, include_past_periodic_segments)
|
||||||
write_csv(output, sensor_output)
|
write_csv(output, sensor_output)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue