Add check for non-overlapping event day segments

pull/103/head
JulioV 2020-10-23 12:15:26 -04:00
parent 86509207ac
commit 1d04aa6807
4 changed files with 40 additions and 24 deletions

View File

@ -1,9 +1,9 @@
label,event_timestamp,length,shift,shift_direction,pid
stress,1587661220000,1hours,0minutes,1,test01
stress,1587747620000,4hours,4hours,-1,test01
stress,1587906020000,3hours,0minutes,1,test01
stress,1588003220000,7hours,4hours,-1,test01
stress,1588172420000,9hours,0,-1,test01
mood,1587661220000,7days,0,0,p02
mood,1587747620000,7days,0,0,p02
mood,1587906020000,7days,0,0,p02
label,event_timestamp,length,shift,shift_direction,device_id
stress,1587661220000,1hours,0minutes,1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524
stress,1587747620000,4hours,4hours,-1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524
stress,1587906020000,3hours,0minutes,1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524
stress,1588003220000,7hours,4hours,-1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524
stress,1588172420000,9hours,0,-1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524
mood,1587661220000,1hour,0,0,a748ee1a-1d0b-4ae9-9074-279a2b6ba524
mood,1587747620000,1days,0,0,a748ee1a-1d0b-4ae9-9074-279a2b6ba524
mood,1587906020000,7days,0,0,a748ee1a-1d0b-4ae9-9074-279a2b6ba524

1 label event_timestamp length shift shift_direction pid device_id
2 stress 1587661220000 1hours 0minutes 1 test01 a748ee1a-1d0b-4ae9-9074-279a2b6ba524
3 stress 1587747620000 4hours 4hours -1 test01 a748ee1a-1d0b-4ae9-9074-279a2b6ba524
4 stress 1587906020000 3hours 0minutes 1 test01 a748ee1a-1d0b-4ae9-9074-279a2b6ba524
5 stress 1588003220000 7hours 4hours -1 test01 a748ee1a-1d0b-4ae9-9074-279a2b6ba524
6 stress 1588172420000 9hours 0 -1 test01 a748ee1a-1d0b-4ae9-9074-279a2b6ba524
7 mood 1587661220000 7days 1hour 0 0 p02 a748ee1a-1d0b-4ae9-9074-279a2b6ba524
8 mood 1587747620000 7days 1days 0 0 p02 a748ee1a-1d0b-4ae9-9074-279a2b6ba524
9 mood 1587906020000 7days 0 0 p02 a748ee1a-1d0b-4ae9-9074-279a2b6ba524

View File

@ -53,7 +53,8 @@ rule download_fitbit_data:
rule compute_day_segments:
input:
config["DAY_SEGMENTS"]["FILE"]
config["DAY_SEGMENTS"]["FILE"],
"data/external/participant_files/{pid}.yaml"
params:
day_segments_type = config["DAY_SEGMENTS"]["TYPE"],
pid = "{pid}"

View File

@ -138,22 +138,33 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type,
sensor_data <- sensor_data %>%
group_by(local_timezone) %>%
nest() %>%
mutate(inferred_day_segments = map(local_timezone, ~ day_segments %>%
mutate(inferred_day_segments = map(local_timezone, function(tz){
inferred <- day_segments %>%
mutate(shift = ifelse(shift == "0", "0seconds", shift),
segment_start_ts = event_timestamp + (as.integer(seconds(lubridate::duration(shift))) * ifelse(shift_direction >= 0, 1, -1) * 1000),
segment_end_ts = segment_start_ts + (as.integer(seconds(lubridate::duration(length))) * 1000),
# these start and end datetime objects are for labeling only
segment_id_start = lubridate::as_datetime(segment_start_ts/1000, tz = .x),
segment_id_end = lubridate::as_datetime(segment_end_ts/1000, tz = .x),
segment_id_start = lubridate::as_datetime(segment_start_ts/1000, tz = tz),
segment_id_end = lubridate::as_datetime(segment_end_ts/1000, tz = tz),
segment_end_ts = segment_end_ts + 999,
segment_id = paste0("[",
paste0(label,"#",
paste0(lubridate::date(segment_id_start), " ",
paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",",
lubridate::date(segment_id_end), " ",
paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";",
paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",",
lubridate::date(segment_id_end), " ",
paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";",
paste0(segment_start_ts, ",", segment_end_ts)),
"]"))),
"]"))
# Check that for overlapping segments (not allowed because our resampling episode algorithm would have to have a second instead of minute granularity that increases storage and computation time)
overlapping <- inferred %>% group_by(label) %>% arrange(segment_start_ts) %>%
mutate(overlaps = if_else(segment_start_ts <= lag(segment_end_ts), TRUE, FALSE),
overlapping_segments = paste(paste(lag(label), lag(event_timestamp), lag(length), lag(shift), lag(shift_direction), lag(device_id), sep = ","),"and",
paste(label, event_timestamp, length, shift, shift_direction, device_id, sep = ",")))
if(any(overlapping$overlaps, na.rm = TRUE)){
stop(paste0("\n\nOne or more event day segments overlap for ",overlapping$device_id[[1]],", modify their lengths so they don't:\n", paste0(overlapping %>% filter(overlaps == TRUE) %>% pull(overlapping_segments), collapse = "\n"), "\n\n"))
} else{
return(inferred)
}}),
data = map2(data, inferred_day_segments, assign_rows_to_segments)) %>%
select(-inferred_day_segments) %>%
unnest(data) %>%

View File

@ -1,5 +1,6 @@
import pandas as pd
import warnings
import yaml
def is_valid_frequency_segments(day_segments, day_segments_file):
"""
@ -107,9 +108,9 @@ def is_valid_periodic_segments(day_segments, day_segments_file):
def is_valid_event_segments(day_segments, day_segments_file):
day_segments = day_segments.copy(deep=True)
valid_columns = ["label", "event_timestamp", "length", "shift", "shift_direction", "pid"]
valid_columns = ["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]
if len(list(set(day_segments.columns) - set(valid_columns))) > 0:
error_message = 'The EVENT day segments file in [DAY_SEGMENTS][FILE] must have six columns: label, event_timestamp, length, shift, shift_direction and pid ' \
error_message = 'The EVENT day segments file in [DAY_SEGMENTS][FILE] must have six columns: label, event_timestamp, length, shift, shift_direction and device_id ' \
'but instead we found {}. Modify {}'.format(list(day_segments.columns), day_segments_file)
raise ValueError(error_message)
@ -167,10 +168,10 @@ def parse_periodic_segments(day_segments):
day_segments.loc[day_segments["repeats_on"] == "every_day", "repeats_value"] = 0
return day_segments
def parse_event_segments(day_segments, pid):
return day_segments.query("pid == @pid")
def parse_event_segments(day_segments, device_id):
return day_segments.query("device_id == @device_id")
def parse_day_segments(day_segments_file, segments_type, pid):
def parse_day_segments(day_segments_file, segments_type, device_id):
# Add code to validate and parse frequencies, intervals, and events
# Expected formats:
# Frequency: label, length columns (e.g. my_prefix, 5) length has to be in minutes (int)
@ -195,12 +196,15 @@ def parse_day_segments(day_segments_file, segments_type, pid):
elif(segments_type == "PERIODIC" and is_valid_periodic_segments(day_segments, day_segments_file)):
day_segments = parse_periodic_segments(day_segments)
elif(segments_type == "EVENT" and is_valid_event_segments(day_segments, day_segments_file)):
day_segments = parse_event_segments(day_segments, pid)
day_segments = parse_event_segments(day_segments, device_id)
else:
raise ValueError("{} does not have a format compatible with frequency, periodic or event day segments. Please refer to [LINK]".format(day_segments_file))
return day_segments
final_day_segments = parse_day_segments(snakemake.input[0], snakemake.params["day_segments_type"], snakemake.params["pid"])
participant_file = yaml.load(open(snakemake.input[1], 'r'), Loader=yaml.FullLoader)
device_id = participant_file["PHONE"]["DEVICE_IDS"]
device_id = device_id[len(device_id) -1 ]
final_day_segments = parse_day_segments(snakemake.input[0], snakemake.params["day_segments_type"], device_id)
if snakemake.params["day_segments_type"] == "EVENT" and final_day_segments.shape[0] == 0:
warnings.warn("There are no event day segments for {}. Check your day segment file {}".format(snakemake.params["pid"], snakemake.input[0]))