diff --git a/data/external/daysegments_event.csv b/data/external/daysegments_event.csv index b46c9ae2..ef6d58bd 100644 --- a/data/external/daysegments_event.csv +++ b/data/external/daysegments_event.csv @@ -1,9 +1,9 @@ -label,event_timestamp,length,shift,shift_direction,pid -stress,1587661220000,1hours,0minutes,1,test01 -stress,1587747620000,4hours,4hours,-1,test01 -stress,1587906020000,3hours,0minutes,1,test01 -stress,1588003220000,7hours,4hours,-1,test01 -stress,1588172420000,9hours,0,-1,test01 -mood,1587661220000,7days,0,0,p02 -mood,1587747620000,7days,0,0,p02 -mood,1587906020000,7days,0,0,p02 +label,event_timestamp,length,shift,shift_direction,device_id +stress,1587661220000,1hours,0minutes,1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 +stress,1587747620000,4hours,4hours,-1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 +stress,1587906020000,3hours,0minutes,1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 +stress,1588003220000,7hours,4hours,-1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 +stress,1588172420000,9hours,0,-1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 +mood,1587661220000,1hour,0,0,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 +mood,1587747620000,1days,0,0,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 +mood,1587906020000,7days,0,0,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index fe2866f6..b0a59255 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -53,7 +53,8 @@ rule download_fitbit_data: rule compute_day_segments: input: - config["DAY_SEGMENTS"]["FILE"] + config["DAY_SEGMENTS"]["FILE"], + "data/external/participant_files/{pid}.yaml" params: day_segments_type = config["DAY_SEGMENTS"]["TYPE"], pid = "{pid}" diff --git a/src/data/assign_to_day_segment.R b/src/data/assign_to_day_segment.R index 26ffcdc1..18dab2df 100644 --- a/src/data/assign_to_day_segment.R +++ b/src/data/assign_to_day_segment.R @@ -138,22 +138,33 @@ assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, sensor_data <- sensor_data %>% group_by(local_timezone) %>% nest() %>% - mutate(inferred_day_segments = map(local_timezone, ~ day_segments %>% + mutate(inferred_day_segments = map(local_timezone, function(tz){ + inferred <- day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift), segment_start_ts = event_timestamp + (as.integer(seconds(lubridate::duration(shift))) * ifelse(shift_direction >= 0, 1, -1) * 1000), segment_end_ts = segment_start_ts + (as.integer(seconds(lubridate::duration(length))) * 1000), # these start and end datetime objects are for labeling only - segment_id_start = lubridate::as_datetime(segment_start_ts/1000, tz = .x), - segment_id_end = lubridate::as_datetime(segment_end_ts/1000, tz = .x), + segment_id_start = lubridate::as_datetime(segment_start_ts/1000, tz = tz), + segment_id_end = lubridate::as_datetime(segment_end_ts/1000, tz = tz), segment_end_ts = segment_end_ts + 999, segment_id = paste0("[", paste0(label,"#", paste0(lubridate::date(segment_id_start), " ", - paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",", - lubridate::date(segment_id_end), " ", - paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";", + paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",", + lubridate::date(segment_id_end), " ", + paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";", paste0(segment_start_ts, ",", segment_end_ts)), - "]"))), + "]")) + # Check that for overlapping segments (not allowed because our resampling episode algorithm would have to have a second instead of minute granularity that increases storage and computation time) + overlapping <- inferred %>% group_by(label) %>% arrange(segment_start_ts) %>% + mutate(overlaps = if_else(segment_start_ts <= lag(segment_end_ts), TRUE, FALSE), + overlapping_segments = paste(paste(lag(label), lag(event_timestamp), lag(length), lag(shift), lag(shift_direction), lag(device_id), sep = ","),"and", + paste(label, event_timestamp, length, shift, shift_direction, device_id, sep = ","))) + if(any(overlapping$overlaps, na.rm = TRUE)){ + stop(paste0("\n\nOne or more event day segments overlap for ",overlapping$device_id[[1]],", modify their lengths so they don't:\n", paste0(overlapping %>% filter(overlaps == TRUE) %>% pull(overlapping_segments), collapse = "\n"), "\n\n")) + } else{ + return(inferred) + }}), data = map2(data, inferred_day_segments, assign_rows_to_segments)) %>% select(-inferred_day_segments) %>% unnest(data) %>% diff --git a/src/data/compute_day_segments.py b/src/data/compute_day_segments.py index 0b66593c..be07dddc 100644 --- a/src/data/compute_day_segments.py +++ b/src/data/compute_day_segments.py @@ -1,5 +1,6 @@ import pandas as pd import warnings +import yaml def is_valid_frequency_segments(day_segments, day_segments_file): """ @@ -107,9 +108,9 @@ def is_valid_periodic_segments(day_segments, day_segments_file): def is_valid_event_segments(day_segments, day_segments_file): day_segments = day_segments.copy(deep=True) - valid_columns = ["label", "event_timestamp", "length", "shift", "shift_direction", "pid"] + valid_columns = ["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"] if len(list(set(day_segments.columns) - set(valid_columns))) > 0: - error_message = 'The EVENT day segments file in [DAY_SEGMENTS][FILE] must have six columns: label, event_timestamp, length, shift, shift_direction and pid ' \ + error_message = 'The EVENT day segments file in [DAY_SEGMENTS][FILE] must have six columns: label, event_timestamp, length, shift, shift_direction and device_id ' \ 'but instead we found {}. Modify {}'.format(list(day_segments.columns), day_segments_file) raise ValueError(error_message) @@ -167,10 +168,10 @@ def parse_periodic_segments(day_segments): day_segments.loc[day_segments["repeats_on"] == "every_day", "repeats_value"] = 0 return day_segments -def parse_event_segments(day_segments, pid): - return day_segments.query("pid == @pid") +def parse_event_segments(day_segments, device_id): + return day_segments.query("device_id == @device_id") -def parse_day_segments(day_segments_file, segments_type, pid): +def parse_day_segments(day_segments_file, segments_type, device_id): # Add code to validate and parse frequencies, intervals, and events # Expected formats: # Frequency: label, length columns (e.g. my_prefix, 5) length has to be in minutes (int) @@ -195,12 +196,15 @@ def parse_day_segments(day_segments_file, segments_type, pid): elif(segments_type == "PERIODIC" and is_valid_periodic_segments(day_segments, day_segments_file)): day_segments = parse_periodic_segments(day_segments) elif(segments_type == "EVENT" and is_valid_event_segments(day_segments, day_segments_file)): - day_segments = parse_event_segments(day_segments, pid) + day_segments = parse_event_segments(day_segments, device_id) else: raise ValueError("{} does not have a format compatible with frequency, periodic or event day segments. Please refer to [LINK]".format(day_segments_file)) return day_segments -final_day_segments = parse_day_segments(snakemake.input[0], snakemake.params["day_segments_type"], snakemake.params["pid"]) +participant_file = yaml.load(open(snakemake.input[1], 'r'), Loader=yaml.FullLoader) +device_id = participant_file["PHONE"]["DEVICE_IDS"] +device_id = device_id[len(device_id) -1 ] +final_day_segments = parse_day_segments(snakemake.input[0], snakemake.params["day_segments_type"], device_id) if snakemake.params["day_segments_type"] == "EVENT" and final_day_segments.shape[0] == 0: warnings.warn("There are no event day segments for {}. Check your day segment file {}".format(snakemake.params["pid"], snakemake.input[0]))