From a0b5b5982bcb31ffaf5d44921ead5233fc30c2b0 Mon Sep 17 00:00:00 2001 From: JulioV Date: Wed, 22 Jul 2020 21:54:19 -0400 Subject: [PATCH] Setup rules and files to support multiple --- config.yaml | 17 ++++++- rules/common.smk | 26 +++++++++++ rules/features.smk | 6 +-- rules/preprocessing.smk | 17 ++++++- src/data/compute_day_segments.py | 25 +++++++++++ src/data/readable_datetime.R | 59 +++++++++++++++++++------ src/features/bluetooth/bluetooth_base.R | 2 +- src/features/bluetooth_features.R | 8 ++-- 8 files changed, 135 insertions(+), 25 deletions(-) create mode 100644 src/data/compute_day_segments.py diff --git a/config.yaml b/config.yaml index c40a2e02..b75946aa 100644 --- a/config.yaml +++ b/config.yaml @@ -1,11 +1,24 @@ # Participants to include in the analysis # You must create a file for each participant named pXXX containing their device_id. This can be done manually or automatically -PIDS: [test01] +PIDS: [t01] # Global var with common day segments DAY_SEGMENTS: &day_segments [daily, morning, afternoon, evening, night] +DAY_SEGMENTS2: &day_segments2 + # Day segments can be computed based on three strategies + # Frequency based: Set SEGMENTS to a number representing the length of a segment in minutes: 15. Every day will be divided in n segments of SEGMENTS minutes starting at midnight. + # Interval based: Set SEGMENTS to a string containing a JSON array with an element for each segment containing a label, and start and end time in 24 hour format. + # For example: '{"daily": {"00:00", "23:59"}, "morning": {"06:00", "11:59"}}'. Note the string is single quoted and each value double quoted. + # Event based: Set SEGMENTS to a string with a path to a csv file with two columns, a unix timestamp column in milliseconds called "timestamp" and a string column called "label". + # Every row represents a meaningful event around which features will be extracted, each label should be unique. See EVENT_TIME_SHIFT and EVENT_SEGMENT_DURATION + # If you want daily features, create a segment with label "daily". DO NOT use "daily" to label any other segment + # ------------------------------------------------------------------------------ + SEGMENTS: '[["daily", "00:00", "23:59"], ["morning", "06:00", "11:59"], ["evening", "18:00", "23:59"]]' + EVENT_TIME_SHIFT: 0 # Postive or negative number of minutes. A day segment will start EVENT_TIME_SHIFT minutes before or after each meaningful event. Only used if SEGMENTS is a valid event file (see above). + EVENT_SEGMENT_DURATION: 60 # Lengh of every day_segment around each meaningful event. Only used if SEGMENTS is a valid event file (see above). + # Global timezone # Use codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones # Double check your code, for example EST is not US Eastern Time. @@ -97,7 +110,7 @@ DORYAB_LOCATION: BLUETOOTH: COMPUTE: False DB_TABLE: bluetooth - DAY_SEGMENTS: *day_segments + DAY_SEGMENTS: *day_segments2 FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] ACTIVITY_RECOGNITION: diff --git a/rules/common.smk b/rules/common.smk index bebbac44..15836272 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -111,3 +111,29 @@ def optional_heatmap_days_by_sensors_input(wildcards): tables_platform = [table for table in config["HEATMAP_DAYS_BY_SENSORS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform) +def optional_day_segments_input(wildcards): + return [] + +def find_day_segments_argument(wildcards, argument): + for key, values in config.items(): + if "DAY_SEGMENTS" in config[key] and "DB_TABLE" in config[key] and config[key]["DB_TABLE"] == wildcards.sensor: + return config[key]["DAY_SEGMENTS"][argument] + +def hash_day_segments(config_section): + # TODO hash the content of the interval file instead of SEGMENTS when SEGMENTS is a path + return hashlib.sha1(config_section["SEGMENTS"].encode('utf-8')).hexdigest() + +def is_valid_day_segment_configuration(sensor, config_section): + if not (isinstance(config_section, collections.OrderedDict) or isinstance(config_section, dict)): + raise ValueError("The DAY_SEGMENTS parameter in the {} config section should be a dictionary with three parameters: SEGMENTS (str), EVENT_TIME_SHIFT (int), and EVENT_SEGMENT_DURATION (int)".format(sensor)) + for attribute in ["SEGMENTS", "EVENT_TIME_SHIFT", "EVENT_SEGMENT_DURATION"]: + if not attribute in config_section: + raise ValueError("The config[{}][DAY_SEGMENTS] section should have an attribute named {}".format(sensor, attribute)) + + if not isinstance(config_section["SEGMENTS"], str): + raise ValueError("The config[{}][DAY_SEGMENTS][SEGMENTS] variable should be a string".format(sensor)) + if not isinstance(config_section["EVENT_TIME_SHIFT"], int): + raise ValueError("The config[{}][DAY_SEGMENTS][EVENT_TIME_SHIFT] variable should be an integer".format(sensor)) + if not isinstance(config_section["EVENT_SEGMENT_DURATION"], int): + raise ValueError("The config[{}][DAY_SEGMENTS][EVENT_SEGMENT_DURATION] variable should be an integer".format(sensor)) + return True diff --git a/rules/features.smk b/rules/features.smk index 4a55059d..57f5db21 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -88,12 +88,12 @@ rule location_doryab_features: rule bluetooth_features: input: - expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"]) + expand("data/raw/{{pid}}/{sensor}_with_datetime_{{hash}}.csv", sensor=config["BLUETOOTH"]["DB_TABLE"]), + day_segments = expand("data/interim/{{pid}}/{sensor}_day_segments_{{hash}}.csv", sensor=config["BLUETOOTH"]["DB_TABLE"]) params: - day_segment = "{day_segment}", features = config["BLUETOOTH"]["FEATURES"] output: - "data/processed/{pid}/bluetooth_{day_segment}.csv" + "data/processed/{pid}/bluetooth_{hash}.csv" script: "../src/features/bluetooth_features.R" diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index fdcc4f3f..ba2273b4 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -38,6 +38,18 @@ rule download_dataset: script: "../src/data/download_dataset.R" +rule compute_day_segments: + input: + optional_day_segments_input, + params: + segments = lambda wildcards: find_day_segments_argument(wildcards, "SEGMENTS"), + event_time_shift = lambda wildcards: find_day_segments_argument(wildcards, "EVENT_TIME_SHIFT"), + event_segment_duration = lambda wildcards: find_day_segments_argument(wildcards, "EVENT_SEGMENT_DURATION"), + output: + "data/interim/{pid}/{sensor}_day_segments_{hash}.csv" + script: + "../src/data/compute_day_segments.py" + PHONE_SENSORS = [] PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["BARNETT_LOCATION"]["DB_TABLE"], config["DORYAB_LOCATION"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"], config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]) PHONE_SENSORS.extend(config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]) @@ -50,14 +62,15 @@ if len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0: rule readable_datetime: input: - sensor_input = rules.download_dataset.output + sensor_input = "data/raw/{pid}/{sensor}_raw.csv", + day_segments = "data/interim/{pid}/{sensor}_day_segments_{hash}.csv" params: timezones = None, fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"] wildcard_constraints: sensor = '.*(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ').*' # only process smartphone sensors, not fitbit output: - "data/raw/{pid}/{sensor}_with_datetime.csv" + "data/raw/{pid}/{sensor}_with_datetime_{hash}.csv" script: "../src/data/readable_datetime.R" diff --git a/src/data/compute_day_segments.py b/src/data/compute_day_segments.py new file mode 100644 index 00000000..d29d30df --- /dev/null +++ b/src/data/compute_day_segments.py @@ -0,0 +1,25 @@ +import pandas as pd +import json + +def parse_day_segments(segments, event_time_shift, event_segment_duration): + # Temporal code to parse segments, should substitute with the code to parse + # frequencies, intervals, and events + data = json.loads(segments) + label = [] + start = [] + end = [] + for d in data: + label.append(d[0]) + start.append(d[1]) + end.append(d[2]) + + day_segments = pd.DataFrame(list(zip([1]*len(label), start, end, label)), columns =['local_date','start_time','end_time','label']) + return day_segments + ########################## + +segments = snakemake.params["segments"] +event_time_shift = snakemake.params["event_time_shift"] +event_segment_duration = snakemake.params["event_segment_duration"] + +day_segments = parse_day_segments(segments, event_time_shift, event_segment_duration) +day_segments.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/data/readable_datetime.R b/src/data/readable_datetime.R index f3cb0ebc..dfc019b9 100644 --- a/src/data/readable_datetime.R +++ b/src/data/readable_datetime.R @@ -1,24 +1,55 @@ source("renv/activate.R") library("tidyverse") -library(readr) +library("readr") +library("lubridate") -input <- read.csv(snakemake@input[[1]]) %>% arrange(timestamp) +input <- read.csv(snakemake@input[["sensor_input"]]) %>% arrange(timestamp) +day_segments <- read.csv(snakemake@input[["day_segments"]]) %>% filter(label != "daily") #daily is done by default by all scripts sensor_output <- snakemake@output[[1]] timezone_periods <- snakemake@params[["timezone_periods"]] fixed_timezone <- snakemake@params[["fixed_timezone"]] -split_local_date_time <- function(data){ - return(data %>% - separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>% - separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>% - mutate(local_hour = as.numeric(local_hour), - local_minute = as.numeric(local_minute), - local_day_segment = case_when(local_hour %in% 0:5 ~ "night", - local_hour %in% 6:11 ~ "morning", - local_hour %in% 12:17 ~ "afternoon", - local_hour %in% 18:23 ~ "evening"))) +assign_to_day_segment <- function(data, day_segments){ + data <- data %>% mutate(local_day_segment = NA) + + # All segments belong to the same date, so we assume all days have the same segments + if(length(unique(day_segments$local_date)) == 1){ + data <- data %>% mutate(local_time_obj = lubridate::hms(local_time)) + day_segments <- day_segments %>% mutate(start_time = lubridate::hm(start_time), + end_time = lubridate::hm(end_time)) + for(row_id in 1:nrow(day_segments)){ + row = day_segments[row_id,] + data <- data %>% mutate(local_day_segment = ifelse(local_time_obj >= row$start_time & local_time_obj <= row$end_time, row$label, local_day_segment)) + } + data <- data %>% select(-local_time_obj) + # Segments belong to different dates, so each day can have different segments + }else{ + data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time)) + day_segments <- day_segments %>% mutate(start_local_date_time_obj = lubridate::ymd_hm(paste(local_date, start_time)), + end_local_date_time_obj = lubridate::ymd_hm(paste(local_date, end_time)), + date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj)) + for(row_id in 1:nrow(day_segments)){ + row = day_segments[row_id,] + data <- data %>% mutate(local_day_segment = ifelse(local_date_time_obj %within% row$date_time_interval, row$label, local_day_segment)) + } + data <- data %>% select(-local_date_time_obj) + } + + return(data) } + +split_local_date_time <- function(data, day_segments){ + split_data <- data %>% + separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>% + separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>% + mutate(local_hour = as.numeric(local_hour), + local_minute = as.numeric(local_minute)) + + split_data <- assign_to_day_segment(split_data, day_segments) + return(split_data) +} + if(!is.null(timezone_periods)){ timezones <- read_csv(timezone_periods) tz_starts <- timezones$start @@ -30,12 +61,12 @@ if(!is.null(timezone_periods)){ rowwise() %>% mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"), local_date_time = format(utc_date_time, tz = timezone, usetz = T)) - output <- split_local_date_time(output) + output <- split_local_date_time(output, day_segments) write.csv(output, sensor_output) } else if(!is.null(fixed_timezone)){ output <- input %>% mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"), local_date_time = format(utc_date_time, tz = fixed_timezone, usetz = F)) - output <- split_local_date_time(output) + output <- split_local_date_time(output, day_segments) write_csv(output, sensor_output) } diff --git a/src/features/bluetooth/bluetooth_base.R b/src/features/bluetooth/bluetooth_base.R index e525b723..f444fb22 100644 --- a/src/features/bluetooth/bluetooth_base.R +++ b/src/features/bluetooth/bluetooth_base.R @@ -2,7 +2,7 @@ library(dplyr) library(tidyr) filter_by_day_segment <- function(data, day_segment) { - if(day_segment %in% c("morning", "afternoon", "evening", "night")) + if(day_segment != "daily") data <- data %>% filter(local_day_segment == day_segment) return(data %>% group_by(local_date)) diff --git a/src/features/bluetooth_features.R b/src/features/bluetooth_features.R index ee181852..de072a29 100644 --- a/src/features/bluetooth_features.R +++ b/src/features/bluetooth_features.R @@ -4,14 +4,16 @@ library(dplyr) library(tidyr) bluetooth_data <- read.csv(snakemake@input[[1]], stringsAsFactors = FALSE) -day_segment <- snakemake@params[["day_segment"]] +day_segments <- read.csv(snakemake@input[["day_segments"]], stringsAsFactors = FALSE) requested_features <- snakemake@params[["features"]] features = data.frame(local_date = character(), stringsAsFactors = FALSE) +day_segments <- day_segments %>% distinct(label) %>% pull(label) # Compute base bluetooth features -features <- merge(features, base_bluetooth_features(bluetooth_data, day_segment, requested_features), by="local_date", all = TRUE) +for (day_segment in day_segments) + features <- merge(features, base_bluetooth_features(bluetooth_data, day_segment, requested_features), by="local_date", all = TRUE) -if(ncol(features) != length(requested_features) + 1) +if(ncol(features) != (length(requested_features)) * length(day_segments) + 1) stop(paste0("The number of features in the output dataframe (=", ncol(features),") does not match the expected value (=", length(requested_features)," + 1). Verify your bluetooth feature extraction functions"))