Setup rules and files to support multiple
parent
b116accb6d
commit
a0b5b5982b
17
config.yaml
17
config.yaml
|
@ -1,11 +1,24 @@
|
|||
# Participants to include in the analysis
|
||||
# You must create a file for each participant named pXXX containing their device_id. This can be done manually or automatically
|
||||
PIDS: [test01]
|
||||
PIDS: [t01]
|
||||
|
||||
# Global var with common day segments
|
||||
DAY_SEGMENTS: &day_segments
|
||||
[daily, morning, afternoon, evening, night]
|
||||
|
||||
DAY_SEGMENTS2: &day_segments2
|
||||
# Day segments can be computed based on three strategies
|
||||
# Frequency based: Set SEGMENTS to a number representing the length of a segment in minutes: 15. Every day will be divided in n segments of SEGMENTS minutes starting at midnight.
|
||||
# Interval based: Set SEGMENTS to a string containing a JSON array with an element for each segment containing a label, and start and end time in 24 hour format.
|
||||
# For example: '{"daily": {"00:00", "23:59"}, "morning": {"06:00", "11:59"}}'. Note the string is single quoted and each value double quoted.
|
||||
# Event based: Set SEGMENTS to a string with a path to a csv file with two columns, a unix timestamp column in milliseconds called "timestamp" and a string column called "label".
|
||||
# Every row represents a meaningful event around which features will be extracted, each label should be unique. See EVENT_TIME_SHIFT and EVENT_SEGMENT_DURATION
|
||||
# If you want daily features, create a segment with label "daily". DO NOT use "daily" to label any other segment
|
||||
# ------------------------------------------------------------------------------
|
||||
SEGMENTS: '[["daily", "00:00", "23:59"], ["morning", "06:00", "11:59"], ["evening", "18:00", "23:59"]]'
|
||||
EVENT_TIME_SHIFT: 0 # Postive or negative number of minutes. A day segment will start EVENT_TIME_SHIFT minutes before or after each meaningful event. Only used if SEGMENTS is a valid event file (see above).
|
||||
EVENT_SEGMENT_DURATION: 60 # Lengh of every day_segment around each meaningful event. Only used if SEGMENTS is a valid event file (see above).
|
||||
|
||||
# Global timezone
|
||||
# Use codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
|
||||
# Double check your code, for example EST is not US Eastern Time.
|
||||
|
@ -97,7 +110,7 @@ DORYAB_LOCATION:
|
|||
BLUETOOTH:
|
||||
COMPUTE: False
|
||||
DB_TABLE: bluetooth
|
||||
DAY_SEGMENTS: *day_segments
|
||||
DAY_SEGMENTS: *day_segments2
|
||||
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
|
||||
|
||||
ACTIVITY_RECOGNITION:
|
||||
|
|
|
@ -111,3 +111,29 @@ def optional_heatmap_days_by_sensors_input(wildcards):
|
|||
tables_platform = [table for table in config["HEATMAP_DAYS_BY_SENSORS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist
|
||||
|
||||
return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform)
|
||||
def optional_day_segments_input(wildcards):
|
||||
return []
|
||||
|
||||
def find_day_segments_argument(wildcards, argument):
|
||||
for key, values in config.items():
|
||||
if "DAY_SEGMENTS" in config[key] and "DB_TABLE" in config[key] and config[key]["DB_TABLE"] == wildcards.sensor:
|
||||
return config[key]["DAY_SEGMENTS"][argument]
|
||||
|
||||
def hash_day_segments(config_section):
|
||||
# TODO hash the content of the interval file instead of SEGMENTS when SEGMENTS is a path
|
||||
return hashlib.sha1(config_section["SEGMENTS"].encode('utf-8')).hexdigest()
|
||||
|
||||
def is_valid_day_segment_configuration(sensor, config_section):
|
||||
if not (isinstance(config_section, collections.OrderedDict) or isinstance(config_section, dict)):
|
||||
raise ValueError("The DAY_SEGMENTS parameter in the {} config section should be a dictionary with three parameters: SEGMENTS (str), EVENT_TIME_SHIFT (int), and EVENT_SEGMENT_DURATION (int)".format(sensor))
|
||||
for attribute in ["SEGMENTS", "EVENT_TIME_SHIFT", "EVENT_SEGMENT_DURATION"]:
|
||||
if not attribute in config_section:
|
||||
raise ValueError("The config[{}][DAY_SEGMENTS] section should have an attribute named {}".format(sensor, attribute))
|
||||
|
||||
if not isinstance(config_section["SEGMENTS"], str):
|
||||
raise ValueError("The config[{}][DAY_SEGMENTS][SEGMENTS] variable should be a string".format(sensor))
|
||||
if not isinstance(config_section["EVENT_TIME_SHIFT"], int):
|
||||
raise ValueError("The config[{}][DAY_SEGMENTS][EVENT_TIME_SHIFT] variable should be an integer".format(sensor))
|
||||
if not isinstance(config_section["EVENT_SEGMENT_DURATION"], int):
|
||||
raise ValueError("The config[{}][DAY_SEGMENTS][EVENT_SEGMENT_DURATION] variable should be an integer".format(sensor))
|
||||
return True
|
||||
|
|
|
@ -88,12 +88,12 @@ rule location_doryab_features:
|
|||
|
||||
rule bluetooth_features:
|
||||
input:
|
||||
expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])
|
||||
expand("data/raw/{{pid}}/{sensor}_with_datetime_{{hash}}.csv", sensor=config["BLUETOOTH"]["DB_TABLE"]),
|
||||
day_segments = expand("data/interim/{{pid}}/{sensor}_day_segments_{{hash}}.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])
|
||||
params:
|
||||
day_segment = "{day_segment}",
|
||||
features = config["BLUETOOTH"]["FEATURES"]
|
||||
output:
|
||||
"data/processed/{pid}/bluetooth_{day_segment}.csv"
|
||||
"data/processed/{pid}/bluetooth_{hash}.csv"
|
||||
script:
|
||||
"../src/features/bluetooth_features.R"
|
||||
|
||||
|
|
|
@ -38,6 +38,18 @@ rule download_dataset:
|
|||
script:
|
||||
"../src/data/download_dataset.R"
|
||||
|
||||
rule compute_day_segments:
|
||||
input:
|
||||
optional_day_segments_input,
|
||||
params:
|
||||
segments = lambda wildcards: find_day_segments_argument(wildcards, "SEGMENTS"),
|
||||
event_time_shift = lambda wildcards: find_day_segments_argument(wildcards, "EVENT_TIME_SHIFT"),
|
||||
event_segment_duration = lambda wildcards: find_day_segments_argument(wildcards, "EVENT_SEGMENT_DURATION"),
|
||||
output:
|
||||
"data/interim/{pid}/{sensor}_day_segments_{hash}.csv"
|
||||
script:
|
||||
"../src/data/compute_day_segments.py"
|
||||
|
||||
PHONE_SENSORS = []
|
||||
PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["BARNETT_LOCATION"]["DB_TABLE"], config["DORYAB_LOCATION"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"], config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]])
|
||||
PHONE_SENSORS.extend(config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"])
|
||||
|
@ -50,14 +62,15 @@ if len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
|
|||
|
||||
rule readable_datetime:
|
||||
input:
|
||||
sensor_input = rules.download_dataset.output
|
||||
sensor_input = "data/raw/{pid}/{sensor}_raw.csv",
|
||||
day_segments = "data/interim/{pid}/{sensor}_day_segments_{hash}.csv"
|
||||
params:
|
||||
timezones = None,
|
||||
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"]
|
||||
wildcard_constraints:
|
||||
sensor = '.*(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ').*' # only process smartphone sensors, not fitbit
|
||||
output:
|
||||
"data/raw/{pid}/{sensor}_with_datetime.csv"
|
||||
"data/raw/{pid}/{sensor}_with_datetime_{hash}.csv"
|
||||
script:
|
||||
"../src/data/readable_datetime.R"
|
||||
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
import pandas as pd
|
||||
import json
|
||||
|
||||
def parse_day_segments(segments, event_time_shift, event_segment_duration):
|
||||
# Temporal code to parse segments, should substitute with the code to parse
|
||||
# frequencies, intervals, and events
|
||||
data = json.loads(segments)
|
||||
label = []
|
||||
start = []
|
||||
end = []
|
||||
for d in data:
|
||||
label.append(d[0])
|
||||
start.append(d[1])
|
||||
end.append(d[2])
|
||||
|
||||
day_segments = pd.DataFrame(list(zip([1]*len(label), start, end, label)), columns =['local_date','start_time','end_time','label'])
|
||||
return day_segments
|
||||
##########################
|
||||
|
||||
segments = snakemake.params["segments"]
|
||||
event_time_shift = snakemake.params["event_time_shift"]
|
||||
event_segment_duration = snakemake.params["event_segment_duration"]
|
||||
|
||||
day_segments = parse_day_segments(segments, event_time_shift, event_segment_duration)
|
||||
day_segments.to_csv(snakemake.output[0], index=False)
|
|
@ -1,24 +1,55 @@
|
|||
source("renv/activate.R")
|
||||
|
||||
library("tidyverse")
|
||||
library(readr)
|
||||
library("readr")
|
||||
library("lubridate")
|
||||
|
||||
input <- read.csv(snakemake@input[[1]]) %>% arrange(timestamp)
|
||||
input <- read.csv(snakemake@input[["sensor_input"]]) %>% arrange(timestamp)
|
||||
day_segments <- read.csv(snakemake@input[["day_segments"]]) %>% filter(label != "daily") #daily is done by default by all scripts
|
||||
sensor_output <- snakemake@output[[1]]
|
||||
timezone_periods <- snakemake@params[["timezone_periods"]]
|
||||
fixed_timezone <- snakemake@params[["fixed_timezone"]]
|
||||
|
||||
split_local_date_time <- function(data){
|
||||
return(data %>%
|
||||
separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
|
||||
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
|
||||
mutate(local_hour = as.numeric(local_hour),
|
||||
local_minute = as.numeric(local_minute),
|
||||
local_day_segment = case_when(local_hour %in% 0:5 ~ "night",
|
||||
local_hour %in% 6:11 ~ "morning",
|
||||
local_hour %in% 12:17 ~ "afternoon",
|
||||
local_hour %in% 18:23 ~ "evening")))
|
||||
assign_to_day_segment <- function(data, day_segments){
|
||||
data <- data %>% mutate(local_day_segment = NA)
|
||||
|
||||
# All segments belong to the same date, so we assume all days have the same segments
|
||||
if(length(unique(day_segments$local_date)) == 1){
|
||||
data <- data %>% mutate(local_time_obj = lubridate::hms(local_time))
|
||||
day_segments <- day_segments %>% mutate(start_time = lubridate::hm(start_time),
|
||||
end_time = lubridate::hm(end_time))
|
||||
for(row_id in 1:nrow(day_segments)){
|
||||
row = day_segments[row_id,]
|
||||
data <- data %>% mutate(local_day_segment = ifelse(local_time_obj >= row$start_time & local_time_obj <= row$end_time, row$label, local_day_segment))
|
||||
}
|
||||
data <- data %>% select(-local_time_obj)
|
||||
# Segments belong to different dates, so each day can have different segments
|
||||
}else{
|
||||
data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time))
|
||||
day_segments <- day_segments %>% mutate(start_local_date_time_obj = lubridate::ymd_hm(paste(local_date, start_time)),
|
||||
end_local_date_time_obj = lubridate::ymd_hm(paste(local_date, end_time)),
|
||||
date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj))
|
||||
for(row_id in 1:nrow(day_segments)){
|
||||
row = day_segments[row_id,]
|
||||
data <- data %>% mutate(local_day_segment = ifelse(local_date_time_obj %within% row$date_time_interval, row$label, local_day_segment))
|
||||
}
|
||||
data <- data %>% select(-local_date_time_obj)
|
||||
}
|
||||
|
||||
return(data)
|
||||
}
|
||||
|
||||
split_local_date_time <- function(data, day_segments){
|
||||
split_data <- data %>%
|
||||
separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
|
||||
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
|
||||
mutate(local_hour = as.numeric(local_hour),
|
||||
local_minute = as.numeric(local_minute))
|
||||
|
||||
split_data <- assign_to_day_segment(split_data, day_segments)
|
||||
return(split_data)
|
||||
}
|
||||
|
||||
if(!is.null(timezone_periods)){
|
||||
timezones <- read_csv(timezone_periods)
|
||||
tz_starts <- timezones$start
|
||||
|
@ -30,12 +61,12 @@ if(!is.null(timezone_periods)){
|
|||
rowwise() %>%
|
||||
mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"),
|
||||
local_date_time = format(utc_date_time, tz = timezone, usetz = T))
|
||||
output <- split_local_date_time(output)
|
||||
output <- split_local_date_time(output, day_segments)
|
||||
write.csv(output, sensor_output)
|
||||
} else if(!is.null(fixed_timezone)){
|
||||
output <- input %>%
|
||||
mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"),
|
||||
local_date_time = format(utc_date_time, tz = fixed_timezone, usetz = F))
|
||||
output <- split_local_date_time(output)
|
||||
output <- split_local_date_time(output, day_segments)
|
||||
write_csv(output, sensor_output)
|
||||
}
|
||||
|
|
|
@ -2,7 +2,7 @@ library(dplyr)
|
|||
library(tidyr)
|
||||
|
||||
filter_by_day_segment <- function(data, day_segment) {
|
||||
if(day_segment %in% c("morning", "afternoon", "evening", "night"))
|
||||
if(day_segment != "daily")
|
||||
data <- data %>% filter(local_day_segment == day_segment)
|
||||
|
||||
return(data %>% group_by(local_date))
|
||||
|
|
|
@ -4,14 +4,16 @@ library(dplyr)
|
|||
library(tidyr)
|
||||
|
||||
bluetooth_data <- read.csv(snakemake@input[[1]], stringsAsFactors = FALSE)
|
||||
day_segment <- snakemake@params[["day_segment"]]
|
||||
day_segments <- read.csv(snakemake@input[["day_segments"]], stringsAsFactors = FALSE)
|
||||
requested_features <- snakemake@params[["features"]]
|
||||
features = data.frame(local_date = character(), stringsAsFactors = FALSE)
|
||||
|
||||
day_segments <- day_segments %>% distinct(label) %>% pull(label)
|
||||
# Compute base bluetooth features
|
||||
features <- merge(features, base_bluetooth_features(bluetooth_data, day_segment, requested_features), by="local_date", all = TRUE)
|
||||
for (day_segment in day_segments)
|
||||
features <- merge(features, base_bluetooth_features(bluetooth_data, day_segment, requested_features), by="local_date", all = TRUE)
|
||||
|
||||
if(ncol(features) != length(requested_features) + 1)
|
||||
if(ncol(features) != (length(requested_features)) * length(day_segments) + 1)
|
||||
stop(paste0("The number of features in the output dataframe (=", ncol(features),") does not match the expected value (=", length(requested_features)," + 1). Verify your bluetooth feature extraction functions"))
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue