diff --git a/Snakefile b/Snakefile index 3afde276..86c4d16e 100644 --- a/Snakefile +++ b/Snakefile @@ -11,6 +11,7 @@ rule all: expand("data/processed/{pid}/screen_deltas.csv", pid=config["PIDS"]), expand("data/processed/{pid}/google_activity_recognition_deltas.csv", pid=config["PIDS"]), expand("data/interim/{pid}/phone_valid_sensed_days.csv", pid=config["PIDS"]), + expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]), expand("data/processed/{pid}/sms_{sms_type}_{day_segment}.csv", pid=config["PIDS"], sms_type = config["SMS"]["TYPES"], diff --git a/rules/preprocessing.snakefile b/rules/preprocessing.snakefile index 1eae30f8..564b2a57 100644 --- a/rules/preprocessing.snakefile +++ b/rules/preprocessing.snakefile @@ -32,6 +32,16 @@ rule phone_valid_sensed_days: script: "../src/data/phone_valid_sensed_days.R" +rule phone_sensed_bins: + input: + all_sensors = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["SENSORS"]) + params: + bin_size = config["PHONE_VALID_SENSED_DAYS"]["BIN_SIZE"] + output: + "data/interim/{pid}/phone_sensed_bins.csv" + script: + "../src/data/phone_sensed_bins.R" + rule unify_ios_android: input: sensor_data = "data/raw/{pid}/{sensor}_with_datetime.csv", diff --git a/src/data/phone_sensed_bins.R b/src/data/phone_sensed_bins.R new file mode 100644 index 00000000..329fc905 --- /dev/null +++ b/src/data/phone_sensed_bins.R @@ -0,0 +1,31 @@ +source("packrat/init.R") + +library(dplyr) +library(tidyr) + +all_sensors <- snakemake@input[["all_sensors"]] +bin_size <- snakemake@params[["bin_size"]] +output_file <- snakemake@output[[1]] + +# Load all sensors and extract timestamps +all_sensor_data <- data.frame(timestamp = c()) +for(sensor in all_sensors){ + sensor_data <- read.csv(sensor, stringsAsFactors = F) %>% + select(local_date, local_hour, local_minute) %>% + mutate(sensor = basename(sensor)) + all_sensor_data <- rbind(all_sensor_data, sensor_data) +} + +phone_sensed_bins <- all_sensor_data %>% + mutate(bin = (local_minute %/% bin_size) * bin_size) %>% # bin rows into bin_size-minute bins + group_by(local_date, local_hour, bin) %>% + summarise(sensor_count = n_distinct(sensor)) %>% + ungroup() %>% + complete(nesting(local_date), + local_hour = seq(0, 23, 1), + bin = seq(0, (59 %/% bin_size) * bin_size, bin_size), + fill = list(sensor_count=0)) %>% + pivot_wider(names_from = c(local_hour, bin), values_from = sensor_count) + +write.csv(phone_sensed_bins, output_file) +