Add valid sensed days

replace/8130ef4b00b70733374407f83bbd20ea618b48ee
JulioV 2019-11-05 12:34:22 -05:00
parent 6eb7bc9e70
commit cca1633728
4 changed files with 52 additions and 3 deletions

View File

@ -8,6 +8,7 @@ rule all:
expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SENSORS"]), expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]), expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]), expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]),
expand("data/interim/{pid}/phone_valid_sensed_days.csv", pid=config["PIDS"]),
expand("data/processed/{pid}/com_sms_{sms_type}_{day_segment}_{metric}.csv", expand("data/processed/{pid}/com_sms_{sms_type}_{day_segment}_{metric}.csv",
pid=config["PIDS"], pid=config["PIDS"],
sms_type = config["COM_SMS"]["SMS_TYPES"], sms_type = config["COM_SMS"]["SMS_TYPES"],

View File

@ -1,5 +1,5 @@
# Valid database table names # Valid database table names
SENSORS: [messages, calls, battery, screen] SENSORS: [applications_crashes, applications_foreground, applications_notifications, battery, bluetooth, calls, locations, messages, plugin_ambient_noise, plugin_device_usage, plugin_google_activity_recognition, screen]
# Participants to include in the analysis # Participants to include in the analysis
# You must create a file for each participant # You must create a file for each participant
@ -31,4 +31,9 @@ COM_CALL:
CALL_TYPE_TAKEN : [incoming, outgoing] CALL_TYPE_TAKEN : [incoming, outgoing]
DAY_SEGMENTS: *day_segments DAY_SEGMENTS: *day_segments
METRICS_MISSED: [count, distinctcontacts] METRICS_MISSED: [count, distinctcontacts]
METRICS_TAKEN: [count, distinctcontacts, meanduration, sumduration, hubermduration, varqnduration, entropyduration] METRICS_TAKEN: [count, distinctcontacts, meanduration, sumduration, hubermduration, varqnduration, entropyduration]
PHONE_VALID_SENSED_DAYS:
BIN_SIZE: 5 # (in minutes)
MIN_VALID_HOURS: 20 # (out of 24)
MIN_BINS_PER_HOUR: 8 # (out of 60min/BIN_SIZE bins)

View File

@ -18,4 +18,16 @@ rule readable_datetime:
output: output:
"data/raw/{pid}/{sensor}_with_datetime.csv" "data/raw/{pid}/{sensor}_with_datetime.csv"
script: script:
"../src/data/readable_datetime.R" "../src/data/readable_datetime.R"
rule phone_valid_sensed_days:
input:
all_sensors = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["SENSORS"])
params:
bin_size = config["PHONE_VALID_SENSED_DAYS"]["BIN_SIZE"],
min_valid_hours = config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_HOURS"],
min_bins_per_hour = config["PHONE_VALID_SENSED_DAYS"]["MIN_BINS_PER_HOUR"]
output:
"data/interim/{pid}/phone_valid_sensed_days.csv"
script:
"../src/data/phone_valid_sensed_days.R"

View File

@ -0,0 +1,31 @@
source("packrat/init.R")
library(dplyr)
all_sensors <- snakemake@input[["all_sensors"]]
bin_size <- snakemake@params[["bin_size"]]
min_valid_hours <- snakemake@params[["min_valid_hours"]]
min_bins_per_hour <- snakemake@params[["min_bins_per_hour"]]
output_file <- snakemake@output[[1]]
# Load all sensors and extract timestamps
all_sensor_data <- data.frame(timestamp = c())
for(sensor in all_sensors){
sensor_data <- read.csv(sensor, stringsAsFactors = F) %>% select(local_date, local_hour, local_minute)
all_sensor_data <- rbind(all_sensor_data, sensor_data)
}
phone_valid_sensed_days <- all_sensor_data %>%
mutate(bin = (local_minute %/% bin_size) * bin_size) %>% # bin rows into bin_size-minute bins
group_by(local_date, local_hour, bin) %>%
summarise(minute_period = first(bin)) %>% #filter repeated bins (if rows were logged within bin_size minutes)
ungroup() %>%
group_by(local_date, local_hour) %>%
summarise(bins = n()) %>% # Count how many bins there are per hour
ungroup() %>%
filter(bins >= min_bins_per_hour) %>% # Discard those hours where there were fewer than min_bins_per_hour
group_by(local_date) %>%
summarise(valid_hours = n()) %>% # Count how many valid hours each day has
filter(valid_hours >= min_valid_hours) # Discard those days where there were fewer than min_valid_hours
write.csv(phone_valid_sensed_days, output_file)