diff --git a/Snakefile b/Snakefile index 069466c6..4d64cfc6 100644 --- a/Snakefile +++ b/Snakefile @@ -8,6 +8,7 @@ rule all: expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SENSORS"]), expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]), expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]), + expand("data/interim/{pid}/phone_valid_sensed_days.csv", pid=config["PIDS"]), expand("data/processed/{pid}/com_sms_{sms_type}_{day_segment}_{metric}.csv", pid=config["PIDS"], sms_type = config["COM_SMS"]["SMS_TYPES"], diff --git a/config.yaml b/config.yaml index 5b574029..66b251b8 100644 --- a/config.yaml +++ b/config.yaml @@ -1,5 +1,5 @@ # Valid database table names -SENSORS: [messages, calls, battery, screen] +SENSORS: [applications_crashes, applications_foreground, applications_notifications, battery, bluetooth, calls, locations, messages, plugin_ambient_noise, plugin_device_usage, plugin_google_activity_recognition, screen] # Participants to include in the analysis # You must create a file for each participant @@ -31,4 +31,9 @@ COM_CALL: CALL_TYPE_TAKEN : [incoming, outgoing] DAY_SEGMENTS: *day_segments METRICS_MISSED: [count, distinctcontacts] - METRICS_TAKEN: [count, distinctcontacts, meanduration, sumduration, hubermduration, varqnduration, entropyduration] \ No newline at end of file + METRICS_TAKEN: [count, distinctcontacts, meanduration, sumduration, hubermduration, varqnduration, entropyduration] + +PHONE_VALID_SENSED_DAYS: + BIN_SIZE: 5 # (in minutes) + MIN_VALID_HOURS: 20 # (out of 24) + MIN_BINS_PER_HOUR: 8 # (out of 60min/BIN_SIZE bins) \ No newline at end of file diff --git a/rules/preprocessing.snakefile b/rules/preprocessing.snakefile index 987a8dad..20b92301 100644 --- a/rules/preprocessing.snakefile +++ b/rules/preprocessing.snakefile @@ -18,4 +18,16 @@ rule readable_datetime: output: "data/raw/{pid}/{sensor}_with_datetime.csv" script: - "../src/data/readable_datetime.R" \ No newline at end of file + "../src/data/readable_datetime.R" + +rule phone_valid_sensed_days: + input: + all_sensors = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["SENSORS"]) + params: + bin_size = config["PHONE_VALID_SENSED_DAYS"]["BIN_SIZE"], + min_valid_hours = config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_HOURS"], + min_bins_per_hour = config["PHONE_VALID_SENSED_DAYS"]["MIN_BINS_PER_HOUR"] + output: + "data/interim/{pid}/phone_valid_sensed_days.csv" + script: + "../src/data/phone_valid_sensed_days.R" \ No newline at end of file diff --git a/src/data/phone_valid_sensed_days.R b/src/data/phone_valid_sensed_days.R new file mode 100644 index 00000000..f76541bc --- /dev/null +++ b/src/data/phone_valid_sensed_days.R @@ -0,0 +1,31 @@ +source("packrat/init.R") + +library(dplyr) + +all_sensors <- snakemake@input[["all_sensors"]] +bin_size <- snakemake@params[["bin_size"]] +min_valid_hours <- snakemake@params[["min_valid_hours"]] +min_bins_per_hour <- snakemake@params[["min_bins_per_hour"]] +output_file <- snakemake@output[[1]] + +# Load all sensors and extract timestamps +all_sensor_data <- data.frame(timestamp = c()) +for(sensor in all_sensors){ + sensor_data <- read.csv(sensor, stringsAsFactors = F) %>% select(local_date, local_hour, local_minute) + all_sensor_data <- rbind(all_sensor_data, sensor_data) +} + +phone_valid_sensed_days <- all_sensor_data %>% + mutate(bin = (local_minute %/% bin_size) * bin_size) %>% # bin rows into bin_size-minute bins + group_by(local_date, local_hour, bin) %>% + summarise(minute_period = first(bin)) %>% #filter repeated bins (if rows were logged within bin_size minutes) + ungroup() %>% + group_by(local_date, local_hour) %>% + summarise(bins = n()) %>% # Count how many bins there are per hour + ungroup() %>% + filter(bins >= min_bins_per_hour) %>% # Discard those hours where there were fewer than min_bins_per_hour + group_by(local_date) %>% + summarise(valid_hours = n()) %>% # Count how many valid hours each day has + filter(valid_hours >= min_valid_hours) # Discard those days where there were fewer than min_valid_hours + +write.csv(phone_valid_sensed_days, output_file)