Add phone yield sensor

pull/103/head
JulioV 2020-11-24 19:12:16 -05:00
parent 555811211d
commit f02ca2624d
6 changed files with 132 additions and 67 deletions

View File

@ -12,20 +12,13 @@ files_to_compute = []
if len(config["PIDS"]) == 0:
raise ValueError("Add participants IDs to PIDS in config.yaml. Remember to create their participant files in data/external")
if config["PHONE_VALID_SENSED_BINS"]["COMPUTE"] or config["PHONE_VALID_SENSED_DAYS"]["COMPUTE"]: # valid sensed bins is necessary for sensed days, so we add these files anyways if sensed days are requested
if len(config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]) == 0:
raise ValueError("If you want to compute PHONE_VALID_SENSED_BINS or PHONE_VALID_SENSED_DAYS, you need to add at least one PHONE_SENSOR to [PHONE_VALID_SENSED_BINS][PHONE_SENSORS] in config.yaml")
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=map(str.lower, config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"])))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=map(str.lower, config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"])))
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_timestamps.csv", pid=config["PIDS"]))
if config["PHONE_VALID_SENSED_DAYS"]["COMPUTE"]:
files_to_compute.extend(expand("data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv",
pid=config["PIDS"],
min_valid_hours_per_day=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_HOURS_PER_DAY"],
min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"]))
for provider in config["PHONE_DATA_YIELD"]["PROVIDERS"].keys():
if config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=map(str.lower, config["PHONE_DATA_YIELD"]["SENSORS"])))
files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_data_yield_features/phone_data_yield_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_data_yield.csv", pid=config["PIDS"]))
for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys():
if config["PHONE_MESSAGES"]["PROVIDERS"][provider]["COMPUTE"]:
@ -73,10 +66,10 @@ for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys():
for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys():
if config["PHONE_SCREEN"]["PROVIDERS"][provider]["COMPUTE"]:
if "PHONE_SCREEN" in config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]:
if "PHONE_SCREEN" in config["PHONE_DATA_YIELD"]["SENSORS"]:
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
else:
raise ValueError("Error: Add PHONE_SCREEN (and as many phone sensor as you have in your database) to [PHONE_VALID_SENSED_BINS][PHONE_SENSORS] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data)")
raise ValueError("Error: Add PHONE_SCREEN (and as many phone sensor as you have in your database) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data)")
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime_unified.csv", pid=config["PIDS"]))
@ -133,10 +126,10 @@ for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys():
for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
if config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]:
if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] == "FUSED_RESAMPLED":
if "PHONE_LOCATIONS" in config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]:
if "PHONE_LOCATIONS" in config["PHONE_DATA_YIELD"]["SENSORS"]:
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
else:
raise ValueError("Error: Add PHONE_LOCATIONS (and as many PHONE_SENSORS as you have) to [PHONE_VALID_SENSED_BINS][PHONE_SENSORS] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
raise ValueError("Error: Add PHONE_LOCATIONS (and as many SENSORS as you have) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"]))
@ -200,22 +193,22 @@ for provider in config["FITBIT_CALORIES"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/raw/{pid}/fitbit_calories_{fitbit_data_type}_parsed_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"]))
# visualization for data exploration
if config["HEATMAP_FEATURES_CORRELATIONS"]["PLOT"]:
files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/heatmap_features_correlations.html", min_valid_hours_per_day=config["HEATMAP_FEATURES_CORRELATIONS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"]))
# if config["HEATMAP_FEATURES_CORRELATIONS"]["PLOT"]:
# files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/heatmap_features_correlations.html", min_valid_hours_per_day=config["HEATMAP_FEATURES_CORRELATIONS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"]))
if config["HISTOGRAM_VALID_SENSED_HOURS"]["PLOT"]:
files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/histogram_valid_sensed_hours.html", min_valid_hours_per_day=config["HISTOGRAM_VALID_SENSED_HOURS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"]))
# if config["HISTOGRAM_VALID_SENSED_HOURS"]["PLOT"]:
# files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/histogram_valid_sensed_hours.html", min_valid_hours_per_day=config["HISTOGRAM_VALID_SENSED_HOURS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"]))
if config["HEATMAP_DAYS_BY_SENSORS"]["PLOT"]:
files_to_compute.extend(expand("reports/interim/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{pid}/heatmap_days_by_sensors.html", pid=config["PIDS"], min_valid_hours_per_day=config["HEATMAP_DAYS_BY_SENSORS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"]))
files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/heatmap_days_by_sensors_all_participants.html", min_valid_hours_per_day=config["HEATMAP_DAYS_BY_SENSORS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"]))
# if config["HEATMAP_DAYS_BY_SENSORS"]["PLOT"]:
# files_to_compute.extend(expand("reports/interim/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{pid}/heatmap_days_by_sensors.html", pid=config["PIDS"], min_valid_hours_per_day=config["HEATMAP_DAYS_BY_SENSORS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"]))
# files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/heatmap_days_by_sensors_all_participants.html", min_valid_hours_per_day=config["HEATMAP_DAYS_BY_SENSORS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"]))
if config["HEATMAP_SENSED_BINS"]["PLOT"]:
files_to_compute.extend(expand("reports/interim/heatmap_sensed_bins/{pid}/heatmap_sensed_bins.html", pid=config["PIDS"]))
files_to_compute.extend(["reports/data_exploration/heatmap_sensed_bins_all_participants.html"])
# if config["HEATMAP_SENSED_BINS"]["PLOT"]:
# files_to_compute.extend(expand("reports/interim/heatmap_sensed_bins/{pid}/heatmap_sensed_bins.html", pid=config["PIDS"]))
# files_to_compute.extend(["reports/data_exploration/heatmap_sensed_bins_all_participants.html"])
if config["OVERALL_COMPLIANCE_HEATMAP"]["PLOT"]:
files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/overall_compliance_heatmap.html", min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"]))
# if config["OVERALL_COMPLIANCE_HEATMAP"]["PLOT"]:
# files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/overall_compliance_heatmap.html", min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"]))
rule all:

View File

@ -27,8 +27,8 @@ CREATE_PARTICIPANT_FILES:
# See https://www.rapids.science/setup/configuration/#day-segments
DAY_SEGMENTS: &day_segments
TYPE: EVENT # FREQUENCY, PERIODIC, EVENT
FILE: "data/external/daysegments_event.csv"
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
FILE: "data/external/daysegments_periodic.csv"
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs
# See https://www.rapids.science/setup/configuration/#device-data-source-configuration
@ -54,27 +54,22 @@ DEVICE_DATA:
############## PHONE ###########################################################
################################################################################
PHONE_VALID_SENSED_BINS:
COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features
BIN_SIZE: &bin_size 5 # (in minutes)
# Add as many PHONE sensors as you have, they all improve the computation of PHONE_VALID_SENSED_BINS and PHONE_VALID_SENSED_DAYS.
# If you are extracting screen or Barnett/Doryab location features, PHONE_SCREEN and PHONE_LOCATIONS tables are mandatory.
# You can choose any of the keys shown below, just make sure its TABLE exists in your database!
# PHONE_MESSAGES, PHONE_CALLS, PHONE_LOCATIONS, PHONE_BLUETOOTH, PHONE_ACTIVITY_RECOGNITION, PHONE_BATTERY, PHONE_SCREEN, PHONE_LIGHT,
# PHONE_ACCELEROMETER, PHONE_APPLICATIONS_FOREGROUND, PHONE_WIFI_VISIBLE, PHONE_WIFI_CONNECTED, PHONE_CONVERSATION
PHONE_SENSORS: []
PHONE_VALID_SENSED_DAYS:
COMPUTE: False
MIN_VALID_BINS_PER_HOUR: &min_valid_bins_per_hour [6] # (out of 60min/BIN_SIZE bins)
MIN_VALID_HOURS_PER_DAY: &min_valid_hours_per_day [16] # (out of 24) MIN_HOURS_PER_DAY
PHONE_DATA_YIELD:
SENSORS: [PHONE_MESSAGES, PHONE_CALLS, PHONE_ACCELEROMETER]
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1 representing the number of minutes with at least
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/phone_data_yield
# Communication SMS features config, TYPES and FEATURES keys need to match
PHONE_MESSAGES:
TABLE: messages
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
MESSAGES_TYPES : [received, sent]
FEATURES:
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
@ -329,8 +324,8 @@ FITBIT_CALORIES:
HEATMAP_FEATURES_CORRELATIONS:
PLOT: False
MIN_ROWS_RATIO: 0.5
MIN_VALID_HOURS_PER_DAY: *min_valid_hours_per_day
MIN_VALID_BINS_PER_HOUR: *min_valid_bins_per_hour
MIN_VALID_HOURS_PER_DAY: #*min_valid_hours_per_day
MIN_VALID_BINS_PER_HOUR: #*min_valid_bins_per_hour
PHONE_FEATURES: [accelerometer, activity_recognition, applications_foreground, battery, calls_incoming, calls_missed, calls_outgoing, conversation, light, location_doryab, messages_received, messages_sent, screen]
FITBIT_FEATURES: [fitbit_heartrate, fitbit_step, fitbit_sleep]
CORR_THRESHOLD: 0.1
@ -338,25 +333,25 @@ HEATMAP_FEATURES_CORRELATIONS:
HISTOGRAM_VALID_SENSED_HOURS:
PLOT: False
MIN_VALID_HOURS_PER_DAY: *min_valid_hours_per_day
MIN_VALID_BINS_PER_HOUR: *min_valid_bins_per_hour
MIN_VALID_HOURS_PER_DAY: #*min_valid_hours_per_day
MIN_VALID_BINS_PER_HOUR: #*min_valid_bins_per_hour
HEATMAP_DAYS_BY_SENSORS:
PLOT: False
MIN_VALID_HOURS_PER_DAY: *min_valid_hours_per_day
MIN_VALID_BINS_PER_HOUR: *min_valid_bins_per_hour
MIN_VALID_HOURS_PER_DAY: #*min_valid_hours_per_day
MIN_VALID_BINS_PER_HOUR: #*min_valid_bins_per_hour
EXPECTED_NUM_OF_DAYS: -1
DB_TABLES: [accelerometer, applications_foreground, battery, bluetooth, calls, light, locations, messages, screen, wifi, sensor_wifi, plugin_google_activity_recognition, plugin_ios_activity_recognition, plugin_studentlife_audio_android, plugin_studentlife_audio]
HEATMAP_SENSED_BINS:
PLOT: False
BIN_SIZE: *bin_size
BIN_SIZE: #*bin_size
OVERALL_COMPLIANCE_HEATMAP:
PLOT: False
ONLY_SHOW_VALID_DAYS: False
EXPECTED_NUM_OF_DAYS: -1
BIN_SIZE: *bin_size
MIN_VALID_HOURS_PER_DAY: *min_valid_hours_per_day
MIN_VALID_BINS_PER_HOUR: *min_valid_bins_per_hour
BIN_SIZE: #*bin_size
MIN_VALID_HOURS_PER_DAY: #*min_valid_hours_per_day
MIN_VALID_BINS_PER_HOUR: #*min_valid_bins_per_hour

View File

@ -6,6 +6,32 @@ rule join_features_from_providers:
script:
"../src/features/join_features_from_providers.R"
rule phone_data_yield_python_features:
input:
sensor_data = "data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_DATA_YIELD"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "phone_data_yield"
output:
"data/interim/{pid}/phone_data_yield_features/phone_data_yield_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule phone_data_yield_r_features:
input:
sensor_data = "data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_DATA_YIELD"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "phone_data_yield"
output:
"data/interim/{pid}/phone_data_yield_features/phone_data_yield_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule phone_accelerometer_python_features:
input:
sensor_data = "data/raw/{pid}/phone_accelerometer_with_datetime.csv",

View File

@ -77,23 +77,27 @@ rule phone_readable_datetime:
script:
"../src/data/readable_datetime.R"
rule phone_sensed_bins:
rule phone_yielded_timestamps:
input:
all_sensors = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor = map(str.lower, config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]))
params:
bin_size = config["PHONE_VALID_SENSED_BINS"]["BIN_SIZE"]
all_sensors = expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor = map(str.lower, config["PHONE_DATA_YIELD"]["SENSORS"]))
output:
"data/interim/{pid}/phone_sensed_bins.csv"
"data/interim/{pid}/phone_yielded_timestamps.csv"
script:
"../src/data/phone_sensed_bins.R"
"../src/data/phone_yielded_timestamps.R"
rule phone_sensed_timestamps:
rule phone_yielded_timestamps_with_datetime:
input:
all_sensors = expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor = map(str.lower, config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]))
sensor_input = "data/interim/{pid}/phone_yielded_timestamps.csv",
day_segments = "data/interim/day_segments/{pid}_day_segments.csv"
params:
timezones = config["DEVICE_DATA"]["PHONE"]["TIMEZONE"]["TYPE"],
fixed_timezone = config["DEVICE_DATA"]["PHONE"]["TIMEZONE"]["VALUE"],
day_segments_type = config["DAY_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output:
"data/interim/{pid}/phone_sensed_timestamps.csv"
"data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv"
script:
"../src/data/phone_sensed_timestamps.R"
"../src/data/readable_datetime.R"
rule phone_valid_sensed_days:
input:

View File

@ -0,0 +1,47 @@
library("dplyr", warn.conflicts = F)
library(tidyr)
library(readr)
compute_data_yield_features <- function(data, feature_name, day_segment, provider){
data <- data %>% filter_data_by_segment(day_segment)
features <- data %>%
separate(timestamps_segment, into = c("start_timestamp", "end_timestamp"), convert = T, sep = ",") %>%
mutate(duration_minutes = (end_timestamp - start_timestamp) / 60000,
timestamp_since_segment_start = timestamp - start_timestamp,
minute_bin = timestamp_since_segment_start %/% 60000, # 60 * 1000
hour_bin = timestamp_since_segment_start %/% 3600000) %>% # (60 * 60 * 1000)
group_by(local_segment, hour_bin) %>%
summarise(minute_count = n_distinct(minute_bin),
duration_minutes = first(duration_minutes),
valid_hour = (minute_count/min(duration_minutes, 60)) > provider$MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS) %>%
group_by(local_segment) %>%
summarise(valid_yielded_minutes = sum(minute_count),
valid_yielded_hours = sum(valid_hour == TRUE) / 1.0,
duration_minutes = first(duration_minutes),
duration_hours = duration_minutes / 60.0,
phone_data_yield_rapids_ratiovalidyieldedminutes = valid_yielded_minutes / duration_minutes,
phone_data_yield_rapids_ratiovalidyieldedhours = if_else(duration_hours > 1, valid_yielded_hours / duration_hours, valid_yielded_hours))
return(features)
}
rapids_features <- function(sensor_data_files, day_segment, provider){
yield_data <- read_csv(sensor_data_files[["sensor_data"]], col_types = cols_only(timestamp ="d", assigned_segments = "c"))
requested_features <- provider[["FEATURES"]]
# Output dataframe
features = data.frame(local_segment = character(), stringsAsFactors = FALSE)
# The name of the features this function can compute
base_features_names <- c("ratiovalidyieldedminutes", "ratiovalidyieldedhours")
# The subset of requested features this function can compute
features_to_compute <- intersect(base_features_names, requested_features)
features <- compute_data_yield_features(yield_data, feature_name, day_segment, provider) %>%
select(c("local_segment", paste0("phone_data_yield_rapids_", features_to_compute)))
return(features)
}