Refactor Snakefile and docs. Rename SMS scripts

pull/95/head
JulioV 2020-06-23 11:33:34 -04:00
parent 9da4fb165c
commit 36017d5dca
13 changed files with 363 additions and 463 deletions

253
Snakefile
View File

@ -6,174 +6,97 @@ include: "rules/models.snakefile"
include: "rules/reports.snakefile"
include: "rules/mystudy.snakefile" # You can add snakfiles with rules tailored to your project
models, scalers = [], []
for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]:
models = models + [model_name] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name])
scalers = scalers + config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]
files_to_compute = []
if len(config["PIDS"]) == 0:
raise ValueError("Add participants IDs to PIDS in config.yaml. Remember to create their participant files in data/external")
if config["MESSAGES"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/messages_{messages_type}_{day_segment}.csv", pid=config["PIDS"], messages_type = config["MESSAGES"]["TYPES"], day_segment = config["MESSAGES"]["DAY_SEGMENTS"]))
if config["CALLS"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}_{segment}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"], segment = config["CALLS"]["DAY_SEGMENTS"]))
if config["BARNETT_LOCATION"]["COMPUTE"]:
# TODO add files_to_compute.extend(optional_location_input(None))
if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED" and config["BARNETT_LOCATION"]["DB_TABLE"] not in config["TABLES_FOR_SENSED_BINS"]:
raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to TABLES_FOR_SENSED_BINS in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/location_barnett_{segment}.csv", pid=config["PIDS"], segment = config["BARNETT_LOCATION"]["DAY_SEGMENTS"]))
if config["BLUETOOTH"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/bluetooth_{segment}.csv", pid=config["PIDS"], segment = config["BLUETOOTH"]["DAY_SEGMENTS"]))
if config["ACTIVITY_RECOGNITION"]["COMPUTE"]:
# TODO add files_to_compute.extend(optional_ar_input(None)), the Android or iOS table gets processed depending on each participant
files_to_compute.extend(expand("data/processed/{pid}/activity_recognition_{segment}.csv",pid=config["PIDS"], segment = config["ACTIVITY_RECOGNITION"]["DAY_SEGMENTS"]))
if config["BATTERY"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BATTERY"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BATTERY"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["BATTERY"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/{pid}/battery_{day_segment}.csv", pid = config["PIDS"], day_segment = config["BATTERY"]["DAY_SEGMENTS"]))
if config["SCREEN"]["COMPUTE"]:
if config["SCREEN"]["DB_TABLE"] not in config["TABLES_FOR_SENSED_BINS"]:
raise ValueError("Error: Add your screen table (and as many sensor tables as you have) to TABLES_FOR_SENSED_BINS in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data)")
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SCREEN"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SCREEN"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/screen_deltas.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/{pid}/screen_{day_segment}.csv", pid = config["PIDS"], day_segment = config["SCREEN"]["DAY_SEGMENTS"]))
if config["LIGHT"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/light_{day_segment}.csv", pid = config["PIDS"], day_segment = config["LIGHT"]["DAY_SEGMENTS"]))
if config["ACCELEROMETER"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/accelerometer_{day_segment}.csv", pid = config["PIDS"], day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"]))
if config["APPLICATIONS_FOREGROUND"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor}_with_datetime_with_genre.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/applications_foreground_{day_segment}.csv", pid = config["PIDS"], day_segment = config["APPLICATIONS_FOREGROUND"]["DAY_SEGMENTS"]))
if config["WIFI"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/wifi_{day_segment}.csv", pid = config["PIDS"], day_segment = config["WIFI"]["DAY_SEGMENTS"]))
if config["HEARTRATE"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["HEARTRATE"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"]))
files_to_compute.extend(expand("data/processed/{pid}/fitbit_heartrate_{day_segment}.csv", pid = config["PIDS"], day_segment = config["HEARTRATE"]["DAY_SEGMENTS"]))
if config["STEP"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["STEP"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_step_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["intraday"]))
files_to_compute.extend(expand("data/processed/{pid}/fitbit_step_{day_segment}.csv", pid = config["PIDS"], day_segment = config["STEP"]["DAY_SEGMENTS"]))
if config["SLEEP"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SLEEP"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["intraday"]))
files_to_compute.extend(expand("data/processed/{pid}/fitbit_sleep_{day_segment}.csv", pid = config["PIDS"], day_segment = config["SLEEP"]["DAY_SEGMENTS"]))
if config["CONVERSATION"]["COMPUTE"]:
# TODO add files_to_compute.extend(optional_conversation_input(None)), the Android or iOS table gets processed depending on each participant
files_to_compute.extend(expand("data/processed/{pid}/conversation_{segment}.csv",pid=config["PIDS"], segment = config["CONVERSATION"]["DAY_SEGMENTS"]))
rule all:
input:
# My study (this is an example of a rule created specifically for a study)
expand("data/interim/{pid}/days_to_analyse_{days_before_surgery}_{days_in_hospital}_{days_after_discharge}.csv",
pid = config["PIDS"],
days_before_surgery = config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"],
days_after_discharge = config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"],
days_in_hospital = config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]),
expand("data/processed/{pid}/targets_{summarised}.csv",
pid = config["PIDS"],
summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
expand("data/processed/{pid}/demographic_features.csv", pid=config["PIDS"]),
# Feature extraction
expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["FITBIT_TABLE"]),
expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]),
expand("data/interim/{pid}/applications_foreground_with_datetime_with_genre.csv", pid=config["PIDS"]),
expand("data/processed/{pid}/screen_deltas.csv", pid=config["PIDS"]),
expand("data/processed/{pid}/plugin_google_activity_recognition_deltas.csv", pid=config["PIDS"]),
expand("data/interim/{pid}/phone_valid_sensed_days.csv", pid=config["PIDS"]),
expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]),
expand("data/processed/{pid}/sms_{sms_type}_{day_segment}.csv",
pid=config["PIDS"],
sms_type = config["SMS"]["TYPES"],
day_segment = config["SMS"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/call_{call_type}_{segment}.csv",
pid=config["PIDS"],
call_type=config["CALLS"]["TYPES"],
segment = config["CALLS"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/location_barnett_{segment}.csv",
pid=config["PIDS"],
segment = config["BARNETT_LOCATION"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/bluetooth_{segment}.csv",
pid=config["PIDS"],
segment = config["BLUETOOTH"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/activity_recognition_{segment}.csv",pid=config["PIDS"],
segment = config["ACTIVITY_RECOGNITION"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/battery_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["BATTERY"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/screen_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["SCREEN"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/light_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["LIGHT"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/conversation_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["CONVERSATION"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/accelerometer_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/applications_foreground_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["APPLICATIONS_FOREGROUND"]["DAY_SEGMENTS"]),
expand("data/raw/{pid}/fitbit_{fitbit_sensor}_{fitbit_data_type}_with_datetime.csv",
pid=config["PIDS"],
fitbit_sensor=config["FITBIT_SENSORS"],
fitbit_data_type=config["FITBIT_DATA_TYPE"]),
expand("data/processed/{pid}/fitbit_heartrate_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["HEARTRATE"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/fitbit_step_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["STEP"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/fitbit_sleep_{day_segment}.csv",
pid = config["PIDS"],
day_segment = config["SLEEP"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/wifi_{segment}.csv",
pid=config["PIDS"],
segment = config["WIFI"]["DAY_SEGMENTS"]),
# Models
expand("data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv",
pid = config["PIDS"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
expand("data/processed/data_for_population_model/{source}_{day_segment}_original.csv",
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/data_for_individual_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
pid = config["PIDS"],
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
expand("data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
expand("data/processed/data_for_population_model/demographic_features.csv"),
expand("data/processed/data_for_population_model/targets_{summarised}.csv",
summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
expand("data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv",
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
expand("data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv",
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"],
summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
expand("data/processed/output_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}_{cv_method}_baseline.csv",
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"],
summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
expand(
expand("data/processed/output_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{{model}}/{cv_method}/{source}_{day_segment}_{summarised}_{{scaler}}/{result_component}.csv",
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"],
summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"],
result_component = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"]),
zip,
model = models,
scaler = scalers),
expand(
expand("data/processed/output_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{{model}}/{cv_method}/{source}_{day_segment}_{summarised}_{{scaler}}/merged_population_model_results.csv",
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"],
summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
zip,
model = models,
scaler = scalers),
# Vizualisations
expand("reports/figures/{pid}/{sensor}_heatmap_rows.html", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("reports/figures/{pid}/compliance_heatmap.html", pid=config["PIDS"]),
expand("reports/figures/{pid}/battery_consumption_rates_barchart.html", pid=config["PIDS"]),
expand("reports/compliance/{pid}/compliance_report.html", pid=config["PIDS"]),
expand("reports/figures/overall_compliance_heatmap.html"),
files_to_compute
rule clean:
shell:

View File

@ -1,14 +1,10 @@
# Valid database table names
SENSORS: [applications_crashes, applications_foreground, applications_notifications, battery, bluetooth, calls, locations, messages, plugin_ambient_noise, plugin_device_usage, plugin_google_activity_recognition, plugin_ios_activity_recognition, screen,plugin_studentlife_audio]
FITBIT_TABLE: [fitbit_data]
FITBIT_SENSORS: [heartrate, steps, sleep, calories]
FITBIT_DATA_TYPE: [summary, intraday]
# Add as many sensor tables as you have, they all improve the computation of PHONE_SENSED_BINS.
# If you are extracting screen or Barnett's location features, screen and locations tables are mandatory.
TABLES_FOR_SENSED_BINS: []
# Participants to include in the analysis
# You must create a file for each participant
# named pXXX containing their device_id
PIDS: [p01, p02]
# You must create a file for each participant named pXXX containing their device_id. This can be done manually or automatically
PIDS: []
# Global var with common day segments
DAY_SEGMENTS: &day_segments
@ -36,7 +32,9 @@ READABLE_DATETIME:
FIXED_TIMEZONE: *timezone
# Communication SMS features config, TYPES and FEATURES keys need to match
SMS:
MESSAGES:
COMPUTE: False
DB_TABLE: messages
TYPES : [received, sent]
FEATURES:
received: [count, distinctcontacts, timefirstsms, timelastsms, countmostfrequentcontact]
@ -45,6 +43,8 @@ SMS:
# Communication call features config, TYPES and FEATURES keys need to match
CALLS:
COMPUTE: False
DB_TABLE: calls
TYPES: [missed, incoming, outgoing]
FEATURES:
missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact]
@ -69,36 +69,52 @@ RESAMPLE_FUSED_LOCATION:
TIMEZONE: *timezone
BARNETT_LOCATION:
COMPUTE: False
DB_TABLE: locations
DAY_SEGMENTS: [daily] # These features are only available on a daily basis
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
LOCATIONS_TO_USE: ALL # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED
LOCATIONS_TO_USE: RESAMPLE_FUSED # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
TIMEZONE: *timezone
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
BLUETOOTH:
COMPUTE: False
DB_TABLE: bluetooth
DAY_SEGMENTS: *day_segments
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
ACTIVITY_RECOGNITION:
COMPUTE: False
DB_TABLE:
ANDROID: plugin_google_activity_recognition
IOS: plugin_ios_activity_recognition
DAY_SEGMENTS: *day_segments
FEATURES: ["count","mostcommonactivity","countuniqueactivities","activitychangecount","sumstationary","summobile","sumvehicle"]
BATTERY:
COMPUTE: False
DB_TABLE: battery
DAY_SEGMENTS: *day_segments
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
SCREEN:
COMPUTE: False
DB_TABLE: screen
DAY_SEGMENTS: *day_segments
REFERENCE_HOUR_FIRST_USE: 0
FEATURES_DELTAS: ["countepisode", "episodepersensedminutes", "sumduration", "maxduration", "minduration", "avgduration", "stdduration", "firstuseafter"]
EPISODE_TYPES: ["unlock"]
LIGHT:
COMPUTE: False
DB_TABLE: light
DAY_SEGMENTS: *day_segments
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
ACCELEROMETER:
COMPUTE: False
DB_TABLE: accelerometer
DAY_SEGMENTS: *day_segments
FEATURES:
MAGNITUDE: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
@ -107,6 +123,8 @@ ACCELEROMETER:
VALID_SENSED_MINUTES: True
APPLICATIONS_FOREGROUND:
COMPUTE: False
DB_TABLE: applications_foreground
DAY_SEGMENTS: *day_segments
SINGLE_CATEGORIES: ["all", "email"]
MULTIPLE_CATEGORIES:
@ -118,12 +136,15 @@ APPLICATIONS_FOREGROUND:
FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
HEARTRATE:
COMPUTE: False
DB_TABLE: fitbit_data
DAY_SEGMENTS: *day_segments
# Only daily features are extracted from summary data
SUMMARY_FEATURES: ["restinghr"] # calories related features might be inaccurate: ["caloriesoutofrange", "caloriesfatburn", "caloriescardio", "caloriespeak"]
SUMMARY_FEATURES: ["restinghr"] # calories features' accuracy depend on the accuracy of the participants fitbit profile (e.g. heigh, weight) use with care: ["caloriesoutofrange", "caloriesfatburn", "caloriescardio", "caloriespeak"]
INTRADAY_FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "minutesonoutofrangezone", "minutesonfatburnzone", "minutesoncardiozone", "minutesonpeakzone"]
STEP:
COMPUTE: False
DB_TABLE: fitbit_data
DAY_SEGMENTS: *day_segments
FEATURES:
ALL_STEPS: ["sumallsteps", "maxallsteps", "minallsteps", "avgallsteps", "stdallsteps"]
@ -133,16 +154,23 @@ STEP:
INCLUDE_ZERO_STEP_ROWS: True
SLEEP:
COMPUTE: False
DB_TABLE: fitbit_data
DAY_SEGMENTS: *day_segments
SLEEP_TYPES: ["main", "nap", "all"]
# Only daily features are extracted from summary data
SUMMARY_FEATURES: ["sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgefficiency", "countepisode"]
WIFI:
COMPUTE: False
DB_TABLE: wifi
DAY_SEGMENTS: *day_segments
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
CONVERSATION:
COMPUTE: False
DB_TABLE:
ANDROID: plugin_studentlife_audio_android
IOS: plugin_studentlife_audio
DAY_SEGMENTS: *day_segments
FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration",
"sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy",
@ -152,6 +180,8 @@ CONVERSATION:
RECORDINGMINUTES: 1
PAUSEDMINUTES : 3
### Analysis ################################################################
PARAMS_FOR_ANALYSIS:
GROUNDTRUTH_TABLE: participant_info
SOURCES: &sources ["phone_features", "fitbit_features", "phone_fitbit_features"]

View File

@ -3,12 +3,23 @@
RAPIDS Features
===============
*How do I compute any of these features?* In your ``config.yaml``, go to the sensor section you are interested in and set the corresponding ``COMPUTE`` option to ``TRUE`` as well as ``DB_TABLE`` to the senor's table name in your database (the default table name is the one assigned by Aware), for example:
| ``MESSAGES:``
| ``COMPUTE: True``
| ``DB_TABLE: messages``
| ``...``
If you want to extract phone_valid_sensed_days.csv, screen features or locaton features based on fused location data don't forget to configure ``TABLES_FOR_SENSED_BINS`` (see below).
.. _global-sensor-doc:
Global Parameters
"""""""""""""""""
.. _sensor-list:
- ``SENSORS`` - List of sensors to include in the pipeline that have to match existent tables in your AWARE_ database. See SENSORS_ variable in ``config`` file.
- ``TABLES_FOR_SENSED_BINS`` - Add as many sensor tables as you have in your database. All sensors included are used to compute ``phone_sensed_bins.csv`` (bins of time when the smartphone was sensing data). In turn, these bins are used to compute ``PHONE_VALID_SENSED_DAYS`` (see below), ``episodepersensedminutes`` feature of :ref:`Screen<screen-sensor-doc>` and to resample fused location data if you configure Barnett's location features to use ``RESAMPLE_FUSED``. See TABLES_FOR_SENSED_BINS_ variable in ``config`` file (therefore, when you are extracting screen or Barnett's location features, screen and locations tables are mandatory).
.. _fitbit-table:
@ -53,34 +64,28 @@ Global Parameters
Contains three attributes: ``BIN_SIZE``, ``MIN_VALID_HOURS``, ``MIN_BINS_PER_HOUR``.
On any given day, Aware could have sensed data only for a few minutes or for 24 hours. Daily estimates of features should be considered more reliable the more hours Aware was running and logging data (for example, 10 calls logged on a day when only one hour of data was recorded is a less reliable measurement compared to 10 calls on a day when 23 hours of data were recorded.
On any given day, Aware could have sensed data only for a few minutes or for 24 hours. Daily estimates of features should be considered more reliable the more hours Aware was running and logging data (for example, 10 calls logged on a day when only one hour of data was recorded is a less reliable feature compared to 10 calls on a day when 23 hours of data were recorded.
Therefore, we define a valid hour as those that contain at least a certain number of valid bins. In turn, a valid bin are those that contain at least one row of data from any sensor logged within that period. We divide an hour into N bins of size ``BIN_SIZE`` (in minutes) and we mark an hour as valid if contains at least ``MIN_BINS_PER_HOUR`` of valid bins (out of the total possible number of bins that can be captured in an hour i.e. out of 60min/``BIN_SIZE`` bins). Days with valid sensed hours less than ``MIN_VALID_HOURS`` will be excluded form the output of this file. See PHONE_VALID_SENSED_DAYS_ in ``config.yaml``.
Therefore, we define a valid hour as those that contain a minimum number of valid bins. In turn, a valid bin are those that contain at least one row of data from any sensor logged within that period. We divide an hour into N bins of size ``BIN_SIZE`` (in minutes) and we mark an hour as valid if contains at least ``MIN_BINS_PER_HOUR`` (out of the total possible number of bins that can be captured in an hour based on their length i.e. 60min/``BIN_SIZE`` bins). Days with valid sensed hours less than ``MIN_VALID_HOURS`` will be excluded form the output of this file. See PHONE_VALID_SENSED_DAYS_ in ``config.yaml``.
In RAPIDS, we use ``phone_sensed_bins`` (a list of all valid and invalid bins of all monitored days) to improve the estimation of features that are ratios over time periods like ``episodepersensedminutes`` of :ref:`Screen<screen-sensor-doc>` or for resampling data like fused location coordinates.
Note that RAPIDS *DOES NOT* filter your feature files automatically, you need to do this manually based on ``"data/interim/{pid}/phone_valid_sensed_days.csv"``.
You can get access to every phone's sensed bins matrix (days x bins) in ``data/interim/{pid}/phone_sensed_bins.csv``. As mentioned above, RAPIDS uses this file to compute ``phone_valid_sensed_days.csv``, ``episodepersensedminutes`` feature of :ref:`Screen<screen-sensor-doc>` and to resample fused location data if you configure Barnett's location features to use ``RESAMPLE_FUSED``.
.. _individual-sensor-settings:
.. _sms-sensor-doc:
SMS
Messages (SMS)
"""""
See `SMS Config Code`_
See `Messages Config Code`_
**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night
**Available Platforms:** Android
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/sms_{sms_type}_{day_segment}.csv".``
| ``pid=config["PIDS"],``
| ``sms_type = config["SMS"]["TYPES"],``
| ``day_segment = config["SMS"]["DAY_SEGMENTS"]),``
**Rule Chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
@ -134,13 +139,6 @@ See `Call Config Code`_
**Available Platforms:** Android and iOS
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/call_{call_type}_{segment}.csv",``
| ``pid=config["PIDS"],``
| ``call_type=config["CALLS"]["TYPES"],``
| ``segment = config["CALLS"]["DAY_SEGMENTS"]),``
**Rule Chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
@ -217,12 +215,6 @@ See `Bluetooth Config Code`_
**Available Platforms:** Android and iOS
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/bluetooth_{segment}.csv",``
| ``pid=config["PIDS"],``
| ``segment = config["BLUETOOTH"]["DAY_SEGMENTS"]),``
**Snakemake rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
@ -254,6 +246,48 @@ countscansmostuniquedevice scans Number of scans of the most scanned
**Assumptions/Observations:** N/A
.. _wifi-sensor-doc:
WiFi
""""""""""
See `WiFi Config Code`_
**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night
**Available Platforms:** Android and iOS
**Snakemake rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
- Rule ``rules/preprocessing.snakefile/readable_datetime``
- Rule ``rules/features.snakefile/wifi_features``
.. _wifi-parameters:
**WiFi Rule Parameters (wifi_features):**
============ ===================
Name Description
============ ===================
day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night``
features Features to be computed, see table below
============ ===================
.. _wifi-available-features:
**Available WiFi Features**
=========================== ========= =============
Name Units Description
=========================== ========= =============
countscans devices Number of scanned WiFi access points during a ``day_segment``, an access point can be detected multiple times over time and these appearances are counted separately
uniquedevices devices Number of unique access point during a ``day_segment`` as identified by their hardware address
countscansmostuniquedevice scans Number of scans of the most scanned access point during a ``day_segment`` across the whole monitoring period
=========================== ========= =============
**Assumptions/Observations:** N/A
.. _accelerometer-sensor-doc:
@ -266,12 +300,6 @@ See `Accelerometer Config Code`_
**Available Platforms:** Android and iOS
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/accelerometer_{day_segment}.csv",``
| ``pid=config["PIDS"],``
| ``day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"]),``
**Rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
@ -325,12 +353,6 @@ See `Applications Foreground Config Code`_
**Available Platforms:** Android
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/applications_foreground_{day_segment}.csv",``
| ``pid=config["PIDS"],``
| ``day_segment = config["APPLICATIONS_FOREGROUND"]["DAY_SEGMENTS"]),``
**Snakemake rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
@ -392,12 +414,6 @@ See `Battery Config Code`_
**Available Platforms:** Android and iOS
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/battery_{day_segment}.csv",``
| ``pid=config["PIDS"],``
| ``day_segment = config["BATTERY"]["DAY_SEGMENTS"]),``
**Snakemake rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
@ -444,11 +460,6 @@ Activity Recognition
**Available Platforms:** Android and iOS
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/activity_recognition_{segment}.csv",pid=config["PIDS"],``
| ``segment = config["ACTIVITY_RECOGNITION"]["DAY_SEGMENTS"]),``
**Snakemake rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
@ -502,12 +513,6 @@ See `Light Config Code`_
**Available Platforms:** Android
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/light_{day_segment}.csv",``
| ``pid=config["PIDS"],``
| ``day_segment = config["LIGHT"]["DAY_SEGMENTS"]),``
**Rule Chain:**
- **Rule:** ``rules/preprocessing.snakefile/download_dataset`` - See the download_dataset_ rule.
@ -557,12 +562,6 @@ See `Location (Barnetts) Config Code`_
**Available Platforms:** Android and iOS
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/location_barnett_{segment}.csv",``
| ``pid=config["PIDS"],``
| ``segment = config["BARNETT_LOCATION"]["DAY_SEGMENTS"]),``
**Snakemake rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
@ -643,12 +642,6 @@ See `Screen Config Code`_
**Available Platforms:** Android and iOS
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/screen_{day_segment}.csv",``
| ``pid=config["PIDS"],``
| ``day_segment = config["SCREEN"]["DAY_SEGMENTS"]),``
**Snakemake rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
@ -701,12 +694,6 @@ See `Conversation Config Code`_
**Available Platforms:** Android and iOS
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/conversation_{day_segment}.csv",``
| ``pid = config["PIDS"],``
| ``day_segment = config["CONVERSATION"]["DAY_SEGMENTS"]),``
**Snakemake rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
@ -721,9 +708,9 @@ See `Conversation Config Code`_
Name Description
========================= ===================
day_segment The particular ``day_segments`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night``
recordingMinutes The current default configuration is 1 min recording/3 min pause.
features_deltas Features to be computed, see table below
pausedMinutes The current default configuration is 1 min recording/3 min pause.
recordingMinutes Minutes the plugin was recording audio (default 1 min)
pausedMinutes Minutes the plugin was NOT recording audio (default 3 min)
features Features to be computed, see table below
========================= ===================
.. _conversation-available-features:
@ -733,30 +720,30 @@ pausedMinutes The current default configuration is 1 min recordin
========================= ================= =============
Name Units Description
========================= ================= =============
minutessilence minutes Total duration of all minutes silence.
minutesnoise minutes Total duration of all minutes noise.
minutesvoice minutes Total duration of all minutes voice.
minutesunknown minutes Total duration of all minutes unknown.
sumconversationduration minutes Total duration of all the conversation.
maxconversationduration minutes Longest duration of all the conversation.
minconversationduration minutes Shortest duration of all the conversation.
avgconversationduration minutes Average duration of all the conversation.
sdconversationduration minutes Standard Deviation duration of all the conversation.
timefirstconversation minutes Starting time of first conversation of the Day/Epoch.
timelastconversation minutes Starting time of last conversation of the Day/Epoch.
sumenergy L2-norm Total sum of all the energy.
avgenergy L2-norm Average of all the energy.
sdenergy L2-norm Standard Deviation of all the energy.
minenergy L2-norm Minimum of all the energy.
maxenergy L2-norm Maximum of all the energy.
silencesensedfraction minutes
noisesensedfraction minutes
voicesensedfraction minutes
unknownsensedfraction minutes
silenceexpectedfraction minutes
noiseexpectedfraction minutes
voiceexpectedfraction minutes
unknownexpectedfraction minutes
minutessilence minutes Minutes labeled as silence
minutesnoise minutes Minutes labeled as noise
minutesvoice minutes Minutes labeled as voice
minutesunknown minutes Minutes labeled as unknown
sumconversationduration minutes Total duration of all conversations
maxconversationduration minutes Longest duration of all conversations
minconversationduration minutes Shortest duration of all conversations
avgconversationduration minutes Average duration of all conversations
sdconversationduration minutes Standard Deviation of the duration of all conversations
timefirstconversation minutes Minutes since midnight when the first conversation for a day segment was detected
timelastconversation minutes Minutes since midnight when the last conversation for a day segment was detected
sumenergy L2-norm Sum of all energy values
avgenergy L2-norm Average of all energy values
sdenergy L2-norm Standard Deviation of all energy values
minenergy L2-norm Minimum of all energy values
maxenergy L2-norm Maximum of all energy values
silencesensedfraction Ratio between minutessilence and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
noisesensedfraction Ratio between minutesnoise and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
voicesensedfraction Ratio between minutesvoice and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
unknownsensedfraction Ratio between minutesunknown and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
silenceexpectedfraction Ration between minutessilence and the number of minutes that in theory should have been sensed based on the record and pause cycle of the plugin (1440 / recordingMinutes+pausedMinutes)
noiseexpectedfraction Ration between minutesnoise and the number of minutes that in theory should have been sensed based on the record and pause cycle of the plugin (1440 / recordingMinutes+pausedMinutes)
voiceexpectedfraction Ration between minutesvoice and the number of minutes that in theory should have been sensed based on the record and pause cycle of the plugin (1440 / recordingMinutes+pausedMinutes)
unknownexpectedfraction Ration between minutesunknown and the number of minutes that in theory should have been sensed based on the record and pause cycle of the plugin (1440 / recordingMinutes+pausedMinutes)
========================= ================= =============
**Assumptions/Observations:**
@ -774,13 +761,6 @@ See `Fitbit: Sleep Config Code`_
**Available Epochs (day_segment) :** daily
**Available Platforms:**: Fitbit
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/fitbit_sleep_{day_segment}.csv",``
| ``pid = config["PIDS"],``
| ``day_segment = config["SLEEP"]["DAY_SEGMENTS"]),``
**Snakemake rule chain:**
@ -818,25 +798,25 @@ countepisode episodes Number of sleep episodes for ``sleep_t
**Assumptions/Observations:**
The `fitbit_with_datetime` rule will extract Summary data (`fitbit_sleep_summary_with_datetime.csv`) Intraday data (`fitbit_sleep_intraday_with_datetime.csv`). There are two versions of Fitbit's sleep API(`version 1`_ and `version 1.2`_), and each provides raw sleep data with different formats.
Only features from summary data are available at the momement.
The differences between both API versions are:
The `fitbit_with_datetime` rule will extract Summary data (`fitbit_sleep_summary_with_datetime.csv`) and Intraday data (`fitbit_sleep_intraday_with_datetime.csv`). There are two versions of Fitbit's sleep API (`version 1`_ and `version 1.2`_), and each provides raw sleep data in a different format:
- Sleep level. In `v1`, it is an integer with three possible values {1, 2, 3} while in `v1.2` it is a string. We convert integer levels of `v1` to strings: "asleep", "restless" or "awake" respectively.
- Count summaries. For Summary data, `v1` contains "count_awake", "duration_awake", "count_awakenings", "count_restless", and "duration_restless" fields in the summary of each sleep record while `v1.2` does not.
- Types of sleep records. `v1.2` has two types of sleep records: "classic" and "stages". The "classic" type contains three sleep levels: "awake", "restless" and "asleep". The "stages" type contains four sleep levels {"wake", "deep", "light", "rem"}. Sleep records from `v1` will have the same sleep levels as `v1.2` classic types; therefore we set their type to "classic".
- Sleep level. In ``v1``, sleep level is an integer with three possible values (1, 2, 3) while in ``v1.2`` is a string. We convert integer levels to strings, "asleep", "restless" or "awake" respectively.
- Count summaries. For Summary data, ``v1`` contains "count_awake", "duration_awake", "count_awakenings", "count_restless", and "duration_restless" fields for every sleep record while ``v1.2`` does not.
- Types of sleep records. ``v1.2`` has two types of sleep records: "classic" and "stages". The "classic" type contains three sleep levels: "awake", "restless" and "asleep". The "stages" type contains four sleep levels: "wake", "deep", "light", and "rem". Sleep records from ``v1`` will have the same sleep levels as `v1.2` classic type; therefore we set their type to "classic".
- Unified level of sleep. For intraday data, we unify sleep levels of each sleep record with a column named "unified_level". Based on `this Fitbit forum post`_ , we merge levels into two categories:
- For the "classic" type: unified_level is one of {0, 1} where 0 means awake and groups "awake" + "restless", while 1 means asleep and groups "asleep".
- For the "stages" type, unified_level is one of {0, 1} where 0 means awake and groups "wake" while 1 means asleep and groups "deep" + "light" + "rem".
- Short Data. In `v1.2`, records of type "stages" contain "shortData" in addition to "data". We merge "data" part and "shortData" part to extract intraday data.
- The "data" grouping displays the sleep stages and any wake periods > 3 minutes (180 seconds).
- The "shortData" grouping displays the short wake periods representing physiological awakenings that are <= 3 minutes (180 seconds).
- The following columns of Summary data are not computed by RAPIDS but taken directly from columns with a similar name provided by the API: `efficiency`, `minutes_after_wakeup`, `minutes_asleep`, `minutes_awake`, `minutes_to_fall_asleep`, `minutes_in_bed`, `is_main_sleep` and `type`
- The following columns of Intraday data are not computed by RAPIDS but taken directly from columns with a similar name provided by the API: `original_level`, `is_main_sleep` and `type`. We compute `unified_level` as explained above.
- For the "classic" type unified_level is one of {0, 1} where 0 means awake and groups "awake" + "restless", while 1 means asleep and groups "asleep".
- For the "stages" type, unified_level is one of {0, 1} where 0 means awake and groups "wake" while 1 means asleep and groups "deep" + "light" + "rem".
- Short Data. In ``v1.2``, records of type "stages" contain "shortData" in addition to "data". We merge both to extract intraday data.
- "data" contains sleep stages and any wake periods > 3 minutes (180 seconds).
- "shortData" contains short wake periods representing physiological awakenings that are <= 3 minutes (180 seconds).
- The following columns of Summary data are not computed by RAPIDS but taken directly from columns with a similar name provided by Fitbit's API: `efficiency`, `minutes_after_wakeup`, `minutes_asleep`, `minutes_awake`, `minutes_to_fall_asleep`, `minutes_in_bed`, `is_main_sleep` and `type`
- The following columns of Intraday data are not computed by RAPIDS but taken directly from columns with a similar name provided by Fitbit's API: `original_level`, `is_main_sleep` and `type`. We compute `unified_level` as explained above.
Detailed sleep data is stored in Intraday data every 30 seconds (for "stages" type) or 60 seconds (for "classic" type) while a summary is stored in Summary data. For example:
These are examples of intraday and summary data:
- Intraday data
- Intraday data (at 30-second intervals for "stages" type or 60-second intervals for "classic" type)
========= ============== ============= ============= ====== =================== ========== =========== ========= ================= ========== ========== ============ =================
device_id original_level unified_level is_main_sleep type local_date_time local_date local_month local_day local_day_of_week local_time local_hour local_minute local_day_segment
@ -868,13 +848,6 @@ See `Fitbit: Heart Rate Config Code`_
**Available Platforms:**: Fitbit
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/fitbit_heartrate_{day_segment}.csv",``
| ``pid=config["PIDS"],``
| ``day_segment = config["HEARTRATE"]["DAY_SEGMENTS"]),``
**Snakemake rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
@ -910,7 +883,7 @@ stdhr beats/mins The standard deviation of heart rate during
diffmaxmodehr beats/mins The difference between the maximum and mode heart rate during ``day_segment`` epoch.
diffminmodehr beats/mins The difference between the mode and minimum heart rate during ``day_segment`` epoch.
entropyhr nats Shannons entropy measurement based on heart rate during ``day_segment`` epoch.
lengthZONE minutes Number of minutes the user's heartrate fell within each ``heartrate_zone`` during ``day_segment`` epoch.
minutesonZONE minutes Number of minutes the user's heartrate fell within each ``heartrate_zone`` during ``day_segment`` epoch.
================== =========== =============
**Assumptions/Observations:**
@ -930,12 +903,6 @@ See `Fitbit: Steps Config Code`_
**Available Platforms:**: Fitbit
**Snakefile entry to compute these features:**
| ``expand("data/processed/{pid}/fitbit_step_{day_segment}.csv",``
| ``pid=config["PIDS"],``
| ``day_segment = config["STEP"]["DAY_SEGMENTS"]),``
**Snakemake rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
@ -985,7 +952,7 @@ Active and sedentary bouts. If the step count per minute is smaller than ``THRES
.. -------------------------Links ------------------------------------ ..
.. _SENSORS: https://github.com/carissalow/rapids/blob/f22d1834ee24ab3bcbf051bc3cc663903d822084/config.yaml#L2
.. _TABLES_FOR_SENSED_BINS: https://github.com/carissalow/rapids/blob/f22d1834ee24ab3bcbf051bc3cc663903d822084/config.yaml#L2
.. _`SMS Config Code`: https://github.com/carissalow/rapids/blob/f22d1834ee24ab3bcbf051bc3cc663903d822084/config.yaml#L38
.. _AWARE: https://awareframework.com/what-is-aware/
.. _`List of Timezones`: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones

View File

@ -154,14 +154,9 @@ Once RAPIDS is installed, follow these steps to start processing mobile data.
Participant01
2020/02/01,2020/03/03
#. Configure the sensors to process:
#. Choose what features to extract:
- See :ref:`Minimal Working Example<minimal-working-example>`. The variable ``SENSORS`` in the ``config.yaml`` file_ should match existent sensor tables in your Aware database (See :ref:`rapids-structure` for more information). Each sensor in this list will be processed in RAPIDS.
.. note::
It is beneficial to list all collected sensors even if you don't plan to include them in a model later on in the pipeline. This is because we use all data available to estimate whether the phone was sensing data or not (i.e. to know if Aware crashed or the battery died). See :ref:`PHONE_VALID_SENSED_DAYS<phone-valid-sensed-days>` for more information.
- See :ref:`Minimal Working Example<minimal-working-example>`.
#. Execute RAPIDS

View File

@ -13,6 +13,7 @@ Available features:
- :ref:`applications-foreground-sensor-doc`
- :ref:`battery-sensor-doc`
- :ref:`bluetooth-sensor-doc`
- :ref:`wifi-sensor-doc`
- :ref:`call-sensor-doc`
- :ref:`activity-recognition-sensor-doc`
- :ref:`light-doc`

View File

@ -3,7 +3,7 @@
Minimal Working Example
=======================
The following is a quick guide for creating and running a simple pipeline to extract Call metrics for daily and night epochs of one participant monitored on the US East coast.
This is a quick guide for creating and running a simple pipeline to extract call features for daily and night epochs of one participant monitored on the US East coast.
#. Make sure your database connection credentials in ``.env`` are correct. See step 1 of :ref:`Usage Section <db-configuration>`.
@ -11,33 +11,9 @@ The following is a quick guide for creating and running a simple pipeline to ext
#. Make sure your Conda (python) environment is active. See step 6 of :ref:`install-page`.
#. Replace the contents of the ``Snakefile`` with the following snippet
::
configfile: "config.yaml"
include: "rules/renv.snakefile"
include: "rules/preprocessing.snakefile"
include: "rules/features.snakefile"
include: "rules/reports.snakefile"
rule all:
input:
expand("data/processed/{pid}/call_{call_type}_{day_segment}.csv",
pid=config["PIDS"],
call_type=config["CALLS"]["TYPES"],
day_segment = config["CALLS"]["DAY_SEGMENTS"]),
#. Modify the following settings in the ``config.yaml`` file with the values shown below (leave all other settings as they are)
::
SENSORS: [calls]
FITBIT_TABLE: []
FITBIT_SENSORS: []
PIDS: [p01]
DAY_SEGMENTS: &day_segments
@ -47,7 +23,11 @@ The following is a quick guide for creating and running a simple pipeline to ext
America/New_York
DATABASE_GROUP: &database_group
MY_GROUP
MY_GROUP (change this if you added your DB credentials to .env with a different label)
CALLS:
COMPUTE: True
DB_TABLE: calls (only change DB_TABLE if your database calls table has a different name)
For more information on the ``calls`` sensor see :ref:`call-sensor-doc`

View File

@ -3,11 +3,25 @@
RAPIDS Structure
=================
.. _the-config-file:
The ``config.yaml`` File
------------------------
RAPIDS configuration settings are defined in ``config.yaml`` (See `config.yaml`_). This is the only file that you need to understand in order to compute the features that RAPIDS ships with.
It has global settings like ``TABLES_FOR_SENSED_BINS``, ``PIDS``, ``DAY_SEGMENTS``, among others (see :ref:`global-sensor-doc` for more information). As well as per sensor settings, for example, for the :ref:`sms-sensor-doc`::
| ``MESSAGES:``
| ``COMPUTE: True``
| ``DB_TABLE: messages``
| ``...``
.. _the-snakefile-file:
The ``Snakefile`` File
----------------------
The ``Snakefile`` file (see the actual `Snakefile`_) pulls the entire system together and can be thought of as the menu of RAPIDS allowing the user to define the sensor data that is desired. The first line in this file identifies the configuration file. Next are a list of included files that define the rules used to pull, clean, process, analyze and report on the data. Next is the ``all`` rule that list the sensor data (menu items) that would be processed by the pipeline.
The ``Snakefile`` file (see the actual `Snakefile`_) pulls the entire system together. The first line in this file identifies the configuration file. Next are a list of included directives that import the rules used to pull, clean, process, analyze and report data. Finally, the ``all`` rule lists the files that need to be computed (raw files, intermediate files, feature files, reports, etc).
.. _includes-section:
@ -15,22 +29,20 @@ Includes
"""""""""
There are 5 included files in the ``Snakefile`` file.
- ``renv.snakefile`` - This file defines the rules to manager the R packages that are used by RAPIDS. (See `renv`_)
- ``preprocessing.snakefile`` - This file contains the rules that are used to preprocess the data such as downloading, cleaning and formatting. (See `preprocessing`_)
- ``features.snakefile`` - This file contains the rules that used for behavioral feature extraction. (See `features`_)
- ``models.snakefile`` - This file contains the rules that are used to build models from features that have been extreacted from the sensor data. (See `models`_)
- ``reports.snakefile`` - The file contains the rules that are used to produce the reports based on the models produced. (See `reports`_)
- ``mystudy.snakefile`` - The file contains the rules that you add that are specifically tailored to your project/study. (See `mystudy`_)
.. - ``analysis.snakefile`` - The rules that define how the data is analyzed is outlined in this file. (see `analysis <https://github.com/carissalow/rapids/blob/master/rules/analysis.snakefile>`_)
- ``renv.snakefile`` - Rules to create, backup and restore the R renv virtual environment for RAPIDS. (See `renv`_)
- ``preprocessing.snakefile`` - Rules that are used to pre-preprocess the data such as downloading, cleaning and formatting. (See `preprocessing`_)
- ``features.snakefile`` - Rules that used for behavioral feature extraction. (See `features`_)
- ``models.snakefile`` - Rules that are used to build models from features that have been extreacted from the sensor data. (See `models`_)
- ``reports.snakefile`` - Rules that are used to produce reports and visualizations. (See `reports`_)
- ``mystudy.snakefile`` - Example file that contains rules specific to your project/study. (See `mystudy`_)
Includes are relative to the directory of the Snakefile in which they occur. For example, if above Snakefile resides in the directory ``my/dir``, then Snakemake will search for the include file at ``my/dir/path/to/other/snakefile``, regardless of the working directory.
Includes are relative to the root directory.
.. _rule-all-section:
``Rule all:``
"""""""""""""
In RAPIDS the ``all`` rule indirectly specifies the features/sensors that are desired by listing the output files of the pipeline using the ``expand`` directive. The ``expand`` function allows the combination of different variables. Consider the following::
In RAPIDS the ``all`` rule lists the output files we expect the pipeline to compute using the ``expand`` directive. The ``expand`` function allows us to generate a list of file paths that have a common structure except for PIDS or other parameters. Consider the following::
expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
@ -38,59 +50,33 @@ If ``pids = ['p01','p02']`` and ``sensor = ['sms', 'calls']`` then the above dir
["data/raw/p01/sms_raw.csv", "data/raw/p01/calls_raw.csv", "data/raw/p02/sms_raw.csv", "data/raw/p02/calls_raw.csv"]
Thus, this allows the user of RAPIDS to define all of the desired output files without having to manually list all for the participants of the research. The way Snakemake works is that it looks for the rule that produces the desired output files and then executes that rule. For more information on ``expand`` see `The Expand Function`_
Thus, this allows us to define all the desired output files without having to manually list each path for every participant and every sensor. The way Snakemake works is that it looks for the rule that produces the desired output files and then executes that rule. For more information on ``expand`` see `The Expand Function`_
.. _the-env-file:
The ``.env`` File
-------------------
The database credentials for database server is placed in the .env file (Remember step 9 on :ref:`install-page` page). The format of the configurations are shown below::
Your database credentials are stored in the ``.env`` file (See :ref:`install-page`)::
[MY_GROUP_NAME]
user=MyUSER
password=MyPassword
host=MyIP
host=MyIP/DOMAIN
port=3306
.. _the-config-file:
The ``config.yaml`` File
------------------------
The configurations for the pipeline are defined in the ``config.yaml`` (See `config.yaml`_). This contains global settings and variables that are used by the rules. Some of the global variables defined in the ``config.yaml`` file are briefly explained below:
- ``SENSORS`` - This is a global variable that contains a list of the sensor/feature tables in the database that will be analyzed.
- ``PIDS`` - This is the list of the participant IDs to include in the analysis. Create a file for each participant with a matching name ``pXXX`` containing the device_id in the ``data/external/`` directory. (Remember step 8 on the :ref:`install-page` page)
- ``DAY_SEGMENTS`` - A variable used to list all of the common day segments.
- ``TIMEZONE`` - Time variable. Use timezone names from the `List of Timezone`_ and double check your code, for example EST is not US Eastern Time.
- ``DATABASE_GROUP`` - Label for the database credentials group. (See :ref:`Configure the database connection <db-configuration>`.)
- ``DOWNLOAD_DATASET`` - Variable used to store the name of the dataset that will be download for analysis.
There are a number of other settings that are specific to the sensor/feature that will be pulled and analyzed by the pipeline. An example of the configuration settings for the :ref:`sms-sensor-doc` data is shown below::
SMS:
TYPES : [received, sent]
FEATURES:
received: [count, distinctcontacts, timefirstsms, timelastsms, countmostfrequentcontact]
sent: [count, distinctcontacts, timefirstsms, timelastsms, countmostfrequentcontact]
DAY_SEGMENTS: *day_segments
The ``TYPES`` setting defines the type of SMS data that will be analyzed. ``FEATURES`` defines the features of the data for each the type of SMS data being analyzed. Finally, ``DAY_SEGMENTS`` list the day segment (times of day) that the data is captured.
.. _rules-syntax:
The ``Rules`` Directory
------------------------
The ``rules`` directory contains the ``snakefiles`` that were included in the ``Snakefile`` file. A short description of these files are given in the :ref:`includes-section` section.
The ``rules`` directory contains the ``snakefiles`` that were included in the main ``Snakefile`` file. A short description of these files are given in the :ref:`includes-section` section.
Rules
""""""
A Snakemake workflow is defined by specifying rules in a ``Snakefile`` (See the features_ snakefile as an actual example). Rules decompose the workflow into small steps (e.g., the application of a single tool) by specifying how to create sets of output files from sets of input files. Snakemake automatically determines the dependencies between the rules by matching file names. Thus, a rule can consist of a name, input files, output files, and a command to generate the output from the input. The following is the basic structure of a Snakemake rule::
A Snakemake workflow is defined by rules (See the features_ snakefile as an actual example). Rules decompose the workflow into small steps by specifying what output files should be created by running a script on a set of input files. Snakemake automatically determines the dependencies between the rules by matching file names. Thus, a rule can consist of a name, input files, output files, and a command to generate the output from the input. The following is the basic structure of a Snakemake rule::
rule NAME:
input: "path/to/inputfile", "path/to/other/inputfile"
@ -113,17 +99,17 @@ A sample rule from the RAPIDS source code is shown below::
"../src/features/sms_features.R"
The ``rule`` directive specifies the name of the rule that is being defined. ``params`` defines the additional parameters that needs to be set for the rule. In the example immediately above, the parameters will be pasted to the script defined in the ``script`` directive of the rule. Instead of ``script`` a ``shell`` command call can also be called by replacing the ``script`` directive of the rule and replacing it with the lines similar to the folllowing::
The ``rule`` directive specifies the name of the rule that is being defined. ``params`` defines additional parameters for the rule's script. In the example above, the parameters are passed to the ``sms_features.R`` script as an dictionary. Instead of ``script`` a ``shell`` command call can also be called by replacing the ``script`` directive of the rule and replacing it with::
shell: "somecommand {input} {output}"
Here input and output (and in general any list or tuple) automatically evaluate to a space-separated list of files (i.e. ``path/to/inputfile path/to/other/inputfile``). It should be noted that rules can defined without input and output as seen in the ``renv.snakemake``. For more information see `Rules documentation`_ and for an actual example see the `renv`_ snakefile.
It should be noted that rules can defined without input and output as seen in the ``renv.snakemake``. For more information see `Rules documentation`_ and for an actual example see the `renv`_ snakefile.
.. _wildcards:
Wildcards
""""""""""
There are times that it would be useful to generalize a rule to be applicable to a number of e.g. datasets. For this purpose, wildcards can be used. Consider the sample code from above again repeated below for quick reference.::
There are times when the same rule should be applied to different participants and day segments. For this we use wildcards ``{my_wildcard}``. All wildcards are inferred from the files listed in the ``all` rule of the ``Snakefile`` file and therfore from the output of any rule::
rule sms_features:
input:
@ -147,10 +133,10 @@ The ``data`` Directory
This directory contains the data files for the project. These directories are as follows:
- ``external`` - This directory stores the participant `pxxx` files that contains the device_id and the type of device as well as data from third party sources. (Remember step 8 on :ref:`install-page` page)
- ``raw`` - This directory contains the original, immutable data dump from the sensor database.
- ``interim`` - This directory would contain intermediate data that has been transformed but has not been completely analyzed.
- ``processed`` - This directory contains the final canonical data sets for modeling.
- ``external`` - This directory stores the participant `pxxx` files as well as data from third party sources (see :ref:`install-page` page).
- ``raw`` - This directory contains the original, immutable data dump from your database.
- ``interim`` - This directory contains intermediate data that has been transformed but do not represent features.
- ``processed`` - This directory contains all behavioral features.
.. _the-src-directory:
@ -158,12 +144,12 @@ This directory contains the data files for the project. These directories are as
The ``src`` Directory
----------------------
The ``src`` directory holds all of the scripts used by the pipeline for data manipulation. These scripts can be in any programming language including but not limited to Python_, R_ and Julia_. This directory is organized into the following directories:
The ``src`` directory holds all the scripts used by the pipeline for data manipulation. These scripts can be in any programming language including but not limited to Python_, R_ and Julia_. This directory is organized into the following directories:
- ``data`` - This directory contains scripts that are used to download and preprocess raw data that will be used in analysis. See `data directory`_
- ``features`` - This directory contains scripts to extract behavioral features. See `features directory`_
- ``models`` - This directory contains the model scripts for building and training models. See `models directory`_
- ``visualization`` - This directory contains the scripts to create plots and reports that visualize the results of the models. See `visualization directory`_
- ``models`` - This directory contains the scripts for building and training models. See `models directory`_
- ``visualization`` - This directory contains the scripts to create plots and reports. See `visualization directory`_
.. _the-report-directory:
@ -171,7 +157,7 @@ The ``src`` directory holds all of the scripts used by the pipeline for data man
The ``reports`` Directory
--------------------------
This contains the reports of the results of the analysis done by the pipeline.
This directory contains reports and visualizations.
.. _Python: https://www.python.org/
.. _Julia: https://julialang.org/

View File

@ -2,55 +2,57 @@ def optional_ar_input(wildcards):
with open("data/external/"+wildcards.pid, encoding="ISO-8859-1") as external_file:
external_file_content = external_file.readlines()
platform = external_file_content[1].strip()
if platform == "android":
return ["data/raw/{pid}/plugin_google_activity_recognition_with_datetime_unified.csv",
"data/processed/{pid}/plugin_google_activity_recognition_deltas.csv"]
if platform == "android":
return ["data/raw/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_with_datetime_unified.csv",
"data/processed/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_deltas.csv"]
elif platform == "ios":
return ["data/raw/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_with_datetime_unified.csv",
"data/processed/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_deltas.csv"]
else:
return ["data/raw/{pid}/plugin_ios_activity_recognition_with_datetime_unified.csv",
"data/processed/{pid}/plugin_ios_activity_recognition_deltas.csv"]
return []
def optional_conversation_input(wildcards):
with open("data/external/"+wildcards.pid, encoding="ISO-8859-1") as external_file:
external_file_content = external_file.readlines()
platform = external_file_content[1].strip()
if platform == "android":
return ["data/raw/{pid}/plugin_studentlife_audio_android_with_datetime.csv"]
return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["ANDROID"] + "_with_datetime.csv"]
else:
return ["data/raw/{pid}/plugin_studentlife_audio_with_datetime.csv"]
return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["ANDROID"] + "_with_datetime.csv"]
def optional_location_input(wildcards):
if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
return rules.resample_fused_location.output
return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
else:
return "data/raw/{pid}/locations_with_datetime.csv",
return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
rule sms_features:
rule messages_features:
input:
"data/raw/{pid}/messages_with_datetime.csv"
expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"])
params:
sms_type = "{sms_type}",
messages_type = "{messages_type}",
day_segment = "{day_segment}",
features = lambda wildcards: config["SMS"]["FEATURES"][wildcards.sms_type]
features = lambda wildcards: config["MESSAGES"]["FEATURES"][wildcards.messages_type]
output:
"data/processed/{pid}/sms_{sms_type}_{day_segment}.csv"
"data/processed/{pid}/messages_{messages_type}_{day_segment}.csv"
script:
"../src/features/sms_features.R"
"../src/features/messages_features.R"
rule call_features:
input:
"data/raw/{pid}/calls_with_datetime_unified.csv"
expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"])
params:
call_type = "{call_type}",
day_segment = "{day_segment}",
features = lambda wildcards: config["CALLS"]["FEATURES"][wildcards.call_type]
output:
"data/processed/{pid}/call_{call_type}_{day_segment}.csv"
"data/processed/{pid}/calls_{call_type}_{day_segment}.csv"
script:
"../src/features/call_features.R"
rule battery_deltas:
input:
"data/raw/{pid}/battery_with_datetime_unified.csv"
expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["BATTERY"]["DB_TABLE"])
output:
"data/processed/{pid}/battery_deltas.csv"
script:
@ -58,7 +60,7 @@ rule battery_deltas:
rule screen_deltas:
input:
screen = "data/raw/{pid}/screen_with_datetime.csv",
screen = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["SCREEN"]["DB_TABLE"]),
participant_info = "data/external/{pid}"
output:
"data/processed/{pid}/screen_deltas.csv"
@ -67,17 +69,17 @@ rule screen_deltas:
rule google_activity_recognition_deltas:
input:
"data/raw/{pid}/plugin_google_activity_recognition_with_datetime_unified.csv"
expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"])
output:
"data/processed/{pid}/plugin_google_activity_recognition_deltas.csv"
expand("data/processed/{{pid}}/{sensor}_deltas.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"])
script:
"../src/features/activity_recognition_deltas.R"
rule ios_activity_recognition_deltas:
input:
"data/raw/{pid}/plugin_ios_activity_recognition_with_datetime_unified.csv"
expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"])
output:
"data/processed/{pid}/plugin_ios_activity_recognition_deltas.csv"
expand("data/processed/{{pid}}/{sensor}_deltas.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"])
script:
"../src/features/activity_recognition_deltas.R"
@ -98,7 +100,7 @@ rule location_barnett_features:
rule bluetooth_features:
input:
"data/raw/{pid}/bluetooth_with_datetime.csv"
expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])
params:
day_segment = "{day_segment}",
features = config["BLUETOOTH"]["FEATURES"]
@ -146,7 +148,7 @@ rule screen_features:
rule light_features:
input:
"data/raw/{pid}/light_with_datetime.csv",
expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"]),
params:
day_segment = "{day_segment}",
features = config["LIGHT"]["FEATURES"],
@ -170,7 +172,7 @@ rule conversation_features:
rule accelerometer_features:
input:
"data/raw/{pid}/accelerometer_with_datetime.csv",
expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["ACCELEROMETER"]["DB_TABLE"]),
params:
day_segment = "{day_segment}",
magnitude = config["ACCELEROMETER"]["FEATURES"]["MAGNITUDE"],
@ -184,7 +186,7 @@ rule accelerometer_features:
rule applications_foreground_features:
input:
"data/interim/{pid}/applications_foreground_with_datetime_with_genre.csv",
expand("data/interim/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])
params:
day_segment = "{day_segment}",
single_categories = config["APPLICATIONS_FOREGROUND"]["SINGLE_CATEGORIES"],
@ -200,7 +202,7 @@ rule applications_foreground_features:
rule wifi_features:
input:
"data/raw/{pid}/wifi_with_datetime.csv"
expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"])
params:
day_segment = "{day_segment}",
features = config["WIFI"]["FEATURES"]
@ -224,7 +226,7 @@ rule fitbit_heartrate_features:
rule fitbit_step_features:
input:
step_data = "data/raw/{pid}/fitbit_steps_intraday_with_datetime.csv"
step_data = "data/raw/{pid}/fitbit_step_intraday_with_datetime.csv"
params:
day_segment = "{day_segment}",
features_all_steps = config["STEP"]["FEATURES"]["ALL_STEPS"],

View File

@ -19,6 +19,9 @@ rule download_dataset:
script:
"../src/data/download_dataset.R"
PHONE_SENSORS = []
PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["BARNETT_LOCATION"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"],config["WIFI"]["DB_TABLE"], config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]])
rule readable_datetime:
input:
sensor_input = rules.download_dataset.output
@ -26,7 +29,7 @@ rule readable_datetime:
timezones = None,
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"]
wildcard_constraints:
sensor = '(' + '|'.join([re.escape(x) for x in config["SENSORS"]]) + ')' # only process smartphone sensors, not fitbit
sensor = '.*(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ').*' # only process smartphone sensors, not fitbit
output:
"data/raw/{pid}/{sensor}_with_datetime.csv"
script:
@ -34,7 +37,7 @@ rule readable_datetime:
rule phone_valid_sensed_days:
input:
all_sensors = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["SENSORS"])
all_sensors = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["TABLES_FOR_SENSED_BINS"])
params:
bin_size = config["PHONE_VALID_SENSED_DAYS"]["BIN_SIZE"],
min_valid_hours = config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_HOURS"],
@ -46,7 +49,7 @@ rule phone_valid_sensed_days:
rule phone_sensed_bins:
input:
all_sensors = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["SENSORS"])
all_sensors = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["TABLES_FOR_SENSED_BINS"])
params:
bin_size = config["PHONE_VALID_SENSED_DAYS"]["BIN_SIZE"]
output:
@ -67,7 +70,7 @@ rule unify_ios_android:
rule resample_fused_location:
input:
locations = "data/raw/{pid}/locations_raw.csv",
locations = "data/raw/{pid}/{sensor}_raw.csv",
phone_sensed_bins = rules.phone_sensed_bins.output
params:
bin_size = config["PHONE_VALID_SENSED_DAYS"]["BIN_SIZE"],
@ -75,32 +78,54 @@ rule resample_fused_location:
consecutive_threshold = config["RESAMPLE_FUSED_LOCATION"]["CONSECUTIVE_THRESHOLD"],
time_since_valid_location = config["RESAMPLE_FUSED_LOCATION"]["TIME_SINCE_VALID_LOCATION"]
output:
"data/raw/{pid}/locations_resampled.csv"
"data/raw/{pid}/{sensor}_resampled.csv"
script:
"../src/data/resample_fused_location.R"
rule application_genres:
input:
"data/raw/{pid}/applications_foreground_with_datetime.csv"
"data/raw/{pid}/{sensor}_with_datetime.csv"
params:
catalogue_source = config["APPLICATION_GENRES"]["CATALOGUE_SOURCE"],
catalogue_file = config["APPLICATION_GENRES"]["CATALOGUE_FILE"],
update_catalogue_file = config["APPLICATION_GENRES"]["UPDATE_CATALOGUE_FILE"],
scrape_missing_genres = config["APPLICATION_GENRES"]["SCRAPE_MISSING_GENRES"]
output:
"data/interim/{pid}/applications_foreground_with_datetime_with_genre.csv"
"data/interim/{pid}/{sensor}_with_datetime_with_genre.csv"
script:
"../src/data/application_genres.R"
rule fitbit_with_datetime:
rule fitbit_heartrate_with_datetime:
input:
expand("data/raw/{{pid}}/{fitbit_table}_raw.csv", fitbit_table=config["FITBIT_TABLE"])
expand("data/raw/{{pid}}/{fitbit_table}_raw.csv", fitbit_table=config["HEARTRATE"]["DB_TABLE"])
params:
local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
fitbit_sensor = "{fitbit_sensor}"
fitbit_sensor = "heartrate"
output:
summary_data = "data/raw/{pid}/fitbit_{fitbit_sensor}_summary_with_datetime.csv",
intraday_data = "data/raw/{pid}/fitbit_{fitbit_sensor}_intraday_with_datetime.csv"
summary_data = "data/raw/{pid}/fitbit_heartrate_summary_with_datetime.csv",
intraday_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv"
script:
"../src/data/fitbit_readable_datetime.py"
rule fitbit_step_with_datetime:
input:
expand("data/raw/{{pid}}/{fitbit_table}_raw.csv", fitbit_table=config["STEP"]["DB_TABLE"])
params:
local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
fitbit_sensor = "steps"
output:
intraday_data = "data/raw/{pid}/fitbit_step_intraday_with_datetime.csv"
script:
"../src/data/fitbit_readable_datetime.py"
rule fitbit_sleep_with_datetime:
input:
expand("data/raw/{{pid}}/{fitbit_table}_raw.csv", fitbit_table=config["SLEEP"]["DB_TABLE"])
params:
local_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
fitbit_sensor = "sleep"
output:
summary_data = "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv",
intraday_data = "data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv"
script:
"../src/data/fitbit_readable_datetime.py"

View File

@ -48,9 +48,12 @@ rule battery_consumption_rates_barchart:
script:
"../src/visualization/battery_consumption_rates_barchart.py"
PHONE_SENSORS = []
PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["BARNETT_LOCATION"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"],config["WIFI"]["DB_TABLE"]])
rule compliance_report:
input:
sensor_heatmaps = expand("reports/figures/{{pid}}/{sensor}_heatmap_rows.html", sensor=config["SENSORS"]),
sensor_heatmaps = expand("reports/figures/{{pid}}/{sensor}_heatmap_rows.html", sensor=PHONE_SENSORS),
compliance_heatmap = rules.compliance_heatmap.output
output:
"reports/compliance/{pid}/compliance_report.html",

View File

@ -45,8 +45,9 @@ elif sensor == "steps":
elif sensor == "calories":
summary_data, intraday_data = parseCaloriesData(data, HOUR2EPOCH)
else:
raise ValueError("Please check the FITBIT_SENSORS list in config.yaml file.")
raise ValueError("We only support heartrate, sleep, step, or calories sensors on Fitbit devices.")
# Summary data will be empty for steps and calories as it is not provided by Fitbit's API
summary_data.to_csv(snakemake.output["summary_data"], index=False)
# Summary data does not exist for steps and calories as it is not provided by Fitbit's API
if sensor == "heartrate" or sensor == "sleep":
summary_data.to_csv(snakemake.output["summary_data"], index=False)
intraday_data.to_csv(snakemake.output["intraday_data"], index=False)

View File

@ -1,5 +1,3 @@
library('tidyr')
filter_by_day_segment <- function(data, day_segment) {
if(day_segment %in% c("morning", "afternoon", "evening", "night"))
data <- data %>% filter(local_day_segment == day_segment)
@ -31,7 +29,7 @@ base_sms_features <- function(sms, sms_type, day_segment, requested_features){
for(feature_name in features_to_compute){
if(feature_name == "countmostfrequentcontact"){
# Get the number of messages for the most frequent contact throughout the study
# Get the number of messages for the most frequent contact throughout the study
mostfrequentcontact <- sms %>%
group_by(trace) %>%
mutate(N=n()) %>%
@ -45,17 +43,6 @@ base_sms_features <- function(sms, sms_type, day_segment, requested_features){
summarise(!!paste("sms", sms_type, day_segment, feature_name, sep = "_") := n()) %>%
replace(is.na(.), 0)
features <- merge(features, feature, by="local_date", all = TRUE)
# # Get the number of messages for the most frequent contact throughout the study
# feature <- sms %>% group_by(trace) %>%
# mutate(N=n()) %>%
# ungroup() %>%
# filter(N == max(N)) %>%
# head(1) %>% # if there are multiple contacts with the same amount of messages pick the first one only
# group_by(local_date) %>%
# summarise(!!paste("sms", sms_type, day_segment, feature_name, sep = "_") := N) %>%
# replace(is.na(.), 0)
# features <- merge(features, feature, by="local_date", all = TRUE)
} else {
feature <- sms %>%
group_by(local_date)
@ -69,6 +56,6 @@ base_sms_features <- function(sms, sms_type, day_segment, requested_features){
features <- merge(features, feature, by="local_date", all = TRUE)
}
}
features <- features %>% mutate_at(vars(contains("countmostfrequentcontact")), list( ~ replace_na(., 0)))
return(features)
}

View File

@ -2,13 +2,13 @@
# swap base_sms_features(...) for your own function
source("renv/activate.R")
source("src/features/sms/sms_base.R")
source("src/features/messages/messages_base.R")
library(dplyr, warn.conflicts = FALSE)
sms <- read.csv(snakemake@input[[1]])
day_segment <- snakemake@params[["day_segment"]]
requested_features <- snakemake@params[["features"]]
sms_type <- snakemake@params[["sms_type"]]
sms_type <- snakemake@params[["messages_type"]]
features <- data.frame(local_date = character(), stringsAsFactors = FALSE)
# Compute base SMS features