Update file names

pull/103/head
JulioV 2020-10-19 15:07:12 -04:00
parent d32771fd9e
commit 24bf62a7ab
68 changed files with 495 additions and 669 deletions

220
Snakefile
View File

@ -13,17 +13,11 @@ if len(config["PIDS"]) == 0:
raise ValueError("Add participants IDs to PIDS in config.yaml. Remember to create their participant files in data/external")
if config["PHONE_VALID_SENSED_BINS"]["COMPUTE"] or config["PHONE_VALID_SENSED_DAYS"]["COMPUTE"]: # valid sensed bins is necessary for sensed days, so we add these files anyways if sensed days are requested
if len(config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]) == 0:
raise ValueError("If you want to compute PHONE_VALID_SENSED_BINS or PHONE_VALID_SENSED_DAYS, you need to add at least one table to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml")
if len(config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]) == 0:
raise ValueError("If you want to compute PHONE_VALID_SENSED_BINS or PHONE_VALID_SENSED_DAYS, you need to add at least one PHONE_SENSOR to [PHONE_VALID_SENSED_BINS][PHONE_SENSORS] in config.yaml")
pids_android = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "android", config["PIDS"]))
pids_ios = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "ios", config["PIDS"]))
tables_android = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist
tables_ios = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist
for pids,table in zip([pids_android, pids_ios], [tables_android, tables_ios]):
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=pids, sensor=table))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=pids, sensor=table))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=map(str.lower, config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"])))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=map(str.lower, config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"])))
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_timestamps.csv", pid=config["PIDS"]))
@ -33,106 +27,100 @@ if config["PHONE_VALID_SENSED_DAYS"]["COMPUTE"]:
min_valid_hours_per_day=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_HOURS_PER_DAY"],
min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"]))
for provider in config["MESSAGES"]["PROVIDERS"].keys():
if config["MESSAGES"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["MESSAGES"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="MESSAGES".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="MESSAGES".lower()))
for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys():
if config["PHONE_MESSAGES"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_messages_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_messages_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_messages_features/phone_messages_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_MESSAGES"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_messages.csv", pid=config["PIDS"]))
for provider in config["CALLS"]["PROVIDERS"].keys():
if config["CALLS"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["CALLS"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="CALLS".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="CALLS".lower()))
for provider in config["PHONE_CALLS"]["PROVIDERS"].keys():
if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"], sensor=config["PHONE_CALLS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"], sensor=config["PHONE_CALLS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["PHONE_CALLS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"]))
for provider in config["BLUETOOTH"]["PROVIDERS"].keys():
if config["BLUETOOTH"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["BLUETOOTH"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="BLUETOOTH".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="BLUETOOTH".lower()))
for provider in config["PHONE_BLUETOOTH"]["PROVIDERS"].keys():
if config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_bluetooth_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_bluetooth_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_bluetooth_features/phone_bluetooth_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_bluetooth.csv", pid=config["PIDS"]))
for provider in config["ACTIVITY_RECOGNITION"]["PROVIDERS"].keys():
if config["ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["COMPUTE"]:
pids_android = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "android", config["PIDS"]))
pids_ios = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "ios", config["PIDS"]))
for pids,table in zip([pids_android, pids_ios], [config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]):
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=pids, sensor=table))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=pids, sensor=table))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table))
files_to_compute.extend(expand("data/interim/{pid}/activity_recognition_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/activity_recognition_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/activity_recognition_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="ACTIVITY_RECOGNITION".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="ACTIVITY_RECOGNITION".lower()))
for provider in config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"].keys():
if config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_with_datetime_unified.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_features/phone_activity_recognition_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_activity_recognition.csv", pid=config["PIDS"]))
for provider in config["BATTERY"]["PROVIDERS"].keys():
if config["BATTERY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BATTERY"]["DB_TABLE"]))
files_to_compute.extend(expand("data/interim/{pid}/battery_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/battery_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/battery_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["SCREEN"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="BATTERY".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="BATTERY".lower()))
for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys():
if config["PHONE_BATTERY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_battery_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_features/phone_battery_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_BATTERY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_battery.csv", pid=config["PIDS"]))
for provider in config["SCREEN"]["PROVIDERS"].keys():
if config["SCREEN"]["PROVIDERS"][provider]["COMPUTE"]:
if config["SCREEN"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]:
for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys():
if config["PHONE_SCREEN"]["PROVIDERS"][provider]["COMPUTE"]:
if "PHONE_SCREEN" in config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]:
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
else:
raise ValueError("Error: Add your screen table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data)")
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SCREEN"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SCREEN"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["SCREEN"]["DB_TABLE"]))
files_to_compute.extend(expand("data/interim/{pid}/screen_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/screen_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/screen_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["SCREEN"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="SCREEN".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="SCREEN".lower()))
raise ValueError("Error: Add PHONE_SCREEN (and as many phone sensor as you have in your database) to [PHONE_VALID_SENSED_BINS][PHONE_SENSORS] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data)")
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime_unified.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_features/phone_screen_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_SCREEN"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_screen.csv", pid=config["PIDS"]))
for provider in config["LIGHT"]["PROVIDERS"].keys():
if config["LIGHT"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["LIGHT"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="LIGHT".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="LIGHT".lower()))
for provider in config["PHONE_LIGHT"]["PROVIDERS"].keys():
if config["PHONE_LIGHT"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_light_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_light_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_light_features/phone_light_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LIGHT"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_light.csv", pid=config["PIDS"],))
for provider in config["ACCELEROMETER"]["PROVIDERS"].keys():
if config["ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["ACCELEROMETER"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="ACCELEROMETER".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="ACCELEROMETER".lower()))
for provider in config["PHONE_ACCELEROMETER"]["PROVIDERS"].keys():
if config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_accelerometer_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_accelerometer_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_accelerometer_features/phone_accelerometer_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_accelerometer.csv", pid=config["PIDS"]))
for provider in config["APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys():
if config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_with_genre.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="APPLICATIONS_FOREGROUND".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="APPLICATIONS_FOREGROUND".lower()))
for provider in config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys():
if config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_applications_foreground_features/phone_applications_foreground_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_foreground.csv", pid=config["PIDS"]))
for provider in config["WIFI"]["PROVIDERS"].keys():
if config["WIFI"]["PROVIDERS"][provider]["COMPUTE"]:
if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor_key}_with_datetime_visibleandconnected.csv", pid=config["PIDS"], sensor_key="WIFI".lower()))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["WIFI"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="WIFI".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="WIFI".lower()))
if len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor_key}_with_datetime_visibleandconnected.csv", pid=config["PIDS"], sensor_key="WIFI".lower()))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["WIFI"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="WIFI".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="WIFI".lower()))
for provider in config["PHONE_WIFI_VISIBLE"]["PROVIDERS"].keys():
if config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_visible_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_visible_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_visible_features/phone_wifi_visible_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_visible.csv", pid=config["PIDS"]))
for provider in config["PHONE_WIFI_CONNECTED"]["PROVIDERS"].keys():
if config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_connected_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_connected_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_connected_features/phone_wifi_connected_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_connected.csv", pid=config["PIDS"]))
if config["HEARTRATE"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["HEARTRATE"]["DB_TABLE"]))
@ -151,31 +139,27 @@ if config["SLEEP"]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["intraday", "summary"]))
files_to_compute.extend(expand("data/processed/{pid}/fitbit_sleep_{day_segment}.csv", pid = config["PIDS"], day_segment = config["SLEEP"]["DAY_SEGMENTS"]))
for provider in config["CONVERSATION"]["PROVIDERS"].keys():
if config["CONVERSATION"]["PROVIDERS"][provider]["COMPUTE"]:
pids_android = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "android", config["PIDS"]))
pids_ios = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "ios", config["PIDS"]))
for pids,table in zip([pids_android, pids_ios], [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"]]):
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=pids, sensor=table))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=pids, sensor=table))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["CONVERSATION"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="CONVERSATION".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="CONVERSATION".lower()))
for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys():
if config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_with_datetime_unified.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_conversation_features/phone_conversation_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_conversation.csv", pid=config["PIDS"]))
for provider in config["LOCATIONS"]["PROVIDERS"].keys():
if config["LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]:
if config["LOCATIONS"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
if config["LOCATIONS"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]:
for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
if config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]:
if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
if config["PHONE_LOCATIONS"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]:
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
else:
raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor}_processed_{locations_to_use}.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor}_processed_{locations_to_use}_with_datetime.csv", pid=config["PIDS"], sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="LOCATIONS".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="LOCATIONS".lower()))
files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"]))
# visualization for data exploration
if config["HEATMAP_FEATURES_CORRELATIONS"]["PLOT"]:

View File

@ -32,9 +32,12 @@ READABLE_DATETIME:
PHONE_VALID_SENSED_BINS:
COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features
BIN_SIZE: &bin_size 5 # (in minutes)
# Add as many sensor tables as you have, they all improve the computation of PHONE_VALID_SENSED_BINS and PHONE_VALID_SENSED_DAYS.
# If you are extracting screen or Barnett's location features, screen and locations tables are mandatory.
DB_TABLES: []
# Add as many PHONE sensors as you have, they all improve the computation of PHONE_VALID_SENSED_BINS and PHONE_VALID_SENSED_DAYS.
# If you are extracting screen or Barnett/Doryab location features, PHONE_SCREEN and PHONE_LOCATIONS tables are mandatory.
# You can choose any of the keys shown below, just make sure its DB_TABLE exists in your database!
# PHONE_MESSAGES, PHONE_CALLS, PHONE_LOCATIONS, PHONE_BLUETOOTH, PHONE_ACTIVITY_RECOGNITION, PHONE_BATTERY, PHONE_SCREEN, PHONE_LIGHT,
# PHONE_ACCELEROMETER, PHONE_APPLICATIONS_FOREGROUND, PHONE_WIFI_VISIBLE, PHONE_WIFI_CONNECTED, PHONE_CONVERSATION
PHONE_SENSORS: []
PHONE_VALID_SENSED_DAYS:
COMPUTE: False
@ -42,7 +45,7 @@ PHONE_VALID_SENSED_DAYS:
MIN_VALID_BINS_PER_HOUR: &min_valid_bins_per_hour [6] # (out of 60min/BIN_SIZE bins)
# Communication SMS features config, TYPES and FEATURES keys need to match
MESSAGES:
PHONE_MESSAGES:
DB_TABLE: messages
PROVIDERS:
RAPIDS:
@ -52,10 +55,10 @@ MESSAGES:
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
sent: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/messages
SRC_FOLDER: "rapids" # inside src/features/phone_messages
# Communication call features config, TYPES and FEATURES keys need to match
CALLS:
PHONE_CALLS:
DB_TABLE: calls
PROVIDERS:
RAPIDS:
@ -66,20 +69,13 @@ CALLS:
incoming: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
outgoing: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/calls
SRC_FOLDER: "rapids" # inside src/features/phone_calls
APPLICATION_GENRES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
UPDATE_CATALOGUE_FILE: false # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
SCRAPE_MISSING_GENRES: false # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
LOCATIONS:
PHONE_LOCATIONS:
DB_TABLE: locations
LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS OR FUSED_RESAMPLED
FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
TIMEZONE: *timezone
PROVIDERS:
DORYAB:
COMPUTE: False
@ -90,7 +86,7 @@ LOCATIONS:
MAXIMUM_GAP_ALLOWED: 300
MINUTES_DATA_USED: False
SAMPLING_FREQUENCY: 0
SRC_FOLDER: "doryab" # inside src/features/locations
SRC_FOLDER: "doryab" # inside src/features/phone_locations
SRC_LANGUAGE: "python"
BARNETT:
@ -99,20 +95,20 @@ LOCATIONS:
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
TIMEZONE: *timezone
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
SRC_FOLDER: "barnett" # inside src/features/locations
SRC_FOLDER: "barnett" # inside src/features/phone_locations
SRC_LANGUAGE: "r"
BLUETOOTH:
PHONE_BLUETOOTH:
DB_TABLE: bluetooth
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/bluetooth
SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth
SRC_LANGUAGE: "r"
ACTIVITY_RECOGNITION:
PHONE_ACTIVITY_RECOGNITION:
DB_TABLE:
ANDROID: plugin_google_activity_recognition
IOS: plugin_ios_activity_recognition
@ -124,19 +120,19 @@ ACTIVITY_RECOGNITION:
STATIONARY: ["still", "tilting"]
MOBILE: ["on_foot", "walking", "running", "on_bicycle"]
VEHICLE: ["in_vehicle"]
SRC_FOLDER: "rapids" # inside src/features/activity_recognition
SRC_FOLDER: "rapids" # inside src/features/phone_activity_recognition
SRC_LANGUAGE: "python"
BATTERY:
PHONE_BATTERY:
DB_TABLE: battery
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
SRC_FOLDER: "rapids" # inside src/features/battery
SRC_FOLDER: "rapids" # inside src/features/phone_battery
SRC_LANGUAGE: "python"
SCREEN:
PHONE_SCREEN:
DB_TABLE: screen
PROVIDERS:
RAPIDS:
@ -146,25 +142,25 @@ SCREEN:
IGNORE_EPISODES_LONGER_THAN: 0 # in minutes, set to 0 to disable
FEATURES: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration", "firstuseafter"] # "episodepersensedminutes" needs to be added later
EPISODE_TYPES: ["unlock"]
SRC_FOLDER: "rapids" # inside src/features/screen
SRC_FOLDER: "rapids" # inside src/features/phone_screen
SRC_LANGUAGE: "python"
LIGHT:
PHONE_LIGHT:
DB_TABLE: light
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
SRC_FOLDER: "rapids" # inside src/features/light
SRC_FOLDER: "rapids" # inside src/features/phone_light
SRC_LANGUAGE: "python"
ACCELEROMETER:
PHONE_ACCELEROMETER:
DB_TABLE: accelerometer
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
SRC_FOLDER: "rapids" # inside src/features/accelerometer
SRC_FOLDER: "rapids" # inside src/features/phone_accelerometer
SRC_LANGUAGE: "python"
PANDA:
@ -173,11 +169,16 @@ ACCELEROMETER:
FEATURES:
exertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
nonexertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
SRC_FOLDER: "panda" # inside src/features/accelerometer
SRC_FOLDER: "panda" # inside src/features/phone_accelerometer
SRC_LANGUAGE: "python"
APPLICATIONS_FOREGROUND:
PHONE_APPLICATIONS_FOREGROUND:
DB_TABLE: applications_foreground
APPLICATION_CATEGORIES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
PROVIDERS:
RAPIDS:
COMPUTE: False
@ -189,9 +190,45 @@ APPLICATIONS_FOREGROUND:
EXCLUDED_CATEGORIES: []
EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"]
FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
SRC_FOLDER: "rapids" # inside src/features/applications_foreground
SRC_FOLDER: "rapids" # inside src/features/phone_applications_foreground
SRC_LANGUAGE: "python"
PHONE_WIFI_VISIBLE:
DB_TABLE: "wifi"
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/phone_wifi_visible
SRC_LANGUAGE: "r"
PHONE_WIFI_CONNECTED:
DB_TABLE: "sensor_wifi"
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/phone_wifi_connected
SRC_LANGUAGE: "r"
PHONE_CONVERSATION:
DB_TABLE:
ANDROID: plugin_studentlife_audio_android
IOS: plugin_studentlife_audio
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration",
"sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy",
"avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction",
"voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction",
"unknownexpectedfraction","countconversation"]
RECORDING_MINUTES: 1
PAUSED_MINUTES : 3
SRC_FOLDER: "rapids" # inside src/features/phone_conversation
SRC_LANGUAGE: "python"
HEARTRATE:
COMPUTE: False
DB_TABLE: fitbit_data
@ -223,34 +260,6 @@ SLEEP:
SLEEP_TYPES: ["main", "nap", "all"]
SUMMARY_FEATURES: ["sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgefficiency", "countepisode"]
WIFI:
DB_TABLE:
VISIBLE_ACCESS_POINTS: "wifi" # if you only have a CONNECTED_ACCESS_POINTS table, set this value to ""
CONNECTED_ACCESS_POINTS: "sensor_wifi" # if you only have a VISIBLE_ACCESS_POINTS table, set this value to ""
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/bluetooth
SRC_LANGUAGE: "r"
CONVERSATION:
DB_TABLE:
ANDROID: plugin_studentlife_audio_android
IOS: plugin_studentlife_audio
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration",
"sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy",
"avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction",
"voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction",
"unknownexpectedfraction","countconversation"]
RECORDING_MINUTES: 1
PAUSED_MINUTES : 3
SRC_FOLDER: "rapids" # inside src/features/conversation
SRC_LANGUAGE: "python"
### Visualizations ################################################################
HEATMAP_FEATURES_CORRELATIONS:
PLOT: False

View File

@ -14,69 +14,20 @@ def infer_participant_platform(participant_file):
return platform
# Preprocessing.smk ####################################################################################################
def optional_phone_sensed_bins_input(wildcards):
platform = infer_participant_platform("data/external/"+wildcards.pid)
if platform == "android":
tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist
elif platform == "ios":
tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist
return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform)
def optional_phone_sensed_timestamps_input(wildcards):
platform = infer_participant_platform("data/external/"+wildcards.pid)
if platform == "android":
tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist
elif platform == "ios":
tables_platform = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist
return expand("data/raw/{{pid}}/{table}_raw.csv", table = tables_platform)
# Features.smk #########################################################################################################
def find_features_files(wildcards):
feature_files = []
for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items():
if provider["COMPUTE"]:
feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", sensor_key=(wildcards.sensor_key).lower(), language=provider["SRC_LANGUAGE"].lower(), provider_key=provider_key))
feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", sensor_key=wildcards.sensor_key.lower(), language=provider["SRC_LANGUAGE"].lower(), provider_key=provider_key.lower()))
return(feature_files)
def optional_ar_input(wildcards):
platform = infer_participant_platform("data/external/"+wildcards.pid)
if platform == "android":
return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"])
elif platform == "ios":
return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"])
def optional_conversation_input(wildcards):
platform = infer_participant_platform("data/external/"+wildcards.pid)
if platform == "android":
return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CONVERSATION"]["DB_TABLE"]["ANDROID"])[0]
elif platform == "ios":
return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CONVERSATION"]["DB_TABLE"]["IOS"])[0]
def optional_steps_sleep_input(wildcards):
if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED":
return "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv"
else:
return []
def optional_wifi_input(wildcards):
if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) == 0:
return {"visible_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])}
elif len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) == 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
return {"connected_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])}
elif len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0 and len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
return {"visible_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]), "connected_access_points": expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])}
else:
raise ValueError("If you are computing WIFI features you need to provide either VISIBLE_ACCESS_POINTS, CONNECTED_ACCESS_POINTS or both")
# Models.smk ###########################################################################################################
def input_merge_features_of_single_participant(wildcards):

View File

@ -28,341 +28,211 @@ rule resample_episodes_with_datetime:
script:
"../src/data/readable_datetime.R"
rule accelerometer_r_features:
rule phone_accelerometer_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["ACCELEROMETER"]["DB_TABLE"])[0],
sensor_data = "data/raw/{pid}/phone_accelerometer_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["ACCELEROMETER"]["PROVIDERS"][wildcards.provider_key],
provider = lambda wildcards: config["PHONE_ACCELEROMETER"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "accelerometer"
sensor_key = "phone_accelerometer"
output:
"data/interim/{pid}/accelerometer_features/accelerometer_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule accelerometer_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["ACCELEROMETER"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["ACCELEROMETER"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "accelerometer"
output:
"data/interim/{pid}/accelerometer_features/accelerometer_python_{provider_key}.csv"
"data/interim/{pid}/phone_accelerometer_features/phone_accelerometer_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule activity_recognition_episodes:
input:
optional_ar_input
sensor_data = "data/raw/{pid}/phone_activity_recognition_with_datetime_unified.csv"
output:
"data/interim/{pid}/activity_recognition_episodes.csv"
"data/interim/{pid}/phone_activity_recognition_episodes.csv"
script:
"../src/features/activity_recognition/episodes/activity_recognition_episodes.R"
"../src/features/phone_activity_recognition/episodes/activity_recognition_episodes.R"
rule activity_recognition_r_features:
rule phone_activity_recognition_python_features:
input:
sensor_episodes = "data/interim/{pid}/activity_recognition_episodes_resampled_with_datetime.csv",
sensor_episodes = "data/interim/{pid}/phone_activity_recognition_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["ACTIVITY_RECOGNITION"]["PROVIDERS"][wildcards.provider_key],
provider = lambda wildcards: config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "activity_recognition"
sensor_key = "phone_activity_recognition"
output:
"data/interim/{pid}/activity_recognition_features/activity_recognition_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule activity_recognition_python_features:
input:
sensor_episodes = "data/interim/{pid}/activity_recognition_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["ACTIVITY_RECOGNITION"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "activity_recognition"
output:
"data/interim/{pid}/activity_recognition_features/activity_recognition_python_{provider_key}.csv"
"data/interim/{pid}/phone_activity_recognition_features/phone_activity_recognition_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule applications_foreground_r_features:
rule phone_applications_foreground_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])[0],
sensor_data = "data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key],
provider = lambda wildcards: config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "applications_foreground"
sensor_key = "phone_applications_foreground"
output:
"data/interim/{pid}/applications_foreground_features/applications_foreground_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule applications_foreground_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "applications_foreground"
output:
"data/interim/{pid}/applications_foreground_features/applications_foreground_python_{provider_key}.csv"
"data/interim/{pid}/phone_applications_foreground_features/phone_applications_foreground_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule battery_episodes:
input:
expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor=config["BATTERY"]["DB_TABLE"])
"data/raw/{pid}/phone_battery_raw.csv"
output:
"data/interim/{pid}/battery_episodes.csv"
"data/interim/{pid}/phone_battery_episodes.csv"
script:
"../src/features/battery/episodes/battery_episodes.R"
"../src/features/phone_battery/episodes/battery_episodes.R"
rule battery_r_features:
rule phone_battery_python_features:
input:
sensor_episodes = "data/interim/{pid}/battery_episodes_resampled_with_datetime.csv",
sensor_episodes = "data/interim/{pid}/phone_battery_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["BATTERY"]["PROVIDERS"][wildcards.provider_key],
provider = lambda wildcards: config["PHONE_BATTERY"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "battery"
sensor_key = "phone_battery"
output:
"data/interim/{pid}/battery_features/battery_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule battery_python_features:
input:
sensor_episodes = "data/interim/{pid}/battery_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["BATTERY"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "battery"
output:
"data/interim/{pid}/battery_features/battery_python_{provider_key}.csv"
"data/interim/{pid}/phone_battery_features/phone_battery_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule bluetooth_r_features:
rule phone_bluetooth_r_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])[0],
sensor_data = "data/raw/{pid}/phone_bluetooth_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["BLUETOOTH"]["PROVIDERS"][wildcards.provider_key],
provider = lambda wildcards: config["PHONE_BLUETOOTH"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "bluetooth"
sensor_key = "phone_bluetooth"
output:
"data/interim/{pid}/bluetooth_features/bluetooth_r_{provider_key}.csv"
"data/interim/{pid}/phone_bluetooth_features/phone_bluetooth_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule bluetooth_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["BLUETOOTH"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "bluetooth"
output:
"data/interim/{pid}/bluetooth_features/bluetooth_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule calls_r_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"])[0],
sensor_data = "data/raw/{pid}/phone_calls_with_datetime_unified.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key],
provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "calls"
sensor_key = "phone_calls"
output:
"data/interim/{pid}/calls_features/calls_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule calls_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "calls"
output:
"data/interim/{pid}/calls_features/calls_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule conversation_r_features:
input:
sensor_data = optional_conversation_input,
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["CONVERSATION"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "conversation"
output:
"data/interim/{pid}/conversation_features/conversation_r_{provider_key}.csv"
"data/interim/{pid}/phone_calls_features/phone_calls_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule conversation_python_features:
input:
sensor_data = optional_conversation_input,
sensor_data = "data/raw/{pid}/phone_conversation_with_datetime_unified.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["CONVERSATION"]["PROVIDERS"][wildcards.provider_key],
provider = lambda wildcards: config["PHONE_CONVERSATION"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "conversation"
sensor_key = "phone_conversation"
output:
"data/interim/{pid}/conversation_features/conversation_python_{provider_key}.csv"
"data/interim/{pid}/phone_conversation_features/phone_conversation_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule light_r_features:
rule phone_light_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"])[0],
sensor_data = "data/raw/{pid}/phone_light_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key],
provider = lambda wildcards: config["PHONE_LIGHT"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "light"
sensor_key = "phone_light"
output:
"data/interim/{pid}/light_features/light_r_{provider_key}.csv"
"data/interim/{pid}/phone_light_features/phone_light_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule phone_locations_r_features:
input:
sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "phone_locations"
output:
"data/interim/{pid}/phone_locations_features/phone_locations_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule light_python_features:
rule phone_locations_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"])[0],
sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key],
provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "light"
sensor_key = "phone_locations"
output:
"data/interim/{pid}/light_features/light_python_{provider_key}.csv"
"data/interim/{pid}/phone_locations_features/phone_locations_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule locations_r_features:
rule phone_messages_r_features:
input:
sensor_data = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"])[0],
sensor_data = "data/raw/{pid}/phone_messages_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key],
provider = lambda wildcards: config["PHONE_MESSAGES"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "locations"
sensor_key = "phone_messages"
output:
"data/interim/{pid}/locations_features/locations_r_{provider_key}.csv"
"data/interim/{pid}/phone_messages_features/phone_messages_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule locations_python_features:
input:
sensor_data = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "locations"
output:
"data/interim/{pid}/locations_features/locations_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule messages_r_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["MESSAGES"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "messages"
output:
"data/interim/{pid}/messages_features/messages_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule messages_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["MESSAGES"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "messages"
output:
"data/interim/{pid}/messages_features/messages_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule screen_episodes:
input:
screen = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["SCREEN"]["DB_TABLE"])
screen = "data/raw/{pid}/phone_screen_with_datetime_unified.csv"
output:
"data/interim/{pid}/screen_episodes.csv"
"data/interim/{pid}/phone_screen_episodes.csv"
script:
"../src/features/screen/episodes/screen_episodes.R"
"../src/features/phone_screen/episodes/screen_episodes.R"
rule screen_r_features:
rule phone_screen_python_features:
input:
sensor_episodes = "data/interim/{pid}/screen_episodes_resampled_with_datetime.csv",
sensor_episodes = "data/interim/{pid}/phone_screen_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["SCREEN"]["PROVIDERS"][wildcards.provider_key],
provider = lambda wildcards: config["PHONE_SCREEN"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "screen"
sensor_key = "phone_screen"
output:
"data/interim/{pid}/screen_features/screen_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule screen_python_features:
input:
sensor_episodes = "data/interim/{pid}/screen_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["SCREEN"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "screen"
output:
"data/interim/{pid}/screen_features/screen_python_{provider_key}.csv"
"data/interim/{pid}/phone_screen_features/phone_screen_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule wifi_r_features:
rule phone_wifi_connected_r_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower())[0],
sensor_data = "data/raw/{pid}/phone_wifi_connected_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["WIFI"]["PROVIDERS"][wildcards.provider_key],
provider = lambda wildcards: config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "wifi"
sensor_key = "phone_wifi_connected"
output:
"data/interim/{pid}/wifi_features/wifi_r_{provider_key}.csv"
"data/interim/{pid}/phone_wifi_connected_features/phone_wifi_connected_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule wifi_python_features:
rule phone_wifi_visible_r_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower())[0],
sensor_data = "data/raw/{pid}/phone_wifi_visible_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["WIFI"]["PROVIDERS"][wildcards.provider_key],
provider = lambda wildcards: config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "wifi"
sensor_key = "phone_wifi_visible"
output:
"data/interim/{pid}/wifi_features/wifi_python_{provider_key}.csv"
"data/interim/{pid}/phone_wifi_visible_features/phone_wifi_visible_r_{provider_key}.csv"
script:
"../src/features/entry.py"
"../src/features/entry.R"
rule fitbit_heartrate_features:
input:

View File

@ -29,10 +29,10 @@ rule download_dataset:
"data/external/{pid}"
params:
group = config["DOWNLOAD_DATASET"]["GROUP"],
table = "{sensor}",
sensor = "{sensor}",
table = lambda wildcards: config[str(wildcards.sensor).upper()]["DB_TABLE"],
timezone = config["TIMEZONE"],
aware_multiplatform_tables = config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "," + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"] + "," + config["CONVERSATION"]["DB_TABLE"]["ANDROID"] + "," + config["CONVERSATION"]["DB_TABLE"]["IOS"],
unifiable_sensors = {"calls": config["CALLS"]["DB_TABLE"], "battery": config["BATTERY"]["DB_TABLE"], "screen": config["SCREEN"]["DB_TABLE"], "ios_activity_recognition": config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"], "ios_conversation": config["CONVERSATION"]["DB_TABLE"]["IOS"]}
aware_multiplatform_tables = config["PHONE_ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "," + config["PHONE_ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"] + "," + config["PHONE_CONVERSATION"]["DB_TABLE"]["ANDROID"] + "," + config["PHONE_CONVERSATION"]["DB_TABLE"]["IOS"],
output:
"data/raw/{pid}/{sensor}_raw.csv"
script:
@ -50,35 +50,23 @@ rule compute_day_segments:
script:
"../src/data/compute_day_segments.py"
PHONE_SENSORS = []
PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["LOCATIONS"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"], config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]])
PHONE_SENSORS.extend(config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"])
if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0:
PHONE_SENSORS.append(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])
if len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0:
PHONE_SENSORS.append(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])
rule readable_datetime:
rule phone_readable_datetime:
input:
sensor_input = "data/raw/{pid}/{sensor}_raw.csv",
sensor_input = "data/raw/{pid}/phone_{sensor}_raw.csv",
day_segments = "data/interim/day_segments/{pid}_day_segments.csv"
params:
timezones = None,
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
day_segments_type = config["DAY_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
wildcard_constraints:
sensor = '(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ')' # only process smartphone sensors, not fitbit
output:
"data/raw/{pid}/{sensor}_with_datetime.csv"
"data/raw/{pid}/phone_{sensor}_with_datetime.csv"
script:
"../src/data/readable_datetime.R"
rule phone_sensed_bins:
input:
all_sensors = optional_phone_sensed_bins_input
all_sensors = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor = map(str.lower, config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]))
params:
bin_size = config["PHONE_VALID_SENSED_BINS"]["BIN_SIZE"]
output:
@ -88,7 +76,7 @@ rule phone_sensed_bins:
rule phone_sensed_timestamps:
input:
all_sensors = optional_phone_sensed_timestamps_input
all_sensors = expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor = map(str.lower, config["PHONE_VALID_SENSED_BINS"]["PHONE_SENSORS"]))
output:
"data/interim/{pid}/phone_sensed_timestamps.csv"
script:
@ -112,55 +100,50 @@ rule unify_ios_android:
participant_info = "data/external/{pid}"
params:
sensor = "{sensor}",
unifiable_sensors = {"calls": config["CALLS"]["DB_TABLE"], "battery": config["BATTERY"]["DB_TABLE"], "screen": config["SCREEN"]["DB_TABLE"], "ios_activity_recognition": config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"], "ios_conversation": config["CONVERSATION"]["DB_TABLE"]["IOS"]}
output:
"data/raw/{pid}/{sensor}_with_datetime_unified.csv"
script:
"../src/data/unify_ios_android.R"
rule process_location_types:
rule process_phone_location_types:
input:
locations = "data/raw/{pid}/{sensor}_raw.csv",
locations = "data/raw/{pid}/phone_locations_raw.csv",
phone_sensed_timestamps = "data/interim/{pid}/phone_sensed_timestamps.csv",
params:
consecutive_threshold = config["LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"],
time_since_valid_location = config["LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"],
locations_to_use = "{locations_to_use}"
wildcard_constraints:
locations_to_use = '(ALL|GPS|FUSED_RESAMPLED)'
consecutive_threshold = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"],
time_since_valid_location = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"],
locations_to_use = config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"]
output:
"data/interim/{pid}/{sensor}_processed_{locations_to_use}.csv"
"data/interim/{pid}/phone_locations_processed.csv"
script:
"../src/data/process_location_types.R"
rule readable_datetime_location_processed:
input:
sensor_input = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]),
sensor_input = "data/interim/{pid}/phone_locations_processed.csv",
day_segments = "data/interim/day_segments/{pid}_day_segments.csv"
params:
timezones = None,
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
day_segments_type = config["DAY_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
wildcard_constraints:
locations_to_use = '(ALL|GPS|FUSED_RESAMPLED)'
output:
expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"])
"data/interim/{pid}/phone_locations_processed_with_datetime.csv"
script:
"../src/data/readable_datetime.R"
rule application_genres:
rule phone_application_categories:
input:
"data/raw/{pid}/{sensor}_with_datetime.csv"
"data/raw/{pid}/phone_applications_foreground_with_datetime.csv"
params:
catalogue_source = config["APPLICATION_GENRES"]["CATALOGUE_SOURCE"],
catalogue_file = config["APPLICATION_GENRES"]["CATALOGUE_FILE"],
update_catalogue_file = config["APPLICATION_GENRES"]["UPDATE_CATALOGUE_FILE"],
scrape_missing_genres = config["APPLICATION_GENRES"]["SCRAPE_MISSING_GENRES"]
catalogue_source = config["PHONE_APPLICATIONS_FOREGROUND"]["APPLICATION_CATEGORIES"]["CATALOGUE_SOURCE"],
catalogue_file = config["PHONE_APPLICATIONS_FOREGROUND"]["APPLICATION_CATEGORIES"]["CATALOGUE_FILE"],
update_catalogue_file = config["PHONE_APPLICATIONS_FOREGROUND"]["APPLICATION_CATEGORIES"]["UPDATE_CATALOGUE_FILE"],
scrape_missing_genres = config["PHONE_APPLICATIONS_FOREGROUND"]["APPLICATION_CATEGORIES"]["SCRAPE_MISSING_CATEGORIES"]
output:
"data/raw/{pid}/{sensor}_with_datetime_with_genre.csv"
"data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv"
script:
"../src/data/application_genres.R"
"../src/data/application_categories.R"
rule fitbit_heartrate_with_datetime:
input:
@ -196,11 +179,3 @@ rule fitbit_sleep_with_datetime:
intraday_data = "data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv"
script:
"../src/data/fitbit_readable_datetime.py"
rule join_wifi_tables:
input:
unpack(optional_wifi_input)
output:
"data/raw/{pid}/wifi_with_datetime_visibleandconnected.csv"
script:
"../src/data/join_visible_and_connected_wifi.R"

View File

@ -2,166 +2,163 @@ library("tidyverse")
library("lubridate")
options(scipen=999)
find_segments_frequency <- function(local_date, local_time, local_timezone, segments){
assigned_segments <- segments[segments$segment_start<= local_time & segments$segment_end >= local_time, ]
assigned_segments["segment_start_ts"] = as.numeric(lubridate::as_datetime(stringi::stri_c(local_date,assigned_segments$segment_id_start_time), tz = local_timezone)) * 1000
assigned_segments["segment_end_ts"] = as.numeric(lubridate::as_datetime(stringi::stri_c(local_date,assigned_segments$segment_id_end_time), tz = local_timezone)) * 1000 + 999
return(stringi::stri_c(stringi::stri_c("[",
assigned_segments[["label"]], "#",
local_date, " ",
assigned_segments[["segment_id_start_time"]], ",",
local_date, " ",
assigned_segments[["segment_id_end_time"]], ";",
assigned_segments[["segment_start_ts"]], ",",
assigned_segments[["segment_end_ts"]],
"]"), collapse = "|"))
day_type_delay <- function(day_type, include_past_periodic_segments){
delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == day_type) %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
return(if_else(is.na(delay) | include_past_periodic_segments == FALSE, duration("0days"), delay))
}
find_segments_periodic <- function(timestamp, segments){
# crossing and pivot_longer make segments a tibble, thus we need to extract [["segment_id"]]
return(stringi::stri_c(segments[[1]][segments[[1]]$segment_start_ts<= timestamp & segments[[1]]$segment_end_ts >= timestamp, "segment_id"][["segment_id"]], collapse = "|"))
get_segment_dates <- function(data, local_timezone, day_type, delay){
dates <- data %>%
distinct(local_date) %>%
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
complete(local_date_obj = seq(date(min(local_date_obj) - delay), max(local_date_obj), by="days")) %>%
mutate(local_date = replace_na(as.character(date(local_date_obj))))
if(day_type == "every_day")
dates <- dates %>% mutate(every_day = 0)
else if (day_type == "wday")
dates <- dates %>% mutate(wday = wday(local_date_obj, week_start = 1))
else if (day_type == "mday")
dates <- dates %>% mutate(mday = mday(local_date_obj))
else if (day_type == "qday")
dates <- dates %>% mutate(qday = qday(local_date_obj))
else if (day_type == "yday")
dates <- dates %>% mutate(yday = yday(local_date_obj))
return(dates)
}
find_segments_event <- function(timestamp, segments){
# segments is a data.frame, we don't need to extract [["segment_id"]] like in find_segments_periodic
return(stringi::stri_c(segments[[1]][segments[[1]]$segment_start_ts<= timestamp & segments[[1]]$segment_end_ts >= timestamp, "segment_id"], collapse = "|"))
assign_rows_to_segments <- function(nested_data, nested_inferred_day_segments){
nested_data <- nested_data %>% mutate(assigned_segments = "")
for(i in 1:nrow(nested_inferred_day_segments)) {
segment <- nested_inferred_day_segments[i,]
nested_data$assigned_segments <- ifelse(segment$segment_start_ts<= nested_data$timestamp & segment$segment_end_ts >= nested_data$timestamp,
stringi::stri_c(nested_data$assigned_segments, segment$segment_id, sep = "|"), nested_data$assigned_segments)
}
nested_data$assigned_segments <- substring(nested_data$assigned_segments, 2)
return(nested_data)
}
assign_rows_to_segments_frequency <- function(nested_data, nested_timezone, day_segments){
for(i in 1:nrow(day_segments)) {
segment <- day_segments[i,]
nested_data$assigned_segments <- ifelse(segment$segment_start_ts<= nested_data$local_time_obj & segment$segment_end_ts >= nested_data$local_time_obj,
# The segment_id is assambled on the fly because it depends on each row's local_date and timezone
stringi::stri_c("[",
segment[["label"]], "#",
nested_data$local_date, " ",
segment[["segment_id_start_time"]], ",",
nested_data$local_date, " ",
segment[["segment_id_end_time"]], ";",
as.numeric(lubridate::as_datetime(stringi::stri_c(nested_data$local_date, segment$segment_id_start_time), tz = nested_timezone)) * 1000, ",",
as.numeric(lubridate::as_datetime(stringi::stri_c(nested_data$local_date, segment$segment_id_end_time), tz = nested_timezone)) * 1000 + 999,
"]"),
nested_data$assigned_segments)
}
return(nested_data)
}
assign_to_day_segment <- function(sensor_data, day_segments, day_segments_type, include_past_periodic_segments){
if(nrow(sensor_data) == 0)
return(sensor_data %>% mutate(assigned_segments = NA))
if(day_segments_type == "FREQUENCY"){ #FREQUENCY
if(day_segments_type == "FREQUENCY"){
day_segments <- day_segments %>% mutate(start_time = lubridate::hm(start_time),
end_time = start_time + minutes(length) - seconds(1),
segment_id_start_time = paste(str_pad(hour(start_time),2, pad="0"), str_pad(minute(start_time),2, pad="0"), str_pad(second(start_time),2, pad="0"),sep =":"),
segment_id_end_time = paste(str_pad(hour(ymd("1970-01-01") + end_time),2, pad="0"), str_pad(minute(ymd("1970-01-01") + end_time),2, pad="0"), str_pad(second(ymd("1970-01-01") + end_time),2, pad="0"),sep =":"), # add ymd("1970-01-01") to get a real time instead of duration
segment_start = as.numeric(start_time),
segment_end = as.numeric(end_time))
sensor_data <- sensor_data %>% mutate(local_time_obj = as.numeric(lubridate::hms(local_time)),
assigned_segments = pmap_chr(list(local_date, local_time_obj, local_timezone), find_segments_frequency, day_segments)) %>% select(-local_time_obj)
segment_start_ts = as.numeric(start_time),
segment_end_ts = as.numeric(end_time))
} else if (day_segments_type == "PERIODIC"){ #PERIODIC
sensor_data <- sensor_data %>% mutate(local_time_obj = as.numeric(lubridate::hms(local_time)),
assigned_segments = "")
sensor_data <- sensor_data %>%
group_by(local_timezone) %>%
nest() %>%
mutate(data = map2(data, local_timezone, assign_rows_to_segments_frequency, day_segments)) %>%
unnest(cols = data) %>%
arrange(timestamp) %>%
select(-local_time_obj)
return(sensor_data)
} else if (day_segments_type == "PERIODIC"){
# We need to take into account segment start dates that could include the first day of data
day_segments <- day_segments %>% mutate(length_duration = duration(length))
wday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "wday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
wday_delay <- if_else(is.na(wday_delay) | include_past_periodic_segments == FALSE, duration("0days"), wday_delay)
mday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "mday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
mday_delay <- if_else(is.na(mday_delay) | include_past_periodic_segments == FALSE, duration("0days"), mday_delay)
qday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "qday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
qday_delay <- if_else(is.na(qday_delay) | include_past_periodic_segments == FALSE, duration("0days"), qday_delay)
yday_delay <- day_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == "yday") %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
yday_delay <- if_else(is.na(yday_delay) | include_past_periodic_segments == FALSE, duration("0days"), yday_delay)
every_day_delay <- duration("0days")
wday_delay <- day_type_delay("wday", include_past_periodic_segments)
mday_delay <- day_type_delay("mday", include_past_periodic_segments)
qday_delay <- day_type_delay("qday", include_past_periodic_segments)
yday_delay <- day_type_delay("yday", include_past_periodic_segments)
sensor_data <- sensor_data %>%
# mutate(row_n = row_number()) %>%
group_by(local_timezone) %>%
nest() %>%
# get existent days that we need to start segments from
mutate(every_date = map(data, ~.x %>%
distinct(local_date) %>%
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
complete(local_date_obj = seq(min(local_date_obj), max(local_date_obj), by="days")) %>%
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
mutate(every_day = 0)),
week_dates = map(data, ~.x %>%
distinct(local_date) %>%
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
complete(local_date_obj = seq(date(min(local_date_obj) - wday_delay), max(local_date_obj), by="days")) %>%
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
mutate(wday = wday(local_date_obj, week_start = 1)) ),
month_dates = map(data, ~.x %>%
distinct(local_date) %>%
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
complete(local_date_obj = seq(date(min(local_date_obj) - mday_delay), max(local_date_obj), by="days")) %>%
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
mutate(mday = mday(local_date_obj))),
quarter_dates = map(data, ~.x %>%
distinct(local_date) %>%
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
complete(local_date_obj = seq(date(min(local_date_obj) - qday_delay), max(local_date_obj), by="days")) %>%
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
mutate(qday = qday(local_date_obj)) ),
year_dates = map(data, ~.x %>%
distinct(local_date) %>%
mutate(local_date_obj = date(lubridate::ymd(local_date, tz = local_timezone))) %>%
complete(local_date_obj = seq(date(min(local_date_obj) - yday_delay), max(local_date_obj), by="days")) %>%
mutate(local_date = replace_na(as.character(date(local_date_obj)))) %>%
mutate(yday = yday(local_date_obj)) ),
mutate(every_date = map2(data, local_timezone, get_segment_dates, "every_day", every_day_delay),
week_dates = map2(data, local_timezone, get_segment_dates, "wday", wday_delay),
month_dates = map2(data, local_timezone, get_segment_dates, "mday", mday_delay),
quarter_dates = map2(data, local_timezone, get_segment_dates, "qday", qday_delay),
year_dates = map2(data, local_timezone, get_segment_dates, "yday", yday_delay),
existent_dates = pmap(list(every_date, week_dates, month_dates, quarter_dates, year_dates),
function(every_date, week_dates, month_dates, quarter_dates, year_dates) reduce(list(every_date, week_dates,month_dates, quarter_dates, year_dates), .f=full_join)),
every_date = NULL,
week_dates = NULL,
month_dates = NULL,
quarter_dates = NULL,
year_dates = NULL,
# build the actual day segments taking into account the users requested leangth and repeat schedule
function(every_date, week_dates, month_dates, quarter_dates, year_dates) reduce(list(every_date, week_dates,month_dates, quarter_dates, year_dates), .f=full_join)),
# build the actual day segments taking into account the users requested length and repeat schedule
inferred_day_segments = map(existent_dates,
~ crossing(day_segments, .x) %>%
pivot_longer(cols = c(every_day,wday, mday, qday, yday), names_to = "day_type", values_to = "day_value") %>%
filter(repeats_on == day_type & repeats_value == day_value) %>%
mutate(segment_id_start = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM")), # The segment ids (label#start#end) are computed in UTC to avoid having different labels for instances of a segment that happen in different timezones
# The segment ids (segment_id_start and segment_id_end) are computed in UTC to avoid having different labels for instances of a segment that happen in different timezones
mutate(segment_id_start = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM")),
segment_id_end = segment_id_start + lubridate::duration(length),
segment_start_ts = as.numeric(lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = local_timezone)) * 1000, # The actual segments are computed using timestamps taking into account the timezone
# The actual segments are computed using timestamps taking into account the timezone
segment_start_ts = as.numeric(lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = local_timezone)) * 1000,
segment_end_ts = segment_start_ts + as.numeric(lubridate::duration(length)) * 1000 + 999,
segment_id = paste0("[",
paste0(
label,"#",
paste0(lubridate::date(segment_id_start), " ",
paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",",
lubridate::date(segment_id_end), " ",
paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";",
paste0(segment_start_ts, ",", segment_end_ts)
),
paste0(label,"#",
paste0(lubridate::date(segment_id_start), " ",
paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",",
lubridate::date(segment_id_end), " ",
paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";",
paste0(segment_start_ts, ",", segment_end_ts)),
"]")) %>%
select(segment_start_ts, segment_end_ts, segment_id) %>%
drop_na(segment_start_ts, segment_end_ts)), # drop day segments with an invalid start or end time (mostly due to daylight saving changes, e.g. 2020-03-08 02:00:00 EST does not exist, clock jumps from 1am to 3am)
data = map2(data, inferred_day_segments, ~ .x %>% mutate(row_date_time = as.numeric(lubridate::ymd_hms(local_date_time, tz = local_timezone)) * 1000,
assigned_segments = map_chr(row_date_time, ~find_segments_periodic(.x, inferred_day_segments)),
row_date_time = NULL))
# drop day segments with an invalid start or end time (mostly due to daylight saving changes, e.g. 2020-03-08 02:00:00 EST does not exist, clock jumps from 01:59am to 03:00am)
drop_na(segment_start_ts, segment_end_ts)),
data = map2(data, inferred_day_segments, assign_rows_to_segments)
) %>%
select(-existent_dates, -inferred_day_segments) %>%
select(-existent_dates, -inferred_day_segments, -every_date, -week_dates, -month_dates, -quarter_dates, -year_dates) %>%
unnest(cols = data) %>%
arrange(timestamp)
} else if ( day_segments_type == "EVENT"){
sensor_data <- sensor_data %>%
group_by(local_timezone) %>%
nest() %>%
mutate(inferred_day_segments = map(local_timezone, ~ day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift),
segment_start_ts = event_timestamp + (as.integer(seconds(lubridate::duration(shift))) * ifelse(shift_direction >= 0, 1, -1) * 1000),
segment_end_ts = segment_start_ts + (as.integer(seconds(lubridate::duration(length))) * 1000),
segment_id_start = lubridate::as_datetime(segment_start_ts/1000, tz = .x), # these start and end datetime objects are for labeling only
segment_id_end = lubridate::as_datetime(segment_end_ts/1000, tz = .x),
segment_end_ts = segment_end_ts + 999,
segment_id = paste0("[",
paste0(
label,"#",
paste0(lubridate::date(segment_id_start), " ",
paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",",
lubridate::date(segment_id_end), " ",
paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";",
paste0(segment_start_ts, ",", segment_end_ts)
),
"]")) %>%
select(-segment_id_start, -segment_id_end)),
data = map2(data, inferred_day_segments, ~ .x %>% mutate(assigned_segments = map_chr(timestamp, ~find_segments_event(.x, inferred_day_segments))))) %>%
mutate(inferred_day_segments = map(local_timezone, ~ day_segments %>%
mutate(shift = ifelse(shift == "0", "0seconds", shift),
segment_start_ts = event_timestamp + (as.integer(seconds(lubridate::duration(shift))) * ifelse(shift_direction >= 0, 1, -1) * 1000),
segment_end_ts = segment_start_ts + (as.integer(seconds(lubridate::duration(length))) * 1000),
# these start and end datetime objects are for labeling only
segment_id_start = lubridate::as_datetime(segment_start_ts/1000, tz = .x),
segment_id_end = lubridate::as_datetime(segment_end_ts/1000, tz = .x),
segment_end_ts = segment_end_ts + 999,
segment_id = paste0("[",
paste0(label,"#",
paste0(lubridate::date(segment_id_start), " ",
paste(str_pad(hour(segment_id_start),2, pad="0"), str_pad(minute(segment_id_start),2, pad="0"), str_pad(second(segment_id_start),2, pad="0"),sep =":"), ",",
lubridate::date(segment_id_end), " ",
paste(str_pad(hour(segment_id_end),2, pad="0"), str_pad(minute(segment_id_end),2, pad="0"), str_pad(second(segment_id_end),2, pad="0"),sep =":")),";",
paste0(segment_start_ts, ",", segment_end_ts)),
"]"))),
data = map2(data, inferred_day_segments, assign_rows_to_segments)) %>%
select(-inferred_day_segments) %>%
unnest(data) %>%
arrange(timestamp)
}
return(sensor_data)
}

View File

@ -40,9 +40,9 @@ is_multiplaform_participant <- function(dbEngine, device_ids, platforms){
participant <- snakemake@input[[1]]
group <- snakemake@params[["group"]]
table <- snakemake@params[["table"]]
sensor <- snakemake@params[["sensor"]]
timezone <- snakemake@params[["timezone"]]
aware_multiplatform_tables <- str_split(snakemake@params[["aware_multiplatform_tables"]], ",")[[1]]
unifiable_tables = snakemake@params[["unifiable_sensors"]]
sensor_file <- snakemake@output[[1]]
device_ids <- strsplit(readLines(participant, n=1), ",")[[1]]
@ -58,30 +58,26 @@ end_datetime_utc = format(as.POSIXct(paste0(end_date, " 23:59:59"),format="%Y/%m
dbEngine <- dbConnect(MySQL(), default.file = "./.env", group = group)
# Get existent columns in table
available_columns <- colnames(dbGetQuery(dbEngine, paste0("SELECT * FROM ", table, " LIMIT 1")))
if("device_id" %in% available_columns){
if(is_multiplaform_participant(dbEngine, device_ids, platforms)){
sensor_data <- unify_raw_data(dbEngine, table, start_datetime_utc, end_datetime_utc, aware_multiplatform_tables, unifiable_tables, device_ids, platforms)
}else {
query <- paste0("SELECT * FROM ", table, " WHERE device_id IN ('", paste0(device_ids, collapse = "','"), "')")
if("timestamp" %in% available_columns && !(is.na(start_datetime_utc)) && !(is.na(end_datetime_utc)) && start_datetime_utc < end_datetime_utc)
query <- paste0(query, "AND timestamp BETWEEN 1000*UNIX_TIMESTAMP('", start_datetime_utc, "') AND 1000*UNIX_TIMESTAMP('", end_datetime_utc, "')")
sensor_data <- dbGetQuery(dbEngine, query)
if(is_multiplaform_participant(dbEngine, device_ids, platforms)){
sensor_data <- unify_raw_data(dbEngine, table, sensor, start_datetime_utc, end_datetime_utc, aware_multiplatform_tables, device_ids, platforms)
}else {
# table has two elements for conversation and activity recognition (they store data on a different table for ios and android)
if(length(table) > 1){
table <- table[[toupper(platforms[1])]]
}
if("timestamp" %in% available_columns)
sensor_data <- sensor_data %>% arrange(timestamp)
# Unify device_id
sensor_data <- sensor_data %>% mutate(device_id = unified_device_id)
# Droping duplicates on all columns except for _id or id
sensor_data <- sensor_data %>% distinct(!!!syms(setdiff(names(sensor_data), c("_id", "id"))))
} else
stop(paste0("Table ", table, "does not have a device_id column (Aware ID) to link its data to a participant"))
query <- paste0("SELECT * FROM ", table, " WHERE device_id IN ('", paste0(device_ids, collapse = "','"), "')")
if(!(is.na(start_datetime_utc)) && !(is.na(end_datetime_utc)) && start_datetime_utc < end_datetime_utc)
query <- paste0(query, "AND timestamp BETWEEN 1000*UNIX_TIMESTAMP('", start_datetime_utc, "') AND 1000*UNIX_TIMESTAMP('", end_datetime_utc, "')")
sensor_data <- dbGetQuery(dbEngine, query)
}
sensor_data <- sensor_data %>% arrange(timestamp)
# Unify device_id
sensor_data <- sensor_data %>% mutate(device_id = unified_device_id)
# Droping duplicates on all columns except for _id or id
sensor_data <- sensor_data %>% distinct(!!!syms(setdiff(names(sensor_data), c("_id", "id"))))
write_csv(sensor_data, sensor_file)
dbDisconnect(dbEngine)

View File

@ -4,11 +4,10 @@ source("src/data/unify_utils.R")
sensor_data <- read.csv(snakemake@input[["sensor_data"]], stringsAsFactors = FALSE)
participant_info <- snakemake@input[["participant_info"]]
sensor <- snakemake@params[["sensor"]]
unifiable_sensors = snakemake@params[["unifiable_sensors"]]
platforms <- strsplit(readLines(participant_info, n=2)[[2]], ",")[[1]]
platform <- ifelse(platforms[1] == "multiple" | (length(platforms) > 1 & "android" %in% platforms & "ios" %in% platforms), "android", platforms[1])
sensor_data <- unify_data(sensor_data, sensor, platform, unifiable_sensors)
sensor_data <- unify_data(sensor_data, sensor, platform)
write.csv(sensor_data, snakemake@output[[1]], row.names = FALSE)

View File

@ -101,7 +101,7 @@ clean_ios_activity_column <- function(ios_gar){
return(ios_gar)
}
unify_ios_gar <- function(ios_gar){
unify_ios_activity_recognition <- function(ios_gar){
# We only need to unify Google Activity Recognition data for iOS
# discard rows where activities column is blank
ios_gar <- ios_gar[-which(ios_gar$activities == ""), ]
@ -138,7 +138,7 @@ unify_ios_conversation <- function(conversation){
}
# This function is used in download_dataset.R
unify_raw_data <- function(dbEngine, table, start_datetime_utc, end_datetime_utc, aware_multiplatform_tables, unifiable_tables, device_ids, platforms){
unify_raw_data <- function(dbEngine, sensor_table, sensor, start_datetime_utc, end_datetime_utc, aware_multiplatform_tables, device_ids, platforms){
# If platforms is 'multiple', fetch each device_id's platform from aware_device, otherwise, use those given by the user
if(length(platforms) == 1 && platforms == "multiple")
devices_platforms <- dbGetQuery(dbEngine, paste0("SELECT device_id,brand FROM aware_device WHERE device_id IN ('", paste0(device_ids, collapse = "','"), "')")) %>%
@ -147,8 +147,9 @@ unify_raw_data <- function(dbEngine, table, start_datetime_utc, end_datetime_utc
devices_platforms <- data.frame(device_id = device_ids, platform = platforms)
# Get existent tables in database
available_tables_in_db <- dbGetQuery(dbEngine, paste0("SELECT table_name FROM information_schema.tables WHERE table_type = 'base table' AND table_schema='", dbGetInfo(dbEngine)$dbname,"'")) %>% pull(table_name)
available_tables_in_db <- dbGetQuery(dbEngine, paste0("SELECT table_name FROM information_schema.tables WHERE table_schema='", dbGetInfo(dbEngine)$dbname,"'"))[[1]]
if(!any(sensor_table %in% available_tables_in_db))
stop(paste0("You requested data from these table(s) ", paste0(sensor_table, collapse=", "), " but they don't exist in your database ", dbGetInfo(dbEngine)$dbname))
# Parse the table names for activity recognition and conversation plugins because they are different between android and ios
ar_tables <- setNames(aware_multiplatform_tables[1:2], c("android", "ios"))
conversation_tables <- setNames(aware_multiplatform_tables[3:4], c("android", "ios"))
@ -160,17 +161,19 @@ unify_raw_data <- function(dbEngine, table, start_datetime_utc, end_datetime_utc
platform <- row$platform
# Handle special cases when tables for the same sensor have different names for Android and iOS (AR and conversation)
if(table %in% ar_tables)
if(length(sensor_table) == 1)
table <- sensor_table
else if(all(sensor_table == ar_tables))
table <- ar_tables[[platform]]
else if(table %in% conversation_tables)
else if(all(sensor_table == conversation_tables))
table <- conversation_tables[[platform]]
if(table %in% available_tables_in_db){
query <- paste0("SELECT * FROM ", table, " WHERE device_id IN ('", device_id, "')")
if("timestamp" %in% available_columns && !(is.na(start_datetime_utc)) && !(is.na(end_datetime_utc)) && start_datetime_utc < end_datetime_utc){
if(!(is.na(start_datetime_utc)) && !(is.na(end_datetime_utc)) && start_datetime_utc < end_datetime_utc){
query <- paste0(query, "AND timestamp BETWEEN 1000*UNIX_TIMESTAMP('", start_datetime_utc, "') AND 1000*UNIX_TIMESTAMP('", end_datetime_utc, "')")
}
sensor_data <- unify_data(dbGetQuery(dbEngine, query), table, platform, unifiable_tables)
sensor_data <- unify_data(dbGetQuery(dbEngine, query), sensor, platform)
participants_sensordata <- append(participants_sensordata, list(sensor_data))
}else{
warning(paste0("Missing ", table, " table. We unified the data from ", paste0(devices_platforms$device_id, collapse = " and "), " but without records from this missing table for ", device_id))
@ -182,25 +185,16 @@ unify_raw_data <- function(dbEngine, table, start_datetime_utc, end_datetime_utc
}
# This function is used in unify_ios_android.R and unify_raw_data function
unify_data <- function(sensor_data, sensor, platform, unifiable_sensors){
if(sensor == unifiable_sensors$calls){
if(platform == "ios"){
sensor_data = unify_ios_calls(sensor_data)
}
# android calls remain unchanged
} else if(sensor == unifiable_sensors$battery){
if(platform == "ios"){
sensor_data = unify_ios_battery(sensor_data)
}
# android battery remains unchanged
} else if(sensor == unifiable_sensors$ios_activity_recognition){
sensor_data = unify_ios_gar(sensor_data)
} else if(sensor == unifiable_sensors$screen){
if(platform == "ios"){
sensor_data = unify_ios_screen(sensor_data)
}
# android screen remains unchanged
} else if(sensor == unifiable_sensors$ios_conversation){
unify_data <- function(sensor_data, sensor, platform){
if(sensor == "phone_calls" & platform == "ios"){
sensor_data = unify_ios_calls(sensor_data)
} else if(sensor == "phone_battery" & platform == "ios"){
sensor_data = unify_ios_battery(sensor_data)
} else if(sensor == "phone_activity_recognition" & platform == "ios"){
sensor_data = unify_ios_activity_recognition(sensor_data)
} else if(sensor == "phone_screen" & platform == "ios"){
sensor_data = unify_ios_screen(sensor_data)
} else if(sensor == "phone_conversation" & platform == "ios"){
sensor_data = unify_ios_conversation(sensor_data)
}
return(sensor_data)

View File

@ -3,7 +3,7 @@ library("dplyr")
library("stringr")
# Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility
file.sources = list.files(c("src/features/locations/barnett/library"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE)
file.sources = list.files(c("src/features/phone_locations/barnett/library"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE)
sapply(file.sources,source,.GlobalEnv)
create_empty_file <- function(requested_features){
@ -52,10 +52,13 @@ barnett_features <- function(sensor_data_files, day_segment, params){
if (nrow(location) > 1){
# Filter by segment and skipping any non-daily segment
location <- location %>% filter_data_by_segment(day_segment)
segment <- location %>% head(1) %>% pull(local_segment)
segment_data <- str_split(segment, "#")[[1]]
if(segment_data[[2]] != segment_data[[4]] || segment_data[[3]] != "00:00:00" || segment_data[[5]] != "23:59:59"){
warning(paste("Barnett's location features cannot be computed for day segmentes that are not daily (cover 00:00:00 to 23:59:59 of every day). Skipping for ", segment))
datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00"
datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59"
location <- location %>% mutate(is_daily = str_detect(local_segment, paste0(day_segment, "#", datetime_start_regex, ",", datetime_end_regex)))
if(!all(location$is_daily)){
message(paste("Barnett's location features cannot be computed for day segmentes that are not daily (cover 00:00:00 to 23:59:59 of every day). Skipping ", day_segment))
location_features <- create_empty_file(requested_features)
} else {
# Count how many minutes of data we use to get location features

View File

@ -0,0 +1,46 @@
library(dplyr)
compute_wifi_feature <- function(data, feature, day_segment){
data <- data %>% filter_data_by_segment(day_segment)
if(feature %in% c("countscans", "uniquedevices")){
data <- data %>% group_by(local_segment)
data <- switch(feature,
"countscans" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n()),
"uniquedevices" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n_distinct(bssid)))
return(data)
} else if(feature == "countscansmostuniquedevice"){
# Get the most scanned device
mostuniquedevice <- data %>%
group_by(bssid) %>%
mutate(N=n()) %>%
ungroup() %>%
filter(N == max(N)) %>%
head(1) %>% # if there are multiple device with the same amount of scans pick the first one only
pull(bssid)
return(data %>%
filter(bssid == mostuniquedevice) %>%
group_by(local_segment) %>%
summarise(!!paste("wifi_rapids", feature, sep = "_") := n()) %>%
replace(is.na(.), 0))
}
}
rapids_features <- function(sensor_data_files, day_segment, provider){
wifi_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
requested_features <- provider[["FEATURES"]]
# Output dataframe
features = data.frame(local_segment = character(), stringsAsFactors = FALSE)
# The name of the features this function can compute
base_features_names <- c("countscans", "uniquedevices", "countscansmostuniquedevice")
# The subset of requested features this function can compute
features_to_compute <- intersect(base_features_names, requested_features)
for(feature_name in features_to_compute){
feature <- compute_wifi_feature(wifi_data, feature_name, day_segment)
features <- merge(features, feature, by="local_segment", all = TRUE)
}
return(features)
}

View File

@ -1,6 +1,8 @@
source("renv/activate.R")
library("dplyr")
library("tidyr")
library("tibble")
options(scipen=999)
# Using mostly indeixng instead of tidyr because is faster
resampled_episodes <- read.csv(snakemake@input[[1]])

View File

@ -74,7 +74,7 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
sensor_features = pd.DataFrame(columns=["local_segment"])
day_segments_labels = pd.read_csv(day_segments_file, header=0)
if "FEATURES" not in provider:
raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(sensor_key.upper(), provider_key))
raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(sensor_key.upper(), provider_key.upper()))
if provider["COMPUTE"] == True: