Merge pull request #128 from carissalow/feature/multi_smartphone_app

Feature/multi smartphone app
pull/130/head
JulioV 2021-03-15 12:01:19 -04:00 committed by GitHub
commit f6ccc3c08c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
247 changed files with 42213 additions and 37657 deletions

View File

@ -67,7 +67,7 @@ jobs:
shell: bash -l {0}
run : |
conda activate rapidstests
bash tests/scripts/run_tests.sh all test
bash tests/scripts/run_tests.sh -t all
- name: Release tag
if: success() && startsWith(github.ref, 'refs/tags')
id: create_release

1
.gitignore vendored
View File

@ -112,3 +112,4 @@ sn_profile_*/
settings.dcf
tests/fakedata_generation/
site/
credentials.yaml

150
Snakefile
View File

@ -21,14 +21,14 @@ for provider in config["PHONE_DATA_YIELD"]["PROVIDERS"].keys():
if not (set(config["PHONE_DATA_YIELD"]["SENSORS"]) <= set(allowed_phone_sensors)):
raise ValueError('\nInvalid sensor(s) for PHONE_DATA_YIELD. config["PHONE_DATA_YIELD"]["SENSORS"] can have '
'one or more of the following phone sensors: {}.\nInstead you provided "{}".\n'
'Keep in mind that the sensors\' TABLE attribute must point to a valid database table'\
'Keep in mind that the sensors\' CONTAINER attribute must point to a valid database table or file'\
.format(', '.join(allowed_phone_sensors),
', '.join(set(config["PHONE_DATA_YIELD"]["SENSORS"]) - set(allowed_phone_sensors))))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=map(str.lower, config["PHONE_DATA_YIELD"]["SENSORS"])))
files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_data_yield_features/phone_data_yield_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_data_yield_features/phone_data_yield_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_data_yield.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -37,7 +37,7 @@ for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys():
if config["PHONE_MESSAGES"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_messages_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_messages_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_messages_features/phone_messages_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_MESSAGES"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_messages_features/phone_messages_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_MESSAGES"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_messages.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -46,8 +46,7 @@ for provider in config["PHONE_CALLS"]["PROVIDERS"].keys():
if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime_unified.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -56,7 +55,7 @@ for provider in config["PHONE_BLUETOOTH"]["PROVIDERS"].keys():
if config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_bluetooth_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_bluetooth_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_bluetooth_features/phone_bluetooth_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_bluetooth_features/phone_bluetooth_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_bluetooth.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -65,11 +64,10 @@ for provider in config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"].keys():
if config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_with_datetime_unified.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_features/phone_activity_recognition_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_features/phone_activity_recognition_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_activity_recognition.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -80,7 +78,7 @@ for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_features/phone_battery_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_BATTERY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_features/phone_battery_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_BATTERY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_battery.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -93,11 +91,10 @@ for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys():
# raise ValueError("Error: Add PHONE_SCREEN (and as many PHONE_SENSORS as you have in your database) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data)")
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime_unified.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_features/phone_screen_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_SCREEN"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_features/phone_screen_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_SCREEN"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_screen.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -106,7 +103,7 @@ for provider in config["PHONE_LIGHT"]["PROVIDERS"].keys():
if config["PHONE_LIGHT"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_light_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_light_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_light_features/phone_light_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LIGHT"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_light_features/phone_light_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_LIGHT"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_light.csv", pid=config["PIDS"],))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -115,7 +112,7 @@ for provider in config["PHONE_ACCELEROMETER"]["PROVIDERS"].keys():
if config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_accelerometer_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_accelerometer_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_accelerometer_features/phone_accelerometer_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_accelerometer_features/phone_accelerometer_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_accelerometer.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -125,7 +122,7 @@ for provider in config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_applications_foreground_features/phone_applications_foreground_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_applications_foreground_features/phone_applications_foreground_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_foreground.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -134,7 +131,7 @@ for provider in config["PHONE_WIFI_VISIBLE"]["PROVIDERS"].keys():
if config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_visible_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_visible_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_visible_features/phone_wifi_visible_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_visible_features/phone_wifi_visible_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_visible.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -143,7 +140,7 @@ for provider in config["PHONE_WIFI_CONNECTED"]["PROVIDERS"].keys():
if config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_connected_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_connected_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_connected_features/phone_wifi_connected_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_connected_features/phone_wifi_connected_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_connected.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -152,8 +149,7 @@ for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys():
if config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_with_datetime_unified.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_conversation_features/phone_conversation_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_conversation_features/phone_conversation_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_conversation.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -165,7 +161,7 @@ if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict):
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_crashes_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_crashes_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_crashes_with_datetime_with_categories.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_applications_crashes_features/phone_applications_crashes_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_applications_crashes_features/phone_applications_crashes_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_crashes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -176,7 +172,7 @@ if isinstance(config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"], dict):
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_notifications_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_notifications_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_notifications_with_datetime_with_categories.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_applications_notifications_features/phone_applications_notifications_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_applications_notifications_features/phone_applications_notifications_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_notifications.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -186,18 +182,18 @@ if isinstance(config["PHONE_KEYBOARD"]["PROVIDERS"], dict):
if config["PHONE_KEYBOARD"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_keyboard_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_keyboard_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_keyboard_features/phone_keyboard_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_KEYBOARD"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_keyboard_features/phone_keyboard_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_KEYBOARD"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_keyboard.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
if isinstance(config["PHONE_AWARE_LOG"]["PROVIDERS"], dict):
for provider in config["PHONE_AWARE_LOG"]["PROVIDERS"].keys():
if config["PHONE_AWARE_LOG"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_aware_log_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_aware_log_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_aware_log_features/phone_aware_log_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_AWARE_LOG"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_aware_log.csv", pid=config["PIDS"]))
if isinstance(config["PHONE_LOG"]["PROVIDERS"], dict):
for provider in config["PHONE_LOG"]["PROVIDERS"].keys():
if config["PHONE_LOG"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_log_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_log_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_log_features/phone_log_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_LOG"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_log.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -213,7 +209,7 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -221,8 +217,7 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
for provider in config["FITBIT_DATA_YIELD"]["PROVIDERS"].keys():
if config["FITBIT_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_data_yield.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -230,9 +225,8 @@ for provider in config["FITBIT_DATA_YIELD"]["PROVIDERS"].keys():
for provider in config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"].keys():
if config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_parsed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_summary_features/fitbit_heartrate_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_summary_features/fitbit_heartrate_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_summary.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -240,9 +234,8 @@ for provider in config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"].keys():
for provider in config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"].keys():
if config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_intraday_features/fitbit_heartrate_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_intraday_features/fitbit_heartrate_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_intraday.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -250,9 +243,8 @@ for provider in config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"].keys():
for provider in config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"].keys():
if config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_parsed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_summary_features/fitbit_sleep_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_summary_features/fitbit_sleep_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_sleep_summary.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -260,10 +252,11 @@ for provider in config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"].keys():
for provider in config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"].keys():
if config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_features/fitbit_sleep_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_features/fitbit_sleep_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_sleep_intraday.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -271,9 +264,8 @@ for provider in config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"].keys():
for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys():
if config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_summary.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -281,43 +273,27 @@ for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys():
for provider in config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"].keys():
if config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_intraday.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
# for provider in config["FITBIT_CALORIES"]["PROVIDERS"].keys():
# if config["FITBIT_CALORIES"]["PROVIDERS"][provider]["COMPUTE"]:
# files_to_compute.extend(expand("data/raw/{pid}/fitbit_calories_{fitbit_data_type}_raw.csv", pid=config["PIDS"], fitbit_data_type=(["json"] if config["FITBIT_CALORIES"]["TABLE_FORMAT"] == "JSON" else ["summary", "intraday"])))
# files_to_compute.extend(expand("data/raw/{pid}/fitbit_calories_{fitbit_data_type}_parsed.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"]))
# files_to_compute.extend(expand("data/raw/{pid}/fitbit_calories_{fitbit_data_type}_parsed_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"]))
# files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
# files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["EMPATICA_ACCELEROMETER"]["PROVIDERS"].keys():
if config["EMPATICA_ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]:
for pid in config["PIDS"]:
suffixes = get_zip_suffixes(pid)
files_to_compute.extend(expand("data/raw/{pid}/empatica_accelerometer_unzipped_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_accelerometer_raw_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_accelerometer_joined.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_accelerometer_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_accelerometer_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_ACCELEROMETER"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_ACCELEROMETER"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_accelerometer.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["EMPATICA_HEARTRATE"]["PROVIDERS"].keys():
if config["EMPATICA_HEARTRATE"]["PROVIDERS"][provider]["COMPUTE"]:
for pid in config["PIDS"]:
suffixes = get_zip_suffixes(pid)
files_to_compute.extend(expand("data/raw/{pid}/empatica_heartrate_unzipped_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_heartrate_raw_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_heartrate_joined.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_heartrate_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_heartrate_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_HEARTRATE"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_HEARTRATE"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_heartrate.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -325,52 +301,36 @@ for provider in config["EMPATICA_HEARTRATE"]["PROVIDERS"].keys():
for provider in config["EMPATICA_TEMPERATURE"]["PROVIDERS"].keys():
if config["EMPATICA_TEMPERATURE"]["PROVIDERS"][provider]["COMPUTE"]:
for pid in config["PIDS"]:
suffixes = get_zip_suffixes(pid)
files_to_compute.extend(expand("data/raw/{pid}/empatica_temperature_unzipped_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_temperature_raw_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_temperature_joined.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_temperature_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_temperature_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_temperature_features/empatica_temperature_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_TEMPERATURE"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/empatica_temperature_features/empatica_temperature_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_TEMPERATURE"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_temperature.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"].keys():
if config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"][provider]["COMPUTE"]:
for pid in config["PIDS"]:
suffixes = get_zip_suffixes(pid)
files_to_compute.extend(expand("data/raw/{pid}/empatica_electrodermal_activity_unzipped_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_electrodermal_activity_raw_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_electrodermal_activity_joined.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_electrodermal_activity_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_electrodermal_activity_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_electrodermal_activity.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"].keys():
if config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"][provider]["COMPUTE"]:
for pid in config["PIDS"]:
suffixes = get_zip_suffixes(pid)
files_to_compute.extend(expand("data/raw/{pid}/empatica_blood_volume_pulse_unzipped_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_blood_volume_pulse_raw_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_blood_volume_pulse_joined.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_blood_volume_pulse_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_blood_volume_pulse_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_blood_volume_pulse.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys():
if config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["COMPUTE"]:
for pid in config["PIDS"]:
suffixes = get_zip_suffixes(pid)
files_to_compute.extend(expand("data/raw/{pid}/empatica_inter_beat_interval_unzipped_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_inter_beat_interval_raw_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_inter_beat_interval_joined.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_inter_beat_interval_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_inter_beat_interval_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_inter_beat_interval.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -378,13 +338,9 @@ for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys():
if isinstance(config["EMPATICA_TAGS"]["PROVIDERS"], dict):
for provider in config["EMPATICA_TAGS"]["PROVIDERS"].keys():
if config["EMPATICA_TAGS"]["PROVIDERS"][provider]["COMPUTE"]:
for pid in config["PIDS"]:
suffixes = get_zip_suffixes(pid)
files_to_compute.extend(expand("data/raw/{pid}/empatica_tags_unzipped_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_tags_raw_{suffix}.csv", pid=pid, suffix=suffixes))
files_to_compute.extend(expand("data/raw/{pid}/empatica_tags_joined.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_tags_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_tags_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_tags_features/empatica_tags_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_TAGS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/empatica_tags_features/empatica_tags_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_TAGS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_tags.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")

View File

@ -1,31 +1,25 @@
# See https://www.rapids.science/latest/setup/configuration/#database-credentials
DATABASE_GROUP: &database_group
MY_GROUP
# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
TIMEZONE: &timezone
America/New_York
########################################################################################################################
# GLOBAL CONFIGURATION #
########################################################################################################################
# See https://www.rapids.science/latest/setup/configuration/#participant-files
PIDS: [test01]
# See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files
CREATE_PARTICIPANT_FILES:
SOURCE:
TYPE: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE
DATABASE_GROUP: *database_group
CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format
TIMEZONE: *timezone
CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format
PHONE_SECTION:
ADD: TRUE
ADD: True
DEVICE_ID_COLUMN: device_id # column name
IGNORED_DEVICE_IDS: []
FITBIT_SECTION:
ADD: FALSE
DEVICE_ID_COLUMN: device_id # column name
ADD: True
DEVICE_ID_COLUMN: fitbit_id # column name
IGNORED_DEVICE_IDS: []
EMPATICA_SECTION:
ADD: FALSE
ADD: True
DEVICE_ID_COLUMN: empatica_id # column name
IGNORED_DEVICE_IDS: []
# See https://www.rapids.science/latest/setup/configuration/#time-segments
TIME_SEGMENTS: &time_segments
@ -33,33 +27,47 @@ TIME_SEGMENTS: &time_segments
FILE: "data/external/timesegments_periodic.csv"
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs
# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
TIMEZONE:
TYPE: SINGLE
SINGLE:
TZCODE: America/New_York
MULTIPLE:
TZCODES_FILE: data/external/multiple_timezones_example.csv
IF_MISSING_TZCODE: STOP
DEFAULT_TZCODE: America/New_York
FITBIT:
ALLOW_MULTIPLE_TZ_PER_DEVICE: False
INFER_FROM_SMARTPHONE_TZ: False
########################################################################################################################
# PHONE #
########################################################################################################################
# See https://www.rapids.science/latest/setup/configuration/#device-data-source-configuration
PHONE_DATA_CONFIGURATION:
SOURCE:
TYPE: DATABASE
DATABASE_GROUP: *database_group
DEVICE_ID_COLUMN: device_id # column name
TIMEZONE:
TYPE: SINGLE
VALUE: *timezone
# See https://www.rapids.science/latest/setup/configuration/#data-stream-configuration
PHONE_DATA_STREAMS:
USE: aware_mysql
# AVAILABLE:
aware_mysql:
DATABASE_GROUP: MY_GROUP
aware_csv:
FOLDER: data/external/aware_csv
aware_influxdb:
DATABASE_GROUP: MY_GROUP
# Sensors ------
# https://www.rapids.science/latest/features/phone-accelerometer/
PHONE_ACCELEROMETER:
TABLE: accelerometer
CONTAINER: accelerometer
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
SRC_FOLDER: "rapids" # inside src/features/phone_accelerometer
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_accelerometer/rapids/main.py
PANDA:
COMPUTE: False
@ -67,12 +75,11 @@ PHONE_ACCELEROMETER:
FEATURES:
exertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
nonexertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
SRC_FOLDER: "panda" # inside src/features/phone_accelerometer
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_accelerometer/panda/main.py
# See https://www.rapids.science/latest/features/phone-activity-recognition/
PHONE_ACTIVITY_RECOGNITION:
TABLE:
CONTAINER:
ANDROID: plugin_google_activity_recognition
IOS: plugin_ios_activity_recognition
EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same AR episode.
@ -84,12 +91,11 @@ PHONE_ACTIVITY_RECOGNITION:
STATIONARY: ["still", "tilting"]
MOBILE: ["on_foot", "walking", "running", "on_bicycle"]
VEHICLE: ["in_vehicle"]
SRC_FOLDER: "rapids" # inside src/features/phone_activity_recognition
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_activity_recognition/rapids/main.py
# See https://www.rapids.science/latest/features/phone-applications-crashes/
PHONE_APPLICATIONS_CRASHES:
TABLE: applications_crashes
CONTAINER: applications_crashes
APPLICATION_CATEGORIES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
@ -99,7 +105,7 @@ PHONE_APPLICATIONS_CRASHES:
# See https://www.rapids.science/latest/features/phone-applications-foreground/
PHONE_APPLICATIONS_FOREGROUND:
TABLE: applications_foreground
CONTAINER: applications_foreground
APPLICATION_CATEGORIES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
@ -116,12 +122,11 @@ PHONE_APPLICATIONS_FOREGROUND:
EXCLUDED_CATEGORIES: []
EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"]
FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
SRC_FOLDER: "rapids" # inside src/features/phone_applications_foreground
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_applications_foreground/rapids/main.py
# See https://www.rapids.science/latest/features/phone-applications-notifications/
PHONE_APPLICATIONS_NOTIFICATIONS:
TABLE: applications_notifications
CONTAINER: applications_notifications
APPLICATION_CATEGORIES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
@ -129,31 +134,25 @@ PHONE_APPLICATIONS_NOTIFICATIONS:
SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
# See https://www.rapids.science/latest/features/phone-aware-log/
PHONE_AWARE_LOG:
TABLE: aware_log
PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
# See https://www.rapids.science/latest/features/phone-battery/
PHONE_BATTERY:
TABLE: battery
CONTAINER: battery
EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
SRC_FOLDER: "rapids" # inside src/features/phone_battery
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_battery/rapids/main.py
# See https://www.rapids.science/latest/features/phone-bluetooth/
PHONE_BLUETOOTH:
TABLE: bluetooth
CONTAINER: bluetooth
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth
SRC_LANGUAGE: "r"
SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R
DORYAB:
COMPUTE: False
FEATURES:
@ -169,12 +168,11 @@ PHONE_BLUETOOTH:
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
SRC_FOLDER: "doryab" # inside src/features/phone_bluetooth
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_bluetooth/doryab/main.py
# See https://www.rapids.science/latest/features/phone-calls/
PHONE_CALLS:
TABLE: calls
CONTAINER: calls
PROVIDERS:
RAPIDS:
COMPUTE: False
@ -183,12 +181,11 @@ PHONE_CALLS:
missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact]
incoming: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
outgoing: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/phone_calls
SRC_SCRIPT: src/features/phone_calls/rapids/main.R
# See https://www.rapids.science/latest/features/phone-conversation/
PHONE_CONVERSATION:
TABLE:
CONTAINER:
ANDROID: plugin_studentlife_audio_android
IOS: plugin_studentlife_audio
PROVIDERS:
@ -202,8 +199,7 @@ PHONE_CONVERSATION:
"unknownexpectedfraction","countconversation"]
RECORDING_MINUTES: 1
PAUSED_MINUTES : 3
SRC_FOLDER: "rapids" # inside src/features/phone_conversation
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_conversation/rapids/main.py
# See https://www.rapids.science/latest/features/phone-data-yield/
PHONE_DATA_YIELD:
@ -213,27 +209,25 @@ PHONE_DATA_YIELD:
COMPUTE: False
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/phone_data_yield
SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R
# See https://www.rapids.science/latest/features/phone-keyboard/
PHONE_KEYBOARD:
TABLE: keyboard
CONTAINER: keyboard
PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
# See https://www.rapids.science/latest/features/phone-light/
PHONE_LIGHT:
TABLE: light
CONTAINER: light
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
SRC_FOLDER: "rapids" # inside src/features/phone_light
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_light/rapids/main.py
# See https://www.rapids.science/latest/features/phone-locations/
PHONE_LOCATIONS:
TABLE: locations
CONTAINER: locations
LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED
FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
@ -247,7 +241,7 @@ PHONE_LOCATIONS:
DORYAB:
COMPUTE: False
FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"]
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
ACCURACY_LIMIT: 100 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
DBSCAN_EPS: 10 # meters
DBSCAN_MINSAMPLES: 5
THRESHOLD_STATIC : 1 # km/h
@ -257,21 +251,26 @@ PHONE_LOCATIONS:
CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
RADIUS_FOR_HOME: 100
SRC_FOLDER: "doryab" # inside src/features/phone_locations
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_locations/doryab/main.py
BARNETT:
COMPUTE: False
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
TIMEZONE: *timezone
ACCURACY_LIMIT: 100 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
SRC_FOLDER: "barnett" # inside src/features/phone_locations
SRC_LANGUAGE: "r"
SRC_SCRIPT: src/features/phone_locations/barnett/main.R
# See https://www.rapids.science/latest/features/phone-log/
PHONE_LOG:
CONTAINER:
ANDROID: aware_log
IOS: ios_aware_log
PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
# See https://www.rapids.science/latest/features/phone-messages/
PHONE_MESSAGES:
TABLE: messages
CONTAINER: messages
PROVIDERS:
RAPIDS:
COMPUTE: False
@ -279,12 +278,11 @@ PHONE_MESSAGES:
FEATURES:
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
sent: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/phone_messages
SRC_SCRIPT: src/features/phone_messages/rapids/main.R
# See https://www.rapids.science/latest/features/phone-screen/
PHONE_SCREEN:
TABLE: screen
CONTAINER: screen
PROVIDERS:
RAPIDS:
COMPUTE: False
@ -293,28 +291,25 @@ PHONE_SCREEN:
IGNORE_EPISODES_LONGER_THAN: 0 # in minutes, set to 0 to disable
FEATURES: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration", "firstuseafter"] # "episodepersensedminutes" needs to be added later
EPISODE_TYPES: ["unlock"]
SRC_FOLDER: "rapids" # inside src/features/phone_screen
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_screen/rapids/main.py
# See https://www.rapids.science/latest/features/phone-wifi-connected/
PHONE_WIFI_CONNECTED:
TABLE: "sensor_wifi"
CONTAINER: sensor_wifi
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/phone_wifi_connected
SRC_LANGUAGE: "r"
SRC_SCRIPT: src/features/phone_wifi_connected/rapids/main.R
# See https://www.rapids.science/latest/features/phone-wifi-visible/
PHONE_WIFI_VISIBLE:
TABLE: "wifi"
CONTAINER: wifi
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/phone_wifi_visible
SRC_LANGUAGE: "r"
SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R
@ -322,16 +317,26 @@ PHONE_WIFI_VISIBLE:
# FITBIT #
########################################################################################################################
# See https://www.rapids.science/latest/setup/configuration/#device-data-source-configuration
FITBIT_DATA_CONFIGURATION:
SOURCE:
TYPE: DATABASE # DATABASE or FILES (set each [FITBIT_SENSOR][TABLE] attribute with a table name or a file path accordingly)
COLUMN_FORMAT: JSON # JSON or PLAIN_TEXT
DATABASE_GROUP: *database_group
DEVICE_ID_COLUMN: device_id # column name
TIMEZONE:
TYPE: SINGLE # Fitbit devices don't support time zones so we read this data in the timezone indicated by VALUE
VALUE: *timezone
# See https://www.rapids.science/latest/setup/configuration/#data-stream-configuration
FITBIT_DATA_STREAMS:
USE: fitbitjson_mysql
# AVAILABLE:
fitbitjson_mysql:
DATABASE_GROUP: MY_GROUP
SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp.
fitbitparsed_mysql:
DATABASE_GROUP: MY_GROUP
SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp.
fitbitjson_csv:
FOLDER: data/external/fitbit_csv
SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp.
fitbitparsed_csv:
FOLDER: data/external/fitbit_csv
SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp.
# Sensors ------
@ -343,45 +348,40 @@ FITBIT_DATA_YIELD:
COMPUTE: False
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/fitbit_data_yield
SRC_SCRIPT: src/features/fitbit_data_yield/rapids/main.R
# See https://www.rapids.science/latest/features/fitbit-heartrate-summary/
FITBIT_HEARTRATE_SUMMARY:
TABLE: heartrate_summary
CONTAINER: heartrate_summary
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["maxrestinghr", "minrestinghr", "avgrestinghr", "medianrestinghr", "moderestinghr", "stdrestinghr", "diffmaxmoderestinghr", "diffminmoderestinghr", "entropyrestinghr"] # calories features' accuracy depend on the accuracy of the participants fitbit profile (e.g. height, weight) use these with care: ["sumcaloriesoutofrange", "maxcaloriesoutofrange", "mincaloriesoutofrange", "avgcaloriesoutofrange", "mediancaloriesoutofrange", "stdcaloriesoutofrange", "entropycaloriesoutofrange", "sumcaloriesfatburn", "maxcaloriesfatburn", "mincaloriesfatburn", "avgcaloriesfatburn", "mediancaloriesfatburn", "stdcaloriesfatburn", "entropycaloriesfatburn", "sumcaloriescardio", "maxcaloriescardio", "mincaloriescardio", "avgcaloriescardio", "mediancaloriescardio", "stdcaloriescardio", "entropycaloriescardio", "sumcaloriespeak", "maxcaloriespeak", "mincaloriespeak", "avgcaloriespeak", "mediancaloriespeak", "stdcaloriespeak", "entropycaloriespeak"]
SRC_FOLDER: "rapids" # inside src/features/fitbit_heartrate_summary
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/fitbit_heartrate_summary/rapids/main.py
# See https://www.rapids.science/latest/features/fitbit-heartrate-intraday/
FITBIT_HEARTRATE_INTRADAY:
TABLE: heartrate_intraday
CONTAINER: heartrate_intraday
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "minutesonoutofrangezone", "minutesonfatburnzone", "minutesoncardiozone", "minutesonpeakzone"]
SRC_FOLDER: "rapids" # inside src/features/fitbit_heartrate_intraday
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/fitbit_heartrate_intraday/rapids/main.py
# See https://www.rapids.science/latest/features/fitbit-sleep-summary/
FITBIT_SLEEP_SUMMARY:
TABLE: sleep_summary
SLEEP_EPISODE_TIMESTAMP: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp.
CONTAINER: sleep_summary
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["countepisode", "avgefficiency", "sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgdurationafterwakeup", "avgdurationasleep", "avgdurationawake", "avgdurationtofallasleep", "avgdurationinbed"]
SLEEP_TYPES: ["main", "nap", "all"]
SRC_FOLDER: "rapids" # inside src/features/fitbit_sleep_summary
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/fitbit_sleep_summary/rapids/main.py
# See https://www.rapids.science/latest/features/fitbit-sleep-intraday/
FITBIT_SLEEP_INTRADAY:
TABLE: sleep_intraday
CONTAINER: sleep_intraday
PROVIDERS:
RAPIDS:
COMPUTE: False
@ -398,8 +398,7 @@ FITBIT_SLEEP_INTRADAY:
SLEEP_TYPES: [main, nap]
INCLUDE_SLEEP_LATER_THAN: 0 # a number ranged from 0 (midnight) to 1439 (23:59)
REFERENCE_TIME: MIDNIGHT # chosen from "MIDNIGHT" and "START_OF_THE_SEGMENT"
SRC_FOLDER: "rapids" # inside src/features/fitbit_sleep_intraday
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/fitbit_sleep_intraday/rapids/main.py
PRICE:
COMPUTE: False
@ -412,22 +411,20 @@ FITBIT_SLEEP_INTRADAY:
GROUP_EPISODES_WITHIN: # by default: today's 6pm to tomorrow's noon
START_TIME: 1080 # number of minutes after the midnight (18:00) 18*60
LENGTH: 1080 # in minutes (18 hours) 18*60
SRC_FOLDER: "price" # inside src/features/fitbit_sleep_intraday
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/fitbit_sleep_intraday/price/main.py
# See https://www.rapids.science/latest/features/fitbit-steps-summary/
FITBIT_STEPS_SUMMARY:
TABLE: steps_summary
CONTAINER: steps_summary
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["maxsumsteps", "minsumsteps", "avgsumsteps", "mediansumsteps", "stdsumsteps"]
SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_summary
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/fitbit_steps_summary/rapids/main.py
# See https://www.rapids.science/latest/features/fitbit-steps-intraday/
FITBIT_STEPS_INTRADAY:
TABLE: steps_intraday
CONTAINER: steps_intraday
PROVIDERS:
RAPIDS:
COMPUTE: False
@ -437,92 +434,78 @@ FITBIT_STEPS_INTRADAY:
ACTIVE_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"]
THRESHOLD_ACTIVE_BOUT: 10 # steps
INCLUDE_ZERO_STEP_ROWS: False
SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_intraday
SRC_LANGUAGE: "python"
# FITBIT_CALORIES:
# TABLE_FORMAT: JSON # JSON or CSV. If your JSON or CSV data are files change [DEVICE_DATA][FITBIT][SOURCE][TYPE] to FILES
# TABLE:
# JSON: fitbit_calories
# CSV:
# SUMMARY: calories_summary
# INTRADAY: calories_intraday
# PROVIDERS:
# RAPIDS:
# COMPUTE: False
# FEATURES: []
SRC_SCRIPT: src/features/fitbit_steps_intraday/rapids/main.py
########################################################################################################################
# EMPATICA #
########################################################################################################################
EMPATICA_DATA_CONFIGURATION:
SOURCE:
TYPE: ZIP_FILE
EMPATICA_DATA_STREAMS:
USE: empatica_zip
# AVAILABLE:
empatica_zip:
FOLDER: data/external/empatica
TIMEZONE:
TYPE: SINGLE # Empatica devices don't support time zones so we read this data in the timezone indicated by VALUE
VALUE: *timezone
# Sensors ------
# See https://www.rapids.science/latest/features/empatica-accelerometer/
EMPATICA_ACCELEROMETER:
TABLE: ACC
CONTAINER: ACC
PROVIDERS:
DBDP:
COMPUTE: False
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
SRC_FOLDER: "dbdp" # inside src/features/empatica_accelerometer
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
# See https://www.rapids.science/latest/features/empatica-heartrate/
EMPATICA_HEARTRATE:
TABLE: HR
CONTAINER: HR
PROVIDERS:
DBDP:
COMPUTE: False
FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr"]
SRC_FOLDER: "dbdp" # inside src/features/empatica_heartrate
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/empatica_heartrate/dbdp/main.py
# See https://www.rapids.science/latest/features/empatica-temperature/
EMPATICA_TEMPERATURE:
TABLE: TEMP
CONTAINER: TEMP
PROVIDERS:
DBDP:
COMPUTE: False
FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
SRC_FOLDER: "dbdp" # inside src/features/empatica_heartrate
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
# See https://www.rapids.science/latest/features/empatica-electrodermal-activity/
EMPATICA_ELECTRODERMAL_ACTIVITY:
TABLE: EDA
CONTAINER: EDA
PROVIDERS:
DBDP:
COMPUTE: False
FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
SRC_FOLDER: "dbdp" # inside src/features/empatica_electrodermal_activity
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
# See https://www.rapids.science/latest/features/empatica-blood-volume-pulse/
EMPATICA_BLOOD_VOLUME_PULSE:
TABLE: BVP
CONTAINER: BVP
PROVIDERS:
DBDP:
COMPUTE: False
FEATURES: ["maxbvp", "minbvp", "avgbvp", "medianbvp", "modebvp", "stdbvp", "diffmaxmodebvp", "diffminmodebvp", "entropybvp"]
SRC_FOLDER: "dbdp" # inside src/features/empatica_blood_volume_pulse
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/dbdp/main.py
# See https://www.rapids.science/latest/features/empatica-inter-beat-interval/
EMPATICA_INTER_BEAT_INTERVAL:
TABLE: IBI
CONTAINER: IBI
PROVIDERS:
DBDP:
COMPUTE: False
FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
SRC_FOLDER: "dbdp" # inside src/features/inter_beat_interval
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
# See https://www.rapids.science/latest/features/empatica-tags/
EMPATICA_TAGS:
TABLE: TAGS
CONTAINER: TAGS
PROVIDERS: # None implemented yet
@ -530,21 +513,28 @@ EMPATICA_TAGS:
# PLOTS #
########################################################################################################################
# Data quality
# Data quality ------
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#1-histograms-of-phone-data-yield
HISTOGRAM_PHONE_DATA_YIELD:
PLOT: False
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#2-heatmaps-of-overall-data-yield
HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT:
PLOT: False
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#3-heatmap-of-recorded-phone-sensors
HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT:
PLOT: False
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#4-heatmap-of-sensor-row-count
HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT:
PLOT: False
SENSORS: [PHONE_ACCELEROMETER, PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE]
# Features
# Features ------
# See https://www.rapids.science/latest/visualizations/feature-visualizations/#1-heatmap-correlation-matrix
HEATMAP_FEATURE_CORRELATION_MATRIX:
PLOT: False
MIN_ROWS_RATIO: 0.5

View File

View File

View File

@ -1,11 +1,20 @@
# Change Log
## Next release
- Add support for Empatica devices (all sensors)
- Add logo
- Move Citation page to the Setup section
- Add `config.yaml` validation schema and documentation.
- Add time at home Doryab location feature and home coordinates to location file
## v1.0.0
- Add a new [Overview](../setup/overview/) page.
- You can [extend](../datastreams/add-new-data-streams/) RAPIDS with your own [data streams](../datastreams/data-streams-introduction/). Data streams are data collected with other sensing apps besides AWARE (like Beiwe, mindLAMP), and stored in other data containers (databases, files) besides MySQL.
- Support to analyze Empatica wearable data (thanks to Joe Kim and Brinnae Bent from the [DBDP](https://dbdp.org/))
- Support to analyze AWARE data stored in [CSV files](../datastreams/aware-csv/) and [InfluxDB](../datastreams/aware-influxdb/) databases
- Support to analyze data collected over [multiple time zones](../setup/configuration/#multiple-timezones)
- Support for [sleep intraday features](../features/fitbit-sleep-intraday/) from the core team and also from the community (thanks to Stephen Price)
- Users can comment on the documentation (powered by utterances).
- `SCR_SCRIPT` and `SRC_LANGUAGE` are replaced by `SRC_SCRIPT`.
- Add RAPIDS new logo
- Move Citation and Minimal Example page to the Setup section
- Add `config.yaml` validation schema and documentation. Now it's more difficult to modify the `config.yaml` file with invalid values.
- Add new `time at home` Doryab location feature
- Add and home coordinates to the location data file so location providers can build features based on it.
- If you are migrating from RAPIDS 0.4.3 or older, check this [guide](../migrating-from-old-versions/#migrating-from-rapids-04x-or-older)
## v0.4.3
- Fix bug when any of the rows from any sensor do not belong a time segment
## v0.4.2

View File

@ -1,4 +1,4 @@
# Frequently Asked Questions
# Common Errors
## Cannot connect to your MySQL server
@ -41,7 +41,7 @@
## Every time I run force the download_dataset rule all rules are executed
???+ failure "Problem"
When running `snakemake -j1 -R download_phone_data` or `./rapids -j1 -R download_phone_data` all the rules and files are re-computed
When running `snakemake -j1 -R pull_phone_data` or `./rapids -j1 -R pull_phone_data` all the rules and files are re-computed
???+ done "Solution"
This is expected behavior. The advantage of using `snakemake` under the hood is that every time a file containing data is modified every rule that depends on that file will be re-executed to update their results. In this case, since `download_dataset` updates all the raw data, and you are forcing the rule with the flag `-R` every single rule that depends on those raw files will be executed.
@ -58,7 +58,7 @@
```
???+ done "Solution"
Please make sure the sensors listed in `[PHONE_VALID_SENSED_BINS][PHONE_SENSORS]` and the `[TABLE]` of each sensor you activated in `config.yaml` match your database tables.
Please make sure the sensors listed in `[PHONE_VALID_SENSED_BINS][PHONE_SENSORS]` and the `[CONTAINER]` of each sensor you activated in `config.yaml` match your database tables or files.
---
## How do I install RAPIDS on Ubuntu 16.04
@ -215,7 +215,7 @@
```bash
R -e 'renv::install("RMySQL")'
```
- Go to `src/data/download_phone_data.R` or `src/data/download_fitbit_data.R` and replace `library(RMariaDB)` with `library(RMySQL)`
- Go to `src/data/streams/pull_phone_data.R` or `src/data/streams/pull_fitbit_data.R` and replace `library(RMariaDB)` with `library(RMySQL)`
- In the same file(s) replace `dbEngine <- dbConnect(MariaDB(), default.file = "./.env", group = group)` with `dbEngine <- dbConnect(MySQL(), default.file = "./.env", group = group)`
## There is no package called `RMariaDB`

View File

@ -0,0 +1,350 @@
# Add New Data Streams
A data stream is a set of sensor data collected using a specific type of **device** with a specific **format** and stored in a specific **container**. RAPIDS is agnostic to data streams' formats and container; see the [Data Streams Introduction](../data-streams-introduction) for a list of supported streams.
**A container** is queried with an R or Python script that connects to the database, API or file where your stream's raw data is stored.
**A format** is described using a `format.yaml` file that specifies how to map and mutate your stream's raw data to match the data and format RAPIDS needs.
The most common cases when you would want to implement a new data stream are:
- You collected data with a mobile sensing app RAPIDS does not support yet. For example, [Beiwe](https://www.beiwe.org/) data stored in MySQL. You will need to define a new format file and a new container script.
- You collected data with a mobile sensing app RAPIDS supports, but this data is stored in a container that RAPIDS can't connect to yet. For example, AWARE data stored in PostgreSQL. In this case, you can reuse the format file of the `aware_mysql` stream, but you will need to implement a new container script.
!!! hint
Both the `container.[R|py]` and the `format.yaml` are stored in `./src/data/streams/[stream_name]` where `[stream_name]` can be `aware_mysql` for example.
## Implement a Container
The `container` script of a data stream can be implemented in R (strongly recommended) or python. This script must have two functions if you are implementing a stream for phone data or one function otherwise. The script can contain other auxiliary functions.
First of all, add any parameters your script might need in `config.yaml` under `(device)_DATA_STREAMS`. These parameters will be available in the `stream_parameters` argument of the one or two functions you implement. For example, if you are adding support for `Beiwe` data stored in `PostgreSQL` and your container needs a set of credentials to connect to a database, your new data stream configuration would be:
```yaml hl_lines="7 8"
PHONE_DATA_STREAMS:
USE: aware_python
# AVAILABLE:
aware_mysql:
DATABASE_GROUP: MY_GROUP
beiwe_postgresql:
DATABASE_GROUP: MY_GROUP # users define this group (user, password, host, etc.) in credentials.yaml
```
Then implement one or both of the following functions:
=== "pull_data"
This function returns the data columns for a specific sensor and participant. It has the following parameters:
| Param | Description |
|--------------------|-------------------------------------------------------------------------------------------------------|
| stream_parameters | Any parameters (keys/values) set by the user in any `[DEVICE_DATA_STREAMS][stream_name]` key of `config.yaml`. For example, `[DATABASE_GROUP]` inside `[FITBIT_DATA_STREAMS][fitbitjson_mysql]` |
| sensor_container | The value set by the user in any `[DEVICE_SENSOR][CONTAINER]` key of `config.yaml`. It can be a table, file path, or whatever data source you want to support that contains the **data from a single sensor for all participants**. For example, `[PHONE_ACCELEROMETER][CONTAINER]`|
| device | The device id that you need to get the data for (this is set by the user in the [participant files](../../setup/configuration/#participant-files)). For example, in AWARE this device id is a uuid|
| columns | A list of the columns that you need to get from `sensor_container`. You specify these columns in your stream's `format.yaml`|
!!! example
This is the `pull_data` function we implemented for `aware_mysql`. Note that we can `message`, `warn` or `stop` the user during execution.
```r
pull_data <- function(stream_parameters, device, sensor_container, columns){
# get_db_engine is an auxiliary function not shown here for brevity bu can be found in src/data/streams/aware_mysql/container.R
dbEngine <- get_db_engine(stream_parameters$DATABASE_GROUP)
query <- paste0("SELECT ", paste(columns, collapse = ",")," FROM ", sensor_container, " WHERE device_id = '", device,"'")
# Letting the user know what we are doing
message(paste0("Executing the following query to download data: ", query))
sensor_data <- dbGetQuery(dbEngine, query)
dbDisconnect(dbEngine)
if(nrow(sensor_data) == 0)
warning(paste("The device '", device,"' did not have data in ", sensor_container))
return(sensor_data)
}
```
=== "infer_device_os"
!!! warning
This function is only necessary for phone data streams.
RAPIDS allows users to use the keyword `infer` (previously `multiple`) to [automatically infer](../../setup/configuration/#structure-of-participants-files) the mobile Operative System a phone was running.
If you have a way to infer the OS of a device id, implement this function. For example, for AWARE data we use the `aware_device` table.
If you don't have a way to infer the OS, call `stop("Error Message")` so other users know they can't use `infer` or the inference failed, and they have to assign the OS manually in the participant file.
This function returns the operative system (`android` or `ios`) for a specific phone device id. It has the following parameters:
| Param | Description |
|--------------------|-------------------------------------------------------------------------------------------------------|
| stream_parameters | Any parameters (keys/values) set by the user in any `[DEVICE_DATA_STREAMS][stream_name]` key of `config.yaml`. For example, `[DATABASE_GROUP]` inside `[FITBIT_DATA_STREAMS][fitbitjson_mysql]` |
| device | The device id that you need to infer the OS for (this is set by the user in the [participant files](../../setup/configuration/#participant-files)). For example, in AWARE this device id is a uuid|
!!! example
This is the `infer_device_os` function we implemented for `aware_mysql`. Note that we can `message`, `warn` or `stop` the user during execution.
```r
infer_device_os <- function(stream_parameters, device){
# get_db_engine is an auxiliary function not shown here for brevity bu can be found in src/data/streams/aware_mysql/container.R
group <- stream_parameters$DATABASE_GROUP
dbEngine <- dbConnect(MariaDB(), default.file = "./.env", group = group)
query <- paste0("SELECT device_id,brand FROM aware_device WHERE device_id = '", device, "'")
message(paste0("Executing the following query to infer phone OS: ", query))
os <- dbGetQuery(dbEngine, query)
dbDisconnect(dbEngine)
if(nrow(os) > 0)
return(os %>% mutate(os = ifelse(brand == "iPhone", "ios", "android")) %>% pull(os))
else
stop(paste("We cannot infer the OS of the following device id because it does not exist in the aware_device table:", device))
return(os)
}
```
## Implement a Format
A format file `format.yaml` describes the mapping between your stream's raw data and the data that RAPIDS needs. This file has a section per sensor (e.g. `PHONE_ACCELEROMETER`), and each section has two attributes (keys):
1. `RAPIDS_COLUMN_MAPPINGS` are mappings between the columns RAPIDS needs and the columns your raw data already has.
1. The reserved keyword `FLAG_TO_MUTATE` flags columns that RAPIDS requires but that are not initially present in your container (database, CSV file). These columns have to be created by your mutation scripts.
2. `MUTATION`. Sometimes your raw data needs to be transformed to match the format RAPIDS can handle (including creating columns marked as `FLAG_TO_MUTATE`)
2. `COLUMN_MAPPINGS` are mappings between the columns a mutation `SCRIPT` needs and the columns your raw data has.
2. `SCRIPTS` are a collection of R or Python scripts that transform one or more raw data columns into the format RAPIDS needs.
!!! hint
`[RAPIDS_COLUMN_MAPPINGS]` and `[MUTATE][COLUMN_MAPPINGS]` have a `key` (left-hand side string) and a `value` (right-hand side string). The `values` are the names used to pulled columns from a container (e.g., columns in a database table). All `values` are renamed to their `keys` in lower case. The renamed columns are sent to every mutation script within the `data` argument, and the final output is the input RAPIDS process further.
For example, let's assume we are implementing `beiwe_mysql` and defining the following format for `PHONE_FAKESENSOR`:
```yaml
PHONE_FAKESENSOR:
ANDROID:
RAPIDS_COLUMN_MAPPINGS:
TIMESTAMP: beiwe_timestamp
DEVICE_ID: beiwe_deviceID
MAGNITUDE_SQUARED: FLAG_TO_MUTATE
MUTATE:
COLUMN_MAPPINGS:
MAGNITUDE: beiwe_value
SCRIPTS:
- src/data/streams/mutations/phone/square_magnitude.py
```
RAPIDS will:
1. Download `beiwe_timestamp`, `beiwe_deviceID`, and `beiwe_value` from the container of `beiwe_mysql` (MySQL DB)
2. Rename these columns to `timestamp`, `device_id`, and `magnitude`, respectively.
3. Execute `square_magnitude.py` with a data frame as an argument containing the renamed columns. This script will square `magnitude` and rename it to `magnitude_squared`
4. Verify the data frame returned by `square_magnitude.py` has the columns RAPIDS needs `timestamp`, `device_id`, and `magnitude_squared`.
5. Use this data frame as the input to be processed in the pipeline.
Note that although `RAPIDS_COLUMN_MAPPINGS` and `[MUTATE][COLUMN_MAPPINGS]` keys are in capital letters for readability (e.g. `MAGNITUDE_SQUARED`), the names of the final columns you mutate in your scripts should be lower case.
Let's explain in more depth this column mapping with examples.
### Name mapping
The mapping for some sensors is straightforward. For example, accelerometer data most of the time has a timestamp, three axes (x,y,z), and a device id that produced it. AWARE and a different sensing app like Beiwe likely logged accelerometer data in the same way but with different column names. In this case, we only need to match Beiwe data columns to RAPIDS columns one-to-one:
```yaml hl_lines="4 5 6 7 8"
PHONE_ACCELEROMETER:
ANDROID:
RAPIDS_COLUMN_MAPPINGS:
TIMESTAMP: beiwe_timestamp
DEVICE_ID: beiwe_deviceID
DOUBLE_VALUES_0: beiwe_x
DOUBLE_VALUES_1: beiwe_y
DOUBLE_VALUES_2: beiwe_z
MUTATE:
COLUMN_MAPPINGS:
SCRIPTS: # it's ok if this is empty
```
### Value mapping
For some sensors, we need to map column names and values. For example, screen data has ON and OFF events; let's suppose Beiwe represents an ON event with the number `1,` but RAPIDS identifies ON events with the number `2`. In this case, we need to mutate the raw data coming from Beiwe and replace all `1`s with `2`s.
We do this by listing one or more R or Python scripts in `MUTATION_SCRIPTS` that will be executed in order. We usually store all mutation scripts under `src/data/streams/mutations/[device]/[platform]/` and they can be reused across data streams.
```yaml hl_lines="10"
PHONE_SCREEN:
ANDROID:
RAPIDS_COLUMN_MAPPINGS:
TIMESTAMP: beiwe_timestamp
DEVICE_ID: beiwe_deviceID
EVENT: beiwe_event
MUTATE:
COLUMN_MAPPINGS:
SCRIPTS:
- src/data/streams/mutations/phone/beiwe/beiwe_screen_map.py
```
!!! hint
- A `MUTATION_SCRIPT` can also be used to clean/preprocess your data before extracting behavioral features.
- A mutation script has to have a `main` function that receives two arguments, `data` and `stream_parameters`.
- The `stream_parameters` argument contains the `config.yaml` key/values of your data stream (this is the same argument that your `container.[py|R]` script receives, see [Implement a Container](#implement-a-container)).
=== "python"
Example of a python mutation script
```python
import pandas as pd
def main(data, stream_parameters):
# mutate data
return(data)
```
=== "R"
Example of a R mutation script
```r
source("renv/activate.R") # needed to use RAPIDS renv environment
library(dplyr)
main <- function(data, stream_parameters){
# mutate data
return(data)
}
```
### Complex mapping
Sometimes, your raw data doesn't even have the same columns RAPIDS expects for a sensor. For example, let's pretend Beiwe stores `PHONE_ACCELEROMETER` axis data in a single column called `acc_col` instead of three. You have to create a `MUTATION_SCRIPT` to split `acc_col` into three columns `x`, `y`, and `z`.
For this, you mark the three axes columns RAPIDS needs in `[RAPIDS_COLUMN_MAPPINGS]` with the word `FLAG_TO_MUTATE`, map `acc_col` in `[MUTATION][COLUMN_MAPPINGS]`, and list a Python script under `[MUTATION][SCRIPTS]` with the code to split `acc_col`. See an example below.
RAPIDS expects that every column mapped as `FLAG_TO_MUTATE` will be generated by your mutation script, so it won't try to retrieve them from your container (database, CSV file, etc.).
In our example, `acc_col` will be fetched from the stream's container and renamed to `JOINED_AXES` because `beiwe_split_acc.py` will split it into `double_values_0`, `double_values_1`, and `double_values_2`.
```yaml hl_lines="6 7 8 11 13"
PHONE_ACCELEROMETER:
ANDROID:
RAPIDS_COLUMN_MAPPINGS:
TIMESTAMP: beiwe_timestamp
DEVICE_ID: beiwe_deviceID
DOUBLE_VALUES_0: FLAG_TO_MUTATE
DOUBLE_VALUES_1: FLAG_TO_MUTATE
DOUBLE_VALUES_2: FLAG_TO_MUTATE
MUTATE:
COLUMN_MAPPINGS:
JOINED_AXES: acc_col
SCRIPTS:
- src/data/streams/mutations/phone/beiwe/beiwe_split_acc.py
```
This is a draft of `beiwe_split_acc.py` `MUTATION_SCRIPT`:
```python
import pandas as pd
def main(data, stream_parameters):
# data has the acc_col
# split acc_col into three columns: double_values_0, double_values_1, double_values_2 to match RAPIDS format
# remove acc_col since we don't need it anymore
return(data)
```
### OS complex mapping
There is a special case for a complex mapping scenario for smartphone data streams. The Android and iOS sensor APIs return data in different formats for certain sensors (like screen, activity recognition, battery, among others).
In case you didn't notice, the examples we have used so far are grouped under an `ANDROID` key, which means they will be applied to data collected by Android phones. Additionally, each sensor has an `IOS` key for a similar purpose. We use the complex mapping described above to transform iOS data into an Android format (it's always iOS to Android and any new phone data stream must do the same).
For example, this is the `format.yaml` key for `PHONE_ACTVITY_RECOGNITION`. Note that the `ANDROID` mapping is simple (one-to-one) but the `IOS` mapping is complex with three `FLAG_TO_MUTATE` columns, two `[MUTATE][COLUMN_MAPPINGS]` mappings, and one `[MUTATION][SCRIPT]`.
```yaml hl_lines="16 17 18 21 22 24"
PHONE_ACTIVITY_RECOGNITION:
ANDROID:
RAPIDS_COLUMN_MAPPINGS:
TIMESTAMP: timestamp
DEVICE_ID: device_id
ACTIVITY_TYPE: activity_type
ACTIVITY_NAME: activity_name
CONFIDENCE: confidence
MUTATION:
COLUMN_MAPPINGS:
SCRIPTS:
IOS:
RAPIDS_COLUMN_MAPPINGS:
TIMESTAMP: timestamp
DEVICE_ID: device_id
ACTIVITY_TYPE: FLAG_TO_MUTATE
ACTIVITY_NAME: FLAG_TO_MUTATE
CONFIDENCE: FLAG_TO_MUTATE
MUTATION:
COLUMN_MAPPINGS:
ACTIVITIES: activities
CONFIDENCE: confidence
SCRIPTS:
- "src/data/streams/mutations/phone/aware/activity_recogniton_ios_unification.R"
```
??? "Example activity_recogniton_ios_unification.R"
In this `MUTATION_SCRIPT` we create `ACTIVITY_NAME` and `ACTIVITY_TYPE` based on `activities`, and map `confidence` iOS values to Android values.
```R
source("renv/activate.R")
library("dplyr", warn.conflicts = F)
library(stringr)
clean_ios_activity_column <- function(ios_gar){
ios_gar <- ios_gar %>%
mutate(activities = str_replace_all(activities, pattern = '("|\\[|\\])', replacement = ""))
existent_multiple_activities <- ios_gar %>%
filter(str_detect(activities, ",")) %>%
group_by(activities) %>%
summarise(mutiple_activities = unique(activities), .groups = "drop_last") %>%
pull(mutiple_activities)
known_multiple_activities <- c("stationary,automotive")
unkown_multiple_actvities <- setdiff(existent_multiple_activities, known_multiple_activities)
if(length(unkown_multiple_actvities) > 0){
stop(paste0("There are unkwown combinations of ios activities, you need to implement the decision of the ones to keep: ", unkown_multiple_actvities))
}
ios_gar <- ios_gar %>%
mutate(activities = str_replace_all(activities, pattern = "stationary,automotive", replacement = "automotive"))
return(ios_gar)
}
unify_ios_activity_recognition <- function(ios_gar){
# We only need to unify Google Activity Recognition data for iOS
# discard rows where activities column is blank
ios_gar <- ios_gar[-which(ios_gar$activities == ""), ]
# clean "activities" column of ios_gar
ios_gar <- clean_ios_activity_column(ios_gar)
# make it compatible with android version: generate "activity_name" and "activity_type" columns
ios_gar <- ios_gar %>%
mutate(activity_name = case_when(activities == "automotive" ~ "in_vehicle",
activities == "cycling" ~ "on_bicycle",
activities == "walking" ~ "walking",
activities == "running" ~ "running",
activities == "stationary" ~ "still"),
activity_type = case_when(activities == "automotive" ~ 0,
activities == "cycling" ~ 1,
activities == "walking" ~ 7,
activities == "running" ~ 8,
activities == "stationary" ~ 3,
activities == "unknown" ~ 4),
confidence = case_when(confidence == 0 ~ 0,
confidence == 1 ~ 50,
confidence == 2 ~ 100)
) %>%
select(-activities)
return(ios_gar)
}
main <- function(data, stream_parameters){
return(unify_ios_activity_recognition(data, stream_parameters))
}
```

View File

@ -0,0 +1,32 @@
# `aware_csv`
This [data stream](../../datastreams/data-streams-introduction) handles iOS and Android sensor data collected with the [AWARE Framework](https://awareframework.com/) and stored in CSV files.
!!! warning
The CSV files have to use `,` as separator, `\` as escape character (do not escape `"` with `""`), and wrap any string columns with `"`.
See examples in the CSV files inside [rapids_example_csv.zip](https://osf.io/wbg23/)
??? example "Example of a valid CSV file"
```csv
"_id","timestamp","device_id","activities","confidence","stationary","walking","running","automotive","cycling","unknown","label"
1,1587528000000,"13dbc8a3-dae3-4834-823a-4bc96a7d459d","[\"stationary\"]",2,1,0,0,0,0,0,""
2,1587528060000,"13dbc8a3-dae3-4834-823a-4bc96a7d459d","[\"stationary\"]",2,1,0,0,0,0,0,"supplement"
3,1587528120000,"13dbc8a3-dae3-4834-823a-4bc96a7d459d","[\"stationary\"]",2,1,0,0,0,0,0,"supplement"
4,1587528180000,"13dbc8a3-dae3-4834-823a-4bc96a7d459d","[\"stationary\"]",2,1,0,0,0,0,0,"supplement"
5,1587528240000,"13dbc8a3-dae3-4834-823a-4bc96a7d459d","[\"stationary\"]",2,1,0,0,0,0,0,"supplement"
6,1587528300000,"13dbc8a3-dae3-4834-823a-4bc96a7d459d","[\"stationary\"]",2,1,0,0,0,0,0,"supplement"
7,1587528360000,"13dbc8a3-dae3-4834-823a-4bc96a7d459d","[\"stationary\"]",2,1,0,0,0,0,0,"supplement"
```
## Container
A CSV file per sensor, each containing the data for all participants.
The script to connect and download data from this container is at:
```bash
src/data/streams/aware_csv/container.R
```
## Format
--8<---- "docs/snippets/aware_format.md"

View File

@ -0,0 +1,18 @@
# `aware_influxdb (beta)`
!!! warning
This data stream is being released in beta while we test it thoroughly.
This [data stream](../../datastreams/data-streams-introduction) handles iOS and Android sensor data collected with the [AWARE Framework](https://awareframework.com/) and stored in an InfluxDB database.
## Container
An InfluxDB database with a table per sensor, each containing the data for all participants.
The script to connect and download data from this container is at:
```bash
src/data/streams/aware_influxdb/container.R
```
## Format
--8<---- "docs/snippets/aware_format.md"

View File

@ -0,0 +1,15 @@
# `aware_mysql`
This [data stream](../../datastreams/data-streams-introduction) handles iOS and Android sensor data collected with the [AWARE Framework](https://awareframework.com/) and stored in a MySQL database.
## Container
A MySQL database with a table per sensor, each containing the data for all participants. This is the default database created by the old PHP AWARE server (as opposed to the new JavaScript Micro server).
The script to connect and download data from this container is at:
```bash
src/data/streams/aware_mysql/container.R
```
## Format
--8<---- "docs/snippets/aware_format.md"

View File

@ -0,0 +1,25 @@
# Data Streams Introduction
A data stream is a set of sensor data collected using a specific type of **device** with a specific **format** and stored in a specific **container**.
For example, the `aware_mysql` data stream handles smartphone data (**device**) collected with the [AWARE Framework](https://awareframework.com/) (**format**) stored in a MySQL database (**container**). Similarly, smartphone data collected with [Beiwe](https://www.beiwe.org/) will have a different format and could be stored in a container like a PostgreSQL database or a CSV file.
If you want to process a data stream using RAPIDS, make sure that your data is stored in a supported **format** and **container** (see table below).
If RAPIDS doesn't support your data stream yet (e.g. Beiwe data stored in PostgreSQL, or AWARE data stored in SQLite), you can always [implement a new data stream](../add-new-data-streams). If it's something you think other people might be interested on, we will be happy to include your new data stream in RAPIDS, so get in touch!.
!!! hint
Currently, you can add new data streams for smartphones, Fitbit, and Empatica devices. If you need RAPIDS to process data from **other devices**, like Oura Rings or Actigraph wearables, get in touch. It is a more complicated process that could take a couple of days to implement for someone familiar with R or Python, but we would be happy to work on it together.
For reference, these are the data streams we currently support:
| Data Stream | Device | Format | Container | Docs
|--|--|--|--|--|
| `aware_mysql`| Phone | AWARE app | MySQL | [link](../aware-mysql)
| `aware_csv`| Phone | AWARE app | CSV files | [link](../aware-csv)
| `aware_influxdb` (beta)| Phone | AWARE app | InfluxDB | [link](../aware-influxdb)
| `fitbitjson_mysql`| Fitbit | JSON (per [Fitbit's API](https://dev.fitbit.com/build/reference/web-api/)) | MySQL | [link](../fitbitjson-mysql)
| `fitbitjson_csv`| Fitbit | JSON (per [Fitbit's API](https://dev.fitbit.com/build/reference/web-api/)) | CSV files | [link](../fitbitjson-csv)
| `fitbitparsed_mysql`| Fitbit | Parsed (parsed API data) | MySQL | [link](../fitbitparsed-mysql)
| `fitbitparsed_csv`| Fitbit | Parsed (parsed API data) | CSV files | [link](../fitbitparsed-csv)
| `empatica_zip`| Empatica | [E4 Connect](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) | ZIP files | [link](../empatica-zip)

View File

@ -0,0 +1,136 @@
# `empatica_zip`
This [data stream](../../datastreams/data-streams-introduction) handles Empatica sensor data downloaded as zip files using the [E4 Connect](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-).
## Container
You need to create a subfolder for every participant named after their `device id` inside the folder specified by `[EMPATICA_DATA_STREAMS][empatica_zipfiles][FOLDER]`. You can add one or more Empatica zip files to any subfolder.
The script to connect and download data from this container is at:
```bash
src/data/streams/empatica_zip/container.R
```
## Format
The `format.yaml` maps and transforms columns in your raw data stream to the [mandatory columns RAPIDS needs for Empatica sensors](../mandatory-empatica-format). This file is at:
```bash
src/data/streams/empatica_zip/format.yaml
```
All columns are mutated from the raw data in the zip files so you don't need to modify any column mappings.
??? info "EMPATICA_ACCELEROMETER"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP | timestamp|
| DEVICE_ID | device_id|
| DOUBLE_VALUES_0 | double_values_0|
| DOUBLE_VALUES_1 | double_values_1|
| DOUBLE_VALUES_2 | double_values_2|
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
??? info "EMPATICA_HEARTRATE"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
|TIMESTAMP | timestamp|
|DEVICE_ID | device_id|
|HEARTRATE | heartrate|
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
??? info "EMPATICA_TEMPERATURE"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
|TIMESTAMP | timestamp|
|DEVICE_ID | device_id|
|TEMPERATURE | temperature|
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
??? info "EMPATICA_ELECTRODERMAL_ACTIVITY"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
|TIMESTAMP | timestamp|
|DEVICE_ID | device_id|
|ELECTRODERMAL_ACTIVITY | electrodermal_activity|
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
??? info "EMPATICA_BLOOD_VOLUME_PULSE"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
|TIMESTAMP | timestamp|
|DEVICE_ID | device_id|
|BLOOD_VOLUME_PULSE | blood_volume_pulse|
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
??? info "EMPATICA_INTER_BEAT_INTERVAL"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
|TIMESTAMP | timestamp|
|DEVICE_ID | device_id|
|INTER_BEAT_INTERVAL | inter_beat_interval|
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
??? info "EMPATICA_EMPATICA_TAGS"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
|TIMESTAMP | timestamp|
|DEVICE_ID | device_id|
|TAGS | tags|
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)

View File

@ -0,0 +1,23 @@
# `fitbitjson_csv`
This [data stream](../../datastreams/data-streams-introduction) handles Fitbit sensor data downloaded using the [Fitbit Web API](https://dev.fitbit.com/build/reference/web-api/) and stored in a CSV file. Please note that RAPIDS cannot query the API directly; you need to use other available tools or implement your own. Once you have your sensor data in a CSV file, RAPIDS can process it.
!!! warning
The CSV files have to use `,` as separator, `\` as escape character (do not escape `"` with `""`), and wrap any string columns with `"`.
??? example "Example of a valid CSV file"
```csv
"timestamp","device_id","label","fitbit_id","fitbit_data_type","fitbit_data"
1587614400000,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524","5S","5ZKN9B","steps","{\"activities-steps\":[{\"dateTime\":\"2020-04-23\",\"value\":\"7881\"}]"
```
## Container
The container should be a CSV file per Fitbit sensor, each containing all participants' data.
The script to connect and download data from this container is at:
```bash
src/data/streams/fitbitjson_csv/container.R
```
## Format
--8<---- "docs/snippets/jsonfitbit_format.md"

View File

@ -0,0 +1,14 @@
# `fitbitjson_mysql`
This [data stream](../../datastreams/data-streams-introduction) handles Fitbit sensor data downloaded using the [Fitbit Web API](https://dev.fitbit.com/build/reference/web-api/) and stored in a MySQL database. Please note that RAPIDS cannot query the API directly; you need to use other available tools or implement your own. Once you have your sensor data in a MySQL database, RAPIDS can process it.
## Container
The container should be a MySQL database with a table per sensor, each containing all participants' data.
The script to connect and download data from this container is at:
```bash
src/data/streams/fitbitjson_mysql/container.R
```
## Format
--8<---- "docs/snippets/jsonfitbit_format.md"

View File

@ -0,0 +1,29 @@
# `fitbitparsed_csv`
This [data stream](../../datastreams/data-streams-introduction) handles Fitbit sensor data downloaded using the [Fitbit Web API](https://dev.fitbit.com/build/reference/web-api/), **parsed**, and stored in a CSV file. Please note that RAPIDS cannot query the API directly; you need to use other available tools or implement your own. Once you have your parsed sensor data in a CSV file, RAPIDS can process it.
!!! info "What is the difference between JSON and plain data streams"
Most people will only need `fitbitjson_*` because they downloaded and stored their data directly from Fitbit's API. However, if, for some reason, you don't have access to that JSON data and instead only have the parsed data (columns and rows), you can use this data stream.
!!! warning
The CSV files have to use `,` as separator, `\` as escape character (do not escape `"` with `""`), and wrap any string columns with `"`.
??? example "Example of a valid CSV file"
```csv
"device_id","heartrate","heartrate_zone","local_date_time","timestamp"
"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",69,"outofrange","2020-04-23 00:00:00",0
"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",69,"outofrange","2020-04-23 00:01:00",0
"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",67,"outofrange","2020-04-23 00:02:00",0
"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",69,"outofrange","2020-04-23 00:03:00",0
```
## Container
The container should be a CSV file per sensor, each containing all participants' data.
The script to connect and download data from this container is at:
```bash
src/data/streams/fitbitparsed_csv/container.R
```
## Format
--8<---- "docs/snippets/parsedfitbit_format.md"

View File

@ -0,0 +1,17 @@
# `fitbitparsed_mysql`
This [data stream](../../datastreams/data-streams-introduction) handles Fitbit sensor data downloaded using the [Fitbit Web API](https://dev.fitbit.com/build/reference/web-api/), **parsed**, and stored in a MySQL database. Please note that RAPIDS cannot query the API directly; you need to use other available tools or implement your own. Once you have your parsed sensor data in a MySQL database, RAPIDS can process it.
!!! info "What is the difference between JSON and plain data streams"
Most people will only need `fitbitjson_*` because they downloaded and stored their data directly from Fitbit's API. However, if, for some reason, you don't have access to that JSON data and instead only have the parsed data (columns and rows), you can use this data stream.
## Container
The container should be a MySQL database with a table per sensor, each containing all participants' data.
The script to connect and download data from this container is at:
```bash
src/data/streams/fitbitparsed_mysql/container.R
```
## Format
--8<---- "docs/snippets/parsedfitbit_format.md"

View File

@ -0,0 +1,61 @@
# Mandatory Empatica Format
This is a description of the format RAPIDS needs to process data for the following Empatica sensors.
??? info "EMPATICA_ACCELEROMETER"
| RAPIDS column | Description |
|-----------------|--------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| DOUBLE_VALUES_0 | x axis of acceleration |
| DOUBLE_VALUES_1 | y axis of acceleration |
| DOUBLE_VALUES_2 | z axis of acceleration |
??? info "EMPATICA_HEARTRATE"
| RAPIDS column | Description |
|-----------------|-----------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged (automatically created by RAPIDS) |
| DEVICE_ID | A string that uniquely identifies a device |
| HEARTRATE | Intraday heartrate |
??? info "EMPATICA_TEMPERATURE"
| RAPIDS column | Description |
|-----------------|-----------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged (automatically created by RAPIDS) |
| DEVICE_ID | A string that uniquely identifies a device |
| TEMPERATURE | temperature |
??? info "EMPATICA_ELECTRODERMAL_ACTIVITY"
| RAPIDS column | Description |
|-----------------|-----------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged (automatically created by RAPIDS) |
| DEVICE_ID | A string that uniquely identifies a device |
| ELECTRODERMAL_ACTIVITY | electrical conductance |
??? info "EMPATICA_BLOOD_VOLUME_PULSE"
| RAPIDS column | Description |
|-----------------|-----------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged (automatically created by RAPIDS) |
| DEVICE_ID | A string that uniquely identifies a device |
| BLOOD_VOLUME_PULSE | blood volume pulse |
??? info "EMPATICA_INTER_BEAT_INTERVAL"
| RAPIDS column | Description |
|-----------------|-----------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged (automatically created by RAPIDS) |
| DEVICE_ID | A string that uniquely identifies a device |
| INTER_BEAT_INTERVAL | inter beat interval |
??? info "EMPATICA_TAGS"
| RAPIDS column | Description |
|-----------------|-----------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged (automatically created by RAPIDS) |
| DEVICE_ID | A string that uniquely identifies a device |
| TAGS | tags |

View File

@ -0,0 +1,75 @@
# Mandatory Fitbit Format
This is a description of the format RAPIDS needs to process data for the following Fitbit\ sensors.
??? info "FITBIT_HEARTRATE_SUMMARY"
| RAPIDS column | Description |
|-----------------|-----------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged (automatically created by RAPIDS) |
| LOCAL_DATE_TIME | Date time string with format `yyyy-mm-dd hh:mm:ss` |
| DEVICE_ID | A string that uniquely identifies a device |
| HEARTRATE_DAILY_RESTINGHR | Daily resting heartrate |
| HEARTRATE_DAILY_CALORIESOUTOFRANGE | Calories spent while heartrate was oustide a heartrate [zone](https://help.fitbit.com/articles/en_US/Help_article/1565.htm#) |
| HEARTRATE_DAILY_CALORIESFATBURN | Calories spent while heartrate was inside the fat burn [zone](https://help.fitbit.com/articles/en_US/Help_article/1565.htm#) |
| HEARTRATE_DAILY_CALORIESCARDIO | Calories spent while heartrate was inside the cardio [zone](https://help.fitbit.com/articles/en_US/Help_article/1565.htm#) |
| HEARTRATE_DAILY_CALORIESPEAK | Calories spent while heartrate was inside the peak [zone](https://help.fitbit.com/articles/en_US/Help_article/1565.htm#) |
??? info "FITBIT_HEARTRATE_INTRADAY"
| RAPIDS column | Description |
|-----------------|-----------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged (automatically created by RAPIDS) |
| LOCAL_DATE_TIME | Date time string with format `yyyy-mm-dd hh:mm:ss` |
| DEVICE_ID | A string that uniquely identifies a device |
| HEARTRATE | Intraday heartrate |
| HEARTRATE_ZONE | Heartrate [zone](https://help.fitbit.com/articles/en_US/Help_article/1565.htm#) that HEARTRATE belongs to. It is based on the heartrate zone ranges of each device |
??? info "FITBIT_SLEEP_SUMMARY"
| RAPIDS column | Description |
|-----------------|-----------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged (automatically created by RAPIDS) |
| LOCAL_DATE_TIME | Date time string with format `yyyy-mm-dd hh:mm:ss`, this either is a copy of LOCAL_START_DATE_TIME or LOCAL_END_DATE_TIME depending on which column is used to assign an episode to a specific day|
| LOCAL_START_DATE_TIME | Date time string with format `yyyy-mm-dd hh:mm:ss` representing the start of a daily sleep episode |
| LOCAL_END_DATE_TIME | Date time string with format `yyyy-mm-dd hh:mm:ss` representing the end of a daily sleep episode|
| DEVICE_ID | A string that uniquely identifies a device |
| EFFICIENCY | Sleep efficiency computed by fitbit as time asleep / (total time in bed - time to fall asleep)|
| MINUTES_AFTER_WAKEUP | Minutes the participant spent in bed after waking up|
| MINUTES_ASLEEP | Minutes the participant was asleep |
| MINUTES_AWAKE | Minutes the participant was awake |
| MINUTES_TO_FALL_ASLEEP | Minutes the participant spent in bed before falling asleep|
| MINUTES_IN_BED | Minutes the participant spent in bed across the sleep episode|
| IS_MAIN_SLEEP | 0 if this episode is a nap, or 1 if it is a main sleep episode|
| TYPE | stages or classic [sleep data](https://dev.fitbit.com/build/reference/web-api/sleep/)|
??? info "FITBIT_SLEEP_INTRADAY"
| RAPIDS column | Description |
|-----------------|-----------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged (automatically created by RAPIDS)|
| LOCAL_DATE_TIME | Date time string with format `yyyy-mm-dd hh:mm:ss`, this either is a copy of LOCAL_START_DATE_TIME or LOCAL_END_DATE_TIME depending on which column is used to assign an episode to a specific day|
| DEVICE_ID | A string that uniquely identifies a device |
| TYPE_EPISODE_ID | An id for each unique main or nap episode. Main and nap episodes have different levels, each row in this table is one of such levels, so multiple rows can have the same TYPE_EPISODE_ID|
| DURATION | Duration of the episode level in minutes|
| IS_MAIN_SLEEP | 0 if this episode level belongs to a nap, or 1 if it belongs to a main sleep episode|
| TYPE | type of level: stages or classic [sleep data](https://dev.fitbit.com/build/reference/web-api/sleep/)|
| LEVEL | For stages levels one of `wake`, `deep`, `light`, or `rem`. For classic levels one of `awake`, `restless`, and `asleep`|
??? info "FITBIT_STEPS_SUMMARY"
| RAPIDS column | Description |
|-----------------|-----------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged (automatically created by RAPIDS) |
| LOCAL_DATE_TIME | Date time string with format `yyyy-mm-dd hh:mm:ss` |
| DEVICE_ID | A string that uniquely identifies a device |
| STEPS | Daily step count |
??? info "FITBIT_STEPS_INTRADAY"
| RAPIDS column | Description |
|-----------------|-----------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged (automatically created by RAPIDS) |
| LOCAL_DATE_TIME | Date time string with format `yyyy-mm-dd hh:mm:ss` |
| DEVICE_ID | A string that uniquely identifies a device |
| STEPS | Intraday step count (usually every minute)|

View File

@ -0,0 +1,202 @@
# Mandatory Phone Format
This is a description of the format RAPIDS needs to process data for the following PHONE sensors.
See examples in the CSV files inside [rapids_example_csv.zip](https://osf.io/wbg23/)
??? info "PHONE_ACCELEROMETER"
| RAPIDS column | Description |
|-----------------|--------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| DOUBLE_VALUES_0 | x axis of acceleration |
| DOUBLE_VALUES_1 | y axis of acceleration |
| DOUBLE_VALUES_2 | z axis of acceleration |
??? info "PHONE_ACTIVITY_RECOGNITION"
| RAPIDS column | Description |
|-----------------|---------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| ACTIVITY_NAME | An string that denotes current activity name: `in_vehicle`, `on_bicycle`, `on_foot`, `still`, `unknown`, `tilting`, `walking` or `running` |
| ACTIVITY_TYPE | An integer (ranged from 0 to 8) that denotes current activity type |
| CONFIDENCE | An integer (ranged from 0 to 100) that denotes the prediction accuracy |
??? info "PHONE_APPLICATIONS_CRASHES"
| RAPIDS column | Description |
|--------------------|---------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| PACKAGE_NAME | Applications package name |
| APPLICATION_NAME | Applications localized name |
| APPLICATION_VERSION| Applications version code |
| ERROR_SHORT | Short description of the error |
| ERROR_LONG | More verbose version of the error description |
| ERROR_CONDITION | 1 = code error; 2 = non-responsive (ANR error) |
| IS_SYSTEM_APP | Devices pre-installed application |
??? info "PHONE_APPLICATIONS_FOREGROUND"
| RAPIDS column | Description |
|--------------------|---------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| PACKAGE_NAME | Applications package name |
| APPLICATION_NAME | Applications localized name |
| IS_SYSTEM_APP | Devices pre-installed application |
??? info "PHONE_APPLICATIONS_NOTIFICATIONS"
| RAPIDS column | Description |
|--------------------|---------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| PACKAGE_NAME | Applications package name |
| APPLICATION_NAME | Applications localized name |
| TEXT | Notifications header text, not the content |
| SOUND | Notifications sound source (if applicable) |
| VIBRATE | Notifications vibration pattern (if applicable) |
| DEFAULTS | If notification was delivered according to devices default settings |
| FLAGS | An integer that denotes [Android notification flag](https://developer.android.com/reference/android/app/Notification.html) |
??? info "PHONE_BATTERY"
| RAPIDS column | Description |
|----------------------|------------------------------------------------------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| BATTERY_STATUS | An integer that denotes battery status: 0 or 1 = unknown, 2 = charging, 3 = discharging, 4 = not charging, 5 = full |
| BATTERY_LEVEL | An integer that denotes battery level, between 0 and `BATTERY_SCALE` |
| BATTERY_SCALE | An integer that denotes the maximum battery level |
??? info "PHONE_BLUETOOTH"
| RAPIDS column | Description |
|--------------------|---------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| BT_ADDRESS | MAC address of the devices Bluetooth sensor |
| BT_NAME | User assigned name of the devices Bluetooth sensor |
| BT_RSSI | The RSSI dB to the scanned device |
??? info "PHONE_CALLS"
| RAPIDS column | Description |
|--------------------|---------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| CALL_TYPE | An integer that denotes call type: 1 = incoming, 2 = outgoing, 3 = missed |
| CALL_DURATION | Length of the call session |
| TRACE | SHA-1 one-way source/target of the call |
??? info "PHONE_CONVERSATION"
| RAPIDS column | Description |
|----------------------|--------------------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| DOUBLE_ENERGY | A number that denotes the amplitude of an audio sample (L2-norm of the audio frame) |
| INFERENCE | An integer (ranged from 0 to 3) that denotes the type of an audio sample: 0 = silence, 1 = noise, 2 = voice, 3 = unknown |
| DOUBLE_CONVO_START | UNIX timestamp (13 digits) of the beginning of a conversation |
| DOUBLE_CONVO_END | UNIX timestamp (13 digits) of the end of a conversation |
??? info "PHONE_KEYBOARD"
| RAPIDS column | Description |
|--------------------|---------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| PACKAGE_NAME | The applications package name of keyboard interaction |
| BEFORE_TEXT | The previous keyboard input (empty if password) |
| CURRENT_TEXT | The current keyboard input (empty if password) |
| IS_PASSWORD | An integer: 0 = not password; 1 = password |
??? info "PHONE_LIGHT"
| RAPIDS column | Description |
|--------------------|----------------------------------------------------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| DOUBLE_LIGHT_LUX | The ambient luminance in lux units |
| ACCURACY | An integer that denotes the sensor's accuracy level: 3 = maximum accuracy, 2 = medium accuracy, 1 = low accuracy |
??? info "PHONE_LOCATIONS"
| RAPIDS column | Description |
|--------------------|---------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| DOUBLE_LATITUDE | The locations latitude, in degrees |
| DOUBLE_LONGITUDE | The locations longitude, in degrees |
| DOUBLE_BEARING | The locations bearing, in degrees |
| DOUBLE_SPEED | The speed if available, in meters/second over ground |
| DOUBLE_ALTITUDE | The altitude if available, in meters above sea level |
| PROVIDER | A string that denotes the provider: `gps`, `fused` or `network` |
| ACCURACY | The estimated location accuracy |
??? info "PHONE_LOG"
| RAPIDS column | Description |
|--------------------|---------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| LOG_MESSAGE | A string that denotes log message |
??? info "PHONE_MESSAGES"
| RAPIDS column | Description |
|--------------------|---------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| MESSAGE_TYPE | An integer that denotes message type: 1 = received, 2 = sent |
| TRACE | SHA-1 one-way source/target of the message |
??? info "PHONE_SCREEN"
| RAPIDS column | Description |
|--------------------|-----------------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| SCREEN_STATUS | An integer that denotes screen status: 0 = off, 1 = on, 2 = locked, 3 = unlocked |
??? info "PHONE_WIFI_CONNECTED"
| RAPIDS column | Description |
|--------------------|-----------------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| MAC_ADDRESS | Devices MAC address |
| SSID | Currently connected access point network name |
| BSSID | Currently connected access point MAC address |
??? info "PHONE_WIFI_VISIBLE"
| RAPIDS column | Description |
|--------------------|-----------------------------------------------------------------------------------|
| TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged |
| DEVICE_ID | A string that uniquely identifies a device |
| SSID | Detected access point network name |
| BSSID | Detected access point MAC address |
| SECURITY | Active security protocols |
| FREQUENCY | Wi-Fi band frequency (e.g., 2427, 5180), in Hz |
| RSSI | RSSI dB to the scanned device |

View File

@ -1,26 +1,37 @@
# Testing
The following is a simple guide to testing RAPIDS. All files necessary for testing are stored in the `/tests` directory
The following is a simple guide to run RAPIDS' tests. All files necessary for testing are stored in the `./tests/` directory
## Steps for Testing
1. To begin testing RAPIDS place the fake raw input data `csv` files of each fake participant in
`tests/data/raw/`. The fake participant files should be placed in
`tests/data/external/participant_files`. The expected output files of RAPIDS after
processing the input data should be placed in `tests/data/processesd/frequency` and `tests/data/processesd/periodic` for frequency and periodic respectively.
2. Edit `tests/settings/frequency/config.yaml` and `tests/settings/periodic/config.yaml` to add and/or remove the rules
to be run for testing from the `forcerun` list.
3. Edit `tests/settings/frequency/testing_config.yaml` and `tests/settings/frequency/testing_config.yaml` to configure the settings and enable/disable sensors to be tested.
4. Add any additional testscripts in `tests/scripts`.
5. Run the testing shell script with
```bash
tests/scripts/run_tests.sh
run_test.sh [-l] [all | periodic | frequency] [test]
1. **Add raw data.**
1. Add the raw data to the corresponding sensor CSV file in `tests/data/external/aware_csv`. Create the CSV if it does not exist.
2. **Link raw data.**
1. Make sure that you link the new raw data to a participant by using the same `device_id` in the data and in `[DEVICE_IDS]` inside their participant file (`tests/data/external/participant_files/testXX.yaml`).
2. Create the participant file if it does not exist, and don't forget to edit `[PIDS]` in the config file of the time segments you are testing (see below). For simplicity, we use a participant's id (`testXX`) as their `device_id`.
3. **Edit the config file.**
1. Activate the sensor provider you are testing if it isn't already. Set `[SENSOR][PROVIDER][COMPUTE]` to `TRUE` in the `config.yaml` of the time segments you are testing:
```yaml
- tests/settings/frequency_config.yaml # For frequency time segments
- tests/settings/periodic_config.yaml # For periodic time segments
# We have not tested events time segments yet
```
4. **Run the pipeline and tests.**
1. You can run all time segments pipelines and their tests
```bash
tests/scripts/run_tests.sh -t all
```
2. You can run only the pipeline of a specific time segment and its tests
```bash
tests/scripts/run_tests.sh -t frequency -a both
```
2. Or, if you are working on your tests and you want to run a pipeline and its tests independently
```bash
tests/scripts/run_tests.sh -t frequency -a run
tests/scripts/run_tests.sh -t frequency -a test
```
`[-l]` will delete all the existing files in `/data` before running tests.
`[all | periodic | frequency]` will generate feature data for all or specific type of features and save in `data/processed`.
`[test]` will compare the features generated with the precomputed and verified features in `/tests/data/processed`.
## Output example
The following is a snippet of the output you should see after running your test.
```bash
@ -35,8 +46,16 @@ test_sensors_features_calculations (test_sensor_features.TestSensorFeatures) ...
FAIL
```
The results above show that the for periodic both `test_sensors_files_exist` and `test_sensors_features_calculations` passed while for frequency first test `test_sensors_files_exist` passed while `test_sensors_features_calculations` failed. In addition you should get the traceback of the failure (not shown here). For more information on how to implement test scripts and use unittest please see [Unittest Documentation](https://docs.python.org/3.7/library/unittest.html#command-line-interface)
The results above show that the for periodic both `test_sensors_files_exist` and `test_sensors_features_calculations` passed while for frequency first test `test_sensors_files_exist` passed while `test_sensors_features_calculations` failed. Additionally, you should get the traceback of the failure (not shown here). For more information on how to implement test scripts and use unittest please see [Unittest Documentation](https://docs.python.org/3.7/library/unittest.html#command-line-interface)
Testing of the RAPIDS sensors and features is a work-in-progress. Please see `test-cases`{.interpreted-text role="ref"} for a list of sensors and features that have testing currently available.
Testing of the RAPIDS sensors and features is a work-in-progress. Please see [Test Cases](../test-cases) for a list of sensors and features that have testing currently available.
Currently the repository is set up to test a number of sensors out of the box by simply running the `tests/scripts/run_tests.sh` command once the RAPIDS python environment is active.
## How do we execute the tests?
This bash script `tests/scripts/run_tests.sh` executes one or all pipelines for different time segment types (`frequency`, `periodic`, and `events`) as well as their tests (see below).
This python script `tests/scripts/run_tests.py` runs the tests. It parses the involved participants and active sensor providers in the `config.yaml` file of the time segment type being tested. We test that the output file we expect exists and that its content matches the expected values.
??? example "Example of raw data for PHONE_APPLICATIONS_FOREGROUND testing"
```json hl_lines="1 2 4" linenums="1"
--8<---- "tests/data/external/aware_csv/phone_applications_foreground_raw.csv"
```

View File

@ -25,7 +25,7 @@ The schema has three main sections `required`, `definitions`, and `properties`.
### definitions
`definitions` lists key/values that are common to different `properties` so we can reuse them. You can define a key/value under `definitions` and use `$ref` to refer to it in any `property`.
For example, every sensor like `[PHONE_ACCELEROMETER]` has one or more providers like `RAPIDS` and `PANDA`, these providers have some common properties like the `COMPUTE` flag or the `SRC_FOLDER` string, therefore we define a common provider "template" that is used by every provider and extended with properties exclusive to each one of them. For example:
For example, every sensor like `[PHONE_ACCELEROMETER]` has one or more providers like `RAPIDS` and `PANDA`, these providers have some common properties like the `COMPUTE` flag or the `SRC_SCRIPT` string. Therefore we define a shared provider "template" that is used by every provider and extended with properties exclusive to each one of them. For example:
=== "provider definition (template)"
The `PROVIDER` definition will be used later on different `properties`.
@ -33,21 +33,19 @@ For example, every sensor like `[PHONE_ACCELEROMETER]` has one or more providers
```yaml
PROVIDER:
type: object
required: [COMPUTE, SRC_FOLDER, SRC_LANGUAGE, FEATURES]
required: [COMPUTE, SRC_SCRIPT, FEATURES]
properties:
COMPUTE:
type: boolean
FEATURES:
type: [array, object]
SRC_FOLDER:
SRC_SCRIPT:
type: string
SRC_LANGUAGE:
type: string
enum: [python, r]
pattern: "^.*\\.(py|R)$"
```
=== "provider reusing and extending the template"
Notice that in this example `RAPIDS` (a provider) is using and extending the `PROVIDER` template. The `FEATURES` key is overriding the `FEATURES` key from the `#/definitions/PROVIDER` template but is keeping the validation for `COMPUTE`, `SRC_FOLDER`, and `SRC_LANGUAGE`. For more details about reusing properties go to this [link](http://json-schema.org/understanding-json-schema/structuring.html#reuse)
Notice that `RAPIDS` (a provider) uses and extends the `PROVIDER` template in this example. The `FEATURES` key is overriding the `FEATURES` key from the `#/definitions/PROVIDER` template but is keeping the validation for `COMPUTE`, and `SRC_SCRIPT`. For more details about reusing properties, go to this [link](http://json-schema.org/understanding-json-schema/structuring.html#reuse)
```yaml hl_lines="9 10"
PHONE_ACCELEROMETER:
@ -128,7 +126,7 @@ You can validate different aspects of each key/value in our `config.yaml` file:
enum: ["received", "sent"]
```
=== "object"
`PARENT` is an object that has two properties. `KID1` is one of those properties that is in turn another object that will reuse the `"#/definitions/PROVIDER"` `definition` **AND** also include (extend) two extra properties `GRAND_KID1` of type `array` and `GRAND_KID2` of type `number`. `KID2` is another property of `PARENT` of type `boolean`.
`PARENT` is an object that has two properties. `KID1` is one of those properties that are, in turn, another object that will reuse the `"#/definitions/PROVIDER"` `definition` **AND** also include (extend) two extra properties `GRAND_KID1` of type `array` and `GRAND_KID2` of type `number`. `KID2` is another property of `PARENT` of type `boolean`.
The schema validation looks like this
```yaml
@ -155,8 +153,7 @@ You can validate different aspects of each key/value in our `config.yaml` file:
# These four come from the `PROVIDER` definition (template)
COMPUTE: False
FEATURES: [x, y] # an array
SRC_FOLDER: "any string"
SRC_LANGUAGE: "any string"
SRC_SCRIPT: "a path to a py or R script"
# This two come from the extension
GRAND_KID1: [a, b] # an array

View File

@ -3,33 +3,37 @@
!!! hint
- We recommend reading the [Behavioral Features Introduction](../feature-introduction/) before reading this page.
- You can implement new features in Python or R scripts.
- You won't have to deal with time zones, dates, times, data cleaning or preprocessing. The data that RAPIDS pipes to your feature extraction code is ready to process.
- You won't have to deal with time zones, dates, times, data cleaning, or preprocessing. The data that RAPIDS pipes to your feature extraction code are ready to process.
## New Features for Existing Sensors
You can add new features to any existing sensors (see list below) by adding a new provider in three steps:
1. [Modify](#modify-the-configyaml-file) the `config.yaml` file
2. [Create](#create-a-provider-folder-script-and-function) a provider folder, script and function
2. [Create](#create-a-feature-provider-script) your feature provider script
3. [Implement](#implement-your-feature-extraction-code) your features extraction code
As a tutorial, we will add a new provider for `PHONE_ACCELEROMETER` called `VEGA` that extracts `feature1`, `feature2`, `feature3` in Python and that it requires a parameter from the user called `MY_PARAMETER`.
As a tutorial, we will add a new provider for `PHONE_ACCELEROMETER` called `VEGA` that extracts `feature1`, `feature2`, `feature3` with a Python script that requires a parameter from the user called `MY_PARAMETER`.
??? info "Existing Sensors"
An existing sensor is any of the phone or Fitbit sensors with a configuration entry in `config.yaml`:
An existing sensor of any device with a configuration entry in `config.yaml`:
Smartphone (AWARE)
- Phone Accelerometer
- Phone Activity Recognition
- Phone Applications Crashes
- Phone Applications Foreground
- Phone Applications Notifications
- Phone Battery
- Phone Bluetooth
- Phone Calls
- Phone Conversation
- Phone Data Yield
- Phone Keyboard
- Phone Light
- Phone Locations
- Phone Log
- Phone Messages
- Phone Screen
- Phone WiFI Connected
@ -58,26 +62,26 @@ As a tutorial, we will add a new provider for `PHONE_ACCELEROMETER` called `VEGA
### Modify the `config.yaml` file
In this step you need to add your provider configuration section under the relevant sensor in `config.yaml`. See our example for our tutorial's `VEGA` provider for `PHONE_ACCELEROMETER`:
In this step, you need to add your provider configuration section under the relevant sensor in `config.yaml`. See our example for our tutorial's `VEGA` provider for `PHONE_ACCELEROMETER`:
??? example "Example configuration for a new accelerometer provider `VEGA`"
```yaml
```yaml hl_lines="12 13 14 15 16"
PHONE_ACCELEROMETER:
TABLE: accelerometer
CONTAINER: accelerometer
PROVIDERS:
RAPIDS:
RAPIDS: # this is a feature provider
COMPUTE: False
...
PANDA:
PANDA: # this is another feature provider
COMPUTE: False
...
VEGA:
VEGA: # this is our new feature provider
COMPUTE: False
FEATURES: ["feature1", "feature2", "feature3"]
MY_PARAMTER: a_string
SRC_FOLDER: "vega"
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_accelerometer/vega/main.py
```
@ -85,31 +89,33 @@ In this step you need to add your provider configuration section under the relev
|---|---|
|`[COMPUTE]`| Flag to activate/deactivate your provider
|`[FEATURES]`| List of features your provider supports. Your provider code should only return the features on this list
|`[MY_PARAMTER]`| An arbitrary parameter that our example provider `VEGA` needs. This can be a boolean, integer, float, string or an array of any of such types.
|`[SRC_LANGUAGE]`| The programming language of your provider script, it can be `python` or `r`, in our example `python`
|`[SRC_FOLDER]`| The name of your provider in lower case, in our example `vega` (this will be the name of your folder in the next step)
|`[MY_PARAMTER]`| An arbitrary parameter that our example provider `VEGA` needs. This can be a boolean, integer, float, string, or an array of any of such types.
|`[SRC_SCRIPT]`| The relative path from RAPIDS' root folder to an script that computes the features for this provider. It can be implemented in R or Python.
### Create a provider folder, script and function
### Create a feature provider script
In this step you need to add a folder, script and function for your provider.
5. Create your provider **folder** under `src/feature/DEVICE_SENSOR/YOUR_PROVIDER`, in our example `src/feature/phone_accelerometer/vega` (same as `[SRC_FOLDER]` in the step above).
6. Create your provider **script** inside your provider folder, it can be a Python file called `main.py` or an R file called `main.R`.
7. Add your provider **function** in your provider script. The name of such function should be `[providername]_features`, in our example `vega_features`
!!! info "Python function"
```python
def [providername]_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
```
!!! info "R function"
```r
[providername]_features <- function(sensor_data, time_segment, provider)
```
Create your feature Python or R script called `main.py` or `main.R` in the correct folder, `src/feature/[sensorname]/[providername]/`. RAPIDS automatically loads and executes it based on the config key `[SRC_SCRIPT]` you added in the last step. For our example, this script is:
```bash
src/feature/phone_accelerometer/vega/main.py
```
### Implement your feature extraction code
Every feature script (`main.[py|R]`) needs a `[providername]_features` function with specific parameters. RAPIDS calls this function with the sensor data ready to process and with other functions and arguments you will need.
The provider function that you created in the step above will receive the following parameters:
=== "Python function"
```python
def [providername]_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
# empty for now
return(your_features_df)
```
=== "R function"
```r
[providername]_features <- function(sensor_data, time_segment, provider){
# empty for now
return(your_features_df)
}
```
| Parameter&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description
|---|---|
@ -121,32 +127,32 @@ The provider function that you created in the step above will receive the follow
|`**kwargs`| Python only. Not used for now
The code to extract your behavioral features should be implemented in your provider function and in general terms it will have three stages:
The next step is to implement the code that computes your behavioral features in your provider script's function. As with any other script, this function can call other auxiliary methods, but in general terms, it should have three stages:
??? info "1. Read a participant's data by loading the CSV data stored in the file pointed by `sensor_data_files`"
``` python
acc_data = pd.read_csv(sensor_data_files["sensor_data"])
```
Note that phone's battery, screen, and activity recognition data is given as episodes instead of event rows (for example, start and end timestamps of the periods the phone screen was on)
Note that the phone's battery, screen, and activity recognition data are given as episodes instead of event rows (for example, start and end timestamps of the periods the phone screen was on)
??? info "2. Filter your data to process only those rows that belong to `time_segment`"
This step is only one line of code, but to undersand why we need it, keep reading.
This step is only one line of code, but keep reading to understand why we need it.
```python
acc_data = filter_data_by_segment(acc_data, time_segment)
```
You should use the `filter_data_by_segment()` function to process and group those rows that belong to each of the [time segments RAPIDS could be configured with](../../setup/configuration/#time-segments).
Let's understand the `filter_data_by_segment()` function with an example. A RAPIDS user can extract features on any arbitrary [time segment](../../setup/configuration/#time-segments). A time segment is a period of time that has a label and one or more instances. For example, the user (or you) could have requested features on a daily, weekly, and week-end basis for `p01`. The labels are arbritrary and the instances depend on the days a participant was monitored for:
Let's understand the `filter_data_by_segment()` function with an example. A RAPIDS user can extract features on any arbitrary [time segment](../../setup/configuration/#time-segments). A time segment is a period that has a label and one or more instances. For example, the user (or you) could have requested features on a daily, weekly, and weekend basis for `p01`. The labels are arbitrary, and the instances depend on the days a participant was monitored for:
- the daily segment could be named `my_days` and if `p01` was monitored for 14 days, it would have 14 instances
- the weekly segment could be named `my_weeks` and if `p01` was monitored for 14 days, it would have 2 instances.
- the weekend segment could be named `my_weekends` and if `p01` was monitored for 14 days, it would have 2 instances.
For this example, RAPIDS will call your provider function three times for `p01`, once where `time_segment` is `my_days`, once where `time_segment` is `my_weeks` and once where `time_segment` is `my_weekends`. In this example not every row in `p01`'s data needs to take part in the feature computation for either segment **and** the rows need to be grouped differently.
For this example, RAPIDS will call your provider function three times for `p01`, once where `time_segment` is `my_days`, once where `time_segment` is `my_weeks`, and once where `time_segment` is `my_weekends`. In this example, not every row in `p01`'s data needs to take part in the feature computation for either segment **and** the rows need to be grouped differently.
Thus `filter_data_by_segment()` comes in handy, it will return a data frame that contains the rows that were logged during a time segment plus an extra column called `local_segment`. This new column will have as many unique values as time segment instances exist (14, 2, and 2 for our `p01`'s `my_days`, `my_weeks`, and `my_weekends` examples). After filtering, **you should group the data frame by this column and compute any desired features**, for example:
@ -154,52 +160,22 @@ The code to extract your behavioral features should be implemented in your provi
acc_features["maxmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].max()
```
The reason RAPIDS does not filter the participant's data set for you is because your code might need to compute something based on a participant's complete dataset before computing their features. For example, you might want to identify the number that called a participant the most throughout the study before computing a feature with the number of calls the participant received from this number.
The reason RAPIDS does not filter the participant's data set for you is because your code might need to compute something based on a participant's complete dataset before computing their features. For example, you might want to identify the number that called a participant the most throughout the study before computing a feature with the number of calls the participant received from that number.
??? info "3. Return a data frame with your features"
After filtering, grouping your data, and computing your features, your provider function should return a data frame that has:
- One row per time segment instance (e.g. 14 our `p01`'s `my_days` example)
- One row per time segment instance (e.g., 14 our `p01`'s `my_days` example)
- The `local_segment` column added by `filter_data_by_segment()`
- One column per feature. By convention the name of your features should only contain letters or numbers (`feature1`). RAPIDS will automatically add the right sensor and provider prefix (`phone_accelerometr_vega_`)
- One column per feature. The name of your features should only contain letters or numbers (`feature1`) by convention. RAPIDS automatically adds the correct sensor and provider prefix; in our example, this prefix is `phone_accelerometr_vega_`.
??? example "`PHONE_ACCELEROMETER` Provider Example"
For your reference, this a short example of our own provider (`RAPIDS`) for `PHONE_ACCELEROMETER` that computes five acceleration features
For your reference, this our own provider (`RAPIDS`) for `PHONE_ACCELEROMETER` that computes five acceleration features
```python
def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
acc_data = pd.read_csv(sensor_data_files["sensor_data"])
requested_features = provider["FEATURES"]
# name of the features this function can compute
base_features_names = ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
# the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names))
--8<---- "src/features/phone_accelerometer/rapids/main.py"
acc_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
if not acc_data.empty:
acc_data = filter_data_by_segment(acc_data, time_segment)
if not acc_data.empty:
acc_features = pd.DataFrame()
# get magnitude related features: magnitude = sqrt(x^2+y^2+z^2)
magnitude = acc_data.apply(lambda row: np.sqrt(row["double_values_0"] ** 2 + row["double_values_1"] ** 2 + row["double_values_2"] ** 2), axis=1)
acc_data = acc_data.assign(magnitude = magnitude.values)
if "maxmagnitude" in features_to_compute:
acc_features["maxmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].max()
if "minmagnitude" in features_to_compute:
acc_features["minmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].min()
if "avgmagnitude" in features_to_compute:
acc_features["avgmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].mean()
if "medianmagnitude" in features_to_compute:
acc_features["medianmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].median()
if "stdmagnitude" in features_to_compute:
acc_features["stdmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].std()
acc_features = acc_features.reset_index()
return acc_features
```
## New Features for Non-Existing Sensors

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[EMPATICA_ACCELEROMETER]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Name of the CSV file containing accelerometer data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
|`[CONTAINER]`| Name of the CSV file containing accelerometer data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
## DBDP provider
@ -13,9 +13,7 @@ Sensor parameters description for `[EMPATICA_ACCELEROMETER]`:
!!! info "File Sequence"
```bash
- data/raw/{pid}/empatica_accelerometer_unzipped_{zip-file}.csv # one per zip file
- data/raw/{pid}/empatica_accelerometer_raw_{zip-file}.csv # one per zip file
- data/raw/{pid}/empatica_accelerometer_joined.csv
- data/raw/{pid}/empatica_accelerometer_raw.csv
- data/raw/{pid}/empatica_accelerometer_with_datetime.csv
- data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_{language}_{provider_key}.csv
- data/processed/features/{pid}/empatica_accelerometer.csv

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[EMPATICA_BLOOD_VOLUME_PULSE]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Name of the CSV file containing blood volume pulse data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
|`[CONTAINER]`| Name of the CSV file containing blood volume pulse data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
## DBDP provider
@ -13,9 +13,7 @@ Sensor parameters description for `[EMPATICA_BLOOD_VOLUME_PULSE]`:
!!! info "File Sequence"
```bash
- data/raw/{pid}/empatica_blood_volume_pulse_unzipped_{zip-file}.csv # one per zip file
- data/raw/{pid}/empatica_blood_volume_pulse_raw_{zip-file}.csv # one per zip file
- data/raw/{pid}/empatica_blood_volume_pulse_joined.csv
- data/raw/{pid}/empatica_blood_volume_pulse_raw.csv
- data/raw/{pid}/empatica_blood_volume_pulse_with_datetime.csv
- data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_{language}_{provider_key}.csv
- data/processed/features/{pid}/empatica_blood_volume_pulse.csv

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[EMPATICA_ELECTRODERMAL_ACTIVITY]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Name of the CSV file containing electrodermal activity data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
|`[CONTAINER]`| Name of the CSV file containing electrodermal activity data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
## DBDP provider
@ -13,9 +13,7 @@ Sensor parameters description for `[EMPATICA_ELECTRODERMAL_ACTIVITY]`:
!!! info "File Sequence"
```bash
- data/raw/{pid}/empatica_electrodermal_activity_unzipped_{zip-file}.csv # one per zip file
- data/raw/{pid}/empatica_electrodermal_activity_raw_{zip-file}.csv # one per zip file
- data/raw/{pid}/empatica_electrodermal_activity_joined.csv
- data/raw/{pid}/empatica_electrodermal_activity_raw.csv
- data/raw/{pid}/empatica_electrodermal_activity_with_datetime.csv
- data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal activity_{language}_{provider_key}.csv
- data/processed/features/{pid}/empatica_electrodermal_activity.csv

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[EMPATICA_HEARTRATE]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Name of the CSV file containing heart rate data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
|`[CONTAINER]`| Name of the CSV file containing heart rate data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
## DBDP provider
@ -13,9 +13,7 @@ Sensor parameters description for `[EMPATICA_HEARTRATE]`:
!!! info "File Sequence"
```bash
- data/raw/{pid}/empatica_heartrate_unzipped_{zip-file}.csv # one per zip file
- data/raw/{pid}/empatica_heartrate_raw_{zip-file}.csv # one per zip file
- data/raw/{pid}/empatica_heartrate_joined.csv
- data/raw/{pid}/empatica_heartrate_raw.csv
- data/raw/{pid}/empatica_heartrate_with_datetime.csv
- data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_{language}_{provider_key}.csv
- data/processed/features/{pid}/empatica_heartrate.csv

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[EMPATICA_INTER_BEAT_INTERVAL]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Name of the CSV file containing inter beat interval data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
|`[CONTAINER]`| Name of the CSV file containing inter beat interval data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
## DBDP provider
@ -13,9 +13,7 @@ Sensor parameters description for `[EMPATICA_INTER_BEAT_INTERVAL]`:
!!! info "File Sequence"
```bash
- data/raw/{pid}/empatica_inter_beat_interval_unzipped_{zip-file}.csv # one per zip file
- data/raw/{pid}/empatica_inter_beat_interval_raw_{zip-file}.csv # one per zip file
- data/raw/{pid}/empatica_inter_beat_interval_joined.csv
- data/raw/{pid}/empatica_inter_beat_interval_raw.csv
- data/raw/{pid}/empatica_inter_beat_interval_with_datetime.csv
- data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_{language}_{provider_key}.csv
- data/processed/features/{pid}/empatica_inter_beat_interval.csv

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[EMPATICA_TAGS]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Name of the CSV file containing tags data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
|`[CONTAINER]`| Name of the CSV file containing tags data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
!!! Note
- No feature providers have been implemented for this sensor yet, however you can [implement your own features](../add-new-features).

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[EMPATICA_TEMPERATURE]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Name of the CSV file containing temperature data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
|`[CONTAINER]`| Name of the CSV file containing temperature data that is compressed inside an Empatica zip file. Since these zip files are created [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) by Empatica, there is no need to change the value of this attribute.
## DBDP provider
@ -13,9 +13,7 @@ Sensor parameters description for `[EMPATICA_TEMPERATURE]`:
!!! info "File Sequence"
```bash
- data/raw/{pid}/empatica_temperature_unzipped_{zip-file}.csv # one per zip file
- data/raw/{pid}/empatica_temperature_raw_{zip-file}.csv # one per zip file
- data/raw/{pid}/empatica_temperature_joined.csv
- data/raw/{pid}/empatica_temperature_raw.csv
- data/raw/{pid}/empatica_temperature_with_datetime.csv
- data/interim/{pid}/empatica_temperature_features/empatica_temperature_{language}_{provider_key}.csv
- data/processed/features/{pid}/empatica_temperature.csv

View File

@ -6,52 +6,38 @@ Every device sensor has a corresponding config section in `config.yaml`, these s
- We recommend reading this page if you are using RAPIDS for the first time
- All computed sensor features are stored under `/data/processed/features` on files per sensor, per participant and per study (all participants).
- Every time you change any sensor parameters, provider parameters or provider features, all the necessary files will be updated as soon as you execute RAPIDS.
- In short, to extract features offered by a provider, you need to set its `[COMPUTE]` flag to `TRUE`, configure any of its parameters, and [execute](../../setup/execution) RAPIDS.
!!! example "Config section example for `PHONE_ACCELEROMETER`"
### Explaining the config.yaml sensor sections with an example
```yaml
# 1) Config section
PHONE_ACCELEROMETER:
# 2) Parameters for PHONE_ACCELEROMETER
TABLE: accelerometer
Each sensor section follows the same structure. Click on the numbered markers to know more.
# 3) Providers for PHONE_ACCELEROMETER
PROVIDERS:
# 4) RAPIDS provider
RAPIDS:
# 4.1) Parameters of RAPIDS provider of PHONE_ACCELEROMETER
COMPUTE: False
# 4.2) Features of RAPIDS provider of PHONE_ACCELEROMETER
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
SRC_FOLDER: "rapids" # inside src/features/phone_accelerometer
SRC_LANGUAGE: "python"
``` { .yaml .annotate }
PHONE_ACCELEROMETER: # (1)
# 5) PANDA provider
PANDA:
# 5.1) Parameters of PANDA provider of PHONE_ACCELEROMETER
COMPUTE: False
VALID_SENSED_MINUTES: False
# 5.2) Features of PANDA provider of PHONE_ACCELEROMETER
FEATURES:
exertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
nonexertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
SRC_FOLDER: "panda" # inside src/features/phone_accelerometer
SRC_LANGUAGE: "python"
```
CONTAINER: accelerometer # (2)
## Sensor Parameters
Each sensor configuration section has a "parameters" subsection (see `#2` in the example). These are parameters that affect different aspects of how the raw data is downloaded, and processed. The `TABLE` parameter exists for every sensor, but some sensors will have extra parameters like [`[PHONE_LOCATIONS]`](../phone-locations/). We explain these parameters in a table at the top of each sensor documentation page.
PROVIDERS: # (3)
RAPIDS:
COMPUTE: False # (4)
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
## Sensor Providers
Each sensor configuration section can have zero, one or more behavioral feature **providers** (see `#3` in the example). A provider is a script created by the core RAPIDS team or other researchers that extracts behavioral features for that sensor. In this example, accelerometer has two providers: RAPIDS (see `#4`) and PANDA (see `#5`).
SRC_SCRIPT: src/features/phone_accelerometer/rapids/main.py
### Provider Parameters
Each provider has parameters that affect the computation of the behavioral features it offers (see `#4.1` or `#5.1` in the example). These parameters will include at least a `[COMPUTE]` flag that you switch to `True` to extract a provider's behavioral features.
PANDA:
COMPUTE: False
VALID_SENSED_MINUTES: False
FEATURES: # (5)
exertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
nonexertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
We explain every provider's parameter in a table under the `Parameters description` heading on each provider documentation page.
# (6)
SRC_SCRIPT: src/features/phone_accelerometer/panda/main.py
```
### Provider Features
Each provider offers a set of behavioral features (see `#4.2` or `#5.2` in the example). For some providers these features are grouped in an array (like those for `RAPIDS` provider in `#4.2`) but for others they are grouped in a collection of arrays depending on the meaning and purpose of those features (like those for `PANDAS` provider in `#5.2`). In either case, you can delete the features you are not interested in and they will not be included in the sensor's output feature file.
--8<--- "docs/snippets/feature_introduction_example.md"
We explain each behavioral feature in a table under the `Features description` heading on each provider documentation page.
These are descriptions of each marker for accessibility:
--8<--- "docs/snippets/feature_introduction_example.md"

View File

@ -1,6 +1,6 @@
# Fitbit Data Yield
We use Fitbit heart rate intraday data to extract data yield features. Fitbit data yield features can be used to remove rows ([time segments](../../setup/configuration/#time-segments)) that do not contain enough Fitbit data. You should decide what is your "enough" threshold depending on the time a participant was supposed to be wearing their Fitbit, the length of your study, and the rates of missing data that your analysis could handle.
We use Fitbit **heart rate intraday** data to extract data yield features. Fitbit data yield features can be used to remove rows ([time segments](../../setup/configuration/#time-segments)) that do not contain enough Fitbit data. You should decide what is your "enough" threshold depending on the time a participant was supposed to be wearing their Fitbit, the length of your study, and the rates of missing data that your analysis could handle.
!!! hint "Why is Fitbit data yield important?"
Imagine that you want to extract `FITBIT_STEPS_SUMMARY` features on daily segments (`00:00` to `23:59`). Let's say that on day 1 the Fitbit logged 6k as the total step count and the heart rate sensor logged 24 hours of data and on day 2 the Fitbit logged 101 as the total step count and the heart rate sensor logged 2 hours of data. Its very likely that on day 2 you walked during the other 22 hours so including this day in your analysis could bias your results.
@ -8,7 +8,7 @@ Sensor parameters description for `[FITBIT_DATA_YIELD]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[SENSORS]`| The Fitbit sensor we considered for calculating the Fitbit data yield features.
|`[SENSORS]`| The Fitbit sensor we considered for calculating the Fitbit data yield features. We only support `FITBIT_HEARTRATE_INTRADAY` since sleep data is commonly collected only overnight, and step counts are 0 even when not wearing the Fitbit device.
## RAPIDS provider
@ -23,8 +23,7 @@ Before explaining the data yield features, let's define the following relevant c
!!! info "File Sequence"
```bash
- data/raw/{pid}/fitbit_heartrate_intraday_raw.csv
- data/raw/{pid}/fitbit_heartrate_intraday_parsed.csv
- data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv
- data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv
- data/interim/{pid}/fitbit_data_yield_features/fitbit_data_yield_{language}_{provider_key}.csv
- data/processed/features/{pid}/fitbit_data_yield.csv
```

View File

@ -4,30 +4,7 @@ Sensor parameters description for `[FITBIT_HEARTRATE_INTRADAY]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table name or file path where the heart rate intraday data is stored. The configuration keys in [Device Data Source Configuration](../../setup/configuration/#device-data-source-configuration) control whether this parameter is interpreted as table or file.
The format of the column(s) containing the Fitbit sensor data can be `JSON` or `PLAIN_TEXT`. The data in `JSON` format is obtained directly from the Fitbit API. We support `PLAIN_TEXT` in case you already parsed your data and don't have access to your participants' Fitbit accounts anymore. If your data is in `JSON` format then summary and intraday data come packed together.
We provide examples of the input format that RAPIDS expects, note that both examples for `JSON` and `PLAIN_TEXT` are tabular and the actual format difference comes in the `fitbit_data` column (we truncate the `JSON` example for brevity).
??? example "Example of the structure of source data"
=== "JSON"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-07","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":1200.6102,"max":88,"min":31,"minutes":1058,"name":"Out of Range"},{"caloriesOut":760.3020,"max":120,"min":86,"minutes":366,"name":"Fat Burn"},{"caloriesOut":15.2048,"max":146,"min":120,"minutes":2,"name":"Cardio"},{"caloriesOut":0,"max":221,"min":148,"minutes":0,"name":"Peak"}],"restingHeartRate":72}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":68},{"time":"00:01:00","value":67},{"time":"00:02:00","value":67},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-08","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":1100.1120,"max":89,"min":30,"minutes":921,"name":"Out of Range"},{"caloriesOut":660.0012,"max":118,"min":82,"minutes":361,"name":"Fat Burn"},{"caloriesOut":23.7088,"max":142,"min":108,"minutes":3,"name":"Cardio"},{"caloriesOut":0,"max":221,"min":148,"minutes":0,"name":"Peak"}],"restingHeartRate":70}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":77},{"time":"00:01:00","value":75},{"time":"00:02:00","value":73},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-09","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":750.3615,"max":77,"min":30,"minutes":851,"name":"Out of Range"},{"caloriesOut":734.1516,"max":107,"min":77,"minutes":550,"name":"Fat Burn"},{"caloriesOut":131.8579,"max":130,"min":107,"minutes":29,"name":"Cardio"},{"caloriesOut":0,"max":220,"min":130,"minutes":0,"name":"Peak"}],"restingHeartRate":69}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":90},{"time":"00:01:00","value":89},{"time":"00:02:00","value":88},...],"datasetInterval":1,"datasetType":"minute"}}
=== "PLAIN_TEXT"
All columns are mandatory, however, all except `device_id` and `local_date_time` can be empty if you don't have that data. Just have in mind that some features will be empty if some of these columns are empty.
|device_id |local_date_time |heartrate |heartrate_zone |
|-------------------------------------- |---------------------- |--------- |--------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 00:00:00 |68 |outofrange |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 00:01:00 |67 |outofrange |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 00:02:00 |67 |outofrange |
|`[CONTAINER]`| Container where your heart rate intraday data is stored, depending on the data stream you are using this can be a database table, a CSV file, etc. |
## RAPIDS provider
@ -38,8 +15,7 @@ We provide examples of the input format that RAPIDS expects, note that both exam
!!! info "File Sequence"
```bash
- data/raw/{pid}/fitbit_heartrate_intraday_raw.csv
- data/raw/{pid}/fitbit_heartrate_intraday_parsed.csv
- data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv
- data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv
- data/interim/{pid}/fitbit_heartrate_intraday_features/fitbit_heartrate_intraday_{language}_{provider_key}.csv
- data/processed/features/{pid}/fitbit_heartrate_intraday.csv
```

View File

@ -4,30 +4,7 @@ Sensor parameters description for `[FITBIT_HEARTRATE_SUMMARY]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table name or file path where the heart rate summary data is stored. The configuration keys in [Device Data Source Configuration](../../setup/configuration/#device-data-source-configuration) control whether this parameter is interpreted as table or file.
The format of the column(s) containing the Fitbit sensor data can be `JSON` or `PLAIN_TEXT`. The data in `JSON` format is obtained directly from the Fitbit API. We support `PLAIN_TEXT` in case you already parsed your data and don't have access to your participants' Fitbit accounts anymore. If your data is in `JSON` format then summary and intraday data come packed together.
We provide examples of the input format that RAPIDS expects, note that both examples for `JSON` and `PLAIN_TEXT` are tabular and the actual format difference comes in the `fitbit_data` column (we truncate the `JSON` example for brevity).
??? example "Example of the structure of source data"
=== "JSON"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-07","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":1200.6102,"max":88,"min":31,"minutes":1058,"name":"Out of Range"},{"caloriesOut":760.3020,"max":120,"min":86,"minutes":366,"name":"Fat Burn"},{"caloriesOut":15.2048,"max":146,"min":120,"minutes":2,"name":"Cardio"},{"caloriesOut":0,"max":221,"min":148,"minutes":0,"name":"Peak"}],"restingHeartRate":72}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":68},{"time":"00:01:00","value":67},{"time":"00:02:00","value":67},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-08","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":1100.1120,"max":89,"min":30,"minutes":921,"name":"Out of Range"},{"caloriesOut":660.0012,"max":118,"min":82,"minutes":361,"name":"Fat Burn"},{"caloriesOut":23.7088,"max":142,"min":108,"minutes":3,"name":"Cardio"},{"caloriesOut":0,"max":221,"min":148,"minutes":0,"name":"Peak"}],"restingHeartRate":70}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":77},{"time":"00:01:00","value":75},{"time":"00:02:00","value":73},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-09","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":750.3615,"max":77,"min":30,"minutes":851,"name":"Out of Range"},{"caloriesOut":734.1516,"max":107,"min":77,"minutes":550,"name":"Fat Burn"},{"caloriesOut":131.8579,"max":130,"min":107,"minutes":29,"name":"Cardio"},{"caloriesOut":0,"max":220,"min":130,"minutes":0,"name":"Peak"}],"restingHeartRate":69}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":90},{"time":"00:01:00","value":89},{"time":"00:02:00","value":88},...],"datasetInterval":1,"datasetType":"minute"}}
=== "PLAIN_TEXT"
All columns are mandatory, however, all except `device_id` and `local_date_time` can be empty if you don't have that data. Just have in mind that some features will be empty if some of these columns are empty.
|device_id |local_date_time |heartrate_daily_restinghr |heartrate_daily_caloriesoutofrange |heartrate_daily_caloriesfatburn |heartrate_daily_caloriescardio |heartrate_daily_caloriespeak |
|-------------------------------------- |----------------- |------- |-------------- |------------- |------------ |-------|
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 |72 |1200.6102 |760.3020 |15.2048 |0 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-08 |70 |1100.1120 |660.0012 |23.7088 |0 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-09 |69 |750.3615 |734.1516 |131.8579 |0 |
|`[CONTAINER]`| Container where your heart rate summary data is stored, depending on the data stream you are using this can be a database table, a CSV file, etc. |
## RAPIDS provider
@ -38,8 +15,7 @@ We provide examples of the input format that RAPIDS expects, note that both exam
!!! info "File Sequence"
```bash
- data/raw/{pid}/fitbit_heartrate_summary_raw.csv
- data/raw/{pid}/fitbit_heartrate_summary_parsed.csv
- data/raw/{pid}/fitbit_heartrate_summary_parsed_with_datetime.csv
- data/raw/{pid}/fitbit_heartrate_summary_with_datetime.csv
- data/interim/{pid}/fitbit_heartrate_summary_features/fitbit_heartrate_summary_{language}_{provider_key}.csv
- data/processed/features/{pid}/fitbit_heartrate_summary.csv
```

View File

@ -4,55 +4,7 @@ Sensor parameters description for `[FITBIT_SLEEP_INTRADAY]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table name or file path where the sleep intraday data is stored. The configuration keys in [Device Data Source Configuration](../../setup/configuration/#device-data-source-configuration) control whether this parameter is interpreted as table or file.
The format of the column(s) containing the Fitbit sensor data can be `JSON` or `PLAIN_TEXT`. The data in `JSON` format is obtained directly from the Fitbit API. We support `PLAIN_TEXT` in case you already parsed your data and don't have access to your participants' Fitbit accounts anymore. If your data is in `JSON` format then summary and intraday data come packed together.
We provide examples of the input format that RAPIDS expects, note that both examples for `JSON` and `PLAIN_TEXT` are tabular and the actual format difference comes in the `fitbit_data` column (we truncate the `JSON` example for brevity).
??? example "Example of the structure of source data with Fitbits sleep API Version 1"
=== "JSON"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep": [{"awakeCount": 2, "awakeDuration": 3, "awakeningsCount": 10, "dateOfSleep": "2020-10-07", "duration": 8100000, "efficiency": 91, "endTime": "2020-10-07T18:10:00.000", "isMainSleep": true, "logId": 14147921940, "minuteData": [{"dateTime": "15:55:00", "value": "3"}, {"dateTime": "15:56:00", "value": "3"}, {"dateTime": "15:57:00", "value": "2"},...], "minutesAfterWakeup": 0, "minutesAsleep": 123, "minutesAwake": 12, "minutesToFallAsleep": 0, "restlessCount": 8, "restlessDuration": 9, "startTime": "2020-10-07T15:55:00.000", "timeInBed": 135}, {"awakeCount": 0, "awakeDuration": 0, "awakeningsCount": 1, "dateOfSleep": "2020-10-07", "duration": 3780000, "efficiency": 100, "endTime": "2020-10-07T10:52:30.000", "isMainSleep": false, "logId": 14144903977, "minuteData": [{"dateTime": "09:49:00", "value": "1"}, {"dateTime": "09:50:00", "value": "1"}, {"dateTime": "09:51:00", "value": "1"},...], "minutesAfterWakeup": 1, "minutesAsleep": 62, "minutesAwake": 0, "minutesToFallAsleep": 0, "restlessCount": 1, "restlessDuration": 1, "startTime": "2020-10-07T09:49:00.000", "timeInBed": 63}], "summary": {"totalMinutesAsleep": 185, "totalSleepRecords": 2, "totalTimeInBed": 198}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep": [{"awakeCount": 3, "awakeDuration": 21, "awakeningsCount": 16, "dateOfSleep": "2020-10-08", "duration": 19260000, "efficiency": 89, "endTime": "2020-10-08T06:01:30.000", "isMainSleep": true, "logId": 14150613895, "minuteData": [{"dateTime": "00:40:00", "value": "3"}, {"dateTime": "00:41:00", "value": "3"}, {"dateTime": "00:42:00", "value": "3"},...], "minutesAfterWakeup": 0, "minutesAsleep": 275, "minutesAwake": 33, "minutesToFallAsleep": 0, "restlessCount": 13, "restlessDuration": 25, "startTime": "2020-10-08T00:40:00.000", "timeInBed": 321}], "summary": {"totalMinutesAsleep": 275, "totalSleepRecords": 1, "totalTimeInBed": 321}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep": [{"awakeCount": 1, "awakeDuration": 3, "awakeningsCount": 8, "dateOfSleep": "2020-10-09", "duration": 19320000, "efficiency": 96, "endTime": "2020-10-09T05:57:30.000", "isMainSleep": true, "logId": 14161136803, "minuteData": [{"dateTime": "00:35:30", "value": "2"}, {"dateTime": "00:36:30", "value": "1"}, {"dateTime": "00:37:30", "value": "1"},...], "minutesAfterWakeup": 0, "minutesAsleep": 309, "minutesAwake": 13, "minutesToFallAsleep": 0, "restlessCount": 7, "restlessDuration": 10, "startTime": "2020-10-09T00:35:30.000", "timeInBed": 322}], "summary": {"totalMinutesAsleep": 309, "totalSleepRecords": 1, "totalTimeInBed": 322}}
=== "PLAIN_TEXT"
All columns are mandatory, however, all except `device_id`, `local_date_time` and `duration` can be empty if you don't have that data. Just have in mind that some features might be inaccurate or empty as `type_episode_id`, `level`, `is_main_sleep`, and `type` are used for sleep episodes extraction. `type_episode_id` is based on where it is extracted: if it is extracted from the 1st "minutesData" block, the `type_episode_id` field will be 0. Similarly, the kth block will be k-1. Actually, you only need to make sure rows extracted from the same "minutesData" block are assigned with the same unique `type_episode_id` value.
|device_id |type_episode_id |local_date_time |duration |level |is_main_sleep |type |
|------------------------------------ |---------------- |------------------- |--------- |---------- |-------------- |-------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |0 |2020-10-07 15:55:00 |60 |awake |0 |classic |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |0 |2020-10-07 15:56:00 |60 |awake |0 |classic |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |0 |2020-10-07 15:57:00 |60 |restless |0 |classic |
??? example "Example of the structure of source data with Fitbits sleep API Version 1.2"
=== "JSON"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep":[{"dateOfSleep":"2020-10-10","duration":3600000,"efficiency":92,"endTime":"2020-10-10T16:37:00.000","infoCode":2,"isMainSleep":false,"levels":{"data":[{"dateTime":"2020-10-10T15:36:30.000","level":"restless","seconds":60},{"dateTime":"2020-10-10T15:37:30.000","level":"asleep","seconds":660},{"dateTime":"2020-10-10T15:48:30.000","level":"restless","seconds":60},...], "summary":{"asleep":{"count":0,"minutes":56},"awake":{"count":0,"minutes":0},"restless":{"count":3,"minutes":4}}},"logId":26315914306,"minutesAfterWakeup":0,"minutesAsleep":55,"minutesAwake":5,"minutesToFallAsleep":0,"startTime":"2020-10-10T15:36:30.000","timeInBed":60,"type":"classic"},{"dateOfSleep":"2020-10-10","duration":22980000,"efficiency":88,"endTime":"2020-10-10T08:10:00.000","infoCode":0,"isMainSleep":true,"levels":{"data":[{"dateTime":"2020-10-10T01:46:30.000","level":"light","seconds":420},{"dateTime":"2020-10-10T01:53:30.000","level":"deep","seconds":1230},{"dateTime":"2020-10-10T02:14:00.000","level":"light","seconds":360},...], "summary":{"deep":{"count":3,"minutes":92,"thirtyDayAvgMinutes":0},"light":{"count":29,"minutes":193,"thirtyDayAvgMinutes":0},"rem":{"count":4,"minutes":33,"thirtyDayAvgMinutes":0},"wake":{"count":28,"minutes":65,"thirtyDayAvgMinutes":0}}},"logId":26311786557,"minutesAfterWakeup":0,"minutesAsleep":318,"minutesAwake":65,"minutesToFallAsleep":0,"startTime":"2020-10-10T01:46:30.000","timeInBed":383,"type":"stages"}],"summary":{"stages":{"deep":92,"light":193,"rem":33,"wake":65},"totalMinutesAsleep":373,"totalSleepRecords":2,"totalTimeInBed":443}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep":[{"dateOfSleep":"2020-10-11","duration":41640000,"efficiency":89,"endTime":"2020-10-11T11:47:00.000","infoCode":0,"isMainSleep":true,"levels":{"data":[{"dateTime":"2020-10-11T00:12:30.000","level":"wake","seconds":450},{"dateTime":"2020-10-11T00:20:00.000","level":"light","seconds":870},{"dateTime":"2020-10-11T00:34:30.000","level":"wake","seconds":780},...], "summary":{"deep":{"count":4,"minutes":52,"thirtyDayAvgMinutes":62},"light":{"count":32,"minutes":442,"thirtyDayAvgMinutes":364},"rem":{"count":6,"minutes":68,"thirtyDayAvgMinutes":58},"wake":{"count":29,"minutes":132,"thirtyDayAvgMinutes":94}}},"logId":26589710670,"minutesAfterWakeup":1,"minutesAsleep":562,"minutesAwake":132,"minutesToFallAsleep":0,"startTime":"2020-10-11T00:12:30.000","timeInBed":694,"type":"stages"}],"summary":{"stages":{"deep":52,"light":442,"rem":68,"wake":132},"totalMinutesAsleep":562,"totalSleepRecords":1,"totalTimeInBed":694}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep":[{"dateOfSleep":"2020-10-12","duration":28980000,"efficiency":93,"endTime":"2020-10-12T09:34:30.000","infoCode":0,"isMainSleep":true,"levels":{"data":[{"dateTime":"2020-10-12T01:31:00.000","level":"wake","seconds":600},{"dateTime":"2020-10-12T01:41:00.000","level":"light","seconds":60},{"dateTime":"2020-10-12T01:42:00.000","level":"deep","seconds":2340},...], "summary":{"deep":{"count":4,"minutes":63,"thirtyDayAvgMinutes":59},"light":{"count":27,"minutes":257,"thirtyDayAvgMinutes":364},"rem":{"count":5,"minutes":94,"thirtyDayAvgMinutes":58},"wake":{"count":24,"minutes":69,"thirtyDayAvgMinutes":95}}},"logId":26589710673,"minutesAfterWakeup":0,"minutesAsleep":415,"minutesAwake":68,"minutesToFallAsleep":0,"startTime":"2020-10-12T01:31:00.000","timeInBed":483,"type":"stages"}],"summary":{"stages":{"deep":63,"light":257,"rem":94,"wake":69},"totalMinutesAsleep":415,"totalSleepRecords":1,"totalTimeInBed":483}}
=== "PLAIN_TEXT"
All columns are mandatory, however, all except `device_id`, `local_date_time` and `duration` can be empty if you don't have that data. Just have in mind that some features might be inaccurate or empty as `type_episode_id`, `level`, `is_main_sleep`, and `type` are used for sleep episodes extraction. `type_episode_id` is based on where it is extracted: if it is extracted from the 1st "data" and "shortData" block, the `type_episode_id` field will be 0. Similarly, the kth block will be k-1. Actually, you only need to make sure rows extracted from the same "minutesData" block are assigned with the same unique `type_episode_id` value.
|device_id |type_episode_id |local_date_time |duration |level |is_main_sleep |type |
|------------------------------------ |---------------- |------------------- |--------- |---------- |-------------- |-------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |0 |2020-10-10 15:36:30 |60 |restless |0 |classic |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |0 |2020-10-10 15:37:30 |660 |asleep |0 |classic |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |0 |2020-10-10 15:48:30 |60 |restless |0 |classic |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |... |... |... |... |... |... |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |1 |2020-10-10 01:46:30 |420 |light |1 |stages |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |1 |2020-10-10 01:53:30 |1230 |deep |1 |stages |
|`[CONTAINER]`| Container where your sleep intraday data is stored, depending on the data stream you are using this can be a database table, a CSV file, etc. |
## RAPIDS provider
@ -62,7 +14,8 @@ We provide examples of the input format that RAPIDS expects, note that both exam
!!! info "File Sequence"
```bash
- data/raw/{pid}/fitbit_sleep_intraday_raw.csv
- data/raw/{pid}/fitbit_sleep_intraday_parsed.csv
- data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv
- data/interim/{pid}/fitbit_sleep_intraday_episodes.csv
- data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled.csv
- data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled_with_datetime.csv
- data/interim/{pid}/fitbit_sleep_intraday_features/fitbit_sleep_intraday_{language}_{provider_key}.csv

View File

@ -4,52 +4,7 @@ Sensor parameters description for `[FITBIT_SLEEP_SUMMARY]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table name or file path where the sleep summary data is stored. The configuration keys in [Device Data Source Configuration](../../setup/configuration/#device-data-source-configuration) control whether this parameter is interpreted as table or file.
The format of the column(s) containing the Fitbit sensor data can be `JSON` or `PLAIN_TEXT`. The data in `JSON` format is obtained directly from the Fitbit API. We support `PLAIN_TEXT` in case you already parsed your data and don't have access to your participants' Fitbit accounts anymore. If your data is in `JSON` format then summary and intraday data come packed together.
We provide examples of the input format that RAPIDS expects, note that both examples for `JSON` and `PLAIN_TEXT` are tabular and the actual format difference comes in the `fitbit_data` column (we truncate the `JSON` example for brevity).
??? example "Example of the structure of source data with Fitbits sleep API Version 1"
=== "JSON"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep": [{"awakeCount": 2, "awakeDuration": 3, "awakeningsCount": 10, "dateOfSleep": "2020-10-07", "duration": 8100000, "efficiency": 91, "endTime": "2020-10-07T18:10:00.000", "isMainSleep": true, "logId": 14147921940, "minuteData": [{"dateTime": "15:55:00", "value": "3"}, {"dateTime": "15:56:00", "value": "3"}, {"dateTime": "15:57:00", "value": "2"},...], "minutesAfterWakeup": 0, "minutesAsleep": 123, "minutesAwake": 12, "minutesToFallAsleep": 0, "restlessCount": 8, "restlessDuration": 9, "startTime": "2020-10-07T15:55:00.000", "timeInBed": 135}, {"awakeCount": 0, "awakeDuration": 0, "awakeningsCount": 1, "dateOfSleep": "2020-10-07", "duration": 3780000, "efficiency": 100, "endTime": "2020-10-07T10:52:30.000", "isMainSleep": false, "logId": 14144903977, "minuteData": [{"dateTime": "09:49:00", "value": "1"}, {"dateTime": "09:50:00", "value": "1"}, {"dateTime": "09:51:00", "value": "1"},...], "minutesAfterWakeup": 1, "minutesAsleep": 62, "minutesAwake": 0, "minutesToFallAsleep": 0, "restlessCount": 1, "restlessDuration": 1, "startTime": "2020-10-07T09:49:00.000", "timeInBed": 63}], "summary": {"totalMinutesAsleep": 185, "totalSleepRecords": 2, "totalTimeInBed": 198}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep": [{"awakeCount": 3, "awakeDuration": 21, "awakeningsCount": 16, "dateOfSleep": "2020-10-08", "duration": 19260000, "efficiency": 89, "endTime": "2020-10-08T06:01:30.000", "isMainSleep": true, "logId": 14150613895, "minuteData": [{"dateTime": "00:40:00", "value": "3"}, {"dateTime": "00:41:00", "value": "3"}, {"dateTime": "00:42:00", "value": "3"},...], "minutesAfterWakeup": 0, "minutesAsleep": 275, "minutesAwake": 33, "minutesToFallAsleep": 0, "restlessCount": 13, "restlessDuration": 25, "startTime": "2020-10-08T00:40:00.000", "timeInBed": 321}], "summary": {"totalMinutesAsleep": 275, "totalSleepRecords": 1, "totalTimeInBed": 321}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep": [{"awakeCount": 1, "awakeDuration": 3, "awakeningsCount": 8, "dateOfSleep": "2020-10-09", "duration": 19320000, "efficiency": 96, "endTime": "2020-10-09T05:57:30.000", "isMainSleep": true, "logId": 14161136803, "minuteData": [{"dateTime": "00:35:30", "value": "2"}, {"dateTime": "00:36:30", "value": "1"}, {"dateTime": "00:37:30", "value": "1"},...], "minutesAfterWakeup": 0, "minutesAsleep": 309, "minutesAwake": 13, "minutesToFallAsleep": 0, "restlessCount": 7, "restlessDuration": 10, "startTime": "2020-10-09T00:35:30.000", "timeInBed": 322}], "summary": {"totalMinutesAsleep": 309, "totalSleepRecords": 1, "totalTimeInBed": 322}}
=== "PLAIN_TEXT"
All columns are mandatory, however, all except `device_id` and `local_date_time` can be empty if you don't have that data. Just have in mind that some features will be empty if some of these columns are empty.
|device_id |local_start_date_time |local_end_date_time |efficiency |minutes_after_wakeup |minutes_asleep |minutes_awake |minutes_to_fall_asleep |minutes_in_bed |is_main_sleep |type |count_awake |duration_awake |count_awakenings |count_restless |duration_restless |
|-------------------------------------- |---------------------- |---------------------- |----------- |--------------------- |--------------- |-------------- |----------------------- |--------------- |-------------- |-------- |----------- |--------------- |----------------- |--------------- |------------------ |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 15:55:00 |2020-10-07 18:10:00 |91 |0 |123 |12 |0 |135 |1 |classic |2 |3 |10 |8 |9 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 09:49:00 |2020-10-07 10:52:30 |100 |1 |62 |0 |0 |63 |0 |classic |0 |0 |1 |1 |1 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-08 00:40:00 |2020-10-08 06:01:30 |89 |0 |275 |33 |0 |321 |1 |classic |3 |21 |16 |13 |25 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-09 00:35:30 |2020-10-09 05:57:30 |96 |0 |309 |13 |0 |322 |1 |classic |1 |3 |8 |7 |10 |
??? example "Example of the structure of source data with Fitbits sleep API Version 1.2"
=== "JSON"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep":[{"dateOfSleep":"2020-10-10","duration":3600000,"efficiency":92,"endTime":"2020-10-10T16:37:00.000","infoCode":2,"isMainSleep":false,"levels":{"data":[{"dateTime":"2020-10-10T15:36:30.000","level":"restless","seconds":60},{"dateTime":"2020-10-10T15:37:30.000","level":"asleep","seconds":660},{"dateTime":"2020-10-10T15:48:30.000","level":"restless","seconds":60},...], "summary":{"asleep":{"count":0,"minutes":56},"awake":{"count":0,"minutes":0},"restless":{"count":3,"minutes":4}}},"logId":26315914306,"minutesAfterWakeup":0,"minutesAsleep":55,"minutesAwake":5,"minutesToFallAsleep":0,"startTime":"2020-10-10T15:36:30.000","timeInBed":60,"type":"classic"},{"dateOfSleep":"2020-10-10","duration":22980000,"efficiency":88,"endTime":"2020-10-10T08:10:00.000","infoCode":0,"isMainSleep":true,"levels":{"data":[{"dateTime":"2020-10-10T01:46:30.000","level":"light","seconds":420},{"dateTime":"2020-10-10T01:53:30.000","level":"deep","seconds":1230},{"dateTime":"2020-10-10T02:14:00.000","level":"light","seconds":360},...], "summary":{"deep":{"count":3,"minutes":92,"thirtyDayAvgMinutes":0},"light":{"count":29,"minutes":193,"thirtyDayAvgMinutes":0},"rem":{"count":4,"minutes":33,"thirtyDayAvgMinutes":0},"wake":{"count":28,"minutes":65,"thirtyDayAvgMinutes":0}}},"logId":26311786557,"minutesAfterWakeup":0,"minutesAsleep":318,"minutesAwake":65,"minutesToFallAsleep":0,"startTime":"2020-10-10T01:46:30.000","timeInBed":383,"type":"stages"}],"summary":{"stages":{"deep":92,"light":193,"rem":33,"wake":65},"totalMinutesAsleep":373,"totalSleepRecords":2,"totalTimeInBed":443}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep":[{"dateOfSleep":"2020-10-11","duration":41640000,"efficiency":89,"endTime":"2020-10-11T11:47:00.000","infoCode":0,"isMainSleep":true,"levels":{"data":[{"dateTime":"2020-10-11T00:12:30.000","level":"wake","seconds":450},{"dateTime":"2020-10-11T00:20:00.000","level":"light","seconds":870},{"dateTime":"2020-10-11T00:34:30.000","level":"wake","seconds":780},...], "summary":{"deep":{"count":4,"minutes":52,"thirtyDayAvgMinutes":62},"light":{"count":32,"minutes":442,"thirtyDayAvgMinutes":364},"rem":{"count":6,"minutes":68,"thirtyDayAvgMinutes":58},"wake":{"count":29,"minutes":132,"thirtyDayAvgMinutes":94}}},"logId":26589710670,"minutesAfterWakeup":1,"minutesAsleep":562,"minutesAwake":132,"minutesToFallAsleep":0,"startTime":"2020-10-11T00:12:30.000","timeInBed":694,"type":"stages"}],"summary":{"stages":{"deep":52,"light":442,"rem":68,"wake":132},"totalMinutesAsleep":562,"totalSleepRecords":1,"totalTimeInBed":694}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep":[{"dateOfSleep":"2020-10-12","duration":28980000,"efficiency":93,"endTime":"2020-10-12T09:34:30.000","infoCode":0,"isMainSleep":true,"levels":{"data":[{"dateTime":"2020-10-12T01:31:00.000","level":"wake","seconds":600},{"dateTime":"2020-10-12T01:41:00.000","level":"light","seconds":60},{"dateTime":"2020-10-12T01:42:00.000","level":"deep","seconds":2340},...], "summary":{"deep":{"count":4,"minutes":63,"thirtyDayAvgMinutes":59},"light":{"count":27,"minutes":257,"thirtyDayAvgMinutes":364},"rem":{"count":5,"minutes":94,"thirtyDayAvgMinutes":58},"wake":{"count":24,"minutes":69,"thirtyDayAvgMinutes":95}}},"logId":26589710673,"minutesAfterWakeup":0,"minutesAsleep":415,"minutesAwake":68,"minutesToFallAsleep":0,"startTime":"2020-10-12T01:31:00.000","timeInBed":483,"type":"stages"}],"summary":{"stages":{"deep":63,"light":257,"rem":94,"wake":69},"totalMinutesAsleep":415,"totalSleepRecords":1,"totalTimeInBed":483}}
=== "PLAIN_TEXT"
All columns are mandatory, however, all except `device_id` and `local_date_time` can be empty if you don't have that data. Just have in mind that some features will be empty if some of these columns are empty.
|device_id |local_start_date_time |local_end_date_time |efficiency |minutes_after_wakeup |minutes_asleep |minutes_awake |minutes_to_fall_asleep |minutes_in_bed |is_main_sleep |type |
|-------------------------------------- |---------------------- |---------------------- |----------- |--------------------- |--------------- |-------------- |----------------------- |--------------- |-------------- |-------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-10 15:36:30 |2020-10-10 16:37:00 |92 |0 |55 |5 |0 |60 |0 |classic |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-10 01:46:30 |2020-10-10 08:10:00 |88 |0 |318 |65 |0 |383 |1 |stages |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-11 00:12:30 |2020-10-11 11:47:00 |89 |1 |562 |132 |0 |694 |1 |stages |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-12 01:31:00 |2020-10-12 09:34:30 |93 |0 |415 |68 |0 |483 |1 |stages |
|`[CONTAINER]`| Container where your sleep summary data is stored, depending on the data stream you are using this can be a database table, a CSV file, etc. |
## RAPIDS provider
@ -60,8 +15,7 @@ We provide examples of the input format that RAPIDS expects, note that both exam
!!! info "File Sequence"
```bash
- data/raw/{pid}/fitbit_sleep_summary_raw.csv
- data/raw/{pid}/fitbit_sleep_summary_parsed.csv
- data/raw/{pid}/fitbit_sleep_summary_parsed_with_datetime.csv
- data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv
- data/interim/{pid}/fitbit_sleep_summary_features/fitbit_sleep_summary_{language}_{provider_key}.csv
- data/processed/features/{pid}/fitbit_sleep_summary.csv
```

View File

@ -4,30 +4,7 @@ Sensor parameters description for `[FITBIT_STEPS_INTRADAY]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table name or file path where the steps intraday data is stored. The configuration keys in [Device Data Source Configuration](../../setup/configuration/#device-data-source-configuration) control whether this parameter is interpreted as table or file.
The format of the column(s) containing the Fitbit sensor data can be `JSON` or `PLAIN_TEXT`. The data in `JSON` format is obtained directly from the Fitbit API. We support `PLAIN_TEXT` in case you already parsed your data and don't have access to your participants' Fitbit accounts anymore. If your data is in `JSON` format then summary and intraday data come packed together.
We provide examples of the input format that RAPIDS expects, note that both examples for `JSON` and `PLAIN_TEXT` are tabular and the actual format difference comes in the `fitbit_data` column (we truncate the `JSON` example for brevity).
??? example "Example of the structure of source data"
=== "JSON"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |"activities-steps":[{"dateTime":"2020-10-07","value":"1775"}],"activities-steps-intraday":{"dataset":[{"time":"00:00:00","value":5},{"time":"00:01:00","value":3},{"time":"00:02:00","value":0},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |"activities-steps":[{"dateTime":"2020-10-08","value":"3201"}],"activities-steps-intraday":{"dataset":[{"time":"00:00:00","value":14},{"time":"00:01:00","value":11},{"time":"00:02:00","value":10},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |"activities-steps":[{"dateTime":"2020-10-09","value":"998"}],"activities-steps-intraday":{"dataset":[{"time":"00:00:00","value":0},{"time":"00:01:00","value":0},{"time":"00:02:00","value":0},...],"datasetInterval":1,"datasetType":"minute"}}
=== "PLAIN_TEXT"
All columns are mandatory.
|device_id |local_date_time |steps |
|-------------------------------------- |---------------------- |--------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 00:00:00 |5 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 00:01:00 |3 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 00:02:00 |0 |
|`[CONTAINER]`| Container where your steps intraday data is stored, depending on the data stream you are using this can be a database table, a CSV file, etc. |
## RAPIDS provider
@ -38,8 +15,7 @@ We provide examples of the input format that RAPIDS expects, note that both exam
!!! info "File Sequence"
```bash
- data/raw/{pid}/fitbit_steps_intraday_raw.csv
- data/raw/{pid}/fitbit_steps_intraday_parsed.csv
- data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv
- data/raw/{pid}/fitbit_steps_intraday_with_datetime.csv
- data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_{language}_{provider_key}.csv
- data/processed/features/{pid}/fitbit_steps_intraday.csv
```

View File

@ -4,30 +4,7 @@ Sensor parameters description for `[FITBIT_STEPS_SUMMARY]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table name or file path where the steps summary data is stored. The configuration keys in [Device Data Source Configuration](../../setup/configuration/#device-data-source-configuration) control whether this parameter is interpreted as table or file.
The format of the column(s) containing the Fitbit sensor data can be `JSON` or `PLAIN_TEXT`. The data in `JSON` format is obtained directly from the Fitbit API. We support `PLAIN_TEXT` in case you already parsed your data and don't have access to your participants' Fitbit accounts anymore. If your data is in `JSON` format then summary and intraday data come packed together.
We provide examples of the input format that RAPIDS expects, note that both examples for `JSON` and `PLAIN_TEXT` are tabular and the actual format difference comes in the `fitbit_data` column (we truncate the `JSON` example for brevity).
??? example "Example of the structure of source data"
=== "JSON"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |"activities-steps":[{"dateTime":"2020-10-07","value":"1775"}],"activities-steps-intraday":{"dataset":[{"time":"00:00:00","value":5},{"time":"00:01:00","value":3},{"time":"00:02:00","value":0},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |"activities-steps":[{"dateTime":"2020-10-08","value":"3201"}],"activities-steps-intraday":{"dataset":[{"time":"00:00:00","value":14},{"time":"00:01:00","value":11},{"time":"00:02:00","value":10},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |"activities-steps":[{"dateTime":"2020-10-09","value":"998"}],"activities-steps-intraday":{"dataset":[{"time":"00:00:00","value":0},{"time":"00:01:00","value":0},{"time":"00:02:00","value":0},...],"datasetInterval":1,"datasetType":"minute"}}
=== "PLAIN_TEXT"
All columns are mandatory.
|device_id |local_date_time |steps |
|-------------------------------------- |---------------------- |--------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 |1775 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-08 |3201 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-09 |998 |
|`[CONTAINER]`| Container where your steps summary data is stored, depending on the data stream you are using this can be a database table, a CSV file, etc. |
## RAPIDS provider
@ -38,8 +15,7 @@ We provide examples of the input format that RAPIDS expects, note that both exam
!!! info "File Sequence"
```bash
- data/raw/{pid}/fitbit_steps_summary_raw.csv
- data/raw/{pid}/fitbit_steps_summary_parsed.csv
- data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv
- data/raw/{pid}/fitbit_steps_summary_with_datetime.csv
- data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_{language}_{provider_key}.csv
- data/processed/features/{pid}/fitbit_steps_summary.csv
```

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[PHONE_ACCELEROMETER]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the accelerometer data is stored
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the accelerometer data is stored
## RAPIDS provider

View File

@ -4,8 +4,8 @@ Sensor parameters description for `[PHONE_ACTIVITY_RECOGNITION]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE][ANDROID]`| Database table where the activity data from Android devices is stored (the AWARE client saves this data on different tables for Android and iOS)
|`[TABLE][IOS]`| Database table where the activity data from iOS devices is stored (the AWARE client saves this data on different tables for Android and iOS)
|`[CONTAINER][ANDROID]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the activity data from Android devices is stored (the AWARE client saves this data on different tables for Android and iOS)
|`[CONTAINER][IOS]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the activity data from iOS devices is stored (the AWARE client saves this data on different tables for Android and iOS)
|`[EPISODE_THRESHOLD_BETWEEN_ROWS]` | Difference in minutes between any two rows for them to be considered part of the same activity episode
## RAPIDS provider
@ -18,7 +18,6 @@ Sensor parameters description for `[PHONE_ACTIVITY_RECOGNITION]`:
```bash
- data/raw/{pid}/phone_activity_recognition_raw.csv
- data/raw/{pid}/phone_activity_recognition_with_datetime.csv
- data/raw/{pid}/phone_activity_recognition_with_datetime_unified.csv
- data/interim/{pid}/phone_activity_recognition_episodes.csv
- data/interim/{pid}/phone_activity_recognition_episodes_resampled.csv
- data/interim/{pid}/phone_activity_recognition_episodes_resampled_with_datetime.csv

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[PHONE_APPLICATIONS_CRASHES]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the applications crashes data is stored
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the applications crashes data is stored
|`[APPLICATION_CATEGORIES][CATALOGUE_SOURCE]` | `FILE` or `GOOGLE`. If `FILE`, app categories (genres) are read from `[CATALOGUE_FILE]`. If `[GOOGLE]`, app categories (genres) are scrapped from the Play Store
|`[APPLICATION_CATEGORIES][CATALOGUE_FILE]` | CSV file with a `package_name` and `genre` column. By default we provide the catalogue created by [Stachl et al](../../citation#stachl-applications-crashes) in `data/external/stachl_application_genre_catalogue.csv`
|`[APPLICATION_CATEGORIES][UPDATE_CATALOGUE_FILE]` | if `[CATALOGUE_SOURCE]` is equal to `FILE`, this flag signals whether or not to update `[CATALOGUE_FILE]`, if `[CATALOGUE_SOURCE]` is equal to `GOOGLE` all scraped genres will be saved to `[CATALOGUE_FILE]`

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[PHONE_APPLICATIONS_FOREGROUND]` (these param
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the applications foreground data is stored
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the applications foreground data is stored
|`[APPLICATION_CATEGORIES][CATALOGUE_SOURCE]` | `FILE` or `GOOGLE`. If `FILE`, app categories (genres) are read from `[CATALOGUE_FILE]`. If `[GOOGLE]`, app categories (genres) are scrapped from the Play Store
|`[APPLICATION_CATEGORIES][CATALOGUE_FILE]` | CSV file with a `package_name` and `genre` column. By default we provide the catalogue created by [Stachl et al](../../citation#stachl-applications-foreground) in `data/external/stachl_application_genre_catalogue.csv`
|`[APPLICATION_CATEGORIES][UPDATE_CATALOGUE_FILE]` | if `[CATALOGUE_SOURCE]` is equal to `FILE`, this flag signals whether or not to update `[CATALOGUE_FILE]`, if `[CATALOGUE_SOURCE]` is equal to `GOOGLE` all scraped genres will be saved to `[CATALOGUE_FILE]`

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[PHONE_APPLICATIONS_NOTIFICATIONS]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the applications notifications data is stored
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the applications notifications data is stored
|`[APPLICATION_CATEGORIES][CATALOGUE_SOURCE]` | `FILE` or `GOOGLE`. If `FILE`, app categories (genres) are read from `[CATALOGUE_FILE]`. If `[GOOGLE]`, app categories (genres) are scrapped from the Play Store
|`[APPLICATION_CATEGORIES][CATALOGUE_FILE]` | CSV file with a `package_name` and `genre` column. By default we provide the catalogue created by [Stachl et al](../../citation#stachl-applications-notifications) in `data/external/stachl_application_genre_catalogue.csv`
|`[APPLICATION_CATEGORIES][UPDATE_CATALOGUE_FILE]` | if `[CATALOGUE_SOURCE]` is equal to `FILE`, this flag signals whether or not to update `[CATALOGUE_FILE]`, if `[CATALOGUE_SOURCE]` is equal to `GOOGLE` all scraped genres will be saved to `[CATALOGUE_FILE]`

View File

@ -1,10 +0,0 @@
# Phone Aware
Sensor parameters description for `[PHONE_AWARE_LOG]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the aware data is stored
!!! note
No feature providers have been implemented for this sensor yet, however you can use its key (`PHONE_AWARE_LOG`) to improve [`PHONE_DATA_YIELD`](../phone-data-yield) or you can [implement your own features](../add-new-features).

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[PHONE_BATTERY]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the battery data is stored
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the battery data is stored
|`[EPISODE_THRESHOLD_BETWEEN_ROWS]` | Difference in minutes between any two rows for them to be considered part of the same battery charge or discharge episode
## RAPIDS provider

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[PHONE_BLUETOOTH]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the bluetooth data is stored
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the bluetooth data is stored
## RAPIDS provider

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[PHONE_CALLS]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the calls data is stored
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the calls data is stored
## RAPIDS Provider
@ -16,7 +16,6 @@ Sensor parameters description for `[PHONE_CALLS]`:
```bash
- data/raw/{pid}/phone_calls_raw.csv
- data/raw/{pid}/phone_calls_with_datetime.csv
- data/raw/{pid}/phone_calls_with_datetime_unified.csv
- data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv
- data/processed/features/{pid}/phone_calls.csv
```

View File

@ -4,8 +4,8 @@ Sensor parameters description for `[PHONE_CONVERSATION]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE][ANDROID]`| Database table where the conversation data from Android devices is stored (the AWARE client saves this data on different tables for Android and iOS)
|`[TABLE][IOS]`| Database table where the conversation data from iOS devices is stored (the AWARE client saves this data on different tables for Android and iOS)
|`[CONTAINER][ANDROID]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the conversation data from Android devices is stored (the AWARE client saves this data on different tables for Android and iOS)
|`[CONTAINER][IOS]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the conversation data from iOS devices is stored (the AWARE client saves this data on different tables for Android and iOS)
## RAPIDS provider
@ -17,7 +17,6 @@ Sensor parameters description for `[PHONE_CONVERSATION]`:
```bash
- data/raw/{pid}/phone_conversation_raw.csv
- data/raw/{pid}/phone_conversation_with_datetime.csv
- data/raw/{pid}/phone_conversation_with_datetime_unified.csv
- data/interim/{pid}/phone_conversation_features/phone_conversation_{language}_{provider_key}.csv
- data/processed/features/{pid}/phone_conversation.csv
```

View File

@ -18,18 +18,18 @@ Sensor parameters description for `[PHONE_DATA_YIELD]`:
PHONE_APPLICATIONS_CRASHES
PHONE_APPLICATIONS_FOREGROUND
PHONE_APPLICATIONS_NOTIFICATIONS
PHONE_AWARE_LOG
PHONE_BATTERY
PHONE_BLUETOOTH
PHONE_CALLS
PHONE_CONVERSATION
PHONE_MESSAGES
PHONE_KEYBOARD
PHONE_LIGHT
PHONE_LOCATIONS
PHONE_LOG
PHONE_MESSAGES
PHONE_SCREEN
PHONE_WIFI_VISIBLE
PHONE_WIFI_CONNECTED
PHONE_WIFI_VISIBLE
```
## RAPIDS provider

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[PHONE_KEYBOARD]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the keyboard data is stored
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the keyboard data is stored
!!! note
No feature providers have been implemented for this sensor yet, however you can use its key (`PHONE_KEYBOARD`) to improve [`PHONE_DATA_YIELD`](../phone-data-yield) or you can [implement your own features](../add-new-features).

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[PHONE_LIGHT]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the light data is stored
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the light data is stored
## RAPIDS provider

View File

@ -4,16 +4,28 @@ Sensor parameters description for `[PHONE_LOCATIONS]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the location data is stored
|`[LOCATIONS_TO_USE]`| Type of location data to use, one of `ALL`, `GPS`, `ALL_RESAMPLED` or `FUSED_RESAMPLED`. This filter is based on the `provider` column of the AWARE locations table, `ALL` includes every row, `GPS` only includes rows where provider is gps, `ALL_RESAMPLED` includes all rows after being resampled, and `FUSED_RESAMPLED` only includes rows where provider is fused after being resampled.
|`[FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD]`| if `ALL_RESAMPLED` or `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row will be resampled to the next valid timestamp (see the Assumptions/Observations below) only if the time difference between them is less or equal than this threshold (in minutes).
|`[FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION]`| if `ALL_RESAMPLED` or `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row will be resampled at most for this long (in minutes)
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the location data is stored
|`[LOCATIONS_TO_USE]`| Type of location data to use, one of `ALL`, `GPS`, `ALL_RESAMPLED` or `FUSED_RESAMPLED`. This filter is based on the `provider` column of the locations table, `ALL` includes every row, `GPS` only includes rows where the provider is gps, `ALL_RESAMPLED` includes all rows after being resampled, and `FUSED_RESAMPLED` only includes rows where the provider is fused after being resampled.
|`[FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD]`| if `ALL_RESAMPLED` or `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row is resampled to the next valid timestamp (see the Assumptions/Observations below) only if the time difference between them is less or equal than this threshold (in minutes).
|`[FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION]`| if `ALL_RESAMPLED` or `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row is resampled at most for this long (in minutes)
!!! note "Assumptions/Observations"
**Types of location data to use**
AWARE Android and iOS clients can collect location coordinates through the phone\'s GPS, the network cellular towers around the phone, or Google\'s fused location API. If you want to use only the GPS provider set `[LOCATIONS_TO_USE]` to `GPS`, if you want to use all providers set `[LOCATIONS_TO_USE]` to `ALL`, if you collected location data from different providers including the fused API use `ALL_RESAMPLED`, if your AWARE client was configured to use fused location only or want to focus only on this provider, set `[LOCATIONS_TO_USE]` to `RESAMPLE_FUSED`. `ALL_RESAMPLED` and `RESAMPLE_FUSED` take the original location coordinates and replicate each pair forward in time as long as the phone was sensing data as indicated by the joined timestamps of [`[PHONE_DATA_YIELD][SENSORS]`](../phone-data-yield/), this is done because Google\'s API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one and because GPS and network providers can log data at variable rates.
Android and iOS clients can collect location coordinates through the phone's GPS, the network cellular towers around the phone, or Google's fused location API.
There are two parameters associated with resampling fused location. `FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A\'s phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). `FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION` (in minutes, default 720 or 12 hours) stops the last known fused location from being replicated longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this resampling, let us know.
- If you want to use only the GPS provider, set `[LOCATIONS_TO_USE]` to `GPS`
- If you want to use all providers, set `[LOCATIONS_TO_USE]` to `ALL`
- If you collected location data from different providers, including the fused API, use `ALL_RESAMPLED`
- If your mobile client was configured to use fused location only or want to focus only on this provider, set `[LOCATIONS_TO_USE]` to `RESAMPLE_FUSED`.
`ALL_RESAMPLED` and `RESAMPLE_FUSED` take the original location coordinates and replicate each pair forward in time as long as the phone was sensing data as indicated by the joined timestamps of [`[PHONE_DATA_YIELD][SENSORS]`](../phone-data-yield/). This is done because Google's API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one and because GPS and network providers can log data at variable rates.
There are two parameters associated with resampling fused location.
1. `FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair. For example, participant A's phone did not collect data between 10.30 am and 10:50 am and between 11:05am and 11:40am, the last known coordinate pair is replicated during the first period but not the second. In other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes.
2. `FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION` (in minutes, default 720 or 12 hours) stops the last known fused location from being replicated longer than this threshold even if the phone was sensing data continuously. For example, participant A went home at 9 pm, and their phone was sensing data without gaps until 11 am the next morning, the last known location is replicated until 9 am.
If you have suggestions to modify or improve this resampling, let us know.
## BARNETT provider
@ -36,13 +48,13 @@ These features are based on the original open-source implementation by [Barnett
Parameters description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[COMPUTE]`| Set to `True` to extract `PHONE_LOCATIONS` features from the `BARNETT` provider|
|`[FEATURES]` | Features to be computed, see table below
|`[ACCURACY_LIMIT]` | An integer in meters, any location rows with an accuracy higher than this will be dropped. This number means there's a 68% probability the true location is within this radius
|`[TIMEZONE]` | Timezone where the location data was collected. By default points to the one defined in the [Configuration](../../setup/configuration#timezone-of-your-study)
|`[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
|`[ACCURACY_LIMIT]` | An integer in meters, any location rows with an accuracy higher than this is dropped. This number means there's a 68% probability the actual location is within this radius
|`[IF_MULTIPLE_TIMEZONES]` | Currently, `USE_MOST_COMMON` is the only value supported. If the location data for a participant belongs to multiple time zones, we select the most common because Barnett's algorithm can only handle one time zone
|`[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes; the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
@ -50,9 +62,9 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]` adapted from [B
|Feature |Units |Description|
|-------------------------- |---------- |---------------------------|
|hometime |minutes | Time at home. Time spent at home in minutes. Home is the most visited significant location between 8 pm and 8 am including any pauses within a 200-meter radius.
|disttravelled |meters | Total distance travelled over a day (flights).
|rog |meters | The Radius of Gyration (rog) is a measure in meters of the area covered by a person over a day. A centroid is calculated for all the places (pauses) visited during a day and a weighted distance between all the places and that centroid is computed. The weights are proportional to the time spent in each place.
|hometime |minutes | Time at home. Time spent at home in minutes. Home is the most visited significant location between 8 pm and 8 am, including any pauses within a 200-meter radius.
|disttravelled |meters | Total distance traveled over a day (flights).
|rog |meters | The Radius of Gyration (rog) is a measure in meters of the area covered by a person over a day. A centroid is calculated for all the places (pauses) visited during a day, and a weighted distance between all the places and that centroid is computed. The weights are proportional to the time spent in each place.
|maxdiam |meters | The maximum diameter is the largest distance between any two pauses.
|maxhomedist |meters | The maximum distance from home in meters.
|siglocsvisited |locations | The number of significant locations visited during the day. Significant locations are computed using k-means clustering over pauses found in the whole monitoring period. The number of clusters is found iterating k from 1 to 200 stopping until the centroids of two significant locations are within 400 meters of one another.
@ -61,16 +73,16 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]` adapted from [B
|avgflightdur |seconds | Mean duration of all flights.
|stdflightdur |seconds | The standard deviation of the duration of all flights.
|probpause | - | The fraction of a day spent in a pause (as opposed to a flight)
|siglocentropy |nats | Shannons entropy measurement based on the proportion of time spent at each significant location visited during a day.
|circdnrtn | - | A continuous metric quantifying a persons circadian routine that can take any value between 0 and 1, where 0 represents a daily routine completely different from any other sensed days and 1 a routine the same as every other sensed day.
|siglocentropy |nats | Shannon's entropy measurement is based on the proportion of time spent at each significant location visited during a day.
|circdnrtn | - | A continuous metric quantifying a person's circadian routine that can take any value between 0 and 1, where 0 represents a daily routine completely different from any other sensed days and 1 a routine the same as every other sensed day.
|wkenddayrtn | - | Same as circdnrtn but computed separately for weekends and weekdays.
!!! note "Assumptions/Observations"
**Barnett\'s et al features**
These features are based on a Pause-Flight model. A pause is defined as a mobiity trace (location pings) within a certain duration and distance (by default 300 seconds and 60 meters). A flight is any mobility trace between two pauses. Data is resampled and imputed before the features are computed. See [Barnett et al](../../citation#barnett-locations) for more information. In RAPIDS we only expose two parameters for these features (timezone and accuracy limit). You can change other parameters in `src/features/phone_locations/barnett/library/MobilityFeatures.R`.
These features are based on a Pause-Flight model. A pause is defined as a mobility trace (location pings) within a certain duration and distance (by default, 300 seconds and 60 meters). A flight is any mobility trace between two pauses. Data is resampled and imputed before the features are computed. See [Barnett et al](../../citation#barnett-locations) for more information. In RAPIDS, we only expose one parameter for these features (accuracy limit). You can change other parameters in `src/features/phone_locations/barnett/library/MobilityFeatures.R`.
**Significant Locations**
Significant locations are determined using K-means clustering on pauses longer than 10 minutes. The number of clusters (K) is increased until no two clusters are within 400 meters from each other. After this, pauses within a certain range of a cluster (200 meters by default) will count as a visit to that significant location. This description was adapted from the Supplementary Materials of [Barnett et al](../../citation#barnett-locations).
Significant locations are determined using K-means clustering on pauses longer than 10 minutes. The number of clusters (K) is increased until no two clusters are within 400 meters from each other. After this, pauses within a certain range of a cluster (200 meters by default) count as a visit to that significant location. This description was adapted from the Supplementary Materials of [Barnett et al](../../citation#barnett-locations).
**The Circadian Calculation**
For a detailed description of how this is calculated, see [Canzian et al](../../citation#barnett-locations).
@ -105,13 +117,13 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
| `[DBSCAN_EPS]` | The maximum distance in meters between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
| `[DBSCAN_MINSAMPLES]` | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point of a cluster. This includes the point itself.
| `[THRESHOLD_STATIC]` | It is the threshold value in km/hr which labels a row as Static or Moving.
| `[MAXIMUM_ROW_GAP]` | The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the the phone was not sensing.
| `[MAXIMUM_ROW_DURATION]` | The time difference between any two consecutive rows `A` and `B` is considered as the time a participant spent in `A`. If this difference is bigger than MAXIMUM_ROW_GAP we will substitute it with `MAXIMUM_ROW_DURATION`.
| `[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
| `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations.
| `[MAXIMUM_ROW_GAP]` | The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the phone was not sensing.
| `[MAXIMUM_ROW_DURATION]` | The time difference between any two consecutive rows `A` and `B` is considered as the time a participant spent in `A`. If this difference is bigger than MAXIMUM_ROW_GAP we substitute it with `MAXIMUM_ROW_DURATION`.
| `[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes; the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
| `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations.
| `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings).
| `[CLUSTERING_ALGORITHM]` | The original Doryab et al implementation uses `DBSCAN`, `OPTICS` is also available with similar (but not identical) clustering results and lower memory consumption.
| `[RADIUS_FOR_HOME]` | All location coordinates within this distance (meters) from the home location coordinates are considered a home stay (see `timeathome` feature).
| `[CLUSTERING_ALGORITHM]` | The original Doryab et al. implementation uses `DBSCAN`, `OPTICS` is also available with similar (but not identical) clustering results and lower memory consumption.
| `[RADIUS_FOR_HOME]` | All location coordinates within this distance (meters) from the home location coordinates are considered a homestay (see `timeathome` feature).
Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
@ -120,24 +132,24 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
|-------------------------- |---------- |---------------------------|
|locationvariance |$meters^2$ |The sum of the variances of the latitude and longitude columns.
|loglocationvariance | - | Log of the sum of the variances of the latitude and longitude columns.
|totaldistance |meters |Total distance travelled in a time segment using the haversine formula.
|totaldistance |meters |Total distance traveled in a time segment using the haversine formula.
|averagespeed |km/hr |Average speed in a time segment considering only the instances labeled as Moving.
|varspeed |km/hr |Speed variance in a time segment considering only the instances labeled as Moving.
|{--circadianmovement--} |- | Not suggested for use at the moment, see Observations below. \"It encodes the extent to which a person's location patterns follow a 24-hour circadian cycle.\" [Doryab et al.](../../citation#doryab-locations).
|{--circadianmovement--} |- | Not suggested for use now; see Observations below. \ "It encodes the extent to which a person's location patterns follow a 24-hour circadian cycle.\" [Doryab et al.](../../citation#doryab-locations).
|numberofsignificantplaces |places |Number of significant locations visited. It is calculated using the DBSCAN/OPTICS clustering algorithm which takes in EPS and MIN_SAMPLES as parameters to identify clusters. Each cluster is a significant place.
|numberlocationtransitions |transitions |Number of movements between any two clusters in a time segment.
|radiusgyration |meters |Quantifies the area covered by a participant
|timeattop1location |minutes |Time spent at the most significant location.
|timeattop2location |minutes |Time spent at the 2nd most significant location.
|timeattop3location |minutes |Time spent at the 3rd most significant location.
|movingtostaticratio | - | Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labelled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed using timeInSeconds feature.
|movingtostaticratio | - | Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labeled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed using timeInSeconds feature.
|outlierstimepercent | - | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed using timeInSeconds feature.
|maxlengthstayatclusters |minutes |Maximum time spent in a cluster (significant location).
|minlengthstayatclusters |minutes |Minimum time spent in a cluster (significant location).
|meanlengthstayatclusters |minutes |Average time spent in a cluster (significant location).
|stdlengthstayatclusters |minutes |Standard deviation of time spent in a cluster (significant location).
|locationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location), it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location).
|normalizedlocationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location) divided by the number of clusters, it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location).
|locationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location), it is higher the more rows belong to a cluster (i.e., the more time a participant spent at a significant location).
|normalizedlocationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location) divided by the number of clusters; it is higher the more rows belong to a cluster (i.e., the more time a participant spent at a significant location).
|timeathome |minutes | Time spent at home (see Observations below for a description on how we compute home).
@ -146,13 +158,13 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
Significant locations are determined using DBSCAN clustering on locations that a patient visit over the course of the period of data collection.
**Circadian Movement Calculation**
Note Feb 3 2021. It seems the implementation of this feature is not correct, we suggest not to use this feature until a fix is in place. For a detailed description of how this should be calculated, see [Saeb et al](https://pubmed.ncbi.nlm.nih.gov/28344895/).
Note Feb 3 2021. It seems the implementation of this feature is not correct; we suggest not to use this feature until a fix is in place. For a detailed description of how this should be calculated, see [Saeb et al](https://pubmed.ncbi.nlm.nih.gov/28344895/).
**Fine Tuning Clustering Parameters**
Based on an experiment where we collected fused location data for 7 days with a mean accuracy of 86 & SD of 350.874635, we determined that `EPS/MAX_EPS`=100 produced closer clustering results to reality. Higher values (>100) missed out some significant places like a short grocery visit while lower values (<100) picked up traffic lights and stop signs while driving as significant locations. We recommend you set `EPS` based on the accuracy of your location data (the more accurate your data is, the lower you should be able to set EPS).
**Fine-Tuning Clustering Parameters**
Based on an experiment where we collected fused location data for 7 days with a mean accuracy of 86 & SD of 350.874635, we determined that `EPS/MAX_EPS`=100 produced closer clustering results to reality. Higher values (>100) missed out on some significant places, like a short grocery visit, while lower values (<100) picked up traffic lights and stop signs while driving as significant locations. We recommend you set `EPS` based on your location data's accuracy (the more accurate your data is, the lower you should be able to set EPS).
**Duration Calculation**
To calculate the time duration component for our features, we compute the difference between the timestamps of consecutive rows to take into account sampling rate variability. If this time difference is larger than a threshold (300 seconds by default) we replace it with a maximum duration (60 seconds by default, i.e. we assume a participant spent at least 60 seconds in their last known location)
To calculate the time duration component for our features, we compute the difference between consecutive rows' timestamps to take into account sampling rate variability. If this time difference is larger than a threshold (300 seconds by default), we replace it with a maximum duration (60 seconds by default, i.e., we assume a participant spent at least 60 seconds in their last known location)
**Home location**
Home is calculated using all location data of a participant between 12 am and 6 am, then applying a clustering algorithm (`DB_SCAN` or `OPTICS`), and considering the center of the biggest cluster as the home coordinates for that participant.
Home is calculated using all location data of a participant between 12 am and 6 am, then applying a clustering algorithm (`DB_SCAN` or `OPTICS`) and considering the center of the biggest cluster home for that participant.

View File

@ -0,0 +1,11 @@
# Phone Log
Sensor parameters description for `[PHONE_LOG]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[CONTAINER][ANDROID]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where a data log is stored for Android devices
|`[CONTAINER][IOS]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where a data log is stored for iOS devices
!!! note
No feature providers have been implemented for this sensor yet, however you can use its key (`PHONE_LOG`) to improve [`PHONE_DATA_YIELD`](../phone-data-yield) or you can [implement your own features](../add-new-features).

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[PHONE_MESSAGES]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the messages data is stored
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the messages data is stored
## RAPIDS provider

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[PHONE_SCREEN]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the screen data is stored
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the screen data is stored
## RAPIDS provider
@ -16,7 +16,6 @@ Sensor parameters description for `[PHONE_SCREEN]`:
```bash
- data/raw/{pid}/phone_screen_raw.csv
- data/raw/{pid}/phone_screen_with_datetime.csv
- data/raw/{pid}/phone_screen_with_datetime_unified.csv
- data/interim/{pid}/phone_screen_episodes.csv
- data/interim/{pid}/phone_screen_episodes_resampled.csv
- data/interim/{pid}/phone_screen_episodes_resampled_with_datetime.csv

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[PHONE_WIFI_CONNECTED]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the wifi (connected) data is stored
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the wifi (connected) data is stored
## RAPIDS provider

View File

@ -4,7 +4,7 @@ Sensor parameters description for `[PHONE_WIFI_VISIBLE]`:
|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|`[TABLE]`| Database table where the wifi (visible) data is stored
|`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the wifi (visible) data is stored
## RAPIDS provider

View File

@ -1,20 +0,0 @@
# File Structure
!!! tip
- Read this page if you want to learn more about how RAPIDS is structured. If you want to start using it go to [Installation](../setup/installation/), then to [Configuration](../setup/configuration/), and then to [Execution](../setup/execution/)
- All paths mentioned in this page are relative to RAPIDS' root folder.
If you want to extract the behavioral features that RAPIDS offers, you will only have to create or modify the [`.env` file](../setup/configuration/#database-credentials), [participants files](../setup/configuration/#participant-files), [time segment files](../setup/configuration/#time-segments), and the `config.yaml` file as instructed in the [Configuration page](../setup/configuration). The `config.yaml` file is the heart of RAPIDS and includes parameters to manage participants, data sources, sensor data, visualizations and more.
All data is saved in `data/`. The `data/external/` folder stores any data imported or created by the user, `data/raw/` stores sensor data as imported from your database, `data/interim/` has intermediate files necessary to compute behavioral features from raw data, and `data/processed/` has all the final files with the behavioral features in folders per participant and sensor.
RAPIDS source code is saved in `src/`. The `src/data/` folder stores scripts to download, clean and pre-process sensor data, `src/features` has scripts to extract behavioral features organized in their respective sensor subfolders , `src/models/` can host any script to create models or statistical analyses with the behavioral features you extract, and `src/visualization/` has scripts to create plots of the raw and processed data. There are other files and folders but only relevant if you are interested in extending RAPIDS (e.g. virtual env files, docs, tests, Dockerfile, the Snakefile, etc.).
In the figure below, we represent the interactions between users and files. After a user modifies the configuration files mentioned above, the `Snakefile` file will search for and execute the Snakemake rules that contain the Python or R scripts necessary to generate or update the required output files (behavioral features, plots, etc.).
<figure>
<img src="../img/files.png" max-width="100%" />
<figcaption>Interaction diagram between the user, and important files in RAPIDS</figcaption>
</figure>

Binary file not shown.

After

Width:  |  Height:  |  Size: 337 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 314 KiB

After

Width:  |  Height:  |  Size: 198 KiB

View File

@ -1,8 +1,10 @@
# Welcome to RAPIDS documentation
Reproducible Analysis Pipeline for Data Streams (RAPIDS) allows you to process smartphone and wearable data to [extract](features/feature-introduction.md) and [create](features/add-new-features.md) **behavioral features** (a.k.a. digital biomarkers), [visualize](visualizations/data-quality-visualizations.md) mobile sensor data and [structure](workflow-examples/analysis.md) your analysis into reproducible workflows.
Reproducible Analysis Pipeline for Data Streams (RAPIDS) allows you to process smartphone and wearable data to [extract](features/feature-introduction.md) and [create](features/add-new-features.md) **behavioral features** (a.k.a. digital biomarkers), [visualize](visualizations/data-quality-visualizations.md) mobile sensor data, and [structure](workflow-examples/analysis.md) your analysis into reproducible workflows.
RAPIDS is open source, documented, modular, tested, and reproducible. At the moment we support smartphone data collected with [AWARE](https://awareframework.com/), wearable data from Fitbit devices, and wearable data from Empatica devices (in collaboration with the [DBDP](https://dbdp.org/)).
RAPIDS is open source, documented, modular, tested, and reproducible. At the moment, we support [data streams](../../datastreams/data-streams-introduction) logged by smartphones, Fitbit wearables, and, in collaboration with the [DBDP](https://dbdp.org/), Empatica wearables (but you can [add your own](../../datastreams/add-new-data-streams) too).
**If you want to know more head over to [Overview](../setup/overview/)**
!!! tip
:material-slack: Questions or feedback can be posted on the \#rapids channel in AWARE Framework\'s [slack](http://awareframework.com:3000/).
@ -11,27 +13,20 @@ RAPIDS is open source, documented, modular, tested, and reproducible. At the mom
:fontawesome-solid-tasks: Join our discussions on our algorithms and assumptions for feature [processing](https://github.com/carissalow/rapids/discussions).
:fontawesome-solid-play: Ready to start? Go to [Installation](setup/installation/), then to [Configuration](setup/configuration/), and then to [Execution](setup/execution/)
:fontawesome-solid-sync-alt: Are you upgrading from RAPIDS `0.4.x` or older? Follow this [guide](migrating-from-old-versions)
:fontawesome-solid-sync-alt: Are you upgrading from RAPIDS [beta](https://rapidspitt.readthedocs.io/en/latest/)? Follow this [guide](migrating-from-old-versions)
## How does it work?
RAPIDS is formed by R and Python scripts orchestrated by [Snakemake](https://snakemake.readthedocs.io/en/stable/). We suggest you read Snakemake's docs but in short: every link in the analysis chain is atomic and has files as input and output. Behavioral features are processed per sensor and per participant.
:fontawesome-solid-play: Ready? Go to [Overview](../setup/overview/).
## What are the benefits of using RAPIDS?
1. **Consistent analysis**. Every participant sensor dataset is analyzed in the exact same way and isolated from each other.
2. **Efficient analysis**. Every analysis step is executed only once. Whenever your data or configuration changes only the affected files are updated.
1. **Consistent analysis**. Every participant sensor dataset is analyzed in the same way and isolated from each other.
2. **Efficient analysis**. Every analysis step is executed only once. Whenever your data or configuration changes, only the affected files are updated.
5. **Parallel execution**. Thanks to Snakemake, your analysis can be executed over multiple cores without changing your code.
6. **Code-free features**. Extract any of the behavioral features offered by RAPIDS without writing any code.
7. **Extensible code**. You can easily add your own behavioral features in R or Python, share them with the community, and keep authorship and citations.
8. **Timezone aware**. Your data is adjusted to the specified timezone (multiple timezones suport *coming soon*).
9. **Flexible time segments**. You can extract behavioral features on time windows of any length (e.g. 5 minutes, 3 hours, 2 days), on every day or particular days (e.g. weekends, Mondays, the 1st of each month, etc.) or around events of interest (e.g. surveys or clinical relapses).
10. **Tested code**. We are constantly adding tests to make sure our behavioral features are correct.
11. **Reproducible code**. If you structure your analysis within RAPIDS, you can be sure your code will run in other computers as intended thanks to R and Python virtual environments. You can share your analysis code along your publications without any overhead.
7. **Extensible code**. You can easily add your own data streams or behavioral features in R or Python, share them with the community, and keep authorship and citations.
8. **Timezone aware**. Your data is adjusted to one or more time zones per participant.
9. **Flexible time segments**. You can extract behavioral features on time windows of any length (e.g., 5 minutes, 3 hours, 2 days), on every day or particular days (e.g., weekends, Mondays, the 1st of each month, etc.), or around events of interest (e.g., surveys or clinical relapses).
10. **Tested code**. We are continually adding tests to make sure our behavioral features are correct.
11. **Reproducible code**. If you structure your analysis within RAPIDS, you can be sure your code will run in other computers as intended, thanks to R and Python virtual environments. You can share your analysis code along with your publications without any overhead.
12. **Private**. All your data is processed locally.
## How is it organized?
In broad terms the `config.yaml`, [`.env` file](setup/configuration/#database-credentials), [participants files](setup/configuration/#participant-files), and [time segment files](setup/configuration/#time-segments) are the only ones that you will have to modify. All data is stored in `data/` and all scripts are stored in `src/`. For more information see RAPIDS' [File Structure](file-structure.md).

View File

@ -1,19 +1,87 @@
# Migrating from RAPIDS beta
# Migration guides
## Migrating from RAPIDS 0.4.x or older
There are four actions that you need to take if you were using RAPIDS `0.4.3` or older ([before Feb 9th, 2021](https://github.com/carissalow/rapids/releases/tag/v0.4.3)):
??? check "Check the new Overview page"
Check the new [Overview](../setup/overview/) page. Hopefully, it is a better overview of RAPIDS and provides answers to Frequently Asked Questions.
??? check "Deploy RAPIDS in a new folder"
- Clone RAPIDS 1.x in a new folder (do not pull the updates in your current folder)
- Install renv again `snakemake -j1 renv_install` (for Ubuntu take advantage of the [platform specific R `renv` instructions](../setup/installation))
- Restore renv packages `snakemake -j1 renv_restore` (for Ubuntu take advantage of the [platform specific R `renv` instructions](../setup/installation))
- Activate your conda environment
- Move your participant files `pxx.yaml` to the new folder
- Move your time segment files to the new folder
- Move your `.env` file to the new folder
??? check "Migrate your `.env` file to the new `credentials.yaml` format"
The `.env` file is not used anymore, the same credential groups are stored in `credentials.yaml`, migrate your `.env` file by running:
```bash
python tools/update_format_env.py
```
??? check "Reconfigure your `config.yaml`"
Reconfigure your `config.yaml` file by hand (don't copy and paste the old one). Some keys and values changed but the defaults should be compatible with the things you know from RAPIDS 0.x (see below).
The most relevant changes to RAPIDS that you need to know about are:
??? danger "We introduced the concept of data streams"
RAPIDS abstracts sensor data logged by different devices, platforms and stored in different data containers as [data streams](../datastreams/data-streams-introduction/).
The default data stream for `PHONE` is [`aware_mysql`](../datastreams/aware-mysql/), and the default for `FITBIT` is [`fitbitjson_mysql`](../datastreams/fitbitjson-mysql/). This is compatible with the old functionality (AWARE and JSON Fitbit data stored in MySQL). These values are set in `[PHONE_DATA_STREAMS][USE]` and `[FITBIT_DATA_STREAMS][USE]`.
You can [add new data stream](../datastreams/add-new-data-streams/) formats (sensing apps) and containers (database engines, file types, etc.).
If you were processing your Fitbit data either in JSON or plain text (parsed) format, and it was stored in MySQL or CSV files, the changes that you made to your raw data will be compatible. Just choose [`fitbitjson_mysql`](../datastreams/fitbitjson-mysql/), [`fitbitparsed_mysql`](../datastreams/fitbitparsed-mysql/), [`fitbitjson_csv`](../datastreams/fitbitjson-csv/), [`fitbitparsed_csv`](../datastreams/fitbitparsed-csv/) accordingly and set it in `[FITBIT_DATA_STREAMS][USE]`.
In the future, you will not have to change your raw data; you will be able to just change column mappings/values in the data stream's `format.yaml` file.
??? danger "We introduced multiple time zones"
You can now process data from participants that visited multiple time zones. The default is still a single time zone (America/New_York). See how to handle [multiple time zones](../setup/configuration/#multiple-timezones)
??? danger "The keyword `multiple` is now `infer`"
When processing data from smartphones, RAPIDS allows you to [infer](../setup/configuration/#participant-files) the OS of a smartphone by using the keyword `multiple` in the `[PLATFORM]` key of participant files. Now RAPIDS uses `infer` instead of `multiple` Nonetheless, `multiple` still works for backward compatibility.
??? danger "A global `DATABASE_GROUP` does not exist anymore"
There is no global `DATABASE_GROUP` anymore. Each data stream that needs credentials to connect to a database has its own [`DATABASE_GROUP` config key](../setup/configuration/#data-stream-configuration). The groups are defined in `credentials.yaml` instead of the `.env`.
??? danger "`[DEVICE_SENSOR][TABLE]` is now `[DEVICE_SENSOR][CONTAINER]`"
We renamed the keys `[DEVICE_SENSOR][TABLE]` to `[DEVICE_SENSOR][CONTAINER]` to reflect that, with the introduction of data streams, they can point to a database table, file, or any other data container.
??? danger "Creating participant files from the AWARE_DEVICE_TABLE is deprecated"
In previous versions of RAPIDS, you could create participant files automatically using the `aware_device` table. We deprecated this option but you can still achieve the same results if you export the output of the following SQL query as a CSV file and follow the instructions to [create participant files from CSV files](../setup/configuration/#automatic-creation-of-participant-files):
```sql
SELECT device_id, device_id as fitbit_id, CONCAT("p", _id) as empatica_id, CONCAT("p", _id) as pid, if(brand = "iPhone", "ios", "android") as platform, CONCAT("p", _id) as label, DATE_FORMAT(FROM_UNIXTIME((timestamp/1000)- 86400), "%Y-%m-%d") as start_date, CURRENT_DATE as end_date from aware_device order by _id;
```
??? danger "`SCR_SCRIPT` and `SRC_LANGUAGE` are replaced by `SRC_SCRIPT`"
The attributes `SCR_SCRIPT` and `SRC_LANGUAGE` of every sensor `PROVIDER` are replaced by `SRC_SCRIPT`. `SRC_SCRIPT` is a relative path from the RAPIDS root folder to that provider's feature script. We did this to simplify and clarify where the features scripts are stored.
There are no actions to take unless you created your own feature provider; update it with your feature script path.
## Migrating from RAPIDS beta
If you were relying on the [old docs](https://rapidspitt.readthedocs.io/en/latest/) and the most recent version of RAPIDS you are working with is from or before [Oct 13, 2020](https://github.com/carissalow/rapids/commit/640890c7b49492d150accff5c87b1eb25bd97a49) you are using the beta version of RAPIDS.
You can start using the new RAPIDS (we are starting with `v0.1.0`) right away, just take into account the following:
You can start using the RAPIDS `0.1.0` right away, just take into account the following:
1. [Install](setup/installation.md) a new copy of RAPIDS (the R and Python virtual environments didn't change so the cached versions will be reused)
1. Make sure you don't skip a new Installation step to give execution permissions to the RAPIDS script: `chmod +x rapids`
2. Follow the new [Configuration](setup/configuration.md) guide.
1. You can copy and paste your old `.env` file
2. You can migrate your old participant files:
```
??? check "Deploy RAPIDS in a new folder"
- [Install](setup/installation.md) a new copy of RAPIDS (the R and Python virtual environments didn't change so the cached versions will be reused)
- Make sure you don't skip a new Installation step to give execution permissions to the RAPIDS script: `chmod +x rapids`
- Move your old `.env` file
- Move your participant files
??? check "Migrate your participant files"
You can migrate your old participant files to the new YAML format:
```bash
python tools/update_format_participant_files.py
```
3. Get familiar with the new way of [Executing](setup/execution.md) RAPIDS
3. You can proceed to reconfigure your `config.yaml`, its structure is more consistent and should be familiar to you.
!!! info
If you have any questions reach out to us on [Slack](http://awareframework.com:3000/).
??? check "Follow the new Configuration guide"
Follow the new [Configuration](https://www.rapids.science/0.1/setup/configuration/) guide
??? check "Learn more about the new way to run RAPIDS"
Get familiar with the new way of [Executing](https://www.rapids.science/0.1/setup/execution) RAPIDS

View File

@ -0,0 +1,6 @@
{% extends "base.html" %}
{% block disqus %}
<!-- Add custom comment system integration here -->
{% include "partials/integrations/utterances.html" %}
{% endblock %}

View File

@ -0,0 +1,38 @@
<!-- Utterances integration -->
{% if not page.is_homepage %}
<h2 id="__comments">{{ lang.t("meta.comments") }}</h2>
<script type="text/javascript">
var rapids_utterances_theme = false
document.onreadystatechange = function () {
if (document.readyState == "interactive") {
// wait for utterances to load and send it's first message.
addEventListener('message', event => {
if (event.origin !== 'https://utteranc.es' || rapids_utterances_theme == true) {
return;
}
rapids_utterances_theme = true
if(document.body.getAttribute("data-md-color-scheme") == "default")
document.querySelector("iframe.utterances-frame").contentWindow.postMessage({ type: "set-theme", theme: "github-light" },"https://utteranc.es/")
else
document.querySelector("iframe.utterances-frame").contentWindow.postMessage({ type: "set-theme", theme: "photon-dark" },"https://utteranc.es/")
});
document.getElementById('__palette_1').onclick = function(){
document.querySelector("iframe.utterances-frame").contentWindow.postMessage({ type: "set-theme", theme: "github-light" },"https://utteranc.es/")
}
document.getElementById('__palette_2').onclick = function(){
document.querySelector("iframe.utterances-frame").contentWindow.postMessage({ type: "set-theme", theme: "photon-dark" },"https://utteranc.es/")
}
}
}
</script>
<script src="https://utteranc.es/client.js"
repo="carissalow/rapids"
issue-term="pathname"
label="docs comments"
theme="github-light"
crossorigin="anonymous"
async>
</script>
{% endif %}

View File

@ -3,11 +3,11 @@
You need to follow these steps to configure your RAPIDS deployment before you can extract behavioral features
1. Add your [database credentials](#database-credentials)
2. Choose the [timezone of your study](#timezone-of-your-study)
0. Verify RAPIDS can process your [data streams](#supported-data-streams)
3. Create your [participants files](#participant-files)
4. Select what [time segments](#time-segments) you want to extract features on
5. Modify your [device data source configuration](#device-data-source-configuration)
2. Choose the [timezone of your study](#timezone-of-your-study)
5. Configure your [data streams](#data-stream-configuration)
6. Select what [sensors and features](#sensor-and-features-to-process) you want to process
When you are done with this configuration, go to [executing RAPIDS](../execution).
@ -16,59 +16,12 @@ When you are done with this configuration, go to [executing RAPIDS](../execution
Every time you see `config["KEY"]` or `[KEY]` in these docs we are referring to the corresponding key in the `config.yaml` file.
---
## Database credentials
Only follow this step if you are processing smartphone or Fitbit data stored in a database. For reference, we list below the data sources RAPIDS support for each type of device.
## Supported data streams
1. Create an empty file called `#!bash .env` in your RAPIDS root directory
2. Add the following lines and replace your database-specific credentials (user, password, host, and database):
A data stream refers to sensor data collected using a specific type of **device** with a specific **format** and stored in a specific **container**. For example, the `aware_mysql` data stream handles smartphone data (**device**) collected with the [AWARE Framework](https://awareframework.com/) (**format**) stored in a MySQL database (**container**).
``` yaml
[MY_GROUP]
user=MY_USER
password=MY_PASSWORD
host=MY_HOST
port=3306
database=MY_DATABASE
```
??? warning "What is `[MY_GROUP]`?"
The label `[MY_GROUP]` is arbitrary but it has to match the following `config.yaml` key:
```yaml
DATABASE_GROUP: &database_group
MY_GROUP
```
??? hint "Connecting to localhost (host machine) from inside our docker container"
If you are using RAPIDS' docker container and Docker-for-mac or Docker-for-Windows 18.03+, you can connect to a MySQL database in your host machine using `host.docker.internal` instead of `127.0.0.1` or `localhost`. In a Linux host you need to run our docker container using `docker run --network="host" -d moshiresearch/rapids:latest` and then `127.0.0.1` will point to your host machine.
??? hint "Data sources supported for each device type"
| Device | Database | CSV Files | Zip files
|--|--|--|--|
| Smartphone| Yes (MySQL) | No | No |
| Fitbit| Yes (MySQL) | Yes | No |
| Empatica| No | No | Yes |
- RAPIDS only supports MySQL/MariaDB databases. If you would like to add support for a different database engine get in touch and we can discuss how to implement it.
- Fitbit data can be processed as the JSON object produced by Fitbit's API (recommended) or in a parsed tabular fashion.
- Empatica devices produce a zip file with a CSV file per sensor which can be processed directly in RAPIDS.
---
## Timezone of your study
### Single timezone
If your study only happened in a single time zone, select the appropriate code form this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) and change the following config key. Double check your timezone code pick, for example US Eastern Time is `America/New_York` not `EST`
``` yaml
TIMEZONE: &timezone
America/New_York
```
### Multiple timezones
Support coming soon.
Check the table in [introduction to data streams](../../datastreams/data-streams-introduction) to know what data streams we support. If your data stream is supported, continue to the next configuration section, **you will use its label later in this guide** (e.g. `aware_mysql`). If your steam is not supported but you want to implement it, follow the tutorial to [add support for new data streams](../../datastreams/add-new-data-streams) and get in touch by email or in Slack if you have any questions.
---
@ -109,7 +62,8 @@ Participant files link together multiple devices (smartphones and wearables) to
LABEL: test01
START_DATE: 2020-04-23
END_DATE: 2020-10-28
EMPATICA: # Empatica doesn't have a device_id because the devices produce zip files per participant
EMPATICA:
DEVICE_IDS: [empatica1]
LABEL: test01
START_DATE: 2020-04-23
END_DATE: 2020-10-28
@ -120,107 +74,89 @@ Participant files link together multiple devices (smartphones and wearables) to
| Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `[DEVICE_IDS]` | An array of the strings that uniquely identify each smartphone, you can have more than one for when participants changed phones in the middle of the study, in this case, data from all their devices will be joined and relabeled with the last 1 on this list. |
| `[PLATFORMS]` | An array that specifies the OS of each smartphone in `[DEVICE_IDS]` , use a combination of `android` or `ios` (we support participants that changed platforms in the middle of your study!). If you have an `aware_device` table in your database you can set `[PLATFORMS]: [multiple]` and RAPIDS will infer them automatically. |
| `[LABEL]` | A string that is used in reports and visualizations. |
| `[START_DATE]` | A string with format `YYY-MM-DD` . Only data collected *after* this date will be included in the analysis |
| `[END_DATE]` | A string with format `YYY-MM-DD` . Only data collected *before* this date will be included in the analysis |
| `[PLATFORMS]` | An array that specifies the OS of each smartphone in `[DEVICE_IDS]` , use a combination of `android` or `ios` (we support participants that changed platforms in the middle of your study!). You can set `[PLATFORMS]: [infer]` and RAPIDS will infer them automatically (each phone data stream infer this differently, e.g. `aware_mysql` uses the `aware_device` table). |
| `[LABEL]` | A string that is used in reports and visualizations. |
| `[START_DATE]` | A string with format `YYYY-MM-DD` or `YYYY-MM-DD HH:MM:SS`. Only data collected *after* this date time will be included in the analysis. By default, `YYYY-MM-DD` is interpreted as `YYYY-MM-DD 00:00:00`. |
| `[END_DATE]` | A string with format `YYYY-MM-DD` or `YYYY-MM-DD HH:MM:SS`. Only data collected *before* this date time will be included in the analysis. By default, `YYYY-MM-DD` is interpreted as `YYYY-MM-DD 00:00:00`. |
=== "[FITBIT]"
| Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|------------------|-----------------------------------------------------------------------------------------------------------|
| `[DEVICE_IDS]` | An array of the strings that uniquely identify each Fitbit, you can have more than one in case the participant changed devices in the middle of the study, in this case, data from all devices will be joined and relabeled with the last `device_id` on this list. |
| `[LABEL]` | A string that is used in reports and visualizations. |
| `[START_DATE]` | A string with format `YYY-MM-DD` . Only data collected *after* this date will be included in the analysis |
| `[END_DATE]` | A string with format `YYY-MM-DD` . Only data collected *before* this date will be included in the analysis |
| `[LABEL]` | A string that is used in reports and visualizations. |
| `[START_DATE]` | A string with format `YYYY-MM-DD` or `YYYY-MM-DD HH:MM:SS`. Only data collected *after* this date time will be included in the analysis. By default, `YYYY-MM-DD` is interpreted as `YYYY-MM-DD 00:00:00`. |
| `[END_DATE]` | A string with format `YYYY-MM-DD` or `YYYY-MM-DD HH:MM:SS`. Only data collected *before* this date time will be included in the analysis. By default, `YYYY-MM-DD` is interpreted as `YYYY-MM-DD 00:00:00`. |
=== "[EMPATICA]"
| Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|------------------|-----------------------------------------------------------------------------------------------------------|
| `[LABEL]` | A string that is used in reports and visualizations. |
| `[START_DATE]` | A string with format `YYY-MM-DD` . Only data collected *after* this date will be included in the analysis |
| `[END_DATE]` | A string with format `YYY-MM-DD` . Only data collected *before* this date will be included in the analysis
| `[DEVICE_IDS]` | An array of the strings that uniquely identify each Empatica device used by this participant. Since the most common use case involves having multiple zip files from a single device for each person, set this device id to an arbitrary string (we usually use their `pid`) |
| `[LABEL]` | A string that is used in reports and visualizations. |
| `[START_DATE]` | A string with format `YYYY-MM-DD` or `YYYY-MM-DD HH:MM:SS`. Only data collected *after* this date time will be included in the analysis. By default, `YYYY-MM-DD` is interpreted as `YYYY-MM-DD 00:00:00`. |
| `[END_DATE]` | A string with format `YYYY-MM-DD` or `YYYY-MM-DD HH:MM:SS`. Only data collected *before* this date time will be included in the analysis. By default, `YYYY-MM-DD` is interpreted as `YYYY-MM-DD 00:00:00`. |
### Automatic creation of participant files
You have two options a) use the `aware_device` table in your database or b) use a CSV file. In either case, in your `config.yaml`, set the devices (`PHONE`, `FITBIT`, `EMPATICA`) `[ADD]` flag to `TRUE` depending on what devices you used in your study. Set `[DEVICE_ID_COLUMN]` to the name of the column that uniquely identifies each device (only for `PHONE` and `FITBIT`).
You can use a CSV file with a row per participant to automatically create participant files.
=== "aware_device table"
??? "`AWARE_DEVICE_TABLE` was deprecated"
In previous versions of RAPIDS, you could create participant files automatically using the `aware_device` table. We deprecated this option but you can still achieve the same results if you export the output of the following SQL query as a CSV file and follow the instructions below:
Set the following keys in your `config.yaml`
```yaml
CREATE_PARTICIPANT_FILES:
SOURCE:
TYPE: AWARE_DEVICE_TABLE
DATABASE_GROUP: *database_group
CSV_FILE_PATH: ""
TIMEZONE: *timezone
PHONE_SECTION:
ADD: TRUE # or FALSE
DEVICE_ID_COLUMN: device_id # column name
IGNORED_DEVICE_IDS: []
FITBIT_SECTION:
ADD: FALSE # or FALSE
DEVICE_ID_COLUMN: fitbit_id # column name
IGNORED_DEVICE_IDS: []
EMPATICA_SECTION: # Empatica doesn't have a device_id column because the devices produce zip files per participant
ADD: FALSE # or FALSE
```sql
SELECT device_id, device_id as fitbit_id, CONCAT("p", _id) as empatica_id, CONCAT("p", _id) as pid, if(brand = "iPhone", "ios", "android") as platform, CONCAT("p", _id) as label, DATE_FORMAT(FROM_UNIXTIME((timestamp/1000)- 86400), "%Y-%m-%d") as start_date, CURRENT_DATE as end_date from aware_device order by _id;
```
Then run
In your `config.yaml`:
```bash
snakemake -j1 create_participants_files
1. Set `CSV_FILE_PATH` to a CSV file path that complies with the specs described below
2. Set the devices (`PHONE`, `FITBIT`, `EMPATICA`) `[ADD]` flag to `TRUE` depending on what devices you used in your study.
3. Set `[DEVICE_ID_COLUMN]` to the column's name in your CSV file that uniquely identifies each device.
```yaml
CREATE_PARTICIPANT_FILES:
CSV_FILE_PATH: "your_path/to_your.csv"
PHONE_SECTION:
ADD: TRUE # or FALSE
DEVICE_ID_COLUMN: device_id # column name
IGNORED_DEVICE_IDS: []
FITBIT_SECTION:
ADD: FALSE # or FALSE
DEVICE_ID_COLUMN: fitbit_id # column name
IGNORED_DEVICE_IDS: []
EMPATICA_SECTION:
ADD: FALSE
DEVICE_ID_COLUMN: empatica_id # column name
IGNORED_DEVICE_IDS: []
```
Your CSV file (`[CSV_FILE_PATH]`) should have the following columns (headers) but the values within each column can be empty:
| Column | Description |
|------------------|-----------------------------------------------------------------------------------------------------------|
| phone device id | The name of this column has to match `[PHONE_SECTION][DEVICE_ID_COLUMN]`. Separate multiple ids with `;` |
| fitbit device id | The name of this column has to match `[FITBIT_SECTION][DEVICE_ID_COLUMN]`. Separate multiple ids with `;` |
| empatica device id | The name of this column has to match `[EMPATICA_SECTION][DEVICE_ID_COLUMN]`. Since the most common use case involves having multiple zip files from a single device for each person, set this device id to an arbitrary string (we usually use their `pid`) |
| pid | Unique identifiers with the format pXXX (your participant files will be named with this string) |
| platform | Use `android`, `ios` or `infer` as explained above, separate values with `;` |
| label | A human readable string that is used in reports and visualizations. |
| start_date | A string with format `YYY-MM-DD`. |
| end_date | A string with format `YYY-MM-DD`. |
!!! example
We added white spaces to this example to make it easy to read but you don't have to.
```csv
device_id ,fitbit_id, empatica_id ,pid ,label ,platform ,start_date ,end_date
a748ee1a-1d0b-4ae9-9074-279a2b6ba524;dsadas-2324-fgsf-sdwr-gdfgs4rfsdf43 ,fitbit1 , p01 ,p01 ,julio ,android;ios ,2020-01-01 ,2021-01-01
4c4cf7a1-0340-44bc-be0f-d5053bf7390c ,fitbit2 , p02 ,p02 ,meng ,ios ,2021-01-01 ,2022-01-01
```
=== "CSV file"
Then run
Set the following keys in your `config.yaml`.
```yaml
CREATE_PARTICIPANT_FILES:
SOURCE:
TYPE: CSV_FILE
DATABASE_GROUP: ""
CSV_FILE_PATH: "your_path/to_your.csv"
TIMEZONE: *timezone
PHONE_SECTION:
ADD: TRUE # or FALSE
DEVICE_ID_COLUMN: device_id # column name
IGNORED_DEVICE_IDS: []
FITBIT_SECTION:
ADD: FALSE # or FALSE
DEVICE_ID_COLUMN: fitbit_id # column name
IGNORED_DEVICE_IDS: []
EMPATICA_SECTION: # Empatica doesn't have a device_id column because the devices produce zip files per participant
ADD: FALSE # or FALSE
```
Your CSV file (`[SOURCE][CSV_FILE_PATH]`) should have the following columns but you can omit any values you don't have on each column:
| Column | Description |
|------------------|-----------------------------------------------------------------------------------------------------------|
| phone device id | The name of this column has to match `[PHONE_SECTION][DEVICE_ID_COLUMN]`. Separate multiple ids with `;` |
| fitbit device id | The name of this column has to match `[FITBIT_SECTION][DEVICE_ID_COLUMN]`. Separate multiple ids with `;` |
| pid | Unique identifiers with the format pXXX (your participant files will be named with this string |
| platform | Use `android`, `ios` or `multiple` as explained above, separate values with `;` |
| label | A human readable string that is used in reports and visualizations. |
| start_date | A string with format `YYY-MM-DD`. |
| end_date | A string with format `YYY-MM-DD`. |
!!! example
```csv
device_id,pid,label,platform,start_date,end_date,fitbit_id
a748ee1a-1d0b-4ae9-9074-279a2b6ba524;dsadas-2324-fgsf-sdwr-gdfgs4rfsdf43,p01,julio,android;ios,2020-01-01,2021-01-01,fitbit1
4c4cf7a1-0340-44bc-be0f-d5053bf7390c,p02,meng,ios,2021-01-01,2022-01-01,fitbit2
```
Then run
```bash
snakemake -j1 create_participants_files
```
```bash
snakemake -j1 create_participants_files
```
---
@ -394,103 +330,256 @@ Time segments (or epochs) are the time windows on which you want to extract beha
survey2,1584291600000,2H,1H,-1,klj34oi2-8frk-2343-21kk-324ljklewlr3
```
---
## Device Data Source Configuration
You might need to modify the following config keys in your `config.yaml` depending on what devices your participants used and where you are storing your data (ignore the sections of devices you did not use).
## Timezone of your study
### Single timezone
If your study only happened in a single time zone or you want to ignore short trips of your participants to different time zones, select the appropriate code form this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) and change the following config key. Double-check your timezone code pick, for example, US Eastern Time is `America/New_York` not `EST`
``` yaml
TIMEZONE:
TYPE: SINGLE
TZCODE: America/New_York
```
### Multiple timezones
If your participants lived on different time zones or they travelled across time zones, and you know when participants' devices were in a specific time zone, RAPIDS can use this data to process your data streams with the correct date-time. You need to provide RAPIDS with the time zone data in a CSV file (`[TZCODES_FILE]`) in the format described below.
``` yaml
TIMEZONE:
TYPE: MULTIPLE
SINGLE:
TZCODE: America/New_York
MULTIPLE:
TZCODES_FILE: path_to/time_zones_csv.file
IF_MISSING_TZCODE: STOP
DEFAULT_TZCODE: America/New_York
FITBIT:
ALLOW_MULTIPLE_TZ_PER_DEVICE: False
INFER_FROM_SMARTPHONE_TZ: False
```
Parameters for `[TIMEZONE]`
|Parameter &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
|--|--|
|`[TYPE]`| Either `SINGLE` or `MULTIPLE` as explained above |
|`[SINGLE][TZCODE]`| The time zone code from this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) to be used across all devices |
|`[MULTIPLE][TZCODES_FILE]`| A CSV file containing the time and code from this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) visited by each device in the study. Multiple devices can be linked to the same person, read more in [Participants Files](#participant-files) |
|`[MULTIPLE][IF_MISSING_TZCODE]`| When a device is missing from `[TZCODES_FILE]` Set this flag to `STOP` to stop RAPIDS execution and show an error, or to `USE_DEFAULT` to assign the time zone specified in `[DEFAULT_TZCODE]` to any such devices |
|`[MULTIPLE][FITBIT][ALLOW_MULTIPLE_TZ_PER_DEVICE]`| You only need to care about this flag if one or more Fitbit devices sensed data in one or more time zones, and you want RAPIDS to take into account this in its feature computation. Read more in "How does RAPIDS handle Fitbit devices?" below. |
|`[MULTIPLE][FITBIT][INFER_FROM_SMARTPHONE_TZ]`| You only need to care about this flag if one or more Fitbit devices sensed data in one or more time zones, and you want RAPIDS to take into account this in its feature computation. Read more in "How does RAPIDS handle Fitbit devices?" below. |
??? info "Format of `TZCODES_FILE`"
`TZCODES_FILE` has three columns and a row for each time zone a device visited (a device can be a smartphone or wearable (Fitbit/Empatica)):
|Column | Description |
|--|--|
|`device_id`|A string that uniquely identifies a smartphone or wearable|
|`tzcode`| A string with the appropriate code from this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) that represents the time zone where the `device` sensed data|
|`timestamp`| A UNIX timestamp indicating when was the first time this `device_id` sensed data in `tzcode`|
```csv
device_id, tzcode, timestamp
13dbc8a3-dae3-4834-823a-4bc96a7d459d, America/New_York, 1587500000000
13dbc8a3-dae3-4834-823a-4bc96a7d459d, America/Mexico_City, 1587600000000
13dbc8a3-dae3-4834-823a-4bc96a7d459d, America/Los_Angeles, 1587700000000
65sa66a5-2d2d-4524-946v-44ascbv4sad7, Europe/Amsterdam, 1587100000000
65sa66a5-2d2d-4524-946v-44ascbv4sad7, Europe/Berlin, 1587200000000
65sa66a5-2d2d-4524-946v-44ascbv4sad7, Europe/Amsterdam, 1587300000000
```
Using this file, RAPDIS will create time zone intervals per device, for example for `13dbc8a3-dae3-4834-823a-4bc96a7d459d`:
- Interval 1 `[1587500000000, 1587599999999]` for `America/New_York`
- Interval 2 `[1587600000000, 1587699999999]` for `America/Mexico_City`
- Interval 3 `[1587700000000, now]` for `America/Los_Angeles`
Any sensor data row from a device will be assigned a timezone if it falls within that interval, for example:
- A screen row sensed at `1587533333333` will be assigned to `America/New_York` because it falls within Interval 1
- A screen row sensed at `1587400000000` will be discarded because it was logged outside any interval.
??? note "What happens if participant X lives in Los Angeles but participant Y lives in Amsterdam and they both stayed there during my study?"
Add a row per participant and set timestamp to `0`:
```csv
device_id, tzcode, timestamp
13dbc8a3-dae3-4834-823a-4bc96a7d459d, America/Los_Angeles, 0
65sa66a5-2d2d-4524-946v-44ascbv4sad7, Europe/Amsterdam, 0
```
??? note "What happens if I forget to add a timezone for one or more devices?"
It depends on `[IF_MISSING_TZCODE]`.
If `[IF_MISSING_TZCODE]` is set to `STOP`, RAPIDS will stop its execution and show you an error message.
If `[IF_MISSING_TZCODE]` is set to `USE_DEFAULT`, it will assign the time zone specified in `[DEFAULT_TZCODE]` to any devices with missing time zone information in `[TZCODES_FILE]`. This is helpful if only a few of your participants had multiple timezones and you don't want to specify the same time zone for the rest.
??? note "How does RAPIDS handle Fitbit devices?"
Fitbit devices are not time zone aware and they always log data with a local date-time string.
- When none of the Fitbit devices in your study changed time zones (e.g., `p01` was always in New York and `p02` was always in Amsterdam), you can set a single time zone per Fitbit device id along with a timestamp 0 (you can still assign multiple time zones to smartphone device ids)
```csv
device_id, tzcode, timestamp
fitbit123, America/New_York, 0
fitbit999, Europe/Amsterdam, 0
```
- On the other hand, when at least one of your Fitbit devices changed time zones **AND** you want RAPIDS to take into account these changes, you need to set `[ALLOW_MULTIPLE_TZ_PER_DEVICE]` to `True`. **You have to manually allow this option because you need to be aware it can produce inaccurate features around the times when time zones changed**. This is because we cannot know exactly when the Fitbit device detected and processed the time zone change.
If you want to `ALLOW_MULTIPLE_TZ_PER_DEVICE` you will need to add any time zone changes per device in the `TZCODES_FILE` as explained above. You could obtain this data by hand but if your participants also used a smartphone during your study, you can use their time zone logs. Recall that in RAPIDS every participant is represented with a participant file `pXX.yaml`, this file links together multiple devices and we will use it to know what smartphone time zone data should be applied to Fitbit devices. Thus set `INFER_FROM_SMARTPHONE_TZ` to `TRUE`, if you have included smartphone time zone data in your `TZCODE_FILE` and you want to make a participant's Fitbit data time zone aware with their respective smartphone data.
---
## Data Stream Configuration
Modify the following keys in your `config.yaml` depending on the [data stream](../../datastreams/data-streams-introduction) you want to process.
=== "Phone"
The relevant `config.yaml` section looks like this by default:
Set `[PHONE_DATA_STREAMS][TYPE]` to the smartphone data stream you want to process (e.g. `aware_mysql`) and configure its parameters (e.g. `[DATABASE_GROUP]`). Ignore the parameters of streams you are not using (e.g. `[FOLDER]` of `aware_csv`).
```yaml
PHONE_DATA_CONFIGURATION:
SOURCE:
TYPE: DATABASE
DATABASE_GROUP: *database_group
DEVICE_ID_COLUMN: device_id # column name
TIMEZONE:
TYPE: SINGLE # SINGLE (MULTIPLE support coming soon)
VALUE: *timezone
PHONE_DATA_STREAMS:
USE: aware_mysql
# AVAILABLE:
aware_mysql:
DATABASE_GROUP: MY_GROUP
aware_csv:
FOLDER: data/external/aware_csv
```
**Parameters for `[PHONE_DATA_CONFIGURATION]`**
=== "aware_mysql"
| Key | Description |
|---------------------|----------------------------------------------------------------------------------------------------------------------------|
| `[DATABASE_GROUP]` | A database credentials group. Read the instructions below to set it up |
--8<---- "docs/snippets/database.md"
=== "aware_csv"
| Key | Description |
|---------------------|----------------------------------------------------------------------------------------------------------------------------|
| `[FOLDER]` | Folder where you have to place a CSV file **per** phone sensor. Each file has to contain all the data from every participant you want to process. |
| Key | Description |
|---------------------|----------------------------------------------------------------------------------------------------------------------------|
| `[SOURCE] [TYPE]` | Only `DATABASE` is supported (phone data will be pulled from a database) |
| `[SOURCE] [DATABASE_GROUP]` | `*database_group` points to the value defined before in [Database credentials](#database-credentials) |
| `[SOURCE] [DEVICE_ID_COLUMN]` | A column that contains strings that uniquely identify smartphones. For data collected with AWARE this is usually `device_id` |
| `[TIMEZONE] [TYPE]` | Only `SINGLE` is supported for now |
| `[TIMEZONE] [VALUE]` | `*timezone` points to the value defined before in [Timezone of your study](#timezone-of-your-study) |
=== "Fitbit"
The relevant `config.yaml` section looks like this by default:
Set `[FITBIT_DATA_STREAMS][TYPE]` to the Fitbit data stream you want to process (e.g. `fitbitjson_mysql`) and configure its parameters (e.g. `[DATABASE_GROUP]`). Ignore the parameters of the other streams you are not using (e.g. `[FOLDER]` of `aware_csv`).
```yaml
FITBIT_DATA_CONFIGURATION:
SOURCE:
TYPE: DATABASE # DATABASE or FILES (set each [FITBIT_SENSOR][TABLE] attribute with a table name or a file path accordingly)
COLUMN_FORMAT: JSON # JSON or PLAIN_TEXT
DATABASE_GROUP: *database_group
DEVICE_ID_COLUMN: device_id # column name
TIMEZONE:
TYPE: SINGLE # Fitbit devices don't support time zones so we read this data in the timezone indicated by VALUE
VALUE: *timezone
!!! warning
You will probably have to tell RAPIDS the name of the columns where you stored your Fitbit data. To do this, modify your chosen stream's `format.yaml` column mappings to match your raw data column names.
```
**Parameters for For `[FITBIT_DATA_CONFIGURATION]`**
| Key | Description |
|------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `[SOURCE]` `[TYPE]` | `DATABASE` or `FILES` (set each `[FITBIT_SENSOR]` `[TABLE]` attribute accordingly with a table name or a file path) |
| `[SOURCE]` `[COLUMN_FORMAT]` | `JSON` or `PLAIN_TEXT`. Column format of the source data. If you pulled your data directly from the Fitbit API the column containing the sensor data will be in `JSON` format |
| `[SOURCE]` `[DATABASE_GROUP]` | `*database_group` points to the value defined before in [Database credentials](#database-credentials). Only used if `[TYPE]` is `DATABASE` . |
| `[SOURCE]` `[DEVICE_ID_COLUMN]` | A column that contains strings that uniquely identify Fitbit devices. |
| `[TIMEZONE]` `[TYPE]` | Only `SINGLE` is supported (Fitbit devices always store data in local time). |
| `[TIMEZONE]` `[VALUE]` | `*timezone` points to the value defined before in [Timezone of your study](#timezone-of-your-study) |
=== "Empatica"
The relevant `config.yaml` section looks like this by default:
```yaml
SOURCE:
TYPE: ZIP_FILE
FOLDER: data/external/empatica
TIMEZONE:
TYPE: SINGLE # Empatica devices don't support time zones so we read this data in the timezone indicated by VALUE
VALUE: *timezone
FITBIT_DATA_STREAMS:
USE: fitbitjson_mysql
# AVAILABLE:
fitbitjson_mysql:
DATABASE_GROUP: MY_GROUP
SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: False
fitbitjson_csv:
FOLDER: data/external/fitbit_csv
SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: False
fitbitparsed_mysql:
DATABASE_GROUP: MY_GROUP
SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: False
fitbitparsed_csv:
FOLDER: data/external/fitbit_csv
SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: False
```
**Parameters for `[EMPATICA_DATA_CONFIGURATION]`**
=== "fitbitjson_mysql"
| Key | Description |
|---------------------|----------------------------------------------------------------------------------------------------------------------------|
| `[SOURCE] [TYPE]` | Only `ZIP_FILE` is supported (Empatica devices save sensor data in CSV files that are zipped together).|
| `[SOURCE] [FOLDER]` | The relative path to a folder containing one folder per participant. The name of a participant folder should match their pid in `config[PIDS]`, for example `p01`. Each participant folder can have one or more zip files with any name; in other words, the sensor data contained in those zip files belongs to a single participant. The zip files are [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) generated by Empatica and have a CSV file per sensor (`ACC`, `HR`, `TEMP`, `EDA`, `BVP`, `TAGS`). All CSV files of the same type contained in one or more zip files are uncompressed, parsed, sorted by timestamp, and joinned together.|
| `[TIMEZONE] [TYPE]` | Only `SINGLE` is supported for now |
| `[TIMEZONE] [VALUE]` | `*timezone` points to the value defined before in [Timezone of your study](#timezone-of-your-study) |
This data stream process Fitbit data inside a JSON column as obtained from the Fitbit API and stored in a MySQL database. Read more about its column mappings and mutations in [`fitbitjson_mysql`](../../datastreams/fitbitjson-mysql#format).
??? example "Example of an EMPATICA FOLDER"
In the file tree below, we want to process the data of three participants: `p01`, `p02`, and `p03`. `p01` has two zip files, `p02` has only one zip file, and `p03` has three zip files. Each zip will have a CSV file per sensor that are joinned together and process by RAPIDS. These zip files are generated by Empatica.
```bash
data/ # this folder exists in the root RAPIDS folder
external/
empatica/
p01/
file1.zip
file2.zip
p02/
aaaa.zip
p03/
t1.zip
t2.zip
t3.zip
```
| Key | Description |
|---------------------|----------------------------------------------------------------------------------------------------------------------------|
| `[DATABASE_GROUP]` | A database credentials group. Read the instructions below to set it up |
| `[SLEEP_SUMMARY_EPISODE_DAY_ANCHOR]` | One of `start` or `end`. Summary sleep episodes are considered as events based on either the start timestamp or end timestamp (they will belong to the day where they start or end). |
--8<---- "docs/snippets/database.md"
=== "fitbitjson_csv"
This data stream process Fitbit data inside a JSON column as obtained from the Fitbit API and stored in a CSV file. Read more about its column mappings and mutations in [`fitbitjson_csv`](../../datastreams/fitbitjson-csv#format).
| Key | Description |
|---------------------|----------------------------------------------------------------------------------------------------------------------------|
| `[FOLDER]` | Folder where you have to place a CSV file **per** Fitbit sensor. Each file has to contain all the data from every participant you want to process. |
| `[SLEEP_SUMMARY_EPISODE_DAY_ANCHOR]` | One of `start` or `end`. Summary sleep episodes are considered as events based on either the start timestamp or end timestamp (they will belong to the day where they start or end). |
=== "fitbitparsed_mysql"
This data stream process Fitbit data stored in multiple columns after being parsed from the JSON column returned by Fitbit API and stored in a MySQL database. Read more about its column mappings and mutations in [`fitbitparsed_mysql`](../../datastreams/fitbitparsed-mysql#format).
| Key | Description |
|---------------------|----------------------------------------------------------------------------------------------------------------------------|
| `[DATABASE_GROUP]` | A database credentials group. Read the instructions below to set it up |
| `[SLEEP_SUMMARY_EPISODE_DAY_ANCHOR]` | One of `start` or `end`. Summary sleep episodes are considered as events based on either the start timestamp or end timestamp (they will belong to the day where they start or end). |
--8<---- "docs/snippets/database.md"
=== "fitbitparsed_csv"
This data stream process Fitbit data stored in multiple columns (plain text) after being parsed from the JSON column returned by Fitbit API and stored in a CSV file. Read more about its column mappings and mutations in [`fitbitparsed_csv`](../../datastreams/fitbitparsed-csv#format).
| Key | Description |
|---------------------|----------------------------------------------------------------------------------------------------------------------------|
| `[FOLDER]` | Folder where you have to place a CSV file **per** Fitbit sensor. Each file has to contain all the data from every participant you want to process. |
| `[SLEEP_SUMMARY_EPISODE_DAY_ANCHOR]` | One of `start` or `end`. Summary sleep episodes are considered as events based on either the start timestamp or end timestamp (they will belong to the day where they start or end). |
=== "Empatica"
Set `[USE]` to the Empatica data stream you want to use; see the table in [introduction to data streams](../../datastreams/data-streams-introduction). Configure any parameters as indicated below.
```yaml
EMPATICA_DATA_STREAMS:
USE: empatica_zip
# AVAILABLE:
empatica_zip:
FOLDER: data/external/empatica
```
=== "empatica_zip"
| Key | Description |
|---------------------|----------------------------------------------------------------------------------------------------------------------------|
| `[FOLDER]` | The relative path to a folder containing one subfolder per participant. The name of a participant folder should match their device_id assigned in their participant file. Each participant folder can have one or more zip files with any name; in other words, the sensor data in those zip files belong to a single participant. The zip files are [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) generated by Empatica and have a CSV file per sensor (`ACC`, `HR`, `TEMP`, `EDA`, `BVP`, `TAGS`). All CSV files of the same type contained in one or more zip files are uncompressed, parsed, sorted by timestamp, and joined together.|
??? example "Example of an EMPATICA FOLDER"
In the file tree below, we want to process three participants' data: `p01`, `p02`, and `p03`. `p01` has two zip files, `p02` has only one zip file, and `p03` has three zip files. Each zip has a CSV file per sensor that are joined together and processed by RAPIDS.
```bash
data/ # this folder exists in the root RAPIDS folder
external/
empatica/
p01/
file1.zip
file2.zip
p02/
aaaa.zip
p03/
t1.zip
t2.zip
t3.zip
```
---

View File

@ -16,19 +16,19 @@ After you have [installed](../installation) and [configured](../configuration) R
Any changes to the `config.yaml` file will be applied automatically and only the relevant files will be updated. This means that after modifying the features list for `PHONE_MESSAGE` for example, RAPIDS will execute the script that computes `MESSAGES` features and update its output file.
!!! hint "Multi-core"
You can run RAPIDS over multiple cores by modifying the `-j` argument (e.g. use `-j8` to use 8 cores). **However**, take into account that this means multiple sensor datasets for different participants will be loaded in memory at the same time. If RAPIDS crashes because it ran out of memory reduce the number of cores and try again.
You can run RAPIDS over multiple cores by modifying the `-j` argument (e.g. use `-j8` to use 8 cores). **However**, take into account that this means multiple sensor datasets for different participants will be loaded in memory at the same time. If RAPIDS crashes because it ran out of memory, reduce the number of cores and try again.
As reference, we have run RAPIDS over 12 cores and 32 Gb of RAM without problems for a study with 200 participants with 14 days of low-frequency smartphone data (no accelerometer, gyroscope, or magnetometer).
!!! hint "Deleting RAPIDS output"
If you want to delete all the output files RAPIDS produces you can execute the following command:
If you want to delete all the output files RAPIDS produces, you can execute the following command:
```bash
./rapids -j1 --delete-all-output
```
!!! hint "Forcing a complete rerun"
If you want to update your raw data or rerun the whole pipeline from scratch run the following commands:
!!! hint "Forcing a complete rerun or updating your raw data in RAPIDS"
If you want to update your raw data or rerun the whole pipeline from scratch, run the following commands:
```bash
./rapids -j1 --delete-all-output

View File

@ -0,0 +1,136 @@
# Overview
Let's review some key concepts we use throughout these docs:
|Definition&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;| Description|
|--|--|
|Data Stream|Set of sensor data collected using a specific type of **device** with a specific **format** and stored in a specific **container**. For example, smartphone (device) data collected with the [AWARE Framework](https://awareframework.com/) (format) and stored in a MySQL database (container).|
|Device| A mobile or wearable device, like smartphones, Fitbit wrist bands, Oura Rings, etc.|
|Sensor| A physical or digital module builtin in a device that produces a data stream. For example, a smartphone's accelerometer or screen.
|Format| A file in RAPIDS that describes how sensor data from a device matches RAPIDS data representation.|
|Container|An electronic repository of data, it can be a database, a file, a Web API, etc. RAPIDS connects to containers through container scripts.|
|Participant|A person that took part in a monitoring study|
|Behavioral feature| A metric computed from raw sensor data quantifying the behavior of a participant. For example, time spent at home computed from location data. These are also known as digital biomarkers|
|Time segment| Time segments (or epochs) are the time windows on which RAPIDS extracts behavioral features. For example, you might want to compute participants' time at home every morning or only during weekends. You define time segments in a CSV file that RAPIDS processes.|
|Time zone| A string code like `America/New_York` that represents a time zone where a device logged data. You can process data collected in single or multiple time zones.|
|Provider| A script that creates behavioral features for a specific sensor. Providers are created by the core RAPIDS team or by the community, which are named after its first author like [[PHONE_LOCATIONS][DORYAB]](../../features/phone-locations/#doryab-provider).|
|config.yaml| A YAML file where you can modify parameters to process data streams and behavioral features. This is the heart of RAPIDS and the file that you will modify the most.|
|credentials.yaml| A YAML file where you can define credential groups (user, password, host, etc.) if your data stream needs to connect to a database or Web API|
|Participant file| A YAML file that links one or more smartphone or wearable devices that a single participant used. RAPIDS needs one file per participant. |
You can do one or more of these things with RAPIDS:
1. [Extract behavioral features](../../features/feature-introduction/) from smartphone, Fitbit, and Empatica's [supported data streams](../../datastreams/data-streams-introduction/)
1. [Add your own behavioral features](../../features/add-new-features/) (we can include them in RAPIDS if you want to share them with the community)
1. [Add support for new data streams](../../datastreams/add-new-data-streams/) if yours cannot be processed by RAPIDS yet
1. Create visualizations for [data quality control](../../visualizations/data-quality-visualizations/) and [feature inspection](../../visualizations/feature-visualizations/)
1. [Extending RAPIDS to organize your analysis](../../workflow-examples/analysis/) and publish a code repository along with your code
**In order to follow any of the previous tutorials, you will have to [Install](../installation/), [Configure](../configuration/), and learn how to [Execute](../execution/) RAPIDS.**
!!! hint
- We recommend you follow the [Minimal Example](../../workflow-examples/minimal/) tutorial to get familiar with RAPIDS
- [Email us](../../team), create a [Github issue](https://github.com/carissalow/rapids/issues) or text us in [Slack](http://awareframework.com:3000/) if you have any questions
## Frequently Asked Questions
### General
??? info "What exactly is RAPIDS?"
RAPIDS is a group of configuration files and R and Python scripts that are executed by [Snakemake](https://snakemake.github.io/). You can get a copy of RAPIDS by cloning our Github repository.
RAPIDS is not a web application or server; all the processing is done in your laptop, server, or computer cluster.
??? info "How does RAPIDS work?"
You will most of the time only have to modify configuration files in YAML format (`config.yaml`, `credentials.yaml`, and participant files `pxx.yaml`), and in CSV format (time zones and time segments).
RAPIDS pulls data from different data containers and processes it in steps. The input/output of each step is saved as a CSV file for inspection; you can check what files are created for each sensor provider on their documentation page. All data is stored in `data/`, and all processing Python and R scripts are stored in `src/`.
??? example "User and File interactions in RAPIDS"
In the figure below, we represent the interactions between users and files. After a user modifies the configuration files mentioned above, the `Snakefile` file will search for and execute the Snakemake rules that contain the Python or R scripts necessary to generate or update the required output files (behavioral features, plots, etc.).
<figure>
<img src="../../img/files.png" max-width="50%" />
<figcaption>Interaction diagram between the user, and important files in RAPIDS</figcaption>
</figure>
??? example "Data flow in RAPIDS"
In the figure below, we represent the flow of data in RAPIDS. In broad terms, smartphone and wearable devices log [data streams](../../datastreams/data-streams-introduction/) with a certain format to a data container (database, file, etc.).
RAPIDS can connect to these containers if it has a `format.yaml` and a `container.[R|py]` script used to pull the correct data and mutate it to comply with RAPIDS' internal data representation. Once the data stream is in RAPIDS, it goes through some basic transformations (scripts), one that assigns a time segment and a time zone to each data row, and another one that creates "episodes" of data for some sensors that need it (like screen, battery, activity recognition, and sleep intraday day). After this, RAPIDS executes the requested provider script that computes behavioral features per time segment instance. After every feature is computed, they are joined per sensor, per participant, and study. Visualizations are built based on raw data with date-time information or based on computed features.
<figure>
<img src="../../img/dataflow.png" max-width="50%" />
<figcaption>Data stream flow in RAPIDS</figcaption>
</figure>
??? info "Is my data private?"
Absolutely, you are processing your data with your own copy of RAPIDS in your laptop, server, or computer cluster, so neither we nor anyone else can have access to your datasets.
??? info "Do I need to have coding skills to use RAPIDS?"
If you want to extract the behavioral features or visualizations that RAPIDS offers out of the box, the answer is no. However, you need to be comfortable running commands in your terminal and familiar with editing YAML files and CSV files.
If you want to add support for new data streams or behavioral features, you need to be familiar with R or Python.
??? info "Is RAPIDS open-source or free?"
Yes, RAPIDS is both open-source and free.
??? info "How do I cite RAPIDS?"
Please refer to our [Citation guide](../../citation/); depending on what parts of RAPIDS you used, we also ask you to cite the work of other authors that shared their work.
??? info "I have a lot of data, can RAPIDS handle it/ is RAPIDS fast enough?"
Yes, we use Snakemake under the hood, so you can automatically distribute RAPIDS execution over multiple [cores](../execution/) or [clusters](https://snakemake.readthedocs.io/en/stable/executing/cluster.html). RAPIDS processes data per sensor and participant, so it can take advantage of this parallel processing.
??? info "What are the advantages of using RAPIDS over implementing my own analysis code?"
We believe RAPIDS can benefit your analysis in several ways:
- RAPIDS has more than 250 [behavioral features](../../features/add-new-features/) available, many of them tested and used by other researchers.
- RAPIDS can extract features in dynamic [time segments](../../setup/configuration/#time-segments) (for example, every x minutes, x hours, x days, x weeks, x months, etc.). This is handy because you don't have to deal with time zones, light saving changes, or date arithmetic.
- Your analysis is less prone to errors. Every participant sensor dataset is analyzed in the same way and isolated from each other.
- If you have lots of data, out-of-the-box parallel execution will speed up your analysis and if your computer crashes, RAPIDS will start from where it left of.
- You can publish your analysis code along with your papers and be sure it will run exactly as it does in your computer.
- You can still add your own [behavioral features](../../features/add-new-features/) and [data streams](../../datastreams/add-new-data-streams/) if you need to, and the community will be able to reuse your work.
### Data Streams
??? info "Can I process smartphone data collected with Beiwe, PurpleRobot, or app X?"
Yes, but you need to add a new data stream to RAPIDS (a new `format.yaml` and container script in R or Python). Follow this [tutorial](../../datastreams/add-new-data-streams/). [Email us](../../team), create a [Github issue](https://github.com/carissalow/rapids/issues) or text us in [Slack](http://awareframework.com:3000/) if you have any questions.
If you do so, let us know so we can integrate your work into RAPIDS.
??? info "Can I process data from Oura Rings, Actigraphs, or wearable X?"
The only wearables we support at the moment are Empatica and Fitbit. However, get in touch if you need to process data from a different wearable. We have limited resources so we add support for different devices on an as-needed basis, but we would be happy to collaborate with you to add new wearables. [Email us](../../team), create a [Github issue](https://github.com/carissalow/rapids/issues) or text us in [Slack](http://awareframework.com:3000/) if you have any questions.
??? info "Can I process smartphone or wearable data stored in PostgreSQL, Oracle, SQLite, CSV files, or data container X?"
Yes, but you need to add a new data stream to RAPIDS (a new `format.yaml` and container script in R or Python). Follow this [tutorial](../../datastreams/add-new-data-streams/). If you are processing data streams we already support like AWARE, Fitbit, or Empatica and are just connecting to a different container; you can reuse their `format.yaml` and only implement a new container script. [Email us](../../team), create a [Github issue](https://github.com/carissalow/rapids/issues) or text us in [Slack](http://awareframework.com:3000/) if you have any questions.
If you do so, let us know so we can integrate your work into RAPIDS.
??? info "I have participants that live in different time zones and some that travel; can RAPIDS handle this?"
Yes, RAPIDS can handle [single or multiple timezones](../../setup/configuration/#timezone-of-your-study) per participant. You can use time zone data collected by smartphones or collected by hand.
??? info "Some of my participants used more than one device during my study; can RAPIDS handle this?"
Yes, you can link more than one smartphone or wearable device to a single participant. RAPIDS will merge them and sort them automatically.
??? info "Some of my participants switched from Android to iOS or vice-versa during my study; can RAPIDS handle this?"
Yes, data from multiple smartphones can be linked to a single participant. All iOS data is converted to Android data before merging it.
### Extending RAPIDS
??? info "Can I add my own behavioral features/digital biomarkers?"
Yes, you can implement your own features in R or Python following this [tutorial](../../features/add-new-features/)
??? info "Can I extract behavioral features based on two or more sensors?"
Yes, we do this for `PHONE_DATA_YIELD` (combines all phone sensors), `PHONE_LOCATIONS` (combines location and data yield data), `PHONE_APPLICATIONS_BACKGROUND` (combines screen and app usage data), and `FITBIT_INTRADAY_STEPS` (combines Fitbit and sleep and step data).
However, we haven't come up with a user-friendly way to configure this, and currently, we join sensors on a case-by-case basis. This is mainly because not enough users have needed this functionality so far. Get in touch, and we can set it up together; the more use cases we are aware of, the easier it will be to integrate this into RAPIDS.
??? info "I know how to program in Python or R but not both. Can I still use or extend RAPIDS?"
Yes, you don't need to write any code to use RAPIDS out of the box. If you need to add support for new [data streams](../../datastreams/add-new-data-streams/) or [behavioral features](../../features/add-new-features/) you can use scripts in either language.
??? info "I have scripts that clean raw data from X sensor, can I use them with RAPIDS?"
Yes, you can add them as a [`[MUTATION][SCRIPT]`](../../datastreams/add-new-data-streams/#complex-mapping) in the `format.yaml` of the [data stream](../../datastreams/data-streams-introduction/) you are using. You will add a `main` function that will receive a data frame with the raw data for that sensor that in turn will be used to compute behavioral features.

View File

@ -0,0 +1,606 @@
If you collected sensor data with the vanilla (original) AWARE mobile clients, you shouldn't need to modify this format (described below).
Remember that a format maps and transforms columns in your raw data stream to the [mandatory columns RAPIDS needs](../mandatory-phone-format).
The yaml file that describes the format of this data stream is at:
```bash
src/data/streams/aware_csv/format.yaml
```
For some sensors, we need to transform iOS data into Android format; you can refer to [OS complex mapping](../../datastreams/add-new-data-streams/#os-complex-mapping) for learn how this works.
!!! hint
The mappings in this stream (RAPIDS/Stream) are the same names because AWARE data was the first stream RAPIDS supported, meaning that it considers AWARE column names the default.
??? info "PHONE_ACCELEROMETER"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| DOUBLE_VALUES_0 | double_values_0 |
| DOUBLE_VALUES_1 | double_values_1 |
| DOUBLE_VALUES_2 | double_values_2 |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
Same as ANDROID
??? info "PHONE_ACTIVITY_RECOGNITION"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| ACTIVITY_NAME | activity_name |
| ACTIVITY_TYPE | activity_type |
| CONFIDENCE | confidence |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| ACTIVITY_NAME | FLAG_TO_MUTATE |
| ACTIVITY_TYPE | FLAG_TO_MUTATE |
| CONFIDENCE | FLAG_TO_MUTATE |
**MUTATION**
- **COLUMN_MAPPINGS**
| Script column | Stream column |
|-----------------|-----------------|
| ACTIVITIES | activities |
| CONFIDENCE | confidence |
- **SCRIPTS**
```bash
src/data/streams/mutations/phone/aware/activity_recogniton_ios_unification.R
```
!!! note
For RAPIDS columns of `ACTIVITY_NAME` and `ACTIVITY_TYPE`:
- if stream's `activities` field is automotive, set `ACTIVITY_NAME` = in_vehicle and `ACTIVITY_TYPE` = 0
- if stream's `activities` field is cycling, set `ACTIVITY_NAME` = on_bicycle and `ACTIVITY_TYPE` = 1
- if stream's `activities` field is walking, set `ACTIVITY_NAME` = walking and `ACTIVITY_TYPE` = 7
- if stream's `activities` field is running, set `ACTIVITY_NAME` = running and `ACTIVITY_TYPE` = 8
- if stream's `activities` field is stationary, set `ACTIVITY_NAME` = still and `ACTIVITY_TYPE` = 3
- if stream's `activities` field is unknown, set `ACTIVITY_NAME` = unknown and `ACTIVITY_TYPE` = 4
For RAPIDS `CONFIDENCE` column:
- if stream's `confidence` field is 0, set `CONFIDENCE` = 0
- if stream's `confidence` field is 1, set `CONFIDENCE` = 50
- if stream's `confidence` field is 2, set `CONFIDENCE` = 100
??? info "PHONE_APPLICATIONS_CRASHES"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|--------------------|--------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| PACKAGE_NAME | package_name |
| APPLICATION_NAME | application_name |
| APPLICATION_VERSION| application_version|
| ERROR_SHORT | error_short |
| ERROR_LONG | error_long |
| ERROR_CONDITION | error_condition |
| IS_SYSTEM_APP | is_system_app |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
This sensor is not supported by iOS devices.
??? info "PHONE_APPLICATIONS_FOREGROUND"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|--------------------|--------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| PACKAGE_NAME | package_name |
| APPLICATION_NAME | application_name |
| IS_SYSTEM_APP | is_system_app |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
This sensor is not supported by iOS devices.
??? info "PHONE_APPLICATIONS_NOTIFICATIONS"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|--------------------|--------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| PACKAGE_NAME | package_name |
| APPLICATION_NAME | application_name |
| TEXT | text |
| SOUND | sound |
| VIBRATE | vibrate |
| DEFAULTS | defaults |
| FLAGS | flags |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
This sensor is not supported by iOS devices.
??? info "PHONE_BATTERY"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| BATTERY_STATUS | battery_status |
| BATTERY_LEVEL | battery_level |
| BATTERY_SCALE | battery_scale |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS Client V1"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| BATTERY_STATUS | FLAG_TO_MUTATE |
| BATTERY_LEVEL | battery_level |
| BATTERY_SCALE | battery_scale |
**MUTATION**
- **COLUMN_MAPPINGS**
| Script column | Stream column |
|----------------------|---------------------|
| BATTERY_STATUS | battery_status |
- **SCRIPTS**
```bash
src/data/streams/mutations/phone/aware/battery_ios_unification.R
```
!!! note
For RAPIDS `BATTERY_STATUS` column:
- if stream's `battery_status` field is 3, set `BATTERY_STATUS` = 5 (full status)
- if stream's `battery_status` field is 1, set `BATTERY_STATUS` = 3 (discharge)
=== "IOS Client V2"
Same as ANDROID
??? info "PHONE_BLUETOOTH"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| BT_ADDRESS | bt_address |
| BT_NAME | bt_name |
| BT_RSSI | bt_rssi |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
This sensor is not supported by iOS devices.
??? info "PHONE_CALLS"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| CALL_TYPE | call_type |
| CALL_DURATION | call_duration |
| TRACE | trace |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| CALL_TYPE | FLAG_TO_MUTATE |
| CALL_DURATION | call_duration |
| TRACE | trace |
**MUTATION**
- **COLUMN_MAPPINGS**
| Script column | Stream column |
|----------------------|---------------------|
| CALL_TYPE | call_type |
- **SCRIPTS**
```bash
src/data/streams/mutations/phone/aware/calls_ios_unification.R
```
!!! note
We transform iOS call logs into Android's format. iOS stores call status: 1=incoming, 2=connected, 3=dialing, 4=disconnected, as opposed to Android's events: 1=incoming, 2=outgoing, 3=missed.
We follow this algorithm to convert iOS call data (there are some inaccuracies in the way we handle sequences, see new rules below):
- Search for the disconnected (4) status as it is common to all calls
- Group all events that preceded every status 4
- We convert every 1,2,4 (or 2,1,4) sequence to an incoming call
- We convert every 3,2,4 (or 2,3,4) sequence to an outgoing call
- We convert every 1,4 or 3,4 sequence to a missed call (either incoming or outgoing)
- We set the duration of the call to be the sum of every status (dialing/ringing to hangup) as opposed to the duration of the last status (pick up to hang up)
**Tested with an Android (OnePlus 7T) and an iPhone XR**
|Call type | Android (duration) | iOS (duration) | New Rule|
|---------|----------|--------|------|
|Outgoing missed ended by me | 2 (0) | 3,4 (0,X) | 3,4 is converted to 2 with duration 0|
|Outgoing missed ended by them|2(0)|3,2,4 (0,X,X2)| 3,2,4 is converted to 2 with duration X2*|
|Incoming missed ended by me|NA**|1,4 (0,X)|1,4 is converted to 3 with duration 0|
|Incoming missed ended by them|3(0)|1,4 (0,X)|1,4 is converted to 3 with duration 0|
|Outgoing answered|2(X excluding dialing time)|3,2,4 (0,X,X2)|3,2,4 is converted to 2 with duration X2|
|Incoming answered|1(X excluding dialing time)|1,2,4 (0,X,X2)|1,2,4 is converted to 1 with duration X2|
.* There is no way to differentiate an outgoing missed call ended by them from an outgoing answered call because the phone goes directly to voice mail and it counts as call time (essentially the voice mail answered).
.** Android does not record incoming missed calls ended by the participant, just those ended by the person calling or ignored by the participant.
??? info "PHONE_CONVERSATION"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| DOUBLE_ENERGY | double_energy |
| INFERENCE | inference |
| DOUBLE_CONVO_START | double_convo_start |
| DOUBLE_CONVO_END | double_convo_end |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| DOUBLE_ENERGY | double_energy |
| INFERENCE | inference |
| DOUBLE_CONVO_START | FLAG_TO_MUTATE |
| DOUBLE_CONVO_END | FLAG_TO_MUTATE |
**MUTATION**
- **COLUMN_MAPPINGS**
| Script column | Stream column |
|----------------------|---------------------|
| DOUBLE_CONVO_START | double_convo_start |
| DOUBLE_CONVO_END | double_convo_end |
- **SCRIPTS**
```bash
src/data/streams/mutations/phone/aware/conversation_ios_timestamp.R
```
!!! note
For RAPIDS columns of `DOUBLE_CONVO_START` and `DOUBLE_CONVO_END`:
- if stream's `double_convo_start` field is smaller than 9999999999, it is in seconds instead of milliseconds. Set `DOUBLE_CONVO_START` = 1000 * `double_convo_start`.
- if stream's `double_convo_end` field is smaller than 9999999999, it is in seconds instead of milliseconds. Set `DOUBLE_CONVO_END` = 1000 * `double_convo_end`.
??? info "PHONE_KEYBOARD"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| PACKAGE_NAME | package_name |
| BEFORE_TEXT | before_text |
| CURRENT_TEXT | current_text |
| IS_PASSWORD | is_password |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
This sensor is not supported by iOS devices.
??? info "PHONE_LIGHT"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| DOUBLE_LIGHT_LUX | double_light_lux |
| ACCURACY | accuracy |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
This sensor is not supported by iOS devices.
??? info "PHONE_LOCATIONS"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| DOUBLE_LATITUDE | double_latitude |
| DOUBLE_LONGITUDE | double_longitude |
| DOUBLE_BEARING | double_bearing |
| DOUBLE_SPEED | double_speed |
| DOUBLE_ALTITUDE | double_altitude |
| PROVIDER | provider |
| ACCURACY | accuracy |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
Same as ANDROID
??? info "PHONE_LOG"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| LOG_MESSAGE | log_message |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
Same as ANDROID
??? info "PHONE_MESSAGES"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| MESSAGE_TYPE | message_type |
| TRACE | trace |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
This sensor is not supported by iOS devices.
??? info "PHONE_SCREEN"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| SCREEN_STATUS | screen_status |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| SCREEN_STATUS | FLAG_TO_MUTATE |
**MUTATION**
- **COLUMN_MAPPINGS**
| Script column | Stream column |
|----------------------|---------------------|
| SCREEN_STATUS | screen_status |
- **SCRIPTS**
```bash
src/data/streams/mutations/phone/aware/screen_ios_unification.R
```
!!! note
For `SCREEN_STATUS` RAPIDS column:
- if stream's `screen_status` field is 2 (lock episode), set `SCREEN_STATUS` = 0 (off episode).
??? info "PHONE_WIFI_CONNECTED"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| MAC_ADDRESS | mac_address |
| SSID | ssid |
| BSSID | bssid |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
Same as ANDROID
??? info "PHONE_WIFI_VISIBLE"
=== "ANDROID"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|----------------------|---------------------|
| TIMESTAMP | timestamp |
| DEVICE_ID | device_id |
| SSID | ssid |
| BSSID | bssid |
| SECURITY | security |
| FREQUENCY | frequency |
| RSSI | rssi |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS** (None)
=== "IOS"
This sensor is not supported by iOS devices.

View File

@ -0,0 +1,58 @@
??? info "Setting up a DATABASE_GROUP and its connection credentials."
1. If you haven't done so, create an empty file called `#!bash credentials.yaml` in your RAPIDS root directory:
2. Add the following lines to `credentials.yaml` and replace your database-specific credentials (user, password, host, and database):
``` yaml
MY_GROUP:
database: MY_DATABASE
host: MY_HOST
password: MY_PASSWORD
port: 3306
user: MY_USER
```
1. Notes
1. The label `[MY_GROUP]` is arbitrary but it has to match the `[DATABASE_GROUP]` attribute of the data stream you choose to use.
2. Indentation matters
3. You can have more than one credentials group in `credentials.yaml`
??? hint "Upgrading from `./.env` from RAPIDS 0.x"
In RAPIDS versions 0.x, database credentials were stored in a `./.env` file. If you are migrating from that type of file, you have two options:
1. Migrate your credentials by hand:
=== "change .env format"
``` yaml
[MY_GROUP]
user=MY_USER
password=MY_PASSWORD
host=MY_HOST
port=3306
database=MY_DATABASE
```
=== "to credentials.yaml format"
``` yaml
MY_GROUP:
user: MY_USER
password: MY_PASSWORD
host: MY_HOST
port: 3306
database: MY_DATABASE
```
2. Use the migration script we provide (make sure your conda environment is active):
```python
python tools/update_format_env.py
```
??? hint "Connecting to localhost (host machine) from inside our docker container."
If you are using RAPIDS' docker container and Docker-for-mac or Docker-for-Windows 18.03+, you can connect to a MySQL database in your host machine using `host.docker.internal` instead of `127.0.0.1` or `localhost`. In a Linux host, you need to run our docker container using `docker run --network="host" -d moshiresearch/rapids:latest` and then `127.0.0.1` will point to your host machine.

View File

@ -0,0 +1,43 @@
1. **Sensor section**
Each sensor (accelerometer, screen, etc.) of every supported device (smartphone, Fitbit, etc.) has a section in the `config.yaml` with `parameters` and feature `PROVIDERS`.
2. **Sensor Parameters.**
Each sensor section has one or more parameters. These are parameters that affect different aspects of how the raw data is pulled, and processed.
The `CONTAINER` parameter exists for every sensor, but some sensors will have extra parameters like [`[PHONE_LOCATIONS]`](../phone-locations/).
We explain these parameters in a table at the top of each sensor documentation page.
3. **Sensor Providers**
Each object in this list represents a feature `PROVIDER`. Each sensor can have zero, one, or more providers.
A `PROVIDER` is a script that creates behavioral features for a specific sensor. Providers are created by the core RAPIDS team or by the community, which are named after its first author like [[PHONE_LOCATIONS][DORYAB]](../../features/phone-locations/#doryab-provider).
In this example, there are two accelerometer feature providers `RAPIDS` and `PANDA`.
4. **`PROVIDER` Parameters**
Each `PROVIDER` has parameters that affect the computation of the behavioral features it offers.
These parameters include at least a `[COMPUTE]` flag that you switch to `True` to extract a provider's behavioral features.
We explain every provider's parameter in a table under the `Parameters description` heading on each provider documentation page.
5. **`PROVIDER` Features**
Each `PROVIDER` offers a set of behavioral features.
These features are grouped in an array for some providers, like those for `RAPIDS` provider. For others, they are grouped in a collection of arrays, like those for `PANDAS` provider.
In either case, you can delete the features you are not interested in, and they will not be included in the sensor's output feature file.
We explain each behavioral feature in a table under the `Features description` heading on each provider documentation page.
6. **`PROVIDER` script**
Each `PROVIDER` has a `SRC_SCRIPT` that points to the script implementing its behavioral features.
It has to be a relative path from RAPIDS' root folder and the script's parent folder should be named after the provider, e.g. `panda`.

View File

@ -0,0 +1,252 @@
The `format.yaml` maps and transforms columns in your raw data stream to the [mandatory columns RAPIDS needs for Fitbit sensors](../mandatory-fitbit-format). This file is at:
```bash
src/data/streams/fitbitjson_csv/format.yaml
```
If you want RAPIDS to process Fitbit sensor data using this stream, you will need to map `DEVICE_ID` and `JSON_FITBIT_COLUMN` to your own raw data columns inside **each sensor** section in `format.yaml`.
??? info "FITBIT_HEARTRATE_SUMMARY"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| LOCAL_DATE_TIME | FLAG_TO_MUTATE |
| DEVICE_ID | device_id |
| HEARTRATE_DAILY_RESTINGHR | FLAG_TO_MUTATE |
| HEARTRATE_DAILY_CALORIESOUTOFRANGE | FLAG_TO_MUTATE |
| HEARTRATE_DAILY_CALORIESFATBURN | FLAG_TO_MUTATE |
| HEARTRATE_DAILY_CALORIESCARDIO | FLAG_TO_MUTATE |
| HEARTRATE_DAILY_CALORIESPEAK | FLAG_TO_MUTATE |
**MUTATION**
- **COLUMN_MAPPINGS**
| Script column | Stream column |
|-----------------|-----------------|
| JSON_FITBIT_COLUMN | fitbit_data |
- **SCRIPTS**
```bash
src/data/streams/mutations/fitbit/parse_heartrate_summary_json.py
```
!!! note
All columns except `DEVICE_ID` are parsed from `JSON_FITBIT_COLUMN`. `JSON_FITBIT_COLUMN` is a string column containing the JSON objects returned by Fitbit's API. See an example of the raw data RAPIDS expects for this data stream:
??? example "Example of the raw data RAPIDS expects for this data stream"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-07","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":1200.6102,"max":88,"min":31,"minutes":1058,"name":"Out of Range"},{"caloriesOut":760.3020,"max":120,"min":86,"minutes":366,"name":"Fat Burn"},{"caloriesOut":15.2048,"max":146,"min":120,"minutes":2,"name":"Cardio"},{"caloriesOut":0,"max":221,"min":148,"minutes":0,"name":"Peak"}],"restingHeartRate":72}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":68},{"time":"00:01:00","value":67},{"time":"00:02:00","value":67},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-08","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":1100.1120,"max":89,"min":30,"minutes":921,"name":"Out of Range"},{"caloriesOut":660.0012,"max":118,"min":82,"minutes":361,"name":"Fat Burn"},{"caloriesOut":23.7088,"max":142,"min":108,"minutes":3,"name":"Cardio"},{"caloriesOut":0,"max":221,"min":148,"minutes":0,"name":"Peak"}],"restingHeartRate":70}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":77},{"time":"00:01:00","value":75},{"time":"00:02:00","value":73},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-09","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":750.3615,"max":77,"min":30,"minutes":851,"name":"Out of Range"},{"caloriesOut":734.1516,"max":107,"min":77,"minutes":550,"name":"Fat Burn"},{"caloriesOut":131.8579,"max":130,"min":107,"minutes":29,"name":"Cardio"},{"caloriesOut":0,"max":220,"min":130,"minutes":0,"name":"Peak"}],"restingHeartRate":69}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":90},{"time":"00:01:00","value":89},{"time":"00:02:00","value":88},...],"datasetInterval":1,"datasetType":"minute"}}
??? info "FITBIT_HEARTRATE_INTRADAY"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| LOCAL_DATE_TIME | FLAG_TO_MUTATE |
| DEVICE_ID | device_id |
| HEARTRATE | FLAG_TO_MUTATE |
| HEARTRATE_ZONE | FLAG_TO_MUTATE |
**MUTATION**
- **COLUMN_MAPPINGS**
| Script column | Stream column |
|-----------------|-----------------|
| JSON_FITBIT_COLUMN | fitbit_data |
- **SCRIPTS**
```bash
src/data/streams/mutations/fitbit/parse_heartrate_intraday_json.py
```
!!! note
All columns except `DEVICE_ID` are parsed from `JSON_FITBIT_COLUMN`. `JSON_FITBIT_COLUMN` is a string column containing the JSON objects returned by Fitbit's API. See an example of the raw data RAPIDS expects for this data stream:
??? example "Example of the raw data RAPIDS expects for this data stream"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-07","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":1200.6102,"max":88,"min":31,"minutes":1058,"name":"Out of Range"},{"caloriesOut":760.3020,"max":120,"min":86,"minutes":366,"name":"Fat Burn"},{"caloriesOut":15.2048,"max":146,"min":120,"minutes":2,"name":"Cardio"},{"caloriesOut":0,"max":221,"min":148,"minutes":0,"name":"Peak"}],"restingHeartRate":72}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":68},{"time":"00:01:00","value":67},{"time":"00:02:00","value":67},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-08","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":1100.1120,"max":89,"min":30,"minutes":921,"name":"Out of Range"},{"caloriesOut":660.0012,"max":118,"min":82,"minutes":361,"name":"Fat Burn"},{"caloriesOut":23.7088,"max":142,"min":108,"minutes":3,"name":"Cardio"},{"caloriesOut":0,"max":221,"min":148,"minutes":0,"name":"Peak"}],"restingHeartRate":70}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":77},{"time":"00:01:00","value":75},{"time":"00:02:00","value":73},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"activities-heart":[{"dateTime":"2020-10-09","value":{"customHeartRateZones":[],"heartRateZones":[{"caloriesOut":750.3615,"max":77,"min":30,"minutes":851,"name":"Out of Range"},{"caloriesOut":734.1516,"max":107,"min":77,"minutes":550,"name":"Fat Burn"},{"caloriesOut":131.8579,"max":130,"min":107,"minutes":29,"name":"Cardio"},{"caloriesOut":0,"max":220,"min":130,"minutes":0,"name":"Peak"}],"restingHeartRate":69}}],"activities-heart-intraday":{"dataset":[{"time":"00:00:00","value":90},{"time":"00:01:00","value":89},{"time":"00:02:00","value":88},...],"datasetInterval":1,"datasetType":"minute"}}
??? info "FITBIT_SLEEP_SUMMARY"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP | FLAG_TO_MUTATE |
| LOCAL_DATE_TIME | FLAG_TO_MUTATE |
| LOCAL_START_DATE_TIME | FLAG_TO_MUTATE |
| LOCAL_END_DATE_TIME | FLAG_TO_MUTATE |
| DEVICE_ID | device_id |
| EFFICIENCY | FLAG_TO_MUTATE |
| MINUTES_AFTER_WAKEUP | FLAG_TO_MUTATE |
| MINUTES_ASLEEP | FLAG_TO_MUTATE |
| MINUTES_AWAKE | FLAG_TO_MUTATE |
| MINUTES_TO_FALL_ASLEEP | FLAG_TO_MUTATE |
| MINUTES_IN_BED | FLAG_TO_MUTATE |
| IS_MAIN_SLEEP | FLAG_TO_MUTATE |
| TYPE | FLAG_TO_MUTATE |
**MUTATION**
- **COLUMN_MAPPINGS**
| Script column | Stream column |
|-----------------|-----------------|
| JSON_FITBIT_COLUMN | fitbit_data |
- **SCRIPTS**
```bash
src/data/streams/mutations/fitbit/parse_sleep_summary_json.py
```
!!! note
Fitbit API has two versions for sleep data, v1 and v1.2. We support both but ignore v1's `count_awake`, `duration_awake`, and `count_awakenings`, `count_restless`, `duration_restless` columns.
All columns except `DEVICE_ID` are parsed from `JSON_FITBIT_COLUMN`. `JSON_FITBIT_COLUMN` is a string column containing the JSON objects returned by Fitbit's API. See an example of the raw data RAPIDS expects for this data stream:
??? example "Example of the expected raw data"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep":[{"dateOfSleep":"2020-10-10","duration":3600000,"efficiency":92,"endTime":"2020-10-10T16:37:00.000","infoCode":2,"isMainSleep":false,"levels":{"data":[{"dateTime":"2020-10-10T15:36:30.000","level":"restless","seconds":60},{"dateTime":"2020-10-10T15:37:30.000","level":"asleep","seconds":660},{"dateTime":"2020-10-10T15:48:30.000","level":"restless","seconds":60},...], "summary":{"asleep":{"count":0,"minutes":56},"awake":{"count":0,"minutes":0},"restless":{"count":3,"minutes":4}}},"logId":26315914306,"minutesAfterWakeup":0,"minutesAsleep":55,"minutesAwake":5,"minutesToFallAsleep":0,"startTime":"2020-10-10T15:36:30.000","timeInBed":60,"type":"classic"},{"dateOfSleep":"2020-10-10","duration":22980000,"efficiency":88,"endTime":"2020-10-10T08:10:00.000","infoCode":0,"isMainSleep":true,"levels":{"data":[{"dateTime":"2020-10-10T01:46:30.000","level":"light","seconds":420},{"dateTime":"2020-10-10T01:53:30.000","level":"deep","seconds":1230},{"dateTime":"2020-10-10T02:14:00.000","level":"light","seconds":360},...], "summary":{"deep":{"count":3,"minutes":92,"thirtyDayAvgMinutes":0},"light":{"count":29,"minutes":193,"thirtyDayAvgMinutes":0},"rem":{"count":4,"minutes":33,"thirtyDayAvgMinutes":0},"wake":{"count":28,"minutes":65,"thirtyDayAvgMinutes":0}}},"logId":26311786557,"minutesAfterWakeup":0,"minutesAsleep":318,"minutesAwake":65,"minutesToFallAsleep":0,"startTime":"2020-10-10T01:46:30.000","timeInBed":383,"type":"stages"}],"summary":{"stages":{"deep":92,"light":193,"rem":33,"wake":65},"totalMinutesAsleep":373,"totalSleepRecords":2,"totalTimeInBed":443}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep":[{"dateOfSleep":"2020-10-11","duration":41640000,"efficiency":89,"endTime":"2020-10-11T11:47:00.000","infoCode":0,"isMainSleep":true,"levels":{"data":[{"dateTime":"2020-10-11T00:12:30.000","level":"wake","seconds":450},{"dateTime":"2020-10-11T00:20:00.000","level":"light","seconds":870},{"dateTime":"2020-10-11T00:34:30.000","level":"wake","seconds":780},...], "summary":{"deep":{"count":4,"minutes":52,"thirtyDayAvgMinutes":62},"light":{"count":32,"minutes":442,"thirtyDayAvgMinutes":364},"rem":{"count":6,"minutes":68,"thirtyDayAvgMinutes":58},"wake":{"count":29,"minutes":132,"thirtyDayAvgMinutes":94}}},"logId":26589710670,"minutesAfterWakeup":1,"minutesAsleep":562,"minutesAwake":132,"minutesToFallAsleep":0,"startTime":"2020-10-11T00:12:30.000","timeInBed":694,"type":"stages"}],"summary":{"stages":{"deep":52,"light":442,"rem":68,"wake":132},"totalMinutesAsleep":562,"totalSleepRecords":1,"totalTimeInBed":694}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep":[{"dateOfSleep":"2020-10-12","duration":28980000,"efficiency":93,"endTime":"2020-10-12T09:34:30.000","infoCode":0,"isMainSleep":true,"levels":{"data":[{"dateTime":"2020-10-12T01:31:00.000","level":"wake","seconds":600},{"dateTime":"2020-10-12T01:41:00.000","level":"light","seconds":60},{"dateTime":"2020-10-12T01:42:00.000","level":"deep","seconds":2340},...], "summary":{"deep":{"count":4,"minutes":63,"thirtyDayAvgMinutes":59},"light":{"count":27,"minutes":257,"thirtyDayAvgMinutes":364},"rem":{"count":5,"minutes":94,"thirtyDayAvgMinutes":58},"wake":{"count":24,"minutes":69,"thirtyDayAvgMinutes":95}}},"logId":26589710673,"minutesAfterWakeup":0,"minutesAsleep":415,"minutesAwake":68,"minutesToFallAsleep":0,"startTime":"2020-10-12T01:31:00.000","timeInBed":483,"type":"stages"}],"summary":{"stages":{"deep":63,"light":257,"rem":94,"wake":69},"totalMinutesAsleep":415,"totalSleepRecords":1,"totalTimeInBed":483}}
??? info "FITBIT_SLEEP_INTRADAY"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP | FLAG_TO_MUTATE |
| LOCAL_DATE_TIME | FLAG_TO_MUTATE |
| DEVICE_ID | device_id |
| TYPE_EPISODE_ID | FLAG_TO_MUTATE |
| DURATION | FLAG_TO_MUTATE |
| IS_MAIN_SLEEP | FLAG_TO_MUTATE |
| TYPE | FLAG_TO_MUTATE |
| LEVEL | FLAG_TO_MUTATE |
**MUTATION**
- **COLUMN_MAPPINGS**
| Script column | Stream column |
|-----------------|-----------------|
| JSON_FITBIT_COLUMN | fitbit_data |
- **SCRIPTS**
```bash
src/data/streams/mutations/fitbit/parse_sleep_intraday_json.py
```
!!! note
Fitbit API has two versions for sleep data, v1 and v1.2, we support both.
All columns except `DEVICE_ID` are parsed from `JSON_FITBIT_COLUMN`. `JSON_FITBIT_COLUMN` is a string column containing the JSON objects returned by Fitbit's API. See an example of the raw data RAPIDS expects for this data stream:
??? example "Example of the expected raw data"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep":[{"dateOfSleep":"2020-10-10","duration":3600000,"efficiency":92,"endTime":"2020-10-10T16:37:00.000","infoCode":2,"isMainSleep":false,"levels":{"data":[{"dateTime":"2020-10-10T15:36:30.000","level":"restless","seconds":60},{"dateTime":"2020-10-10T15:37:30.000","level":"asleep","seconds":660},{"dateTime":"2020-10-10T15:48:30.000","level":"restless","seconds":60},...], "summary":{"asleep":{"count":0,"minutes":56},"awake":{"count":0,"minutes":0},"restless":{"count":3,"minutes":4}}},"logId":26315914306,"minutesAfterWakeup":0,"minutesAsleep":55,"minutesAwake":5,"minutesToFallAsleep":0,"startTime":"2020-10-10T15:36:30.000","timeInBed":60,"type":"classic"},{"dateOfSleep":"2020-10-10","duration":22980000,"efficiency":88,"endTime":"2020-10-10T08:10:00.000","infoCode":0,"isMainSleep":true,"levels":{"data":[{"dateTime":"2020-10-10T01:46:30.000","level":"light","seconds":420},{"dateTime":"2020-10-10T01:53:30.000","level":"deep","seconds":1230},{"dateTime":"2020-10-10T02:14:00.000","level":"light","seconds":360},...], "summary":{"deep":{"count":3,"minutes":92,"thirtyDayAvgMinutes":0},"light":{"count":29,"minutes":193,"thirtyDayAvgMinutes":0},"rem":{"count":4,"minutes":33,"thirtyDayAvgMinutes":0},"wake":{"count":28,"minutes":65,"thirtyDayAvgMinutes":0}}},"logId":26311786557,"minutesAfterWakeup":0,"minutesAsleep":318,"minutesAwake":65,"minutesToFallAsleep":0,"startTime":"2020-10-10T01:46:30.000","timeInBed":383,"type":"stages"}],"summary":{"stages":{"deep":92,"light":193,"rem":33,"wake":65},"totalMinutesAsleep":373,"totalSleepRecords":2,"totalTimeInBed":443}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep":[{"dateOfSleep":"2020-10-11","duration":41640000,"efficiency":89,"endTime":"2020-10-11T11:47:00.000","infoCode":0,"isMainSleep":true,"levels":{"data":[{"dateTime":"2020-10-11T00:12:30.000","level":"wake","seconds":450},{"dateTime":"2020-10-11T00:20:00.000","level":"light","seconds":870},{"dateTime":"2020-10-11T00:34:30.000","level":"wake","seconds":780},...], "summary":{"deep":{"count":4,"minutes":52,"thirtyDayAvgMinutes":62},"light":{"count":32,"minutes":442,"thirtyDayAvgMinutes":364},"rem":{"count":6,"minutes":68,"thirtyDayAvgMinutes":58},"wake":{"count":29,"minutes":132,"thirtyDayAvgMinutes":94}}},"logId":26589710670,"minutesAfterWakeup":1,"minutesAsleep":562,"minutesAwake":132,"minutesToFallAsleep":0,"startTime":"2020-10-11T00:12:30.000","timeInBed":694,"type":"stages"}],"summary":{"stages":{"deep":52,"light":442,"rem":68,"wake":132},"totalMinutesAsleep":562,"totalSleepRecords":1,"totalTimeInBed":694}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |{"sleep":[{"dateOfSleep":"2020-10-12","duration":28980000,"efficiency":93,"endTime":"2020-10-12T09:34:30.000","infoCode":0,"isMainSleep":true,"levels":{"data":[{"dateTime":"2020-10-12T01:31:00.000","level":"wake","seconds":600},{"dateTime":"2020-10-12T01:41:00.000","level":"light","seconds":60},{"dateTime":"2020-10-12T01:42:00.000","level":"deep","seconds":2340},...], "summary":{"deep":{"count":4,"minutes":63,"thirtyDayAvgMinutes":59},"light":{"count":27,"minutes":257,"thirtyDayAvgMinutes":364},"rem":{"count":5,"minutes":94,"thirtyDayAvgMinutes":58},"wake":{"count":24,"minutes":69,"thirtyDayAvgMinutes":95}}},"logId":26589710673,"minutesAfterWakeup":0,"minutesAsleep":415,"minutesAwake":68,"minutesToFallAsleep":0,"startTime":"2020-10-12T01:31:00.000","timeInBed":483,"type":"stages"}],"summary":{"stages":{"deep":63,"light":257,"rem":94,"wake":69},"totalMinutesAsleep":415,"totalSleepRecords":1,"totalTimeInBed":483}}
??? info "FITBIT_STEPS_SUMMARY"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP | FLAG_TO_MUTATE |
| DEVICE_ID | device_id |
| LOCAL_DATE_TIME | FLAG_TO_MUTATE |
| STEPS | FLAG_TO_MUTATE |
**MUTATION**
- **COLUMN_MAPPINGS**
| Script column | Stream column |
|-----------------|-----------------|
| JSON_FITBIT_COLUMN | fitbit_data |
- **SCRIPTS**
```bash
src/data/streams/mutations/fitbit/parse_steps_summary_json.py
```
!!! note
`TIMESTAMP`, `LOCAL_DATE_TIME`, and `STEPS` are parsed from `JSON_FITBIT_COLUMN`. `JSON_FITBIT_COLUMN` is a string column containing the JSON objects returned by Fitbit's API. See an example of the raw data RAPIDS expects for this data stream:
??? example "Example of the expected raw data"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |"activities-steps":[{"dateTime":"2020-10-07","value":"1775"}],"activities-steps-intraday":{"dataset":[{"time":"00:00:00","value":5},{"time":"00:01:00","value":3},{"time":"00:02:00","value":0},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |"activities-steps":[{"dateTime":"2020-10-08","value":"3201"}],"activities-steps-intraday":{"dataset":[{"time":"00:00:00","value":14},{"time":"00:01:00","value":11},{"time":"00:02:00","value":10},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |"activities-steps":[{"dateTime":"2020-10-09","value":"998"}],"activities-steps-intraday":{"dataset":[{"time":"00:00:00","value":0},{"time":"00:01:00","value":0},{"time":"00:02:00","value":0},...],"datasetInterval":1,"datasetType":"minute"}}
??? info "FITBIT_STEPS_INTRADAY"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP | FLAG_TO_MUTATE |
| DEVICE_ID | device_id |
| LOCAL_DATE_TIME | FLAG_TO_MUTATE |
| STEPS | FLAG_TO_MUTATE |
**MUTATION**
- **COLUMN_MAPPINGS**
| Script column | Stream column |
|-----------------|-----------------|
| JSON_FITBIT_COLUMN | fitbit_data |
- **SCRIPTS**
```bash
src/data/streams/mutations/fitbit/parse_steps_intraday_json.py
```
!!! note
`TIMESTAMP`, `LOCAL_DATE_TIME`, and `STEPS` are parsed from `JSON_FITBIT_COLUMN`. `JSON_FITBIT_COLUMN` is a string column containing the JSON objects returned by [Fitbit's API](https://dev.fitbit.com/build/reference/web-api/activity/#get-activity-intraday-time-series). See an example of the raw data RAPIDS expects for this data stream:
??? example "Example of the expected raw data"
|device_id |fitbit_data |
|---------------------------------------- |--------------------------------------------------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |"activities-steps":[{"dateTime":"2020-10-07","value":"1775"}],"activities-steps-intraday":{"dataset":[{"time":"00:00:00","value":5},{"time":"00:01:00","value":3},{"time":"00:02:00","value":0},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |"activities-steps":[{"dateTime":"2020-10-08","value":"3201"}],"activities-steps-intraday":{"dataset":[{"time":"00:00:00","value":14},{"time":"00:01:00","value":11},{"time":"00:02:00","value":10},...],"datasetInterval":1,"datasetType":"minute"}}
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |"activities-steps":[{"dateTime":"2020-10-09","value":"998"}],"activities-steps-intraday":{"dataset":[{"time":"00:00:00","value":0},{"time":"00:01:00","value":0},{"time":"00:02:00","value":0},...],"datasetInterval":1,"datasetType":"minute"}}

View File

@ -0,0 +1,233 @@
The `format.yaml` maps and transforms columns in your raw data stream to the [mandatory columns RAPIDS needs for Fitbit sensors](../mandatory-fitbit-format). This file is at:
```bash
src/data/streams/fitbitparsed_mysql/format.yaml
```
If you want to use this stream with your data, modify every sensor in `format.yaml` to map all columns except `TIMESTAMP` in `[RAPIDS_COLUMN_MAPPINGS]` to your raw data column names.
All columns are mandatory; however, all except `device_id` and `local_date_time` can be empty if you don't have that data. Just have in mind that some features will be empty if some of these columns are empty.
??? info "FITBIT_HEARTRATE_SUMMARY"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP| FLAG_TO_MUTATE |
| LOCAL_DATE_TIME | local_date_time |
| DEVICE_ID | device_id |
| HEARTRATE_DAILY_RESTINGHR | heartrate_daily_restinghr |
| HEARTRATE_DAILY_CALORIESOUTOFRANGE | heartrate_daily_caloriesoutofrange |
| HEARTRATE_DAILY_CALORIESFATBURN | heartrate_daily_caloriesfatburn |
| HEARTRATE_DAILY_CALORIESCARDIO | heartrate_daily_caloriescardio |
| HEARTRATE_DAILY_CALORIESPEAK | heartrate_daily_caloriespeak |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS**
```bash
src/data/streams/mutations/fitbit/add_zero_timestamp.py
```
!!! note
`add_zero_timestamp` adds an all-zero column called `timestamp` that will be filled in later in the pipeline by `readable_time.R` converting LOCAL_DATE_TIME to a unix timestamp taking into account single or multiple time zones.
??? example "Example of the raw data RAPIDS expects for this data stream"
|device_id |local_date_time |heartrate_daily_restinghr |heartrate_daily_caloriesoutofrange |heartrate_daily_caloriesfatburn |heartrate_daily_caloriescardio |heartrate_daily_caloriespeak |
|-------------------------------------- |----------------- |------- |-------------- |------------- |------------ |-------|
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 |72 |1200.6102 |760.3020 |15.2048 |0 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-08 |70 |1100.1120 |660.0012 |23.7088 |0 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-09 |69 |750.3615 |734.1516 |131.8579 |0 |
??? info "FITBIT_HEARTRATE_INTRADAY"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP| FLAG_TO_MUTATE |
| LOCAL_DATE_TIME | local_date_time |
| DEVICE_ID | device_id |
| HEARTRATE | heartrate |
| HEARTRATE_ZONE | heartrate_zone |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS**
```bash
src/data/streams/mutations/fitbit/add_zero_timestamp.py
```
!!! note
`add_zero_timestamp` adds an all-zero column called `timestamp` that will be filled in later in the pipeline by `readable_time.R` converting LOCAL_DATE_TIME to a unix timestamp taking into account single or multiple time zones.
??? example "Example of the raw data RAPIDS expects for this data stream"
|device_id |local_date_time |heartrate |heartrate_zone |
|-------------------------------------- |---------------------- |--------- |--------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 00:00:00 |68 |outofrange |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 00:01:00 |67 |outofrange |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 00:02:00 |67 |outofrange |
??? info "FITBIT_SLEEP_SUMMARY"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP| FLAG_TO_MUTATE |
| LOCAL_DATE_TIME| local_date_time |
| LOCAL_START_DATE_TIME| local_start_date_time |
| LOCAL_END_DATE_TIME| local_end_date_time |
| DEVICE_ID| device_id |
| EFFICIENCY| efficiency |
| MINUTES_AFTER_WAKEUP| minutes_after_wakeup |
| MINUTES_ASLEEP| minutes_asleep |
| MINUTES_AWAKE| minutes_awake |
| MINUTES_TO_FALL_ASLEEP| minutes_to_fall_asleep |
| MINUTES_IN_BED| minutes_in_bed |
| IS_MAIN_SLEEP| is_main_sleep |
| TYPE| type |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS**
```bash
src/data/streams/mutations/fitbit/add_zero_timestamp.py
```
!!! note
`add_zero_timestamp` adds an all-zero column called `timestamp` that will be filled in later in the pipeline by `readable_time.R` converting LOCAL_DATE_TIME to a unix timestamp taking into account single or multiple time zones.
Fitbit API has two versions for sleep data, v1 and v1.2. We support both but ignore v1's `count_awake`, `duration_awake`, and `count_awakenings`, `count_restless`, `duration_restless` columns.
??? example "Example of the expected raw data"
|device_id |local_start_date_time |local_end_date_time |efficiency |minutes_after_wakeup |minutes_asleep |minutes_awake |minutes_to_fall_asleep |minutes_in_bed |is_main_sleep |type |
|-------------------------------------- |---------------------- |---------------------- |----------- |--------------------- |--------------- |-------------- |----------------------- |--------------- |-------------- |-------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-10 15:36:30 |2020-10-10 16:37:00 |92 |0 |55 |5 |0 |60 |0 |classic |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-10 01:46:30 |2020-10-10 08:10:00 |88 |0 |318 |65 |0 |383 |1 |stages |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-11 00:12:30 |2020-10-11 11:47:00 |89 |1 |562 |132 |0 |694 |1 |stages |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-12 01:31:00 |2020-10-12 09:34:30 |93 |0 |415 |68 |0 |483 |1 |stages |
??? info "FITBIT_SLEEP_INTRADAY"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP | FLAG_TO_MUTATE |
| LOCAL_DATE_TIME | local_date_time |
| DEVICE_ID | device_id |
| TYPE_EPISODE_ID | type_episode_id |
| DURATION | duration |
| IS_MAIN_SLEEP | is_main_sleep |
| TYPE | type |
| LEVEL | level |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS**
```bash
src/data/streams/mutations/fitbit/add_zero_timestamp.py
```
!!! note
`add_zero_timestamp` adds an all-zero column called `timestamp` that will be filled in later in the pipeline by `readable_time.R` converting LOCAL_DATE_TIME to a unix timestamp taking into account single or multiple time zones.
Fitbit API has two versions for sleep data, v1 and v1.2, we support both.
??? example "Example of the expected raw data"
|device_id |type_episode_id |local_date_time |duration |level |is_main_sleep |type |
|------------------------------------ |---------------- |------------------- |--------- |---------- |-------------- |-------------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |0 |2020-10-10 15:36:30 |60 |restless |0 |classic |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |0 |2020-10-10 15:37:30 |660 |asleep |0 |classic |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |0 |2020-10-10 15:48:30 |60 |restless |0 |classic |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |... |... |... |... |... |... |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |1 |2020-10-10 01:46:30 |420 |light |1 |stages |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |1 |2020-10-10 01:53:30 |1230 |deep |1 |stages |
??? info "FITBIT_STEPS_SUMMARY"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP | FLAG_TO_MUTATE |
| DEVICE_ID | device_id |
| LOCAL_DATE_TIME | local_date_time |
| STEPS | steps |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS**
```bash
src/data/streams/mutations/fitbit/add_zero_timestamp.py
```
!!! note
`add_zero_timestamp` adds an all-zero column called `timestamp` that will be filled in later in the pipeline by `readable_time.R` converting LOCAL_DATE_TIME to a unix timestamp taking into account single or multiple time zones.
??? example "Example of the expected raw data"
|device_id |local_date_time |steps |
|-------------------------------------- |---------------------- |--------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 |1775 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-08 |3201 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-09 |998 |
??? info "FITBIT_STEPS_INTRADAY"
**RAPIDS_COLUMN_MAPPINGS**
| RAPIDS column | Stream column |
|-----------------|-----------------|
| TIMESTAMP | FLAG_TO_MUTATE |
| DEVICE_ID | device_id |
| LOCAL_DATE_TIME | local_date_time |
| STEPS | steps |
**MUTATION**
- **COLUMN_MAPPINGS** (None)
- **SCRIPTS**
```bash
src/data/streams/mutations/fitbit/add_zero_timestamp.py
```
!!! note
`add_zero_timestamp` adds an all-zero column called `timestamp` that will be filled in later in the pipeline by `readable_time.R` converting LOCAL_DATE_TIME to a unix timestamp taking into account single or multiple time zones.
??? example "Example of the expected raw data"
|device_id |local_date_time |steps |
|-------------------------------------- |---------------------- |--------- |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 00:00:00 |5 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 00:01:00 |3 |
|a748ee1a-1d0b-4ae9-9074-279a2b6ba524 |2020-10-07 00:02:00 |0 |

View File

@ -78,6 +78,16 @@ If you are interested in contributing feel free to submit a pull request or cont
??? abstract "About"
- [Personal Website](https://runsdata.org/)
### Stephen Price
??? abstract "About"
Carnegie Mellon University
### Neil Singh
??? abstract "About"
University of Virginia
## Advisors
### Afsaneh Doryab

View File

@ -27,7 +27,7 @@ Our example is based on a hypothetical study that recruited 2 participants that
The goal of this workflow is to find out if we can predict the daily symptom burden score of a participant. Thus, we framed this question as a binary classification problem with two classes, high and low symptom burden based on the scores above and below average of each participant. We also want to compare the performance of individual (personalized) models vs a population model.
In total, our example workflow has nine steps that are in charge of sensor data preprocessing, feature extraction, feature cleaning, machine learning model training and model evaluation (see figure below). We ship this workflow with RAPIDS and share a database with [test data](https://osf.io/skqfv/files/) in an Open Science Framework repository.
In total, our example workflow has nine steps that are in charge of sensor data preprocessing, feature extraction, feature cleaning, machine learning model training and model evaluation (see figure below). We ship this workflow with RAPIDS and share files with [test data](https://osf.io/wbg23/) in an Open Science Framework repository.
<figure>
<img src="../../img/analysis_workflow.png" max-width="100%" />
@ -37,33 +37,31 @@ In total, our example workflow has nine steps that are in charge of sensor data
## Configure and run the analysis workflow example
1. [Install](../../setup/installation) RAPIDS
2. Configure the [user credentials](../../setup/configuration/#database-credentials) of a local or remote MySQL server with writing permissions in your `.env` file. The config file where you need to modify the `DATABASE_GROUP` is at `example_profile/example_config.yaml`.
3. *Skip this step if you are using RAPIDS docker container*. Unzip the [test database](https://osf.io/skqfv/files/) to `data/external/rapids_example.sql` and run:
```bash
./rapids -j1 restore_sql_file --profile example_profile
```
4. Create the participant files for this example by running:
2. Unzip the CSV files inside [rapids_example_csv.zip](https://osf.io/wbg23/) in `data/external/example_workflow/*.csv`.
3. Create the participant files for this example by running:
```bash
./rapids -j1 create_example_participant_files
```
5. Run the example pipeline with:
4. Run the example pipeline with:
```bash
./rapids -j1 --profile example_profile
```
Note you will see a lot of warning messages, you can ignore them since they happen because we ran ML algorithms with a small fake dataset.
## Modules of our analysis workflow example
??? info "1. Feature extraction"
We extract daily behavioral features for data yield, received and sent messages, missed, incoming and outgoing calls, resample fused location data using Doryab provider, activity recognition, battery, Bluetooth, screen, light, applications foreground, conversations, Wi-Fi connected, Wi-Fi visible, Fitbit heart rate summary and intraday data, Fitbit sleep summary data, and Fitbit step summary and intraday data without excluding sleep periods with an active bout threshold of 10 steps. In total, we obtained 237 daily sensor features over 12 days per participant.
??? info "2. Extract demographic data."
It is common to have demographic data in addition to mobile and target (ground truth) data. In this example we include participants age, gender and the number of days they spent in hospital after their surgery as features in our model. We extract these three columns from the participant_info table of our test database . As these three features remain the same within participants, they are used only on the population model. Refer to the `demographic_features` rule in `rules/models.smk`.
It is common to have demographic data in addition to mobile and target (ground truth) data. In this example we include participants age, gender and the number of days they spent in hospital after their surgery as features in our model. We extract these three columns from the `data/external/example_workflow/participant_info.csv` file. As these three features remain the same within participants, they are used only on the population model. Refer to the `demographic_features` rule in `rules/models.smk`.
??? info "3. Create target labels."
The two classes for our machine learning binary classification problem are high and low symptom burden. Target values are already stored in the `participant_target` table of our test database and transferred to a CSV file. A new rule/script can be created if further manipulation is necessary. Refer to the `parse_targets` rule in `rules/models.smk`.
The two classes for our machine learning binary classification problem are high and low symptom burden. Target values are already stored in the `data/external/example_workflow/participant_target.csv` file. A new rule/script can be created if further manipulation is necessary. Refer to the `parse_targets` rule in `rules/models.smk`.
??? info "4. Feature merging."
These daily features are stored on a CSV file per sensor, a CSV file per participant, and a CSV file including all features from all participants (in every case each column represents a feature and each row represents a day). Refer to the `merge_sensor_features_for_individual_participants` and `merge_features_for_population_model` rules in `rules/features.smk`.
These daily features are stored on a CSV file per sensor, a CSV file per participant, and a CSV file including all features from all participants (in every case each column represents a feature and each row represents a day). Refer to the `merge_sensor_features_for_individual_participants` and `merge_sensor_features_for_all_participants` rules in `rules/features.smk`.
??? info "5. Data visualization."
At this point the user can use the five plots RAPIDS provides (or implement new ones) to explore and understand the quality of the raw data and extracted features and decide what sensors, days, or participants to include and exclude. Refer to `rules/reports.smk` to find the rules that generate these plots.
@ -71,7 +69,7 @@ In total, our example workflow has nine steps that are in charge of sensor data
??? info "6. Feature cleaning."
In this stage we perform four steps to clean our sensor feature file. First, we discard days with a data yield hour ratio less than or equal to 0.75, i.e. we include days with at least 18 hours of data. Second, we drop columns (features) with more than 30% of missing rows. Third, we drop columns with zero variance. Fourth, we drop rows (days) with more than 30% of missing columns (features). In this cleaning stage several parameters are created and exposed in `example_profile/example_config.yaml`.
After this step, we kept 162 features over 11 days for the individual model of p01, 107 features over 12 days for the individual model of p02 and 101 features over 20 days for the population model. Note that the difference in the number of features between p01 and p02 is mostly due to iOS restrictions that stops researchers from collecting the same number of sensors than in Android phones.
After this step, we kept 161 features over 11 days for the individual model of p01, 101 features over 12 days for the individual model of p02 and 107 features over 20 days for the population model. Note that the difference in the number of features between p01 and p02 is mostly due to iOS restrictions that stops researchers from collecting the same number of sensors than in Android phones.
Feature cleaning for the individual models is done in the `clean_sensor_features_for_individual_participants` rule and for the population model in the `clean_sensor_features_for_all_participants` rule in `rules/models.smk`.

View File

@ -1,27 +1,117 @@
Minimal Working Example
=======================
This is a quick guide for creating and running a simple pipeline to extract missing, outgoing, and incoming `call` features for `daily` (`00:00:00` to `23:59:59`) and `night` (`00:00:00` to `05:59:59`) epochs of every day of data of one participant monitored on the US East coast with an Android smartphone.
!!! hint
If you don't have `call` data that you can use to try this example you can restore this [CSV file](../img/calls.csv) as a table in a MySQL database.
This is a quick guide for creating and running a simple pipeline to extract missing, outgoing, and incoming `call` features for `24 hr` (`00:00:00` to `23:59:59`) and `night` (`00:00:00` to `05:59:59`) time segments of every day of data of one participant that was monitored on the US East coast with an Android smartphone.
1. Install RAPIDS and make sure your `conda` environment is active (see [Installation](../../setup/installation))
3. Download this [CSV file](../img/calls.csv) and save it as `data/external/aware_csv/calls.csv`
2. Make the changes listed below for the corresponding [Configuration](../../setup/configuration) step (we provide an example of what the relevant sections in your `config.yml` will look like after you are done)
??? info "Required configuration changes"
1. **Add your [database credentials](../../setup/configuration#database-credentials).**
??? info "Required configuration changes (*click to expand*)"
1. **Supported [data streams](../../setup/configuration#supported-data-streams).**
Setup your database connection credentials in `.env`, we assume your credentials group in the `.env` file is called `MY_GROUP`.
Based on the docs, we decided to use the `aware_csv` data stream because we are processing aware data saved in a CSV file. We will use this label in a later step; there's no need to type it or save it anywhere yet.
3. **Create your [participants file](../../setup/configuration#participant-files).**
Since we are processing data from a single participant, you only need to create a single participant file called `p01.yaml` in `data/external/participant_files`. This participant file only has a `PHONE` section because this hypothetical participant was only monitored with a smartphone. Note that for a real analysis, you can do this [automatically with a CSV file](../../setup/configuration##automatic-creation-of-participant-files)
1. Add `p01` to `[PIDS]` in `config.yaml`
1. Create a file in `data/external/participant_files/p01.yaml` with the following content:
```yaml
PHONE:
DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524] # the participant's AWARE device id
PLATFORMS: [android] # or ios
LABEL: MyTestP01 # any string
START_DATE: 2020-01-01 # this can also be empty
END_DATE: 2021-01-01 # this can also be empty
```
4. **Select what [time segments](../../setup/configuration#time-segments) you want to extract features on.**
1. Set `[TIME_SEGMENTS][FILE]` to `data/external/timesegments_periodic.csv`
1. Create a file in `data/external/timesegments_periodic.csv` with the following content
```csv
label,start_time,length,repeats_on,repeats_value
daily,00:00:00,23H 59M 59S,every_day,0
night,00:00:00,5H 59M 59S,every_day,0
```
2. **Choose the [timezone of your study](../../setup/configuration#timezone-of-your-study).**
Since this example is processing data collected on the US East cost, `America/New_York` should be the configured timezone, change this according to your data.
We will use the default time zone settings since this example is processing data collected on the US East Coast (`America/New_York`)
3. **Create your [participants files](../../setup/configuration#participant-files).**
```yaml
TIMEZONE:
TYPE: SINGLE
SINGLE:
TZCODE: America/New_York
```
Since we are processing data from a single participant, you only need to create a single participant file called `p01.yaml`. This participant file only has a `PHONE` section because this hypothetical participant was only monitored with an smartphone. You also need to add `p01` to `[PIDS]` in `config.yaml`. The following would be the content of your `p01.yaml` participant file:
5. **Modify your [device data stream configuration](../../setup/configuration#data-stream-configuration)**
1. Set `[PHONE_DATA_STREAMS][USE]` to `aware_csv`.
2. We will use the default value for `[PHONE_DATA_STREAMS][aware_csv][FOLDER]` since we already stored the test calls CSV file there.
6. **Select what [sensors and features](../../setup/configuration#sensor-and-features-to-process) you want to process.**
1. Set `[PHONE_CALLS][CONTAINER]` to `calls.csv` in the `config.yaml` file.
1. Set `[PHONE_CALLS][PROVIDERS][RAPIDS][COMPUTE]` to `True` in the `config.yaml` file.
!!! example "Example of the `config.yaml` sections after the changes outlined above"
This will be your `config.yaml` after following the instructions above. Click on the numbered markers to know more.
``` { .yaml .annotate }
PIDS: [p01] # (1)
TIMEZONE:
TYPE: SINGLE # (2)
SINGLE:
TZCODE: America/New_York
# ... other irrelevant sections
TIME_SEGMENTS: &time_segments
TYPE: PERIODIC # (3)
FILE: "data/external/timesegments_periodic.csv" # (4)
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE
PHONE_DATA_STREAMS:
USE: aware_csv # (5)
aware_csv:
FOLDER: data/external/aware_csv # (6)
# ... other irrelevant sections
############## PHONE ###########################################################
################################################################################
# ... other irrelevant sections
# Communication call features config, TYPES and FEATURES keys need to match
PHONE_CALLS:
CONTAINER: calls.csv # (7)
PROVIDERS:
RAPIDS:
COMPUTE: True # (8)
CALL_TYPES: ...
```
1. We added `p01` to PIDS after creating the participant file:
```bash
data/external/participant_files/p01.yaml
```
With the following content:
```yaml
PHONE:
DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524] # the participant's AWARE device id
@ -31,67 +121,25 @@ This is a quick guide for creating and running a simple pipeline to extract miss
END_DATE: 2021-01-01 # this can also be empty
```
4. **Select what [time segments](../../setup/configuration#time-segments) you want to extract features on.**
2. We use the default `SINGLE` time zone.
`[TIME_SEGMENTS][TYPE]` should be the default `PERIODIC`. Change `[TIME_SEGMENTS][FILE]` with the path (for example `data/external/timesegments_periodic.csv`) of a file containing the following lines:
```csv
label,start_time,length,repeats_on,repeats_value
daily,00:00:00,23H 59M 59S,every_day,0
night,00:00:00,5H 59M 59S,every_day,0
```
3. We use the default `PERIODIC` time segment `[TYPE]`
5. **Modify your [device data source configuration](../../setup/configuration#device-data-source-configuration)**
4. We created this time segments file with these lines:
In this example we do not need to modify this section because we are using smartphone data collected with AWARE stored on a MySQL database.
```csv
label,start_time,length,repeats_on,repeats_value
daily,00:00:00,23H 59M 59S,every_day,0
night,001:00:00,5H 59M 59S,every_day,0
```
6. **Select what [sensors and features](../../setup/configuration#sensor-and-features-to-process) you want to process.**
5. We set `[USE]` to `aware_device` to tell RAPIDS to process sensor data collected with the AWARE Framework stored in CSV files.
Set `[PHONE_CALLS][PROVIDERS][RAPIDS][COMPUTE]` to `True` in the `config.yaml` file.
6. We used the default `[FOLDER]` for `awre_csv` since we already stored our test `calls.csv` file there
7. We changed `[CONTAINER]` to `calls.csv` to process our test call data.
??? example "Example of the `config.yaml` sections after the changes outlined above"
Highlighted lines are related to the configuration steps above.
``` yaml hl_lines="1 4 7 12 13 38"
PIDS: [p01]
TIMEZONE: &timezone
America/New_York
DATABASE_GROUP: &database_group
MY_GROUP
# ... other irrelevant sections
TIME_SEGMENTS: &time_segments
TYPE: PERIODIC
FILE: "data/external/timesegments_periodic.csv" # make sure the three lines specified above are in the file
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE
# No need to change this if you collected AWARE data on a database and your credentials are grouped under `MY_GROUP` in `.env`
DEVICE_DATA:
PHONE:
SOURCE:
TYPE: DATABASE
DATABASE_GROUP: *database_group
DEVICE_ID_COLUMN: device_id # column name
TIMEZONE:
TYPE: SINGLE # SINGLE or MULTIPLE
VALUE: *timezone
############## PHONE ###########################################################
################################################################################
# ... other irrelevant sections
# Communication call features config, TYPES and FEATURES keys need to match
PHONE_CALLS:
TABLE: calls # change if your calls table has a different name
PROVIDERS:
RAPIDS:
COMPUTE: True # set this to True!
CALL_TYPES: ...
```
8. We flipped `[COMPUTE]` to `True` to extract call behavioral features using the `RAPIDS` feature provider.
3. Run RAPIDS
```bash
@ -99,7 +147,7 @@ This is a quick guide for creating and running a simple pipeline to extract miss
```
4. The call features for daily and morning time segments will be in
```
/data/processed/features/p01/phone_calls.csv
data/processed/features/all_participants/all_sensor_features.csv
```

View File

@ -15,10 +15,19 @@ if len(config["PIDS"]) == 0:
for provider in config["PHONE_DATA_YIELD"]["PROVIDERS"].keys():
if config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]:
allowed_phone_sensors = get_phone_sensor_names()
if not (set(config["PHONE_DATA_YIELD"]["SENSORS"]) <= set(allowed_phone_sensors)):
raise ValueError('\nInvalid sensor(s) for PHONE_DATA_YIELD. config["PHONE_DATA_YIELD"]["SENSORS"] can have '
'one or more of the following phone sensors: {}.\nInstead you provided "{}".\n'
'Keep in mind that the sensors\' CONTAINER attribute must point to a valid database table or file'\
.format(', '.join(allowed_phone_sensors),
', '.join(set(config["PHONE_DATA_YIELD"]["SENSORS"]) - set(allowed_phone_sensors))))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=map(str.lower, config["PHONE_DATA_YIELD"]["SENSORS"])))
files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_data_yield_features/phone_data_yield_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_data_yield_features/phone_data_yield_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_data_yield.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -27,7 +36,7 @@ for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys():
if config["PHONE_MESSAGES"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_messages_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_messages_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_messages_features/phone_messages_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_MESSAGES"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_messages_features/phone_messages_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_MESSAGES"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_messages.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -36,8 +45,7 @@ for provider in config["PHONE_CALLS"]["PROVIDERS"].keys():
if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime_unified.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -46,7 +54,7 @@ for provider in config["PHONE_BLUETOOTH"]["PROVIDERS"].keys():
if config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_bluetooth_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_bluetooth_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_bluetooth_features/phone_bluetooth_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_bluetooth_features/phone_bluetooth_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_bluetooth.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -55,11 +63,10 @@ for provider in config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"].keys():
if config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_with_datetime_unified.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_features/phone_activity_recognition_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_features/phone_activity_recognition_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_activity_recognition.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -70,20 +77,23 @@ for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_features/phone_battery_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_BATTERY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_battery_features/phone_battery_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_BATTERY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_battery.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys():
if config["PHONE_SCREEN"]["PROVIDERS"][provider]["COMPUTE"]:
# if "PHONE_SCREEN" in config["PHONE_DATA_YIELD"]["SENSORS"]:# not used for now because we took episodepersensedminutes out of the list of supported features
# files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"]))
# else:
# raise ValueError("Error: Add PHONE_SCREEN (and as many PHONE_SENSORS as you have in your database) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data)")
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime_unified.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_features/phone_screen_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_SCREEN"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_screen_features/phone_screen_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_SCREEN"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_screen.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -92,7 +102,7 @@ for provider in config["PHONE_LIGHT"]["PROVIDERS"].keys():
if config["PHONE_LIGHT"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_light_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_light_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_light_features/phone_light_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LIGHT"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_light_features/phone_light_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_LIGHT"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_light.csv", pid=config["PIDS"],))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -101,7 +111,7 @@ for provider in config["PHONE_ACCELEROMETER"]["PROVIDERS"].keys():
if config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_accelerometer_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_accelerometer_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_accelerometer_features/phone_accelerometer_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_accelerometer_features/phone_accelerometer_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_accelerometer.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -111,7 +121,7 @@ for provider in config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_applications_foreground_features/phone_applications_foreground_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_applications_foreground_features/phone_applications_foreground_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_foreground.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -120,7 +130,7 @@ for provider in config["PHONE_WIFI_VISIBLE"]["PROVIDERS"].keys():
if config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_visible_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_visible_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_visible_features/phone_wifi_visible_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_visible_features/phone_wifi_visible_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_visible.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -129,7 +139,7 @@ for provider in config["PHONE_WIFI_CONNECTED"]["PROVIDERS"].keys():
if config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_connected_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_connected_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_connected_features/phone_wifi_connected_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_connected_features/phone_wifi_connected_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_connected.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -138,34 +148,84 @@ for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys():
if config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_with_datetime_unified.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_conversation_features/phone_conversation_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_conversation_features/phone_conversation_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_conversation.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
# We can delete these if's as soon as we add feature PROVIDERS to any of these sensors
if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict):
for provider in config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"].keys():
if config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_crashes_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_crashes_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_crashes_with_datetime_with_categories.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_applications_crashes_features/phone_applications_crashes_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_crashes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
if isinstance(config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"], dict):
for provider in config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"].keys():
if config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_notifications_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_notifications_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_applications_notifications_with_datetime_with_categories.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_applications_notifications_features/phone_applications_notifications_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_notifications.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
if isinstance(config["PHONE_KEYBOARD"]["PROVIDERS"], dict):
for provider in config["PHONE_KEYBOARD"]["PROVIDERS"].keys():
if config["PHONE_KEYBOARD"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_keyboard_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_keyboard_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_keyboard_features/phone_keyboard_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_KEYBOARD"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_keyboard.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
if isinstance(config["PHONE_LOG"]["PROVIDERS"], dict):
for provider in config["PHONE_LOG"]["PROVIDERS"].keys():
if config["PHONE_LOG"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_log_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_log_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_log_features/phone_log_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_LOG"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_log.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
if config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]:
if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] == "FUSED_RESAMPLED":
if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] in ["FUSED_RESAMPLED","ALL_RESAMPLED"]:
if "PHONE_LOCATIONS" in config["PHONE_DATA_YIELD"]["SENSORS"]:
files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"]))
else:
raise ValueError("Error: Add PHONE_LOCATIONS (and as many PHONE_SENSORS as you have) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
raise ValueError("Error: Add PHONE_LOCATIONS (and as many PHONE_SENSORS as you have) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data) which is used to resample fused location data (ALL_RESAMPLED and RESAMPLED_FUSED)")
files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["FITBIT_DATA_YIELD"]["PROVIDERS"].keys():
if config["FITBIT_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_data_yield.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"].keys():
if config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_parsed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_summary_features/fitbit_heartrate_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_summary_features/fitbit_heartrate_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_summary.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -173,9 +233,8 @@ for provider in config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"].keys():
for provider in config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"].keys():
if config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_intraday_features/fitbit_heartrate_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_intraday_features/fitbit_heartrate_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_intraday.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -183,19 +242,29 @@ for provider in config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"].keys():
for provider in config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"].keys():
if config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_parsed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_summary_features/fitbit_sleep_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_summary_features/fitbit_sleep_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_sleep_summary.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"].keys():
if config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_features/fitbit_sleep_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_sleep_intraday.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys():
if config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_summary.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -203,13 +272,78 @@ for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys():
for provider in config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"].keys():
if config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_intraday.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["EMPATICA_ACCELEROMETER"]["PROVIDERS"].keys():
if config["EMPATICA_ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/empatica_accelerometer_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_accelerometer_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_ACCELEROMETER"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_accelerometer.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["EMPATICA_HEARTRATE"]["PROVIDERS"].keys():
if config["EMPATICA_HEARTRATE"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/empatica_heartrate_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_heartrate_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_HEARTRATE"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_heartrate.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["EMPATICA_TEMPERATURE"]["PROVIDERS"].keys():
if config["EMPATICA_TEMPERATURE"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/empatica_temperature_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_temperature_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_temperature_features/empatica_temperature_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_TEMPERATURE"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_temperature.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"].keys():
if config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/empatica_electrodermal_activity_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_electrodermal_activity_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_electrodermal_activity.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"].keys():
if config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/empatica_blood_volume_pulse_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_blood_volume_pulse_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_blood_volume_pulse.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys():
if config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/empatica_inter_beat_interval_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_inter_beat_interval_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_inter_beat_interval.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
if isinstance(config["EMPATICA_TAGS"]["PROVIDERS"], dict):
for provider in config["EMPATICA_TAGS"]["PROVIDERS"].keys():
if config["EMPATICA_TAGS"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/empatica_tags_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/empatica_tags_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/empatica_tags_features/empatica_tags_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_TAGS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_tags.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
# Visualization for Data Exploration
if config["HISTOGRAM_PHONE_DATA_YIELD"]["PLOT"]:
files_to_compute.append("reports/data_exploration/histogram_phone_data_yield.html")

View File

@ -1,62 +1,73 @@
# See https://www.rapids.science/setup/configuration/#database-credentials
DATABASE_GROUP: &database_group
MY_GROUP
########################################################################################################################
# GLOBAL CONFIGURATION #
########################################################################################################################
# See https://www.rapids.science/setup/configuration/#timezone-of-your-study
TIMEZONE: &timezone
America/New_York
# See https://www.rapids.science/setup/configuration/#participant-files
# See https://www.rapids.science/latest/setup/configuration/#participant-files
PIDS: [example01, example02]
# See https://www.rapids.science/setup/configuration/#automatic-creation-of-participant-files
# See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files
CREATE_PARTICIPANT_FILES:
SOURCE:
TYPE: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE
DATABASE_GROUP: *database_group
CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format
TIMEZONE: *timezone
CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format
PHONE_SECTION:
ADD: TRUE
ADD: True
DEVICE_ID_COLUMN: device_id # column name
IGNORED_DEVICE_IDS: []
FITBIT_SECTION:
ADD: TRUE
DEVICE_ID_COLUMN: device_id # column name
ADD: True
DEVICE_ID_COLUMN: fitbit_id # column name
IGNORED_DEVICE_IDS: []
EMPATICA_SECTION:
ADD: False
DEVICE_ID_COLUMN: empatica_id # column name
IGNORED_DEVICE_IDS: []
# See https://www.rapids.science/setup/configuration/#time-segments
# See https://www.rapids.science/latest/setup/configuration/#time-segments
TIME_SEGMENTS: &time_segments
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
FILE: "example_profile/exampleworkflow_timesegments.csv"
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs
# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
TIMEZONE:
TYPE: SINGLE
SINGLE:
TZCODE: America/New_York
MULTIPLE:
TZCODES_FILE: data/external/multiple_timezones_example.csv
IF_MISSING_TZCODE: STOP
DEFAULT_TZCODE: America/New_York
FITBIT:
ALLOW_MULTIPLE_TZ_PER_DEVICE: False
INFER_FROM_SMARTPHONE_TZ: False
########################################################################################################################
# PHONE #
########################################################################################################################
# See https://www.rapids.science/setup/configuration/#device-data-source-configuration
PHONE_DATA_CONFIGURATION:
SOURCE:
TYPE: DATABASE
DATABASE_GROUP: *database_group
DEVICE_ID_COLUMN: device_id # column name
TIMEZONE:
TYPE: SINGLE # SINGLE or MULTIPLE
VALUE: *timezone # IF TYPE=SINGLE, see docs
# See https://www.rapids.science/latest/setup/configuration/#data-stream-configuration
PHONE_DATA_STREAMS:
USE: aware_csv
# AVAILABLE:
aware_mysql:
DATABASE_GROUP: MY_GROUP
aware_csv:
FOLDER: data/external/example_workflow
aware_influxdb:
DATABASE_GROUP: MY_GROUP
# Sensors ------
# https://www.rapids.science/latest/features/phone-accelerometer/
PHONE_ACCELEROMETER:
TABLE: accelerometer
CONTAINER: accelerometer
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
SRC_FOLDER: "rapids" # inside src/features/phone_accelerometer
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_accelerometer/rapids/main.py
PANDA:
COMPUTE: False
@ -64,13 +75,13 @@ PHONE_ACCELEROMETER:
FEATURES:
exertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
nonexertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"]
SRC_FOLDER: "panda" # inside src/features/phone_accelerometer
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_accelerometer/panda/main.py
# See https://www.rapids.science/latest/features/phone-activity-recognition/
PHONE_ACTIVITY_RECOGNITION:
TABLE:
ANDROID: plugin_google_activity_recognition
IOS: plugin_ios_activity_recognition
CONTAINER:
ANDROID: plugin_google_activity_recognition.csv
IOS: plugin_ios_activity_recognition.csv
EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
PROVIDERS:
RAPIDS:
@ -80,11 +91,21 @@ PHONE_ACTIVITY_RECOGNITION:
STATIONARY: ["still", "tilting"]
MOBILE: ["on_foot", "walking", "running", "on_bicycle"]
VEHICLE: ["in_vehicle"]
SRC_FOLDER: "rapids" # inside src/features/phone_activity_recognition
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_activity_recognition/rapids/main.py
# See https://www.rapids.science/latest/features/phone-applications-crashes/
PHONE_APPLICATIONS_CRASHES:
CONTAINER: applications_crashes
APPLICATION_CATEGORIES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
# See https://www.rapids.science/latest/features/phone-applications-foreground/
PHONE_APPLICATIONS_FOREGROUND:
TABLE: applications_foreground
CONTAINER: applications_foreground.csv
APPLICATION_CATEGORIES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
@ -101,30 +122,57 @@ PHONE_APPLICATIONS_FOREGROUND:
EXCLUDED_CATEGORIES: ["system_apps"]
EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"]
FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
SRC_FOLDER: "rapids" # inside src/features/phone_applications_foreground
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_applications_foreground/rapids/main.py
# See https://www.rapids.science/latest/features/phone-applications-notifications/
PHONE_APPLICATIONS_NOTIFICATIONS:
CONTAINER: applications_notifications
APPLICATION_CATEGORIES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
# See https://www.rapids.science/latest/features/phone-battery/
PHONE_BATTERY:
TABLE: battery
CONTAINER: battery.csv
EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
SRC_FOLDER: "rapids" # inside src/features/phone_battery
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_battery/rapids/main.py
# See https://www.rapids.science/latest/features/phone-bluetooth/
PHONE_BLUETOOTH:
TABLE: bluetooth
CONTAINER: bluetooth.csv
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth
SRC_LANGUAGE: "r"
SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R
DORYAB:
COMPUTE: False
FEATURES:
ALL:
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
OWN:
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
OTHERS:
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
SRC_SCRIPT: src/features/phone_bluetooth/doryab/main.py
# See https://www.rapids.science/latest/features/phone-calls/
PHONE_CALLS:
TABLE: calls
CONTAINER: calls.csv
PROVIDERS:
RAPIDS:
COMPUTE: True
@ -133,13 +181,13 @@ PHONE_CALLS:
missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact]
incoming: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
outgoing: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/phone_calls
SRC_SCRIPT: src/features/phone_calls/rapids/main.R
# See https://www.rapids.science/latest/features/phone-conversation/
PHONE_CONVERSATION:
TABLE:
ANDROID: plugin_studentlife_audio_android
IOS: plugin_studentlife_audio
CONTAINER:
ANDROID: plugin_studentlife_audio_android.csv
IOS: plugin_studentlife_audio.csv
PROVIDERS:
RAPIDS:
COMPUTE: True
@ -151,57 +199,78 @@ PHONE_CONVERSATION:
"unknownexpectedfraction","countconversation"]
RECORDING_MINUTES: 1
PAUSED_MINUTES : 3
SRC_FOLDER: "rapids" # inside src/features/phone_conversation
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_conversation/rapids/main.py
# See https://www.rapids.science/latest/features/phone-data-yield/
PHONE_DATA_YIELD:
SENSORS: [PHONE_ACCELEROMETER, PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE]
SENSORS: [PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE]
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1 representing the number of minutes with at least
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/phone_data_yield
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R
# See https://www.rapids.science/latest/features/phone-keyboard/
PHONE_KEYBOARD:
CONTAINER: keyboard
PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
# See https://www.rapids.science/latest/features/phone-light/
PHONE_LIGHT:
TABLE: light
CONTAINER: light.csv
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
SRC_FOLDER: "rapids" # inside src/features/phone_light
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_light/rapids/main.py
# See https://www.rapids.science/latest/features/phone-locations/
PHONE_LOCATIONS:
TABLE: locations
LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS OR FUSED_RESAMPLED
CONTAINER: locations.csv
LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED
FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
HOME_INFERENCE:
DBSCAN_EPS: 10 # meters
DBSCAN_MINSAMPLES: 5
THRESHOLD_STATIC : 1 # km/h
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
PROVIDERS:
DORYAB:
COMPUTE: True
FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"]
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
DBSCAN_EPS: 10 # meters
DBSCAN_MINSAMPLES: 5
THRESHOLD_STATIC : 1 # km/h
MAXIMUM_GAP_ALLOWED: 300
MAXIMUM_ROW_GAP: 300
MAXIMUM_ROW_DURATION: 60
MINUTES_DATA_USED: False
SAMPLING_FREQUENCY: 0
SRC_FOLDER: "doryab" # inside src/features/phone_locations
SRC_LANGUAGE: "python"
CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
RADIUS_FOR_HOME: 100
SRC_SCRIPT: src/features/phone_locations/doryab/main.py
BARNETT:
COMPUTE: False
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
TIMEZONE: *timezone
IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
SRC_FOLDER: "barnett" # inside src/features/phone_locations
SRC_LANGUAGE: "r"
SRC_SCRIPT: src/features/phone_locations/barnett/main.R
# See https://www.rapids.science/latest/features/phone-log/
PHONE_LOG:
CONTAINER:
ANDROID: aware_log
IOS: ios_aware_log
PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
# See https://www.rapids.science/latest/features/phone-messages/
PHONE_MESSAGES:
TABLE: messages
CONTAINER: messages.csv
PROVIDERS:
RAPIDS:
COMPUTE: True
@ -209,11 +278,11 @@ PHONE_MESSAGES:
FEATURES:
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
sent: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
SRC_LANGUAGE: "r"
SRC_FOLDER: "rapids" # inside src/features/phone_messages
SRC_SCRIPT: src/features/phone_messages/rapids/main.R
# See https://www.rapids.science/latest/features/phone-screen/
PHONE_SCREEN:
TABLE: screen
CONTAINER: screen.csv
PROVIDERS:
RAPIDS:
COMPUTE: True
@ -222,26 +291,25 @@ PHONE_SCREEN:
IGNORE_EPISODES_LONGER_THAN: 0 # in minutes, set to 0 to disable
FEATURES: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration", "firstuseafter"] # "episodepersensedminutes" needs to be added later
EPISODE_TYPES: ["unlock"]
SRC_FOLDER: "rapids" # inside src/features/phone_screen
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/phone_screen/rapids/main.py
# See https://www.rapids.science/latest/features/phone-wifi-connected/
PHONE_WIFI_CONNECTED:
TABLE: "sensor_wifi"
CONTAINER: sensor_wifi.csv
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/phone_wifi_connected
SRC_LANGUAGE: "r"
SRC_SCRIPT: src/features/phone_wifi_connected/rapids/main.R
# See https://www.rapids.science/latest/features/phone-wifi-visible/
PHONE_WIFI_VISIBLE:
TABLE: "wifi"
CONTAINER: wifi.csv
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_FOLDER: "rapids" # inside src/features/phone_wifi_visible
SRC_LANGUAGE: "r"
SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R
@ -249,59 +317,115 @@ PHONE_WIFI_VISIBLE:
# FITBIT #
########################################################################################################################
# See https://www.rapids.science/latest/setup/configuration/#device-data-source-configuration
FITBIT_DATA_CONFIGURATION:
SOURCE:
TYPE: DATABASE # DATABASE or FILES (set each [FITBIT_SENSOR][TABLE] attribute with a table name or a file path accordingly)
COLUMN_FORMAT: JSON # JSON or PLAIN_TEXT
DATABASE_GROUP: *database_group
DEVICE_ID_COLUMN: device_id # column name
TIMEZONE:
TYPE: SINGLE # Fitbit only supports SINGLE timezones
VALUE: *timezone # see docs
HIDDEN:
SINGLE_FITBIT_TABLE: TRUE
# See https://www.rapids.science/latest/setup/configuration/#data-stream-configuration
FITBIT_DATA_STREAMS:
USE: fitbitjson_csv
# AVAILABLE:
fitbitjson_mysql:
DATABASE_GROUP: MY_GROUP
SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp.
fitbitparsed_mysql:
DATABASE_GROUP: MY_GROUP
SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp.
fitbitjson_csv:
FOLDER: data/external/example_workflow
SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp.
fitbitparsed_csv:
FOLDER: data/external/fitbit_csv
SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp.
# Sensors ------
# See https://www.rapids.science/latest/features/fitbit-data-yield/
FITBIT_DATA_YIELD:
SENSOR: FITBIT_HEARTRATE_INTRADAY
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
SRC_SCRIPT: src/features/fitbit_data_yield/rapids/main.R
# See https://www.rapids.science/latest/features/fitbit-heartrate-summary/
FITBIT_HEARTRATE_SUMMARY:
TABLE: fitbit_data
CONTAINER: fitbit_data.csv
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["maxrestinghr", "minrestinghr", "avgrestinghr", "medianrestinghr", "moderestinghr", "stdrestinghr", "diffmaxmoderestinghr", "diffminmoderestinghr", "entropyrestinghr"] # calories features' accuracy depend on the accuracy of the participants fitbit profile (e.g. height, weight) use these with care: ["sumcaloriesoutofrange", "maxcaloriesoutofrange", "mincaloriesoutofrange", "avgcaloriesoutofrange", "mediancaloriesoutofrange", "stdcaloriesoutofrange", "entropycaloriesoutofrange", "sumcaloriesfatburn", "maxcaloriesfatburn", "mincaloriesfatburn", "avgcaloriesfatburn", "mediancaloriesfatburn", "stdcaloriesfatburn", "entropycaloriesfatburn", "sumcaloriescardio", "maxcaloriescardio", "mincaloriescardio", "avgcaloriescardio", "mediancaloriescardio", "stdcaloriescardio", "entropycaloriescardio", "sumcaloriespeak", "maxcaloriespeak", "mincaloriespeak", "avgcaloriespeak", "mediancaloriespeak", "stdcaloriespeak", "entropycaloriespeak"]
SRC_FOLDER: "rapids" # inside src/features/fitbit_heartrate_summary
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/fitbit_heartrate_summary/rapids/main.py
# See https://www.rapids.science/latest/features/fitbit-heartrate-intraday/
FITBIT_HEARTRATE_INTRADAY:
TABLE: fitbit_data
CONTAINER: fitbit_data.csv
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "minutesonoutofrangezone", "minutesonfatburnzone", "minutesoncardiozone", "minutesonpeakzone"]
SRC_FOLDER: "rapids" # inside src/features/fitbit_heartrate_intraday
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/fitbit_heartrate_intraday/rapids/main.py
# See https://www.rapids.science/latest/features/fitbit-sleep-summary/
FITBIT_SLEEP_SUMMARY:
TABLE: fitbit_data
CONTAINER: fitbit_data.csv
SLEEP_EPISODE_TIMESTAMP: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp.
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["countepisode", "avgefficiency", "sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgdurationafterwakeup", "avgdurationasleep", "avgdurationawake", "avgdurationtofallasleep", "avgdurationinbed"]
SLEEP_TYPES: ["main", "nap", "all"]
SRC_FOLDER: "rapids" # inside src/features/fitbit_sleep_summary
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/fitbit_sleep_summary/rapids/main.py
# See https://www.rapids.science/latest/features/fitbit-sleep-intraday/
FITBIT_SLEEP_INTRADAY:
CONTAINER: sleep_intraday
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES:
LEVELS_AND_TYPES_COMBINING_ALL: True
LEVELS_AND_TYPES: [countepisode, sumduration, maxduration, minduration, avgduration, medianduration, stdduration]
RATIOS_TYPE: [count, duration]
RATIOS_SCOPE: [ACROSS_LEVELS, ACROSS_TYPES, WITHIN_LEVELS, WITHIN_TYPES]
ROUTINE: [starttimefirstmainsleep, endtimelastmainsleep, starttimefirstnap, endtimelastnap]
SLEEP_LEVELS:
CLASSIC: [awake, restless, asleep]
STAGES: [wake, deep, light, rem]
UNIFIED: [awake, asleep]
SLEEP_TYPES: [main, nap]
INCLUDE_SLEEP_LATER_THAN: 0 # a number ranged from 0 (midnight) to 1439 (23:59)
REFERENCE_TIME: MIDNIGHT # chosen from "MIDNIGHT" and "START_OF_THE_SEGMENT"
SRC_SCRIPT: src/features/fitbit_sleep_intraday/rapids/main.py
PRICE:
COMPUTE: False
FEATURES: [avgduration, avgratioduration, avgstarttimeofepisodemain, avgendtimeofepisodemain, avgmidpointofepisodemain, "stdstarttimeofepisodemain", "stdendtimeofepisodemain", "stdmidpointofepisodemain", socialjetlag, meanssdstarttimeofepisodemain, meanssdendtimeofepisodemain, meanssdmidpointofepisodemain, medianssdstarttimeofepisodemain, medianssdendtimeofepisodemain, medianssdmidpointofepisodemain]
SLEEP_LEVELS:
CLASSIC: [awake, restless, asleep]
STAGES: [wake, deep, light, rem]
UNIFIED: [awake, asleep]
DAY_TYPES: [WEEKEND, WEEK, ALL]
GROUP_EPISODES_WITHIN: # by default: today's 6pm to tomorrow's noon
START_TIME: 1080 # number of minutes after the midnight (18:00) 18*60
LENGTH: 1080 # in minutes (18 hours) 18*60
SRC_SCRIPT: src/features/fitbit_sleep_intraday/price/main.py
# See https://www.rapids.science/latest/features/fitbit-steps-summary/
FITBIT_STEPS_SUMMARY:
TABLE: fitbit_data
CONTAINER: fitbit_data.csv
PROVIDERS:
RAPIDS:
COMPUTE: True
FEATURES: ["maxsumsteps", "minsumsteps", "avgsumsteps", "mediansumsteps", "stdsumsteps"]
SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_summary
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/fitbit_steps_summary/rapids/main.py
# See https://www.rapids.science/latest/features/fitbit-steps-intraday/
FITBIT_STEPS_INTRADAY:
TABLE: fitbit_data
CONTAINER: fitbit_data.csv
PROVIDERS:
RAPIDS:
COMPUTE: True
@ -311,30 +435,109 @@ FITBIT_STEPS_INTRADAY:
ACTIVE_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"]
THRESHOLD_ACTIVE_BOUT: 10 # steps
INCLUDE_ZERO_STEP_ROWS: False
SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_intraday
SRC_LANGUAGE: "python"
SRC_SCRIPT: src/features/fitbit_steps_intraday/rapids/main.py
########################################################################################################################
# EMPATICA #
########################################################################################################################
EMPATICA_DATA_STREAMS:
USE: empatica_zip
# AVAILABLE:
empatica_zip:
FOLDER: data/external/empatica
# Sensors ------
# See https://www.rapids.science/latest/features/empatica-accelerometer/
EMPATICA_ACCELEROMETER:
CONTAINER: ACC
PROVIDERS:
DBDP:
COMPUTE: False
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
# See https://www.rapids.science/latest/features/empatica-heartrate/
EMPATICA_HEARTRATE:
CONTAINER: HR
PROVIDERS:
DBDP:
COMPUTE: False
FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr"]
SRC_SCRIPT: src/features/empatica_heartrate/dbdp/main.py
# See https://www.rapids.science/latest/features/empatica-temperature/
EMPATICA_TEMPERATURE:
CONTAINER: TEMP
PROVIDERS:
DBDP:
COMPUTE: False
FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
# See https://www.rapids.science/latest/features/empatica-electrodermal-activity/
EMPATICA_ELECTRODERMAL_ACTIVITY:
CONTAINER: EDA
PROVIDERS:
DBDP:
COMPUTE: False
FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
# See https://www.rapids.science/latest/features/empatica-blood-volume-pulse/
EMPATICA_BLOOD_VOLUME_PULSE:
CONTAINER: BVP
PROVIDERS:
DBDP:
COMPUTE: False
FEATURES: ["maxbvp", "minbvp", "avgbvp", "medianbvp", "modebvp", "stdbvp", "diffmaxmodebvp", "diffminmodebvp", "entropybvp"]
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/dbdp/main.py
# See https://www.rapids.science/latest/features/empatica-inter-beat-interval/
EMPATICA_INTER_BEAT_INTERVAL:
CONTAINER: IBI
PROVIDERS:
DBDP:
COMPUTE: False
FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
# See https://www.rapids.science/latest/features/empatica-tags/
EMPATICA_TAGS:
CONTAINER: TAGS
PROVIDERS: # None implemented yet
########################################################################################################################
# PLOTS #
########################################################################################################################
# Data quality ------
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#1-histograms-of-phone-data-yield
HISTOGRAM_PHONE_DATA_YIELD:
PLOT: True
HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT:
PLOT: True
HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT:
PLOT: True
SENSORS: [PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE]
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#2-heatmaps-of-overall-data-yield
HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT:
PLOT: True
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#3-heatmap-of-recorded-phone-sensors
HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT:
PLOT: True
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#4-heatmap-of-sensor-row-count
HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT:
PLOT: False
SENSORS: [PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE]
# Features ------
# See https://www.rapids.science/latest/visualizations/feature-visualizations/#1-heatmap-correlation-matrix
HEATMAP_FEATURE_CORRELATION_MATRIX:
PLOT: TRUE
PLOT: False
MIN_ROWS_RATIO: 0.5
CORR_THRESHOLD: 0.1
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
@ -349,18 +552,14 @@ PARAMS_FOR_ANALYSIS:
CATEGORICAL_OPERATORS: [mostcommon]
DEMOGRAPHIC:
TABLE: participant_info
FOLDER: data/external/example_workflow
CONTAINER: participant_info.csv
FEATURES: [age, gender, inpatientdays]
CATEGORICAL_FEATURES: [gender]
SOURCE:
DATABASE_GROUP: *database_group
TIMEZONE: *timezone
TARGET:
TABLE: participant_target
SOURCE:
DATABASE_GROUP: *database_group
TIMEZONE: *timezone
FOLDER: data/external/example_workflow
CONTAINER: participant_target.csv
# Cleaning Parameters
COLS_NAN_THRESHOLD: 0.3

View File

@ -24,6 +24,8 @@ markdown_extensions:
- pymdownx.mark
- pymdownx.smartsymbols
- pymdownx.superfences
- pymdownx.snippets:
check_paths: True
- pymdownx.tabbed
- pymdownx.tasklist:
custom_checkbox: True
@ -47,6 +49,7 @@ repo_url: 'https://github.com/carissalow/rapids'
copyright: 'Released under AGPL'
theme:
name: material
custom_dir: docs/overrides
logo: img/logo.png
favicon: img/logo.png
palette:
@ -71,14 +74,29 @@ extra_css:
nav:
- Home: 'index.md'
- Setup:
- File Structure: file-structure.md
- Overview: setup/overview.md
- Minimal Example: workflow-examples/minimal.md
- Installation: 'setup/installation.md'
- Configuration: setup/configuration.md
- Execution: setup/execution.md
- Citation: citation.md
- Example Workflows:
- Minimal: workflow-examples/minimal.md
- Analysis: workflow-examples/analysis.md
- Data Streams:
- Introduction: datastreams/data-streams-introduction.md
- Phone:
- aware_mysql: datastreams/aware-mysql.md
- aware_csv: datastreams/aware-csv.md
- aware_influxdb (beta): datastreams/aware-influxdb.md
- Mandatory Phone Format: datastreams/mandatory-phone-format.md
- Fitbit:
- fitbitjson_mysql: datastreams/fitbitjson-mysql.md
- fitbitjson_csv: datastreams/fitbitjson-csv.md
- fitbitparsed_mysql: datastreams/fitbitparsed-mysql.md
- fitbitparsed_csv: datastreams/fitbitparsed-csv.md
- Mandatory Fitbit Format: datastreams/mandatory-fitbit-format.md
- Empatica:
- empatica_zip: datastreams/empatica-zip.md
- Mandatory Empatica Format: datastreams/mandatory-empatica-format.md
- Add New Data Streams: datastreams/add-new-data-streams.md
- Behavioral Features:
- Introduction: features/feature-introduction.md
- Phone:
@ -87,7 +105,6 @@ nav:
- Phone Applications Crashes: features/phone-applications-crashes.md
- Phone Applications Foreground: features/phone-applications-foreground.md
- Phone Applications Notifications: features/phone-applications-notifications.md
- Phone Aware Log: features/phone-aware-log.md
- Phone Battery: features/phone-battery.md
- Phone Bluetooth: features/phone-bluetooth.md
- Phone Calls: features/phone-calls.md
@ -96,6 +113,7 @@ nav:
- Phone Keyboard: features/phone-keyboard.md
- Phone Light: features/phone-light.md
- Phone Locations: features/phone-locations.md
- Phone Log: features/phone-log.md
- Phone Messages: features/phone-messages.md
- Phone Screen: features/phone-screen.md
- Phone WiFI Connected: features/phone-wifi-connected.md
@ -120,6 +138,8 @@ nav:
- Visualizations:
- Data Quality: visualizations/data-quality-visualizations.md
- Features: visualizations/feature-visualizations.md
- Analysis Workflows:
- Complete Example: workflow-examples/analysis.md
- Developers:
- Git Flow: developers/git-flow.md
- Remote Support: developers/remote-support.md
@ -129,8 +149,8 @@ nav:
- Test cases: developers/test-cases.md
- Validation schema of config.yaml: developers/validation-schema-config.md
- Others:
- Migrating from beta: migrating-from-old-versions.md
- Migrating from an old version: migrating-from-old-versions.md
- Code of Conduct: code_of_conduct.md
- FAQ: faq.md
- Common Errors: common-errors.md
- Team: team.md
- Change Log: change-log.md

View File

@ -305,10 +305,10 @@
},
"data.table": {
"Package": "data.table",
"Version": "1.13.4",
"Version": "1.14.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "626afb25b020646f53105e6485e695c6"
"Hash": "d1b8b1a821ee564a3515fa6c6d5c52dc"
},
"dbplyr": {
"Package": "dbplyr",
@ -340,10 +340,17 @@
},
"dplyr": {
"Package": "dplyr",
"Version": "1.0.2",
"Version": "1.0.5",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "d0509913b27ea898189ee664b6030dc2"
"Hash": "d0d76c11ec807eb3f000eba4e3eb0f68"
},
"dtplyr": {
"Package": "dtplyr",
"Version": "1.1.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "1e14e4c5b2814de5225312394bc316da"
},
"ellipsis": {
"Package": "ellipsis",
@ -576,6 +583,13 @@
"Repository": "CRAN",
"Hash": "7b1f856410253d56ea67ad808f7cdff6"
},
"influxdbr": {
"Package": "influxdbr",
"Version": "0.14.2",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "fb3d0730f3a6de3b9c09081910cca9bc"
},
"isoband": {
"Package": "isoband",
"Version": "0.2.3",
@ -648,10 +662,10 @@
},
"lifecycle": {
"Package": "lifecycle",
"Version": "0.2.0",
"Version": "1.0.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "361811f31f71f8a617a9a68bf63f1f42"
"Hash": "3471fb65971f1a7b2d4ae7848cf2db8d"
},
"lubridate": {
"Package": "lubridate",
@ -891,6 +905,13 @@
"Repository": "CRAN",
"Hash": "d35964686307333a7121eb41c7dcd4e0"
},
"rappdirs": {
"Package": "rappdirs",
"Version": "0.3.3",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "5e3c5dc0b071b21fa128676560dbe94d"
},
"readr": {
"Package": "readr",
"Version": "1.4.0",
@ -940,6 +961,13 @@
"Repository": "CRAN",
"Hash": "bb5996d0bd962d214a11140d77589917"
},
"reticulate": {
"Package": "reticulate",
"Version": "1.18",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "fbd35cac6ae7554d0e4f440bca1adf3a"
},
"rjson": {
"Package": "rjson",
"Version": "0.2.20",
@ -949,10 +977,10 @@
},
"rlang": {
"Package": "rlang",
"Version": "0.4.9",
"Version": "0.4.10",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "9d7aba7bed9a79e2403b4777428a2b12"
"Hash": "599df23c40a4fce9c7b4764f28c37857"
},
"rmarkdown": {
"Package": "rmarkdown",
@ -1199,12 +1227,26 @@
"Repository": "CRAN",
"Hash": "b8acdf8af494d9ec19ccb2481a9b11c2"
},
"xts": {
"Package": "xts",
"Version": "0.12.1",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "ca2fd4ad8ef78cca3aa2b30f992798a8"
},
"yaml": {
"Package": "yaml",
"Version": "2.2.1",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "2826c5d9efb0a88f657c7a679c7106db"
},
"zoo": {
"Package": "zoo",
"Version": "1.8-9",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "035d1c7c12593038c26fb1c2fd40c4d2"
}
}
}

View File

@ -18,9 +18,6 @@ local({
if(grepl("Darwin", Sys.info()["sysname"], fixed = TRUE) & grepl("ARM64", Sys.info()["version"], fixed = TRUE)) # M1 Macs
Sys.setenv("TZDIR" = file.path(R.home(), "share", "zoneinfo"))
# set timezone library
#Sys.setenv("TZDIR" = file.path(R.home(), "share", "zoneinfo"))
# signal that we've consented to use renv
options(renv.consent = TRUE)

View File

@ -1,9 +1,21 @@
def get_script_language(script_path):
from pathlib import Path
script_path = Path(script_path)
if not script_path.exists():
raise ValueError("The following provider feature script does not exist: " + str(script_path))
if script_path.name.endswith(".py"):
return "python"
elif script_path.name.endswith(".R"):
return "r"
# Features.smk #########################################################################################################
def find_features_files(wildcards):
feature_files = []
for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items():
if provider["COMPUTE"]:
feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", sensor_key=wildcards.sensor_key.lower(), language=provider["SRC_LANGUAGE"].lower(), provider_key=provider_key.lower()))
feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", sensor_key=wildcards.sensor_key.lower(), language=get_script_language(provider["SRC_SCRIPT"]), provider_key=provider_key.lower()))
return(feature_files)
def optional_steps_sleep_input(wildcards):
@ -30,15 +42,88 @@ def get_phone_sensor_names():
phone_sensor_names.append(config_key)
return phone_sensor_names
from pathlib import Path
def get_zip_suffixes(pid):
zipfiles = list((Path("data/external/empatica/") / Path(pid)).rglob("*.zip"))
suffixes = []
for zipfile in zipfiles:
suffixes.append(zipfile.stem)
return suffixes
def pull_phone_data_input_with_mutation_scripts(wilcards):
import yaml
input = dict()
phone_stream = config["PHONE_DATA_STREAMS"]["USE"]
input["participant_file"] = "data/external/participant_files/{pid}.yaml"
input["rapids_schema_file"] = "src/data/streams/rapids_columns.yaml"
input["stream_format"] = "src/data/streams/" + phone_stream + "/format.yaml"
if Path("src/data/streams/"+ phone_stream + "/container.R").exists():
input["stream_container"] = "src/data/streams/"+ phone_stream + "/container.R"
elif Path("src/data/streams/"+ phone_stream + "/container.py").exists():
input["stream_container"] = "src/data/streams/"+ phone_stream + "/container.py"
else:
raise ValueError("The container script for {stream} is missing: src/data/streams/{stream}/container.[py|R]".format(stream=empatica_stream))
schema = yaml.load(open(input.get("stream_format"), 'r'), Loader=yaml.FullLoader)
sensor = ("phone_" + wilcards.sensor).upper()
if sensor not in schema:
raise ValueError("{sensor} is not defined in the schema {schema}".format(sensor=sensor, schema=input.get("stream_format")))
for device_os in schema[sensor].keys():
if "MUTATION" not in schema[sensor][device_os]:
raise ValueError("MUTATION is missing from [{sensor}][{device_os}] of {schema}".format(sensor=sensor, device_os=device_os,schema=input.get("stream_format")))
if "COLUMN_MAPPINGS" not in schema[sensor][device_os]["MUTATION"]:
raise ValueError("COLUMN_MAPPINGS is missing from [{sensor}][{device_os}][MUTATION] of {schema}".format(sensor=sensor, device_os=device_os, schema=input.get("stream_format")))
if "SCRIPTS" not in schema[sensor][device_os]["MUTATION"]:
raise ValueError("SCRIPTS is missing from [{sensor}][{device_os}][MUTATION] of {schema}".format(sensor=sensor, device_os=device_os, schema=input.get("stream_format")))
scripts = schema[sensor][device_os]["MUTATION"]["SCRIPTS"]
if isinstance(scripts, list):
for idx, script in enumerate(scripts):
if not script.lower().endswith((".py", ".r")):
raise ValueError("Mutate scripts can only be Python or R scripts (.py, .R).\n Instead we got {script} in \n [{sensor}][{device_os}] of {schema}".format(script=script, sensor=sensor, device_os=device_os, schema=input.get("stream_format")))
input["mutationscript"+str(idx)] = script
return input
def input_tzcodes_file(wilcards):
from pathlib import Path
if config["TIMEZONE"]["TYPE"] == "MULTIPLE":
if not config["TIMEZONE"]["MULTIPLE"]["TZCODES_FILE"].lower().endswith(".csv"):
raise ValueError("[TIMEZONE][MULTIPLE][TZCODES_FILE] should point to a CSV file, instead you typed: " + config["TIMEZONE"]["MULTIPLE"]["TZCODES_FILE"])
if not Path(config["TIMEZONE"]["MULTIPLE"]["TZCODES_FILE"]).exists():
raise ValueError("[TIMEZONE][MULTIPLE][TZCODES_FILE] should point to a CSV file, the file in the path you typed does not exist: " + config["TIMEZONE"]["MULTIPLE"]["TZCODES_FILE"])
return [config["TIMEZONE"]["MULTIPLE"]["TZCODES_FILE"]]
return []
def pull_wearable_data_input_with_mutation_scripts(wilcards):
import yaml
from pathlib import Path
input = dict()
device = wilcards.device_type.upper()
device_stream = config[device+"_DATA_STREAMS"]["USE"]
input["participant_file"] = "data/external/participant_files/{pid}.yaml"
input["rapids_schema_file"] = "src/data/streams/rapids_columns.yaml"
input["stream_format"] = "src/data/streams/" + device_stream + "/format.yaml"
if Path("src/data/streams/"+ device_stream + "/container.R").exists():
input["stream_container"] = "src/data/streams/"+ device_stream + "/container.R"
elif Path("src/data/streams/"+ device_stream + "/container.py").exists():
input["stream_container"] = "src/data/streams/"+ device_stream + "/container.py"
else:
raise ValueError("The container script for {stream} is missing: src/data/streams/{stream}/container.[py|R]".format(stream=device_stream))
schema = yaml.load(open(input.get("stream_format"), 'r'), Loader=yaml.FullLoader)
sensor = (device + "_" + wilcards.sensor).upper()
if sensor not in schema:
raise ValueError("{sensor} is not defined in the schema {schema}".format(sensor=sensor, schema=input.get("stream_format")))
if "MUTATION" not in schema[sensor]:
raise ValueError("MUTATION is missing from [{sensor}] of {schema}".format(sensor=sensor, schema=input.get("stream_format")))
if "COLUMN_MAPPINGS" not in schema[sensor]["MUTATION"]:
raise ValueError("COLUMN_MAPPINGS is missing from [{sensor}][MUTATION] of {schema}".format(sensor=sensor, schema=input.get("stream_format")))
if "SCRIPTS" not in schema[sensor]["MUTATION"]:
raise ValueError("SCRIPTS is missing from [{sensor}][MUTATION] of {schema}".format(sensor=sensor, schema=input.get("stream_format")))
scripts = schema[sensor]["MUTATION"]["SCRIPTS"]
if isinstance(scripts, list):
for idx, script in enumerate(scripts):
if not script.lower().endswith((".py", ".r")):
raise ValueError("Mutate scripts can only be Python or R scripts (.py, .R).\n Instead we got {script} in [{sensor}] of {schema}".format(script=script, sensor=sensor, schema=input.get("stream_format")))
input["mutationscript"+str(idx)] = script
return input
def get_all_raw_empatica_sensor_files(wildcards):
suffixes = get_zip_suffixes(wildcards.pid)
files = ["data/raw/{}/empatica_{}_raw_{}.csv".format(wildcards.pid, wildcards.sensor, suffix) for suffix in suffixes]
return(files)

View File

@ -62,9 +62,9 @@ rule phone_accelerometer_r_features:
rule activity_recognition_episodes:
input:
sensor_data = "data/raw/{pid}/phone_activity_recognition_with_datetime_unified.csv"
sensor_data = "data/raw/{pid}/phone_activity_recognition_with_datetime.csv"
params:
episode_threshold_between_rows = config["PHONE_BATTERY"]["EPISODE_THRESHOLD_BETWEEN_ROWS"]
episode_threshold_between_rows = config["PHONE_ACTIVITY_RECOGNITION"]["EPISODE_THRESHOLD_BETWEEN_ROWS"]
output:
"data/interim/{pid}/phone_activity_recognition_episodes.csv"
script:
@ -174,29 +174,29 @@ rule phone_applications_notifications_r_features:
script:
"../src/features/entry.R"
rule phone_aware_log_python_features:
rule phone_log_python_features:
input:
sensor_data = "data/raw/{pid}/phone_aware_log_with_datetime.csv",
sensor_data = "data/raw/{pid}/phone_log_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_AWARE_LOG"]["PROVIDERS"][wildcards.provider_key.upper()],
provider = lambda wildcards: config["PHONE_LOG"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "phone_aware_log"
sensor_key = "phone_log"
output:
"data/interim/{pid}/phone_aware_log_features/phone_aware_log_python_{provider_key}.csv"
"data/interim/{pid}/phone_log_features/phone_log_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule phone_aware_log_r_features:
rule phone_log_r_features:
input:
sensor_data = "data/raw/{pid}/phone_aware_log_with_datetime.csv",
sensor_data = "data/raw/{pid}/phone_log_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_AWARE_LOG"]["PROVIDERS"][wildcards.provider_key.upper()],
provider = lambda wildcards: config["PHONE_LOG"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "phone_aware_log"
sensor_key = "phone_log"
output:
"data/interim/{pid}/phone_aware_log_features/phone_aware_log_r_{provider_key}.csv"
"data/interim/{pid}/phone_log_features/phone_log_r_{provider_key}.csv"
script:
"../src/features/entry.R"
@ -264,7 +264,7 @@ rule phone_bluetooth_r_features:
rule calls_python_features:
input:
sensor_data = "data/raw/{pid}/phone_calls_with_datetime_unified.csv",
sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -277,7 +277,7 @@ rule calls_python_features:
rule calls_r_features:
input:
sensor_data = "data/raw/{pid}/phone_calls_with_datetime_unified.csv",
sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -290,7 +290,7 @@ rule calls_r_features:
rule conversation_python_features:
input:
sensor_data = "data/raw/{pid}/phone_conversation_with_datetime_unified.csv",
sensor_data = "data/raw/{pid}/phone_conversation_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_CONVERSATION"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -303,7 +303,7 @@ rule conversation_python_features:
rule conversation_r_features:
input:
sensor_data = "data/raw/{pid}/phone_conversation_with_datetime_unified.csv",
sensor_data = "data/raw/{pid}/phone_conversation_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_CONVERSATION"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -420,7 +420,7 @@ rule phone_messages_r_features:
rule screen_episodes:
input:
screen = "data/raw/{pid}/phone_screen_with_datetime_unified.csv"
screen = "data/raw/{pid}/phone_screen_with_datetime.csv"
output:
"data/interim/{pid}/phone_screen_episodes.csv"
script:
@ -506,7 +506,7 @@ rule phone_wifi_visible_r_features:
rule fitbit_data_yield_python_features:
input:
sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv",
sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["FITBIT_DATA_YIELD"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -519,7 +519,7 @@ rule fitbit_data_yield_python_features:
rule fitbit_data_yield_r_features:
input:
sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv",
sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["FITBIT_DATA_YIELD"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -532,7 +532,7 @@ rule fitbit_data_yield_r_features:
rule fitbit_heartrate_summary_python_features:
input:
sensor_data = "data/raw/{pid}/fitbit_heartrate_summary_parsed_with_datetime.csv",
sensor_data = "data/raw/{pid}/fitbit_heartrate_summary_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -545,7 +545,7 @@ rule fitbit_heartrate_summary_python_features:
rule fitbit_heartrate_summary_r_features:
input:
sensor_data = "data/raw/{pid}/fitbit_heartrate_summary_parsed_with_datetime.csv",
sensor_data = "data/raw/{pid}/fitbit_heartrate_summary_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -558,7 +558,7 @@ rule fitbit_heartrate_summary_r_features:
rule fitbit_heartrate_intraday_python_features:
input:
sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv",
sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -571,7 +571,7 @@ rule fitbit_heartrate_intraday_python_features:
rule fitbit_heartrate_intraday_r_features:
input:
sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv",
sensor_data = "data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -584,7 +584,7 @@ rule fitbit_heartrate_intraday_r_features:
rule fitbit_steps_summary_python_features:
input:
sensor_data = "data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv",
sensor_data = "data/raw/{pid}/fitbit_steps_summary_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -597,7 +597,7 @@ rule fitbit_steps_summary_python_features:
rule fitbit_steps_summary_r_features:
input:
sensor_data = "data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv",
sensor_data = "data/raw/{pid}/fitbit_steps_summary_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -610,7 +610,7 @@ rule fitbit_steps_summary_r_features:
rule fitbit_steps_intraday_python_features:
input:
sensor_data = "data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv",
sensor_data = "data/raw/{pid}/fitbit_steps_intraday_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -623,7 +623,7 @@ rule fitbit_steps_intraday_python_features:
rule fitbit_steps_intraday_r_features:
input:
sensor_data = "data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv",
sensor_data = "data/raw/{pid}/fitbit_steps_intraday_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -636,7 +636,7 @@ rule fitbit_steps_intraday_r_features:
rule fitbit_sleep_summary_python_features:
input:
sensor_data = "data/raw/{pid}/fitbit_sleep_summary_parsed_with_datetime.csv",
sensor_data = "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -649,7 +649,7 @@ rule fitbit_sleep_summary_python_features:
rule fitbit_sleep_summary_r_features:
input:
sensor_data = "data/raw/{pid}/fitbit_sleep_summary_parsed_with_datetime.csv",
sensor_data = "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -660,13 +660,13 @@ rule fitbit_sleep_summary_r_features:
script:
"../src/features/entry.R"
rule resample_sleep_episodes:
rule sleep_intraday_episodes:
input:
"data/raw/{pid}/fitbit_sleep_intraday_parsed.csv"
sleep_intraday = "data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv"
output:
"data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled.csv"
"data/interim/{pid}/fitbit_sleep_intraday_episodes.csv"
script:
"../src/features/utils/resample_episodes.R"
"../src/features/fitbit_sleep_intraday/episodes/sleep_intraday_episodes.py"
rule fitbit_sleep_intraday_python_features:
input:

View File

@ -1,9 +1,7 @@
rule download_demographic_data:
input:
participant_file = "data/external/participant_files/{pid}.yaml"
params:
source = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["SOURCE"],
table = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["TABLE"],
participant_file = "data/external/participant_files/{pid}.yaml",
data = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FOLDER"] + "/" + config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["CONTAINER"]
output:
"data/raw/{pid}/participant_info_raw.csv"
script:
@ -22,10 +20,8 @@ rule demographic_features:
rule download_target_data:
input:
participant_file = "data/external/participant_files/{pid}.yaml"
params:
source = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["SOURCE"],
table = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["TABLE"],
participant_file = "data/external/participant_files/{pid}.yaml",
data = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["FOLDER"] + "/" + config["PARAMS_FOR_ANALYSIS"]["TARGET"]["CONTAINER"]
output:
"data/raw/{pid}/participant_target_raw.csv"
script:
@ -34,15 +30,19 @@ rule download_target_data:
rule target_readable_datetime:
input:
sensor_input = "data/raw/{pid}/participant_target_raw.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv"
time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
params:
fixed_timezone = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["SOURCE"]["TIMEZONE"],
device_type = "fitbit",
timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output:
"data/raw/{pid}/participant_target_with_datetime.csv"
script:
"../src/data/readable_datetime.R"
"../src/data/datetime/readable_datetime.R"
rule parse_targets:
input:

View File

@ -1,54 +1,27 @@
rule restore_sql_file:
input:
sql_file = "data/external/rapids_example.sql",
db_credentials = ".env"
params:
group = config["DATABASE_GROUP"]
output:
touch("data/interim/restore_sql_file.done")
script:
"../src/data/restore_sql_file.py"
rule create_example_participant_files:
output:
expand("data/external/participant_files/{pid}.yaml", pid = ["example01", "example02"])
shell:
"echo 'PHONE:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n PLATFORMS: [android]\n LABEL: test-01\n START_DATE: 2020-04-23\n END_DATE: 2020-05-04\nFITBIT:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n LABEL: test-01\n START_DATE: 2020-04-23\n END_DATE: 2020-05-04\n' >> ./data/external/participant_files/example01.yaml && echo 'PHONE:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n PLATFORMS: [ios]\n LABEL: test-02\n START_DATE: 2020-04-23\n END_DATE: 2020-05-04\nFITBIT:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n LABEL: test-02\n START_DATE: 2020-04-23\n END_DATE: 2020-05-04\n' >> ./data/external/participant_files/example02.yaml"
"echo 'PHONE:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n PLATFORMS: [android]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example01.yaml && echo 'PHONE:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n PLATFORMS: [ios]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example02.yaml"
rule create_participants_files:
input:
participants_file = [] if config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["TYPE"] == "AWARE_DEVICE_TABLE" else config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["CSV_FILE_PATH"]
participants_file = config["CREATE_PARTICIPANT_FILES"]["CSV_FILE_PATH"]
params:
config = config["CREATE_PARTICIPANT_FILES"]
script:
"../src/data/create_participants_files.R"
rule download_phone_data:
input:
"data/external/participant_files/{pid}.yaml"
rule pull_phone_data:
input: unpack(pull_phone_data_input_with_mutation_scripts)
params:
source = config["PHONE_DATA_CONFIGURATION"]["SOURCE"],
data_configuration = config["PHONE_DATA_STREAMS"][config["PHONE_DATA_STREAMS"]["USE"]],
sensor = "phone_" + "{sensor}",
table = lambda wildcards: config["PHONE_" + str(wildcards.sensor).upper()]["TABLE"],
timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
aware_multiplatform_tables = config["PHONE_ACTIVITY_RECOGNITION"]["TABLE"]["ANDROID"] + "," + config["PHONE_ACTIVITY_RECOGNITION"]["TABLE"]["IOS"] + "," + config["PHONE_CONVERSATION"]["TABLE"]["ANDROID"] + "," + config["PHONE_CONVERSATION"]["TABLE"]["IOS"],
tables = lambda wildcards: config["PHONE_" + str(wildcards.sensor).upper()]["CONTAINER"],
output:
"data/raw/{pid}/phone_{sensor}_raw.csv"
script:
"../src/data/download_phone_data.R"
rule download_fitbit_data:
input:
participant_file = "data/external/participant_files/{pid}.yaml",
input_file = [] if config["FITBIT_DATA_CONFIGURATION"]["SOURCE"]["TYPE"] == "DATABASE" else lambda wildcards: config["FITBIT_" + str(wildcards.sensor).upper()]["TABLE"]
params:
data_configuration = config["FITBIT_DATA_CONFIGURATION"],
sensor = "fitbit_" + "{sensor}",
table = lambda wildcards: config["FITBIT_" + str(wildcards.sensor).upper()]["TABLE"],
output:
"data/raw/{pid}/fitbit_{sensor}_raw.csv"
script:
"../src/data/download_fitbit_data.R"
"../src/data/streams/pull_phone_data.R"
rule compute_time_segments:
input:
@ -66,16 +39,19 @@ rule compute_time_segments:
rule phone_readable_datetime:
input:
sensor_input = "data/raw/{pid}/phone_{sensor}_raw.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv"
time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
params:
timezones = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["TYPE"],
fixed_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
device_type = "phone",
timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output:
"data/raw/{pid}/phone_{sensor}_with_datetime.csv"
script:
"../src/data/readable_datetime.R"
"../src/data/datetime/readable_datetime.R"
rule phone_yielded_timestamps:
input:
@ -90,16 +66,19 @@ rule phone_yielded_timestamps:
rule phone_yielded_timestamps_with_datetime:
input:
sensor_input = "data/interim/{pid}/phone_yielded_timestamps.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv"
time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
params:
timezones = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["TYPE"],
fixed_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
device_type = "phone",
timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output:
"data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv"
script:
"../src/data/readable_datetime.R"
"../src/data/datetime/readable_datetime.R"
rule unify_ios_android:
input:
@ -128,16 +107,19 @@ rule process_phone_locations_types:
rule phone_locations_processed_with_datetime:
input:
sensor_input = "data/interim/{pid}/phone_locations_processed.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv"
time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
params:
timezones = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["TYPE"],
fixed_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
device_type = "phone",
timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output:
"data/interim/{pid}/phone_locations_processed_with_datetime.csv"
script:
"../src/data/readable_datetime.R"
"../src/data/datetime/readable_datetime.R"
rule phone_locations_processed_with_datetime_with_home:
input:
@ -163,16 +145,20 @@ rule resample_episodes:
rule resample_episodes_with_datetime:
input:
sensor_input = "data/interim/{pid}/{sensor}_episodes_resampled.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv"
time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
params:
timezones = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["TYPE"],
fixed_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
device_type = lambda wildcards: wildcards.sensor.split("_")[0],
timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output:
"data/interim/{pid}/{sensor}_episodes_resampled_with_datetime.csv"
script:
"../src/data/readable_datetime.R"
"../src/data/datetime/readable_datetime.R"
rule phone_application_categories:
input:
@ -187,119 +173,51 @@ rule phone_application_categories:
script:
"../src/data/application_categories.R"
rule fitbit_parse_heartrate:
input:
participant_file = "data/external/participant_files/{pid}.yaml",
raw_data = "data/raw/{pid}/fitbit_heartrate_{fitbit_data_type}_raw.csv"
rule pull_wearable_data:
input: unpack(pull_wearable_data_input_with_mutation_scripts)
params:
timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
table = lambda wildcards: config["FITBIT_HEARTRATE_"+str(wildcards.fitbit_data_type).upper()]["TABLE"],
column_format = config["FITBIT_DATA_CONFIGURATION"]["SOURCE"]["COLUMN_FORMAT"],
fitbit_data_type = "{fitbit_data_type}"
data_configuration = lambda wildcards: config[wildcards.device_type.upper() +"_DATA_STREAMS"][config[wildcards.device_type.upper() +"_DATA_STREAMS"]["USE"]],
device_type = "{device_type}",
sensor = "{device_type}" + "_" + "{sensor}",
pid = "{pid}",
tables = lambda wildcards: config[wildcards.device_type.upper() + "_" + str(wildcards.sensor).upper()]["CONTAINER"],
wildcard_constraints:
device_type="(empatica|fitbit)"
output:
"data/raw/{pid}/fitbit_heartrate_{fitbit_data_type}_parsed.csv"
"data/raw/{pid}/{device_type}_{sensor}_raw.csv"
script:
"../src/data/fitbit_parse_heartrate.py"
rule fitbit_parse_steps:
input:
participant_file = "data/external/participant_files/{pid}.yaml",
raw_data = "data/raw/{pid}/fitbit_steps_{fitbit_data_type}_raw.csv"
params:
timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
table = lambda wildcards: config["FITBIT_STEPS_"+str(wildcards.fitbit_data_type).upper()]["TABLE"],
column_format = config["FITBIT_DATA_CONFIGURATION"]["SOURCE"]["COLUMN_FORMAT"],
fitbit_data_type = "{fitbit_data_type}"
output:
"data/raw/{pid}/fitbit_steps_{fitbit_data_type}_parsed.csv"
script:
"../src/data/fitbit_parse_steps.py"
rule fitbit_parse_sleep:
input:
participant_file = "data/external/participant_files/{pid}.yaml",
raw_data = "data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_raw.csv"
params:
timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
table = lambda wildcards: config["FITBIT_SLEEP_"+str(wildcards.fitbit_data_type).upper()]["TABLE"],
column_format = config["FITBIT_DATA_CONFIGURATION"]["SOURCE"]["COLUMN_FORMAT"],
fitbit_data_type = "{fitbit_data_type}",
sleep_episode_timestamp = config["FITBIT_SLEEP_SUMMARY"]["SLEEP_EPISODE_TIMESTAMP"]
output:
"data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_parsed.csv"
script:
"../src/data/fitbit_parse_sleep.py"
# rule fitbit_parse_calories:
# input:
# data = expand("data/raw/{{pid}}/fitbit_calories_{fitbit_data_type}_raw.csv", fitbit_data_type = (["json"] if config["FITBIT_CALORIES"]["TABLE_FORMAT"] == "JSON" else ["summary", "intraday"]))
# params:
# timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
# table = config["FITBIT_CALORIES"]["TABLE"],
# table_format = config["FITBIT_CALORIES"]["TABLE_FORMAT"]
# output:
# summary_data = "data/raw/{pid}/fitbit_calories_summary_parsed.csv",
# intraday_data = "data/raw/{pid}/fitbit_calories_intraday_parsed.csv"
# script:
# "../src/data/fitbit_parse_calories.py"
"../src/data/streams/pull_wearable_data.R"
rule fitbit_readable_datetime:
input:
sensor_input = "data/raw/{pid}/fitbit_{sensor}_{fitbit_data_type}_parsed.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv"
sensor_input = "data/raw/{pid}/fitbit_{sensor}_raw.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
params:
fixed_timezone = config["FITBIT_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
device_type = "fitbit",
timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output:
"data/raw/{pid}/fitbit_{sensor}_{fitbit_data_type}_parsed_with_datetime.csv"
"data/raw/{pid}/fitbit_{sensor}_with_datetime.csv"
script:
"../src/data/readable_datetime.R"
from pathlib import Path
rule unzip_empatica_data:
input:
input_file = Path(config["EMPATICA_DATA_CONFIGURATION"]["SOURCE"]["FOLDER"]) / Path("{pid}") / Path("{suffix}.zip"),
participant_file = "data/external/participant_files/{pid}.yaml"
params:
sensor = "{sensor}"
output:
sensor_output = "data/raw/{pid}/empatica_{sensor}_unzipped_{suffix}.csv"
script:
"../src/data/empatica/unzip_empatica_data.py"
rule extract_empatica_data:
input:
input_file = "data/raw/{pid}/empatica_{sensor}_unzipped_{suffix}.csv",
participant_file = "data/external/participant_files/{pid}.yaml"
params:
data_configuration = config["EMPATICA_DATA_CONFIGURATION"],
sensor = "{sensor}",
table = lambda wildcards: config["EMPATICA_" + str(wildcards.sensor).upper()]["TABLE"],
output:
sensor_output = "data/raw/{pid}/empatica_{sensor}_raw_{suffix}.csv"
script:
"../src/data/empatica/extract_empatica_data.py"
rule join_empatica_data:
input:
input_files = get_all_raw_empatica_sensor_files,
output:
sensor_output = "data/raw/{pid}/empatica_{sensor}_joined.csv"
script:
"../src/data/empatica/join_empatica_data.R"
"../src/data/datetime/readable_datetime.R"
rule empatica_readable_datetime:
input:
sensor_input = "data/raw/{pid}/empatica_{sensor}_joined.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv"
sensor_input = "data/raw/{pid}/empatica_{sensor}_raw.csv",
time_segments = "data/interim/time_segments/{pid}_time_segments.csv",
pid_file = "data/external/participant_files/{pid}.yaml",
tzcodes_file = input_tzcodes_file,
params:
timezones = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["TYPE"],
fixed_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"],
device_type = "empatica",
timezone_parameters = config["TIMEZONE"],
pid = "{pid}",
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output:
"data/raw/{pid}/empatica_{sensor}_with_datetime.csv"
script:
"../src/data/readable_datetime.R"
"../src/data/datetime/readable_datetime.R"

View File

View File

@ -11,43 +11,25 @@ group <- config$SOURCE$DATABASE_GROUP
timezone <- config$SOURCE$TIMEZONE
phone_device_id_column = config$PHONE_SECTION$DEVICE_ID_COLUMN
fitbit_device_id_column = config$FITBIT_SECTION$DEVICE_ID_COLUMN
empatica_device_id_column = config$EMPATICA_SECTION$DEVICE_ID_COLUMN
add_phone_section = config$PHONE_SECTION$ADD
add_fitbit_section = config$FITBIT_SECTION$ADD
add_empatica_section = config$EMPATICA_SECTION$ADD
phone_ignored = config$PHONE_SECTION$IGNORED_DEVICE_IDS
fitbit_ignored = config$FITBIT_SECTION$IGNORED_DEVICE_IDS
empatica_ignored = config$EMPATICA_SECTION$IGNORED_DEVICE_IDS
rmysql.settingsfile <- "./.env"
if(config$SOURCE$TYPE == "AWARE_DEVICE_TABLE"){
database <- dbConnect(MariaDB(), default.file = rmysql.settingsfile, group = group)
if(config$FITBIT_SECTION$ADD == TRUE){
query <- paste("SELECT",phone_device_id_column, ",",fitbit_device_id_column," as _temp_fitbit_id, brand, label, timestamp FROM aware_device order by timestamp asc")
fitbit_device_id_column <- "_temp_fitbit_id"
}
else
query <- paste("SELECT ",phone_device_id_column,", brand, label, timestamp FROM aware_device order by timestamp asc")
participants <- dbGetQuery(database, query)
dbDisconnect(database)
participants <- participants %>%
mutate(pid = if_else(row_number()<10, paste0("p","0",row_number()), paste0("p", row_number())),
platform = if_else(brand == "iPhone", "ios", "android"), brand = NULL,
label = iconv(if_else(label == "", "EMPTY_LABEL", label), from = "UTF-8", to = "UTF-8", sub=''),
start_date = format(as.POSIXct(timestamp / 1000, origin = "1970-01-01", tz = timezone), "%Y-%m-%d"),
end_date = format(Sys.Date(), "%Y-%m-%d"),
!!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)),
!!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column)))
} else if(config$SOURCE$TYPE == "CSV_FILE"){
participants <- read_csv(config$SOURCE$CSV_FILE_PATH, col_types=cols_only(device_id="c",pid="c",label="c",platform="c",
start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c")) %>%
mutate(start_date = as.character(start_date), end_date = as.character(end_date)) # we read as date to validate format
participants <- participants %>%
mutate(!!phone_device_id_column := str_replace(!!rlang::sym(phone_device_id_column), ";",","),
platform = str_replace(platform, ";",","),
!!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)),
!!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column)))
}
participants <- read_csv(config$CSV_FILE_PATH, col_types=cols_only(device_id="c",pid="c",label="c",platform="c",
start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c",empatica_id="c")) %>%
mutate(start_date = as.character(start_date), end_date = as.character(end_date)) # we read as date to validate format
participants <- participants %>%
mutate(!!phone_device_id_column := str_replace(!!rlang::sym(phone_device_id_column), ";",","),
platform = str_replace(platform, ";",","),
!!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)),
!!empatica_device_id_column := if_else(!!rlang::sym(empatica_device_id_column) %in% empatica_ignored, NA_character_, !!rlang::sym(empatica_device_id_column)),
!!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column)))
dir.create(file.path("./data/external/participant_files/"))
@ -73,8 +55,8 @@ participants %>%
} else
lines <- append(lines, empty_fitbit)
if(add_empatica_section == TRUE){
lines <- append(lines, c("EMPATICA:",
if(add_empatica_section == TRUE && !is.na(row[empatica_device_id_column])){
lines <- append(lines, c("EMPATICA:", paste0(" DEVICE_IDS: [",row[empatica_device_id_column],"]"),
paste(" LABEL:",row$label), paste(" START_DATE:", start_date), paste(" END_DATE:", end_date)))
} else
lines <- append(lines, empty_empatica)
@ -83,7 +65,7 @@ participants %>%
writeLines(lines, file_connection)
close(file_connection)
}, add_phone_section, add_fitbit_section, phone_device_id_column, fitbit_device_id_column)
}, add_phone_section, add_fitbit_section, phone_device_id_column, fitbit_device_id_column, empatica_device_id_column)
file_lines <-readLines("./config.yaml")
for (i in 1:length(file_lines)){

View File

@ -0,0 +1,105 @@
library(tibble)
library(dplyr)
library(tidyr)
library(purrr)
library(yaml)
options(scipen = 999)
buils_tz_intervals <- function(tz_codes){
tz_codes <- tz_codes %>%
group_by(device_id) %>%
mutate(end_timestamp = lead(timestamp)) %>%
ungroup() %>%
replace_na(list(end_timestamp = as.numeric(Sys.time())*1000))
return(tz_codes)
}
filter_tz_per_device <- function(device_id, tz_codes, default, IF_MISSING_TZCODE){
device_tz_codes <- tz_codes %>% filter(device_id == !!device_id) %>% select(-device_id)
if(nrow(device_tz_codes) > 0)
return(device_tz_codes)
else if(IF_MISSING_TZCODE == "STOP")
stop(paste("The device id '", device_id, "' does not have any time zone codes in your [MULTIPLE][TZCODES_FILE], add one or set IF_MISSING_TZCODE to 'USE_DEFAULT'"))
else if(IF_MISSING_TZCODE == "USE_DEFAULT")
return(data.frame(timestamp = c(0), tzcode = default, end_timestamp = as.numeric(Sys.time())*1000))
stop("We should have obtained the time zones for a device, stop the execution or use the default tz but this didn't happen. Create an issue on Github")
}
assign_tz_code <- function(data, tz_codes){
for(i in 1:nrow(tz_codes)) {
start_timestamp <- tz_codes[[i, "timestamp"]]
end_timestamp <- tz_codes[[i, "end_timestamp"]]
time_zone <- trimws(tz_codes[[i, "tzcode"]], which="both")
data$local_timezone <- if_else(start_timestamp <= data$timestamp & data$timestamp < end_timestamp, time_zone, data$local_timezone)
}
return(data %>% filter(!is.na(local_timezone)))
}
validate_single_tz_per_fitbit_device <- function(tz_codes, INFER_FROM_SMARTPHONE_TZ){
if(INFER_FROM_SMARTPHONE_TZ)
stop("If [TIMEZONE][MULTIPLE][FITBIT][INFER_FROM_SMARTPHONE_TZ] is True (you want to infer Fitbit time zones with smartphone data), you need to set ALLOW_MULTIPLE_TZ_PER_DEVICE to True. However, read the docs to understand why this can be innacurate")
tz_per_device <- tz_codes %>% group_by(device_id) %>% summarise(n = n(), .groups = "drop_last") %>% filter(n > 1)
if(nrow(tz_per_device) > 0)
stop(paste("The following Fitbit device ids have more than one time zone change which is not allowed if [TIMEZONE][MULTIPLE][FITBIT][ALLOW_MULTIPLE_TZ_PER_DEVICE] is False:", paste(tz_per_device %>% pull(device_id), collapse = ",")))
zero_ts <- tz_codes %>% filter(timestamp > 0)
if(nrow(zero_ts) > 0)
stop(paste("The following Fitbit device ids have a time zone change with a timestamp bigger than 0 which is not allowed if [TIMEZONE][MULTIPLE][FITBIT][ALLOW_MULTIPLE_TZ_PER_DEVICE] is False: ", paste(zero_ts %>% pull(device_id), collapse = ",")))
}
validate_devies_exist_in_participant_file <- function(devices, device_type, pid, participant_file){
if(length(devices) == 0)
stop("[TIMEZONE][MULTIPLE][FITBIT][ALLOW_MULTIPLE_TZ_PER_DEVICE] is True (you want to infer Fitbit time zones with smartphone data), however participant ", pid," does not have any [",device_type,"][DEVICE_IDS] in ", participant_file)
}
# TODO include CSV timezone file in rule
multiple_time_zone_assignment <- function(sensor_data, timezone_parameters, device_type, pid, participant_file){
tz_codes <- read.csv(timezone_parameters$MULTIPLE$TZCODES_FILE)
default <- timezone_parameters$MULTIPLE$DEFAULT_TZCODE
IF_MISSING_TZCODE <- timezone_parameters$MULTIPLE$IF_MISSING_TZCODE
ALLOW_MULTIPLE_TZ_PER_DEVICE <- timezone_parameters$MULTIPLE$FITBIT$ALLOW_MULTIPLE_TZ_PER_DEVICE
INFER_FROM_SMARTPHONE_TZ <- timezone_parameters$MULTIPLE$FITBIT$INFER_FROM_SMARTPHONE_TZ
participant_data <- read_yaml(participant_file)
phone_ids <- participant_data$PHONE$DEVICE_IDS
fitbit_ids <- participant_data$FITBIT$DEVICE_IDS
if(device_type == "fitbit"){
if(!ALLOW_MULTIPLE_TZ_PER_DEVICE){
validate_single_tz_per_fitbit_device(tz_codes, INFER_FROM_SMARTPHONE_TZ)
} else if(INFER_FROM_SMARTPHONE_TZ){
validate_devies_exist_in_participant_file(phone_ids, "PHONE", pid, participant_file)
validate_devies_exist_in_participant_file(fitbit_ids, "FITBIT", pid, participant_file)
unified_device_id <- paste0("unified_device_id", pid)
sensor_data <- sensor_data %>% mutate(device_id = if_else(device_id %in% phone_ids, unified_device_id, device_id))
tz_codes <- tz_codes %>% mutate(device_id = if_else(device_id %in% fitbit_ids, unified_device_id, device_id))
}
}
tz_intervals <- buils_tz_intervals(tz_codes)
sensor_data <- sensor_data %>% mutate(local_timezone = NA_character_)
if(nrow(sensor_data) > 0){
sensor_data <- sensor_data %>%
group_by(device_id) %>%
nest() %>%
mutate(tz_codes_per_device = map(device_id, filter_tz_per_device, tz_intervals, default, IF_MISSING_TZCODE)) %>%
mutate(data = map2(data, tz_codes_per_device, assign_tz_code )) %>%
select(-tz_codes_per_device) %>%
unnest(cols = data)
}
return(sensor_data)
}

View File

@ -2,7 +2,7 @@ library("tidyverse")
library("lubridate", warn.conflicts = F)
options(scipen=999)
day_type_delay <- function(day_type, include_past_periodic_segments){
day_type_delay <- function(time_segments, day_type, include_past_periodic_segments){
delay <- time_segments %>% mutate(length_duration = duration(length)) %>% filter(repeats_on == day_type) %>% arrange(-length_duration) %>% pull(length_duration) %>% first()
return(if_else(is.na(delay) | include_past_periodic_segments == FALSE, duration("0days"), delay))
}
@ -80,7 +80,8 @@ assign_to_time_segment <- function(sensor_data, time_segments, time_segments_typ
mutate(data = map2(data, local_timezone, assign_rows_to_segments_frequency, time_segments)) %>%
unnest(cols = data) %>%
arrange(timestamp) %>%
select(-local_time_obj)
select(-local_time_obj) %>%
ungroup()
return(sensor_data)
@ -90,10 +91,10 @@ assign_to_time_segment <- function(sensor_data, time_segments, time_segments_typ
# We need to take into account segment start dates that could include the first day of data
time_segments <- time_segments %>% mutate(length_duration = duration(length))
every_day_delay <- duration("0days")
wday_delay <- day_type_delay("wday", include_past_periodic_segments)
mday_delay <- day_type_delay("mday", include_past_periodic_segments)
qday_delay <- day_type_delay("qday", include_past_periodic_segments)
yday_delay <- day_type_delay("yday", include_past_periodic_segments)
wday_delay <- day_type_delay(time_segments, "wday", include_past_periodic_segments)
mday_delay <- day_type_delay(time_segments, "mday", include_past_periodic_segments)
qday_delay <- day_type_delay(time_segments, "qday", include_past_periodic_segments)
yday_delay <- day_type_delay(time_segments, "yday", include_past_periodic_segments)
sensor_data <- sensor_data %>%
group_by(local_timezone) %>%
@ -171,5 +172,5 @@ assign_to_time_segment <- function(sensor_data, time_segments, time_segments_typ
arrange(timestamp)
}
return(sensor_data)
return(sensor_data %>% ungroup())
}

View File

@ -0,0 +1,128 @@
source("renv/activate.R")
library("tidyverse")
library("readr")
library("tidyr")
library("lubridate")
library("yaml")
source("src/data/datetime/assign_to_time_segment.R")
source("src/data/datetime/assign_to_multiple_timezones.R")
split_local_date_time <- function(data){
data <- data %>%
separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
mutate(local_hour = as.numeric(local_hour),
local_minute = as.numeric(local_minute))
return(data)
}
is_valid_timezone <- function(timezone) {
return(timezone %in% (OlsonNames()))
}
validate_user_timezones <- function(timezone_parameters){
if(!timezone_parameters$TYPE %in% c("SINGLE", "MULTIPLE"))
stop("Invalid [TIMEZONE][TYPE], only valid options are SINGLE or MULTIPLE")
if(timezone_parameters$TYPE == "SINGLE"){
if(!is_valid_timezone(timezone_parameters$SINGLE$TZCODE))
stop(paste("[TIMEZONE][SINGLE][TZCODE] is not a valid timezone: ", timezone_parameters$SINGLE$TZCODE))
} else if(timezone_parameters$TYPE == "MULTIPLE"){
tz_codes <- read.csv(timezone_parameters$MULTIPLE$TZCODES_FILE)
valid_file_columns <- c("device_id", "timestamp", "tzcode")
if(length(colnames(tz_codes)) != length(valid_file_columns) || !setequal(colnames(tz_codes), valid_file_columns))
stop(paste("[TIMEZONE][MULTIPLE][TZCODES_FILE] has does not have the required columns. You provided",paste(colnames(tz_codes), collapse=","),"but we need",paste(valid_file_columns, collapse=",")))
invalid_tz_codes <- tz_codes %>%
mutate(row = (1:n()) + 1,
tzcode = trimws(tzcode, which="both"),
is_valid = is_valid_timezone(tzcode)) %>%
filter(is_valid == FALSE)
if(nrow(invalid_tz_codes) > 0)
stop(paste("[TIMEZONE][MULTIPLE][TZCODES_FILE] has invalid time zone codes. In file ", timezone_parameters$MULTIPLE$TZCODES_FILE, ".\nAffected rows=[", paste(invalid_tz_codes %>% pull(row),collapse=","), "], with invalid codes=[", paste(invalid_tz_codes %>% pull(tzcode),collapse=",") ,"]"))
}
}
create_mising_temporal_column <- function(data, device_type){
if(device_type == "fitbit" && all(data$timestamp == 0)){
# For fibit we infere timestamp from Fitbit's local date time
if(nrow(data) == 0)
return(data %>% mutate(timestamp = NA_real_))
if(any(is.na(parse_date_time(data$local_date_time, orders= c("%Y/%m/%d %H:%M:%S","%Y-%m-%d %H:%M:%S"), exact=T))))
stop("One or more values in the local_date_time column do not have the expected format: yyyy-mm-dd hh:mm:ss or yyyy/mm/dd hh:mm:ss")
return(data %>%
group_by(local_timezone) %>%
nest() %>%
mutate(data = map2(data, local_timezone, function(nested_data, tz){
return(nested_data %>% mutate(timestamp = as.numeric(ymd_hms(local_date_time, tz=tz)) * 1000) %>% drop_na(timestamp))
})) %>%
unnest(cols = everything())) %>%
ungroup()
} else {
# For the rest of devices we infere local date time from timestamp
if(nrow(data) == 0)
return(data %>% mutate(local_date_time = NA_character_))
return(data %>%
group_by(local_timezone) %>%
nest() %>%
mutate(data = map2(data, local_timezone, function(nested_data, tz){
return(nested_data %>% mutate(local_date_time = format(as_datetime(timestamp / 1000, tz=tz), format="%Y-%m-%d %H:%M:%S")) %>% drop_na(local_date_time) )
})) %>%
unnest(cols = everything())) %>%
ungroup()
}
}
filter_wanted_dates <- function(output, participant_file, device_type){
participant_data <- read_yaml(participant_file)
device_type <- toupper(device_type)
start_date <- participant_data[[device_type]]$START_DATE
end_date <- participant_data[[device_type]]$END_DATE
if(!is.null(start_date)){
start_date <- parse_date_time(start_date, orders = c("ymd", "ymdhMs", "ymdhM", "ymdh"))
if(is.na(start_date))
stop(paste0("[",device_type, "][START_DATE] does not have one of these valid formats: [ymd, ymd hms, ymd hm, ymd h], you typed: '", participant_data[[device_type]]$START_DATE, "' in ", participant_file))
output <- output %>% filter(ymd_hms(local_date_time) >= start_date)
}
if(!is.null(end_date)){
end_date <- parse_date_time(end_date, orders = c("ymd", "ymdhMs", "ymdhM", "ymdh"))
if(is.na(end_date))
stop(paste0("[",device_type, "][END_DATE] does not have one of these valid formats: [ymd, ymd hms, ymd hm, ymd h], you typed: '", participant_data[[device_type]]$END_DATE, "' in ", participant_file))
output <- output %>% filter(ymd_hms(local_date_time) <= end_date)
}
return(output)
}
readable_datetime <- function(){
input <- read.csv(snakemake@input[["sensor_input"]]) %>% arrange(timestamp)
time_segments <- read.csv(snakemake@input[["time_segments"]])
participant_file <- snakemake@input[["pid_file"]]
device_type <- snakemake@params[["device_type"]]
timezone_parameters <- snakemake@params[["timezone_parameters"]]
pid <- snakemake@params[["pid"]]
time_segments_type <- snakemake@params[["time_segments_type"]]
include_past_periodic_segments <- snakemake@params[["include_past_periodic_segments"]]
validate_user_timezones(timezone_parameters)
if(timezone_parameters$TYPE == "SINGLE")
output <- input %>% mutate(local_timezone = timezone_parameters$SINGLE$TZCODE)
else if(timezone_parameters$TYPE == "MULTIPLE")
output <- multiple_time_zone_assignment(input, timezone_parameters, device_type, pid, participant_file)
output <- create_mising_temporal_column(output, device_type)
output <- split_local_date_time(output)
output <- assign_to_time_segment(output, time_segments, time_segments_type, include_past_periodic_segments)
output <- filter_wanted_dates(output, participant_file, device_type)
output <- output %>% arrange(timestamp)
write_csv(output, snakemake@output[[1]])
}
readable_datetime()

View File

@ -1,46 +0,0 @@
source("renv/activate.R")
library(RMariaDB)
library("dplyr", warn.conflicts = F)
library(readr)
library(stringr)
library(yaml)
participant_file <- snakemake@input[["participant_file"]]
input_file <- snakemake@input[["input_file"]]
data_configuration <- snakemake@params[["data_configuration"]]
source <- data_configuration$SOURCE
sensor <- snakemake@params[["sensor"]]
table <- snakemake@params[["table"]]
sensor_file <- snakemake@output[[1]]
participant <- read_yaml(participant_file)
if(! "FITBIT" %in% names(participant)){
stop(paste("The following participant file does not have a FITBIT section, create one manually or automatically (see the docs):", participant_file))
}
device_ids <- participant$FITBIT$DEVICE_IDS
unified_device_id <- tail(device_ids, 1)
# As opposed to phone data, we dont' filter by date here because data can still be in JSON format, we need to parse it first
if(source$TYPE == "DATABASE"){
dbEngine <- dbConnect(MariaDB(), default.file = "./.env", group = source$DATABASE_GROUP)
query <- paste0("SELECT * FROM ", table, " WHERE ",source$DEVICE_ID_COLUMN," IN ('", paste0(device_ids, collapse = "','"), "')")
sensor_data <- dbGetQuery(dbEngine, query)
dbDisconnect(dbEngine)
} else if(source$TYPE == "FILES"){
sensor_data <- read_csv_chunked(input_file, callback = DataFrameCallback$new(function(x, pos) subset(x,x[[source$DEVICE_ID_COLUMN]] %in% device_ids)), progress = T, chunk_size = 50000)
if(is.null(sensor_data)) # emtpy file
sensor_data <- read.csv(input_file)
}
sensor_data <- sensor_data %>%
rename(device_id = source$DEVICE_ID_COLUMN) %>%
mutate(device_id = unified_device_id) # Unify device_id
if("HIDDEN" %in% names(data_configuration) && data_configuration$HIDDEN$SINGLE_FITBIT_TABLE == TRUE) # For MoSHI use, we didn't split fitbit sensors into different tables
sensor_data <- sensor_data %>% filter(fitbit_data_type == str_split(sensor, "_", simplify = TRUE)[[2]])
# Droping duplicates on all columns except for _id or id
sensor_data <- sensor_data %>% distinct(!!!syms(setdiff(names(sensor_data), c("_id", "id"))))
write_csv(sensor_data, sensor_file)

View File

@ -1,107 +0,0 @@
source("renv/activate.R")
source("src/data/unify_utils.R")
library(RMariaDB)
library(stringr)
library("dplyr", warn.conflicts = F)
library(readr)
library(yaml)
library(lubridate)
options(scipen=999)
validate_deviceid_platforms <- function(device_ids, platforms){
if(length(device_ids) == 1){
if(length(platforms) > 1 || (platforms != "android" && platforms != "ios"))
stop(paste0("If you have 1 device_id, its platform should be 'android' or 'ios' but you typed: '", paste0(platforms, collapse = ","), "'. Participant file: ", participant))
} else if(length(device_ids) > 1 && length(platforms) == 1){
if(platforms != "android" && platforms != "ios" && platforms != "multiple")
stop(paste0("If you have more than 1 device_id, platform should be 'android', 'ios' OR 'multiple' but you typed: '", paste0(platforms, collapse = "s,"), "'. Participant file: ", participant))
} else if(length(device_ids) > 1 && length(platforms) > 1){
if(length(device_ids) != length(platforms))
stop(paste0("The number of device_ids should match the number of platforms. Participant file:", participant))
if(all(intersect(c("android", "ios"), unique(platforms)) != c("android", "ios")))
stop(paste0("If you have more than 1 device_id and more than 1 platform, the platforms should be a mix of 'android' AND 'ios' but you typed: '", paste0(platforms, collapse = ","), "'. Participant file: ", participant))
}
}
is_multiplaform_participant <- function(dbEngine, device_ids, platforms){
# Multiple android and ios platforms or the same platform (android, ios) for multiple devices
if((length(device_ids) > 1 && length(platforms) > 1) || (length(device_ids) > 1 && length(platforms) == 1 && (platforms == "android" || platforms == "ios"))){
return(TRUE)
}
# Multiple platforms for multiple devices, we search the platform for every device in the aware_device table
if(length(device_ids) > 1 && length(platforms) == 1 && platforms == "multiple"){
devices_platforms <- dbGetQuery(dbEngine, paste0("SELECT device_id,brand FROM aware_device WHERE device_id IN ('", paste0(device_ids, collapse = "','"), "')"))
platforms <- devices_platforms %>% distinct(brand) %>% pull(brand)
# Android phones have different brands so we check that we got at least two different platforms and one of them is iPhone
if(length(platforms) > 1 && "iPhone" %in% platforms){
return(TRUE)
}
}
return(FALSE)
}
get_timestamp_filter <- function(device_ids, participant, timezone){
# Read start and end date from the participant file to filter data within that range
start_date <- ymd_hms(paste(participant$PHONE$START_DATE,"00:00:00"), tz=timezone, quiet=TRUE)
end_date <- ymd_hms(paste(participant$PHONE$END_DATE, "23:59:59"), tz=timezone, quiet=TRUE)
start_timestamp = as.numeric(start_date) * 1000
end_timestamp = as.numeric(end_date) * 1000
if(is.na(start_timestamp)){
message(paste("PHONE[START_DATE] was not provided or failed to parse (", participant$PHONE$START_DATE,"), all data for", paste0(device_ids, collapse=","),"is returned"))
return("")
}else if(is.na(end_timestamp)){
message(paste("PHONE[END_DATE] was not provided or failed to parse (", participant$PHONE$END_DATE,"), all data for", paste0(device_ids, collapse=","),"is returned"))
return("")
} else if(start_timestamp > end_timestamp){
stop(paste("Start date has to be before end date in PHONE[TIME_SPAN] (",start_date,",", date(end_date),"), all data for", paste0(device_ids, collapse=","),"is returned"))
return("")
} else {
message(paste("Filtering data between", start_date, "and", end_date, "in", timezone, "for",paste0(device_ids, collapse=",")))
return(paste0("AND timestamp BETWEEN ", start_timestamp, " AND ", end_timestamp))
}
}
participant_file <- snakemake@input[[1]]
source <- snakemake@params[["source"]]
group <- source$DATABASE_GROUP
table <- snakemake@params[["table"]]
sensor <- snakemake@params[["sensor"]]
timezone <- snakemake@params[["timezone"]]
aware_multiplatform_tables <- str_split(snakemake@params[["aware_multiplatform_tables"]], ",")[[1]]
sensor_file <- snakemake@output[[1]]
participant <- read_yaml(participant_file)
if(! "PHONE" %in% names(participant)){
stop(paste("The following participant file does not have a PHONE section, create one manually or automatically (see the docs):", participant_file))
}
device_ids <- participant$PHONE$DEVICE_IDS
unified_device_id <- tail(device_ids, 1)
platforms <- participant$PHONE$PLATFORMS
validate_deviceid_platforms(device_ids, platforms)
timestamp_filter <- get_timestamp_filter(device_ids, participant, timezone)
dbEngine <- dbConnect(MariaDB(), default.file = "./.env", group = group)
if(is_multiplaform_participant(dbEngine, device_ids, platforms)){
sensor_data <- unify_raw_data(dbEngine, table, sensor, timestamp_filter, aware_multiplatform_tables, device_ids, platforms)
}else {
# table has two elements for conversation and activity recognition (they store data on a different table for ios and android)
if(length(table) > 1)
table <- table[[toupper(platforms[1])]]
query <- paste0("SELECT * FROM ", table, " WHERE ",source$DEVICE_ID_COLUMN," IN ('", paste0(device_ids, collapse = "','"), "')", timestamp_filter)
sensor_data <- dbGetQuery(dbEngine, query) %>%
rename(device_id = source$DEVICE_ID_COLUMN)
}
sensor_data <- sensor_data %>% arrange(timestamp)
# Unify device_id
sensor_data <- sensor_data %>% mutate(device_id = unified_device_id)
# Removing blob_feature conversation column (it's loaded as a list column that crashes write_csv)
sensor_data <- sensor_data %>% select(-any_of("blob_feature"))
# Droping duplicates on all columns except for _id or id
sensor_data <- sensor_data %>% distinct(!!!syms(setdiff(names(sensor_data), c("_id", "id"))))
write_csv(sensor_data, sensor_file)
dbDisconnect(dbEngine)

View File

@ -1,94 +0,0 @@
import pandas as pd
from pandas.core import indexing
import yaml
import csv
from collections import OrderedDict
def processAcceleration(x, y, z):
x = float(x)
y = float(y)
z = float(z)
return {'x': x, 'y': y, 'z': z}
def readFile(file, dtype):
dict = OrderedDict()
with open(file, 'rt') as csvfile:
if dtype in ('electrodermal_activity', 'temperature', 'heartrate', 'blood_volume_pulse'):
reader = csv.reader(csvfile, delimiter='\n')
elif dtype == 'accelerometer':
reader = csv.reader(csvfile, delimiter=',')
i = 0
for row in reader:
if i == 0:
timestamp = float(row[0])
elif i == 1:
hertz = float(row[0])
else:
if i == 2:
pass
else:
timestamp = timestamp + 1.0 / hertz
if dtype in ('electrodermal_activity', 'temperature', 'heartrate', 'blood_volume_pulse'):
dict[timestamp] = row[0]
elif dtype == 'accelerometer':
dict[timestamp] = processAcceleration(row[0], row[1], row[2])
i += 1
return dict
def extract_empatica_data(sensor_data_file, output_file, start_date, end_date, timezone, sensor):
# read sensor data
if sensor in ('electrodermal_activity', 'temperature', 'heartrate', 'blood_volume_pulse'):
ddict = readFile(sensor_data_file, sensor)
df = pd.DataFrame.from_dict(ddict, orient='index', columns=[sensor])
df[sensor] = df[sensor].astype(float)
df.index.name = 'timestamp'
elif sensor == 'accelerometer':
ddict = readFile(sensor_data_file, sensor)
df = pd.DataFrame.from_dict(ddict, orient='index', columns=['x', 'y', 'z'])
df['x'] = df['x'].astype(float)
df['y'] = df['y'].astype(float)
df['z'] = df['z'].astype(float)
df.index.name = 'timestamp'
elif sensor == 'inter_beat_interval':
df = pd.read_csv(sensor_data_file, names=['timestamp', sensor], header=None)
timestampstart = float(df['timestamp'][0])
df['timestamp'] = (df['timestamp'][1:len(df)]).astype(float) + timestampstart
df = df.drop([0])
df[sensor] = df[sensor].astype(float)
df = df.set_index('timestamp')
else:
raise ValueError(
"sensor can only be one of ['electrodermal_activity','temperature','heartrate','blood_volume_pulse','accelerometer','inter_beat_interval'].")
# filter based on given start and end date
start_date_utc = pd.Timestamp(start_date, tz=timezone).timestamp()
end_date_utc = pd.Timestamp(end_date, tz=timezone).timestamp()
df = df[start_date_utc:end_date_utc]
# format timestamps
df.index *= 1000
df.index = df.index.astype(int)
# output csv file
df.to_csv(output_file)
sensor_data_file = snakemake.input[0]
output_file = snakemake.output[0]
with open(snakemake.input[1], "r", encoding="utf-8") as f:
participant_file = yaml.safe_load(f)
start_date = participant_file["EMPATICA"]["START_DATE"]
end_date = participant_file["EMPATICA"]["END_DATE"]
timezone = snakemake.params["data_configuration"]["TIMEZONE"]["VALUE"]
sensor = snakemake.params["sensor"]
extract_empatica_data(sensor_data_file, output_file, start_date, end_date, timezone, sensor)

View File

@ -1,17 +0,0 @@
source("renv/activate.R")
library("tidyr")
library("dplyr", warn.conflicts = F)
empatica_files <- snakemake@input[["input_files"]]
empatica_data <- setNames(data.frame(matrix(ncol = 1, nrow = 0)), c("timestamp"))
for(file in empatica_files){
data <- read.csv(file)
if(! ("timestamp" %in% colnames(data)))
stop(paste("This file does not have a timestamp column, something might have gone wrong while unzipping it:", file))
empatica_data <- merge(empatica_data, data, all = TRUE)
}
write.csv(empatica_data, snakemake@output[[1]], row.names = FALSE)

View File

@ -1,21 +0,0 @@
from zipfile import ZipFile
import warnings
sensor_short_name = {"accelerometer":"ACC",
"temperature":"TEMP",
"tags":"tags",
"heartrate":"HR",
"inter_beat_interval":"IBI",
"blood_volume_pulse":"BVP",
"electrodermal_activity":"EDA"}
sensor_csv = sensor_short_name[snakemake.params["sensor"]] + '.csv'
warning = True
with ZipFile(snakemake.input[0], 'r') as zipFile:
listOfFileNames = zipFile.namelist()
for fileName in listOfFileNames:
if fileName == sensor_csv:
with open(snakemake.output[0], 'wb') as outputFile:
outputFile.write(zipFile.read(fileName))
warning = False
if(warning):
warnings.warn("We could not find a zipped file for {} in {} (we tried to find {})".format(snakemake.params["sensor"], snakemake.input[0], sensor_csv))

View File

@ -1,161 +0,0 @@
import yaml, json, sys
import pandas as pd
import numpy as np
from datetime import datetime, timezone
from math import trunc
HR_SUMMARY_COLUMNS = ("device_id",
"local_date_time",
"timestamp",
"heartrate_daily_restinghr",
"heartrate_daily_caloriesoutofrange",
"heartrate_daily_caloriesfatburn",
"heartrate_daily_caloriescardio",
"heartrate_daily_caloriespeak")
HR_INTRADAY_COLUMNS = ("device_id",
"heartrate",
"heartrate_zone",
"local_date_time",
"timestamp")
def parseHeartrateZones(heartrate_data):
# Get the range of heartrate zones: outofrange, fatburn, cardio, peak
# refer to: https://help.fitbit.com/articles/en_US/Help_article/1565
heartrate_fitbit_data = json.loads(heartrate_data["fitbit_data"].iloc[0])["activities-heart"][0]
# API Version X: not sure the exact version
if "heartRateZones" in heartrate_fitbit_data:
heartrate_zones = heartrate_fitbit_data["heartRateZones"]
# API VERSION Y: not sure the exact version
elif "value" in heartrate_fitbit_data:
heartrate_zones = heartrate_fitbit_data["value"]["heartRateZones"]
else:
raise ValueError("Heartrate zone are stored in an unkown format, this could mean Fitbit's heartrate API changed")
heartrate_zones_range = {}
for hrzone in heartrate_zones:
heartrate_zones_range[hrzone["name"].lower().replace(" ", "")] = [hrzone["min"], hrzone["max"]]
return heartrate_zones_range
def parseHeartrateSummaryData(record_summary, device_id, curr_date):
# API Version X: not sure the exact version
if "heartRateZones" in record_summary:
heartrate_zones = record_summary["heartRateZones"]
d_resting_heartrate = record_summary["value"] if "value" in record_summary else None
# API VERSION Y: not sure the exact version
elif "value" in record_summary:
heartrate_zones = record_summary["value"]["heartRateZones"]
d_resting_heartrate = record_summary["value"]["restingHeartRate"] if "restingHeartRate" in record_summary["value"] else None
else:
ValueError("Heartrate zone are stored in an unkown format, this could mean Fitbit's heartrate API changed")
if "caloriesOut" in heartrate_zones[0]:
d_calories_outofrange = heartrate_zones[0]["caloriesOut"]
d_calories_fatburn = heartrate_zones[1]["caloriesOut"]
d_calories_cardio = heartrate_zones[2]["caloriesOut"]
d_calories_peak = heartrate_zones[3]["caloriesOut"]
else:
d_calories_outofrange, d_calories_fatburn, d_calories_cardio, d_calories_peak = None, None, None, None
row_summary = (device_id,
curr_date,
0,
d_resting_heartrate,
d_calories_outofrange,
d_calories_fatburn,
d_calories_cardio,
d_calories_peak)
return row_summary
def parseHeartrateIntradayData(records_intraday, dataset, device_id, curr_date, heartrate_zones_range):
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
d_hr = data["value"]
# Get heartrate zone by range: min <= heartrate < max
d_hrzone = None
for hrzone, hrrange in heartrate_zones_range.items():
if d_hr >= hrrange[0] and d_hr < hrrange[1]:
d_hrzone = hrzone
break
row_intraday = (device_id,
d_hr, d_hrzone,
d_datetime,
0)
records_intraday.append(row_intraday)
return records_intraday
def parseHeartrateData(heartrate_data, fitbit_data_type):
if heartrate_data.empty:
if fitbit_data_type == "summary":
return pd.DataFrame(columns=HR_SUMMARY_COLUMNS)
elif fitbit_data_type == "intraday":
return pd.DataFrame(columns=HR_INTRADAY_COLUMNS)
device_id = heartrate_data["device_id"].iloc[0]
records_summary, records_intraday = [], []
heartrate_zones_range = parseHeartrateZones(heartrate_data)
# Parse JSON into individual records
for record in heartrate_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(record["activities-heart"][0]["dateTime"], "%Y-%m-%d")
if fitbit_data_type == "summary":
record_summary = record["activities-heart"][0]
row_summary = parseHeartrateSummaryData(record_summary, device_id, curr_date)
records_summary.append(row_summary)
if fitbit_data_type == "intraday":
dataset = record["activities-heart-intraday"]["dataset"]
records_intraday = parseHeartrateIntradayData(records_intraday, dataset, device_id, curr_date, heartrate_zones_range)
if fitbit_data_type == "summary":
parsed_data = pd.DataFrame(data=records_summary, columns=HR_SUMMARY_COLUMNS)
elif fitbit_data_type == "intraday":
parsed_data = pd.DataFrame(data=records_intraday, columns=HR_INTRADAY_COLUMNS)
return parsed_data
timezone = snakemake.params["timezone"]
column_format = snakemake.params["column_format"]
fitbit_data_type = snakemake.params["fitbit_data_type"]
with open(snakemake.input["participant_file"], "r", encoding="utf-8") as f:
participant_file = yaml.safe_load(f)
local_start_date = pd.Timestamp(participant_file["FITBIT"]["START_DATE"])
local_end_date = pd.Timestamp(participant_file["FITBIT"]["END_DATE"]) + pd.DateOffset(1)
if column_format == "JSON":
json_raw = pd.read_csv(snakemake.input["raw_data"])
parsed_data = parseHeartrateData(json_raw, fitbit_data_type)
elif column_format == "PLAIN_TEXT":
parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
else:
raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
# discard rows with restinghr = 0
if fitbit_data_type == "summary":
parsed_data = parsed_data[(parsed_data["heartrate_daily_restinghr"] != "0") & (parsed_data["heartrate_daily_restinghr"] != 0)]
# Only keep dates in the range of [local_start_date, local_end_date)
if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
if parsed_data.shape[0] > 0:
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
parsed_data.dropna(subset=['timestamp'], inplace=True)
parsed_data.to_csv(snakemake.output[0], index=False)

View File

@ -1,251 +0,0 @@
import json, yaml
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import dateutil.parser
SLEEP_CODE2LEVEL = ["asleep", "restless", "awake"]
SLEEP_SUMMARY_COLUMNS_V1_2 = ("device_id", "efficiency",
"minutes_after_wakeup", "minutes_asleep", "minutes_awake", "minutes_to_fall_asleep", "minutes_in_bed",
"is_main_sleep", "type",
"local_start_date_time", "local_end_date_time",
"timestamp")
SLEEP_SUMMARY_COLUMNS_V1 = SLEEP_SUMMARY_COLUMNS_V1_2 + ("count_awake", "duration_awake", "count_awakenings", "count_restless", "duration_restless")
SLEEP_INTRADAY_COLUMNS = (# Extract "type_episode_id" field based on summary data: start from 0
"type_episode_id",
"duration",
# For "classic" type, original_level is one of {"awake", "restless", "asleep"}
# For "stages" type, original_level is one of {"wake", "deep", "light", "rem"}
"level",
# For "classic" type, unified_level is one of {0, 1} where 0: awake {"awake" + "restless"}, 1: asleep {"asleep"}
# For "stages" type, unified_level is one of {0, 1} where 0: awake {"wake"}, 1: asleep {"deep" + "light" + "rem"}
"unified_level",
# One of {0, 1} where 0: nap, 1: main sleep
"is_main_sleep",
# One of {"classic", "stages"}
"type",
"local_date_time",
"start_timestamp",
"end_timestamp")
def mergeLongAndShortData(data_intraday):
long_data = pd.DataFrame(columns=["dateTime", "level"])
short_data = pd.DataFrame(columns=["dateTime", "level"])
window_length = 30
for data in data_intraday["data"]:
counter = 0
for times in range(data["seconds"] // window_length):
row = {"dateTime": dateutil.parser.parse(data["dateTime"])+timedelta(seconds=counter*window_length), "level": data["level"]}
long_data = long_data.append(row, ignore_index = True)
counter = counter + 1
for data in data_intraday["shortData"]:
counter = 0
for times in range(data["seconds"] // window_length):
row = {"dateTime": dateutil.parser.parse(data["dateTime"])+timedelta(seconds=counter*window_length), "level": data["level"]}
short_data = short_data.append(row, ignore_index = True)
counter = counter + 1
long_data.set_index("dateTime",inplace=True)
short_data.set_index("dateTime",inplace=True)
long_data["level"] = np.where(long_data.index.isin(short_data.index) == True, "wake", long_data["level"])
long_data.reset_index(inplace=True)
return long_data.values.tolist()
# Parse one record for sleep API version 1
def parseOneRecordForV1(record, device_id, type_episode_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type):
sleep_record_type = "classic"
d_start_datetime = datetime.strptime(record["startTime"][:18], "%Y-%m-%dT%H:%M:%S")
d_end_datetime = datetime.strptime(record["endTime"][:18], "%Y-%m-%dT%H:%M:%S")
# Summary data
if fitbit_data_type == "summary":
row_summary = (device_id, record["efficiency"],
record["minutesAfterWakeup"], record["minutesAsleep"], record["minutesAwake"], record["minutesToFallAsleep"], record["timeInBed"],
d_is_main_sleep, sleep_record_type,
d_start_datetime, d_end_datetime,
0,
record["awakeCount"], record["awakeDuration"], record["awakeningsCount"],
record["restlessCount"], record["restlessDuration"])
records_summary.append(row_summary)
# Intraday data
if fitbit_data_type == "intraday":
start_date = d_start_datetime.date()
end_date = d_end_datetime.date()
is_before_midnight = True
curr_date = start_date
for data in record["minuteData"]:
# For overnight episodes, use end_date once we are over midnight
d_time = datetime.strptime(data["dateTime"], '%H:%M:%S').time()
if is_before_midnight and d_time.hour == 0:
curr_date = end_date
d_datetime = datetime.combine(curr_date, d_time)
# API 1.2 stores original_level as strings, so we convert original_levels of API 1 to strings too
# (1: "asleep", 2: "restless", 3: "awake")
d_original_level = SLEEP_CODE2LEVEL[int(data["value"])-1]
row_intraday = (type_episode_id, 60,
d_original_level, -1, d_is_main_sleep, sleep_record_type,
d_datetime, 0, 0)
records_intraday.append(row_intraday)
return records_summary, records_intraday
# Parse one record for sleep API version 1.2
def parseOneRecordForV12(record, device_id, type_episode_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type):
sleep_record_type = record['type']
d_start_datetime = datetime.strptime(record["startTime"][:18], "%Y-%m-%dT%H:%M:%S")
d_end_datetime = datetime.strptime(record["endTime"][:18], "%Y-%m-%dT%H:%M:%S")
# Summary data
if fitbit_data_type == "summary":
row_summary = (device_id, record["efficiency"],
record["minutesAfterWakeup"], record["minutesAsleep"], record["minutesAwake"], record["minutesToFallAsleep"], record["timeInBed"],
d_is_main_sleep, sleep_record_type,
d_start_datetime, d_end_datetime,
0)
records_summary.append(row_summary)
# Intraday data
if fitbit_data_type == "intraday":
if sleep_record_type == "classic":
for data in record["levels"]["data"]:
d_datetime = dateutil.parser.parse(data["dateTime"])
row_intraday = (type_episode_id, data["seconds"],
data["level"], -1, d_is_main_sleep, sleep_record_type,
d_datetime, 0, 0)
records_intraday.append(row_intraday)
else:
# For sleep type "stages"
for data in mergeLongAndShortData(record["levels"]):
row_intraday = (type_episode_id, 30,
data[1], -1, d_is_main_sleep, sleep_record_type,
data[0], 0, 0)
records_intraday.append(row_intraday)
return records_summary, records_intraday
def parseSleepData(sleep_data, fitbit_data_type):
SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1_2
if sleep_data.empty:
if fitbit_data_type == "summary":
return pd.DataFrame(columns=SLEEP_SUMMARY_COLUMNS)
elif fitbit_data_type == "intraday":
return pd.DataFrame(columns=SLEEP_INTRADAY_COLUMNS)
device_id = sleep_data["device_id"].iloc[0]
records_summary, records_intraday = [], []
type_episode_id = 0
# Parse JSON into individual records
for multi_record in sleep_data.fitbit_data:
for record in json.loads(multi_record)["sleep"]:
# Whether the sleep episode is nap (0) or main sleep (1)
d_is_main_sleep = 1 if record["isMainSleep"] else 0
# For sleep API version 1
if "awakeCount" in record:
SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1
records_summary, records_intraday = parseOneRecordForV1(record, device_id, type_episode_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type)
# For sleep API version 1.2
else:
SLEEP_SUMMARY_COLUMNS = SLEEP_SUMMARY_COLUMNS_V1_2
records_summary, records_intraday = parseOneRecordForV12(record, device_id, type_episode_id, d_is_main_sleep, records_summary, records_intraday, fitbit_data_type)
type_episode_id = type_episode_id + 1
if fitbit_data_type == "summary":
parsed_data = pd.DataFrame(data=records_summary, columns=SLEEP_SUMMARY_COLUMNS)
elif fitbit_data_type == "intraday":
parsed_data = pd.DataFrame(data=records_intraday, columns=SLEEP_INTRADAY_COLUMNS)
return parsed_data
def mergeSleepEpisodes(sleep_data, cols_for_groupby):
sleep_episodes = pd.DataFrame(columns=["type_episode_id", "level_episode_id", "level", "unified_level", "is_main_sleep", "type", "start_timestamp", "end_timestamp"])
if not sleep_data.empty:
sleep_data = sleep_data.groupby(by=cols_for_groupby)
sleep_episodes = sleep_data[["start_timestamp"]].first()
sleep_episodes["end_timestamp"] = sleep_data["end_timestamp"].last()
sleep_episodes.reset_index(inplace=True, drop=False)
return sleep_episodes
timezone = snakemake.params["timezone"]
column_format = snakemake.params["column_format"]
fitbit_data_type = snakemake.params["fitbit_data_type"]
sleep_episode_timestamp = snakemake.params["sleep_episode_timestamp"]
with open(snakemake.input["participant_file"], "r", encoding="utf-8") as f:
participant_file = yaml.safe_load(f)
local_start_date = pd.Timestamp(participant_file["FITBIT"]["START_DATE"])
local_end_date = pd.Timestamp(participant_file["FITBIT"]["END_DATE"]) + pd.DateOffset(1)
if column_format == "JSON":
json_raw = pd.read_csv(snakemake.input["raw_data"])
parsed_data = parseSleepData(json_raw, fitbit_data_type)
elif column_format == "PLAIN_TEXT":
if fitbit_data_type == "summary":
parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_start_date_time", "local_end_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
elif fitbit_data_type == "intraday":
parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
else:
raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
# Drop duplicates
parsed_data.drop_duplicates(inplace=True)
if parsed_data.shape[0] > 0 and fitbit_data_type == "summary":
if sleep_episode_timestamp != "start" and sleep_episode_timestamp != "end":
raise ValueError("SLEEP_EPISODE_TIMESTAMP can only be one of ['start', 'end'].")
# Column name to be considered as the event datetime
datetime_column = "local_" + sleep_episode_timestamp + "_date_time"
if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
parsed_data = parsed_data.loc[(parsed_data[datetime_column] >= local_start_date) & (parsed_data[datetime_column] < local_end_date)]
# Sort by "local_start_date_time" column
parsed_data.sort_values(by="local_start_date_time", ascending=True, inplace=True)
parsed_data["timestamp"] = parsed_data[datetime_column].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
parsed_data.dropna(subset=['timestamp'], inplace=True)
parsed_data.drop(["local_start_date_time", "local_end_date_time"], axis = 1, inplace=True)
if parsed_data.shape[0] > 0 and fitbit_data_type == "intraday":
if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
# Sort by "local_date_time" column
parsed_data.sort_values(by="local_date_time", ascending=True, inplace=True)
parsed_data["start_timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
parsed_data.dropna(subset=['start_timestamp'], inplace=True)
parsed_data["end_timestamp"] = parsed_data["start_timestamp"] + ((parsed_data["duration"] - 1) * 1000) + 999
parsed_data["unified_level"] = np.where(parsed_data["level"].isin(["awake", "restless", "wake"]), 0, 1)
# Put consecutive rows with the same "level" field together and merge episodes
parsed_data.insert(2, "level_episode_id", (parsed_data[["type_episode_id", "level"]] != parsed_data[["type_episode_id", "level"]].shift()).any(axis=1).cumsum())
parsed_data = mergeSleepEpisodes(parsed_data, ["type_episode_id", "level_episode_id", "level", "unified_level", "is_main_sleep", "type"])
parsed_data.to_csv(snakemake.output[0], index=False)

View File

@ -1,78 +0,0 @@
import json, yaml
import pandas as pd
import numpy as np
from datetime import datetime, timezone
from math import trunc
STEPS_COLUMNS = ("device_id", "steps", "local_date_time", "timestamp")
def parseStepsData(steps_data, fitbit_data_type):
if steps_data.empty:
return pd.DataFrame(columns=STEPS_COLUMNS)
device_id = steps_data["device_id"].iloc[0]
records = []
# Parse JSON into individual records
for record in steps_data.fitbit_data:
record = json.loads(record) # Parse text into JSON
if "activities-steps" in record.keys():
curr_date = datetime.strptime(record["activities-steps"][0]["dateTime"], "%Y-%m-%d")
# Parse summary data
if fitbit_data_type == "summary":
row_summary = (device_id,
record["activities-steps"][0]["value"],
curr_date,
0)
records.append(row_summary)
# Parse intraday data
if (fitbit_data_type == "intraday") and ("activities-steps-intraday" in record.keys()):
dataset = record["activities-steps-intraday"]["dataset"]
for data in dataset:
d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
row_intraday = (device_id,
data["value"],
d_datetime,
0)
records.append(row_intraday)
parsed_data = pd.DataFrame(data=records, columns=STEPS_COLUMNS)
return parsed_data
timezone = snakemake.params["timezone"]
column_format = snakemake.params["column_format"]
fitbit_data_type = snakemake.params["fitbit_data_type"]
with open(snakemake.input["participant_file"], "r", encoding="utf-8") as f:
participant_file = yaml.safe_load(f)
local_start_date = pd.Timestamp(participant_file["FITBIT"]["START_DATE"])
local_end_date = pd.Timestamp(participant_file["FITBIT"]["END_DATE"]) + pd.DateOffset(1)
if column_format == "JSON":
json_raw = pd.read_csv(snakemake.input["raw_data"])
parsed_data = parseStepsData(json_raw, fitbit_data_type)
elif column_format == "PLAIN_TEXT":
parsed_data = pd.read_csv(snakemake.input["raw_data"], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
else:
raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
# Only keep dates in the range of [local_start_date, local_end_date)
if not pd.isnull(local_start_date) and not pd.isnull(local_end_date):
parsed_data = parsed_data.loc[(parsed_data["local_date_time"] >= local_start_date) & (parsed_data["local_date_time"] < local_end_date)]
if parsed_data.shape[0] > 0:
parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone, ambiguous=False, nonexistent="NaT").dropna().astype(np.int64) // 10**6
parsed_data.dropna(subset=['timestamp'], inplace=True)
parsed_data.to_csv(snakemake.output[0], index=False)

View File

@ -1,53 +0,0 @@
import pandas as pd
import pytz, json
from datetime import datetime
from fitbit_parse_sensors.fitbit_parse_heartrate import parseHeartrateData
from fitbit_parse_sensors.fitbit_parse_sleep import parseSleepData
from fitbit_parse_sensors.fitbit_parse_steps import parseStepsData
from fitbit_parse_sensors.fitbit_parse_calories import parseCaloriesData
NIGHT = "night"
MORNING = "morning"
AFTERNOON = "afternoon"
EVENING = "evening"
HOUR2EPOCH = [NIGHT] * 6 + [MORNING] * 6 + [AFTERNOON] * 6 + [EVENING] * 6
def drop_duplicates(data, local_timezone):
"""
Data is pulled in intraday manner. Since data will be duplicated until the
last record from that day, first sort by time, then drop all but
the last record for each day. Drop duplicates based on aware timestamp.
"""
local_date_col = data["timestamp"].apply(lambda ts: str(datetime.fromtimestamp(ts/1000, tz=local_timezone).date()))
data = data.assign(local_date=local_date_col.values)
data.sort_values(by="timestamp", ascending=True, inplace=True)
data.drop_duplicates(subset="local_date", keep="last", inplace=True)
return data
fitbit_data = pd.read_csv(snakemake.input[0])
local_timezone = pytz.timezone(snakemake.params["local_timezone"])
sensor = snakemake.params["fitbit_sensor"]
data = fitbit_data[fitbit_data["fitbit_data_type"] == sensor]
data = drop_duplicates(data, local_timezone)
if sensor == "heartrate":
summary_data, intraday_data = parseHeartrateData(data, HOUR2EPOCH)
elif sensor == "sleep":
summary_data, intraday_data = parseSleepData(data, HOUR2EPOCH)
elif sensor == "steps":
summary_data, intraday_data = parseStepsData(data, HOUR2EPOCH)
elif sensor == "calories":
summary_data, intraday_data = parseCaloriesData(data, HOUR2EPOCH)
else:
raise ValueError("We only support heartrate, sleep, step, or calories sensors on Fitbit devices.")
# Summary data does not exist for steps and calories as it is not provided by Fitbit's API
if sensor == "heartrate" or sensor == "sleep":
summary_data.to_csv(snakemake.output["summary_data"], index=False)
intraday_data.to_csv(snakemake.output["intraday_data"], index=False)

Some files were not shown because too many files have changed in this diff Show More