diff --git a/example_profile/Snakefile b/example_profile/Snakefile index 92416fcd..cd3f2a5e 100644 --- a/example_profile/Snakefile +++ b/example_profile/Snakefile @@ -15,6 +15,15 @@ if len(config["PIDS"]) == 0: for provider in config["PHONE_DATA_YIELD"]["PROVIDERS"].keys(): if config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]: + + allowed_phone_sensors = get_phone_sensor_names() + if not (set(config["PHONE_DATA_YIELD"]["SENSORS"]) <= set(allowed_phone_sensors)): + raise ValueError('\nInvalid sensor(s) for PHONE_DATA_YIELD. config["PHONE_DATA_YIELD"]["SENSORS"] can have ' + 'one or more of the following phone sensors: {}.\nInstead you provided "{}".\n' + 'Keep in mind that the sensors\' TABLE attribute must point to a valid database table'\ + .format(', '.join(allowed_phone_sensors), + ', '.join(set(config["PHONE_DATA_YIELD"]["SENSORS"]) - set(allowed_phone_sensors)))) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=map(str.lower, config["PHONE_DATA_YIELD"]["SENSORS"]))) files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv", pid=config["PIDS"])) @@ -36,7 +45,6 @@ for provider in config["PHONE_CALLS"]["PROVIDERS"].keys(): if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime_unified.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) @@ -76,9 +84,12 @@ for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys(): for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys(): if config["PHONE_SCREEN"]["PROVIDERS"][provider]["COMPUTE"]: + # if "PHONE_SCREEN" in config["PHONE_DATA_YIELD"]["SENSORS"]:# not used for now because we took episodepersensedminutes out of the list of supported features + # files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"])) + # else: + # raise ValueError("Error: Add PHONE_SCREEN (and as many PHONE_SENSORS as you have in your database) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data)") files_to_compute.extend(expand("data/raw/{pid}/phone_screen_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime_unified.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes_resampled.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) @@ -142,27 +153,78 @@ for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") +# We can delete these if's as soon as we add feature PROVIDERS to any of these sensors +if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict): + for provider in config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"].keys(): + if config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_crashes_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_crashes_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_crashes_with_datetime_with_categories.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_applications_crashes_features/phone_applications_crashes_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_crashes.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +if isinstance(config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"], dict): + for provider in config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"].keys(): + if config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_notifications_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_notifications_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_notifications_with_datetime_with_categories.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_applications_notifications_features/phone_applications_notifications_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_APPLICATIONS_NOTIFICATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_notifications.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +if isinstance(config["PHONE_KEYBOARD"]["PROVIDERS"], dict): + for provider in config["PHONE_KEYBOARD"]["PROVIDERS"].keys(): + if config["PHONE_KEYBOARD"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_keyboard_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_keyboard_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_keyboard_features/phone_keyboard_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_KEYBOARD"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_keyboard.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +if isinstance(config["PHONE_LOG"]["PROVIDERS"], dict): + for provider in config["PHONE_LOG"]["PROVIDERS"].keys(): + if config["PHONE_LOG"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_log_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_log_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_log_features/phone_log_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOG"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_log.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): if config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]: - if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] == "FUSED_RESAMPLED": + if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] in ["FUSED_RESAMPLED","ALL_RESAMPLED"]: if "PHONE_LOCATIONS" in config["PHONE_DATA_YIELD"]["SENSORS"]: files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"])) else: - raise ValueError("Error: Add PHONE_LOCATIONS (and as many PHONE_SENSORS as you have) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") + raise ValueError("Error: Add PHONE_LOCATIONS (and as many PHONE_SENSORS as you have) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data) which is used to resample fused location data (ALL_RESAMPLED and RESAMPLED_FUSED)") files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") +for provider in config["FITBIT_DATA_YIELD"]["PROVIDERS"].keys(): + if config["FITBIT_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_data_yield.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + for provider in config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"].keys(): if config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_raw.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_parsed.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_summary_features/fitbit_heartrate_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_summary.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) @@ -171,8 +233,7 @@ for provider in config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"].keys(): for provider in config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"].keys(): if config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_parsed.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_intraday_features/fitbit_heartrate_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_intraday.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) @@ -181,18 +242,28 @@ for provider in config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"].keys(): for provider in config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"].keys(): if config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_raw.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_parsed.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_summary_features/fitbit_sleep_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_sleep_summary.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") +for provider in config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"].keys(): + if config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_episodes.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_features/fitbit_sleep_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_sleep_intraday.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys(): if config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_raw.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_parsed.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_summary.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) @@ -201,13 +272,78 @@ for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys(): for provider in config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"].keys(): if config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_raw.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_parsed.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_intraday.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["EMPATICA_ACCELEROMETER"]["PROVIDERS"].keys(): + if config["EMPATICA_ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/empatica_accelerometer_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/empatica_accelerometer_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_ACCELEROMETER"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/empatica_accelerometer.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["EMPATICA_HEARTRATE"]["PROVIDERS"].keys(): + if config["EMPATICA_HEARTRATE"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/empatica_heartrate_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/empatica_heartrate_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_HEARTRATE"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/empatica_heartrate.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + + +for provider in config["EMPATICA_TEMPERATURE"]["PROVIDERS"].keys(): + if config["EMPATICA_TEMPERATURE"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/empatica_temperature_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/empatica_temperature_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/empatica_temperature_features/empatica_temperature_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_TEMPERATURE"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/empatica_temperature.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"].keys(): + if config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/empatica_electrodermal_activity_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/empatica_electrodermal_activity_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/empatica_electrodermal_activity.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"].keys(): + if config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/empatica_blood_volume_pulse_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/empatica_blood_volume_pulse_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/empatica_blood_volume_pulse.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys(): + if config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/empatica_inter_beat_interval_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/empatica_inter_beat_interval_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/empatica_inter_beat_interval.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +if isinstance(config["EMPATICA_TAGS"]["PROVIDERS"], dict): + for provider in config["EMPATICA_TAGS"]["PROVIDERS"].keys(): + if config["EMPATICA_TAGS"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/empatica_tags_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/empatica_tags_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/empatica_tags_features/empatica_tags_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["EMPATICA_TAGS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/empatica_tags.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + # Visualization for Data Exploration if config["HISTOGRAM_PHONE_DATA_YIELD"]["PLOT"]: files_to_compute.append("reports/data_exploration/histogram_phone_data_yield.html") diff --git a/example_profile/example_config.yaml b/example_profile/example_config.yaml index 25aa5863..f3bbb44a 100644 --- a/example_profile/example_config.yaml +++ b/example_profile/example_config.yaml @@ -1,54 +1,62 @@ -# See https://www.rapids.science/setup/configuration/#database-credentials -DATABASE_GROUP: &database_group - MY_GROUP +######################################################################################################################## +# GLOBAL CONFIGURATION # +######################################################################################################################## -# See https://www.rapids.science/setup/configuration/#timezone-of-your-study -TIMEZONE: &timezone - America/New_York - -# See https://www.rapids.science/setup/configuration/#participant-files +# See https://www.rapids.science/latest/setup/configuration/#participant-files PIDS: [example01, example02] -# See https://www.rapids.science/setup/configuration/#automatic-creation-of-participant-files +# See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files CREATE_PARTICIPANT_FILES: - SOURCE: - TYPE: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE - DATABASE_GROUP: *database_group - CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format - TIMEZONE: *timezone + CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format PHONE_SECTION: - ADD: TRUE + ADD: True DEVICE_ID_COLUMN: device_id # column name IGNORED_DEVICE_IDS: [] FITBIT_SECTION: - ADD: TRUE - DEVICE_ID_COLUMN: device_id # column name + ADD: True + DEVICE_ID_COLUMN: fitbit_id # column name + IGNORED_DEVICE_IDS: [] + EMPATICA_SECTION: + ADD: False + DEVICE_ID_COLUMN: empatica_id # column name IGNORED_DEVICE_IDS: [] -# See https://www.rapids.science/setup/configuration/#time-segments +# See https://www.rapids.science/latest/setup/configuration/#time-segments TIME_SEGMENTS: &time_segments TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT FILE: "example_profile/exampleworkflow_timesegments.csv" INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs - +# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study +TIMEZONE: + TYPE: SINGLE + SINGLE: + TZCODE: America/New_York + MULTIPLE: + TZCODES_FILE: data/external/multiple_timezones_example.csv + IF_MISSING_TZCODE: STOP + DEFAULT_TZCODE: America/New_York + FITBIT: + ALLOW_MULTIPLE_TZ_PER_DEVICE: False + INFER_FROM_SMARTPHONE_TZ: False ######################################################################################################################## # PHONE # ######################################################################################################################## -# See https://www.rapids.science/setup/configuration/#device-data-source-configuration -PHONE_DATA_CONFIGURATION: - SOURCE: - TYPE: DATABASE - DATABASE_GROUP: *database_group - DEVICE_ID_COLUMN: device_id # column name - TIMEZONE: - TYPE: SINGLE # SINGLE or MULTIPLE - VALUE: *timezone # IF TYPE=SINGLE, see docs +# See https://www.rapids.science/latest/setup/configuration/#device-data-source-configuration +PHONE_DATA_STREAMS: + USE: aware_csv + + # AVAILABLE: + aware_mysql: + DATABASE_GROUP: MY_GROUP + aware_csv: + FOLDER: data/external/example_workflow # Sensors ------ +# https://www.rapids.science/latest/features/phone-accelerometer/ PHONE_ACCELEROMETER: CONTAINER: accelerometer PROVIDERS: @@ -67,10 +75,11 @@ PHONE_ACCELEROMETER: SRC_FOLDER: "panda" # inside src/features/phone_accelerometer SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/phone-activity-recognition/ PHONE_ACTIVITY_RECOGNITION: CONTAINER: - ANDROID: plugin_google_activity_recognition - IOS: plugin_ios_activity_recognition + ANDROID: plugin_google_activity_recognition.csv + IOS: plugin_ios_activity_recognition.csv EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode. PROVIDERS: RAPIDS: @@ -83,8 +92,19 @@ PHONE_ACTIVITY_RECOGNITION: SRC_FOLDER: "rapids" # inside src/features/phone_activity_recognition SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/phone-applications-crashes/ +PHONE_APPLICATIONS_CRASHES: + CONTAINER: applications_crashes + APPLICATION_CATEGORIES: + CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) + CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv" + UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE + SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway + PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD + +# See https://www.rapids.science/latest/features/phone-applications-foreground/ PHONE_APPLICATIONS_FOREGROUND: - CONTAINER: applications_foreground + CONTAINER: applications_foreground.csv APPLICATION_CATEGORIES: CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv" @@ -104,8 +124,19 @@ PHONE_APPLICATIONS_FOREGROUND: SRC_FOLDER: "rapids" # inside src/features/phone_applications_foreground SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/phone-applications-notifications/ +PHONE_APPLICATIONS_NOTIFICATIONS: + CONTAINER: applications_notifications + APPLICATION_CATEGORIES: + CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) + CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv" + UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE + SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway + PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD + +# See https://www.rapids.science/latest/features/phone-battery/ PHONE_BATTERY: - CONTAINER: battery + CONTAINER: battery.csv EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode. PROVIDERS: RAPIDS: @@ -114,17 +145,36 @@ PHONE_BATTERY: SRC_FOLDER: "rapids" # inside src/features/phone_battery SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/phone-bluetooth/ PHONE_BLUETOOTH: - CONTAINER: bluetooth + CONTAINER: bluetooth.csv PROVIDERS: RAPIDS: COMPUTE: True FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth SRC_LANGUAGE: "r" + DORYAB: + COMPUTE: False + FEATURES: + ALL: + DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"] + SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"] + SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"] + OWN: + DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"] + SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"] + SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"] + OTHERS: + DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"] + SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"] + SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"] + SRC_FOLDER: "doryab" # inside src/features/phone_bluetooth + SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/phone-calls/ PHONE_CALLS: - CONTAINER: calls + CONTAINER: calls.csv PROVIDERS: RAPIDS: COMPUTE: True @@ -136,10 +186,11 @@ PHONE_CALLS: SRC_LANGUAGE: "r" SRC_FOLDER: "rapids" # inside src/features/phone_calls +# See https://www.rapids.science/latest/features/phone-conversation/ PHONE_CONVERSATION: CONTAINER: - ANDROID: plugin_studentlife_audio_android - IOS: plugin_studentlife_audio + ANDROID: plugin_studentlife_audio_android.csv + IOS: plugin_studentlife_audio.csv PROVIDERS: RAPIDS: COMPUTE: True @@ -154,18 +205,25 @@ PHONE_CONVERSATION: SRC_FOLDER: "rapids" # inside src/features/phone_conversation SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/phone-data-yield/ PHONE_DATA_YIELD: - SENSORS: [PHONE_ACCELEROMETER, PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE] + SENSORS: [PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE] PROVIDERS: RAPIDS: COMPUTE: True FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours] - MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1 representing the number of minutes with at least + MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid. SRC_LANGUAGE: "r" SRC_FOLDER: "rapids" # inside src/features/phone_data_yield +# See https://www.rapids.science/latest/features/phone-keyboard/ +PHONE_KEYBOARD: + CONTAINER: keyboard + PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD + +# See https://www.rapids.science/latest/features/phone-light/ PHONE_LIGHT: - CONTAINER: light + CONTAINER: light.csv PROVIDERS: RAPIDS: COMPUTE: True @@ -173,21 +231,32 @@ PHONE_LIGHT: SRC_FOLDER: "rapids" # inside src/features/phone_light SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/phone-locations/ PHONE_LOCATIONS: - CONTAINER: locations - LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS OR FUSED_RESAMPLED + CONTAINER: locations.csv + LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row + HOME_INFERENCE: + DBSCAN_EPS: 10 # meters + DBSCAN_MINSAMPLES: 5 + THRESHOLD_STATIC : 1 # km/h + CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS + PROVIDERS: DORYAB: COMPUTE: True - FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"] + FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] + ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius DBSCAN_EPS: 10 # meters DBSCAN_MINSAMPLES: 5 THRESHOLD_STATIC : 1 # km/h - MAXIMUM_GAP_ALLOWED: 300 + MAXIMUM_ROW_GAP: 300 + MAXIMUM_ROW_DURATION: 60 MINUTES_DATA_USED: False - SAMPLING_FREQUENCY: 0 + CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT + CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS + RADIUS_FOR_HOME: 100 SRC_FOLDER: "doryab" # inside src/features/phone_locations SRC_LANGUAGE: "python" @@ -195,13 +264,21 @@ PHONE_LOCATIONS: COMPUTE: False FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"] ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius - TIMEZONE: *timezone + IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features SRC_FOLDER: "barnett" # inside src/features/phone_locations SRC_LANGUAGE: "r" +# See https://www.rapids.science/latest/features/phone-log/ +PHONE_LOG: + CONTAINER: + ANDROID: aware_log + IOS: ios_aware_log + PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD + +# See https://www.rapids.science/latest/features/phone-messages/ PHONE_MESSAGES: - CONTAINER: messages + CONTAINER: messages.csv PROVIDERS: RAPIDS: COMPUTE: True @@ -212,8 +289,9 @@ PHONE_MESSAGES: SRC_LANGUAGE: "r" SRC_FOLDER: "rapids" # inside src/features/phone_messages +# See https://www.rapids.science/latest/features/phone-screen/ PHONE_SCREEN: - CONTAINER: screen + CONTAINER: screen.csv PROVIDERS: RAPIDS: COMPUTE: True @@ -225,8 +303,9 @@ PHONE_SCREEN: SRC_FOLDER: "rapids" # inside src/features/phone_screen SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/phone-wifi-connected/ PHONE_WIFI_CONNECTED: - CONTAINER: "sensor_wifi" + CONTAINER: sensor_wifi.csv PROVIDERS: RAPIDS: COMPUTE: True @@ -234,8 +313,9 @@ PHONE_WIFI_CONNECTED: SRC_FOLDER: "rapids" # inside src/features/phone_wifi_connected SRC_LANGUAGE: "r" +# See https://www.rapids.science/latest/features/phone-wifi-visible/ PHONE_WIFI_VISIBLE: - CONTAINER: "wifi" + CONTAINER: wifi.csv PROVIDERS: RAPIDS: COMPUTE: True @@ -250,20 +330,43 @@ PHONE_WIFI_VISIBLE: ######################################################################################################################## # See https://www.rapids.science/latest/setup/configuration/#device-data-source-configuration -FITBIT_DATA_CONFIGURATION: - SOURCE: - TYPE: DATABASE # DATABASE or FILES (set each [FITBIT_SENSOR][CONTAINER] attribute with a table name or a file path accordingly) - COLUMN_FORMAT: JSON # JSON or PLAIN_TEXT - DATABASE_GROUP: *database_group - DEVICE_ID_COLUMN: device_id # column name - TIMEZONE: - TYPE: SINGLE # Fitbit only supports SINGLE timezones - VALUE: *timezone # see docs - HIDDEN: - SINGLE_FITBIT_TABLE: TRUE +FITBIT_DATA_STREAMS: + USE: fitbitjson_csv + + # AVAILABLE: + fitbitjson_mysql: + DATABASE_GROUP: MY_GROUP + SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp. + + fitbitparsed_mysql: + DATABASE_GROUP: MY_GROUP + SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp. + fitbitjson_csv: + FOLDER: data/external/example_workflow + SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp. + + fitbitparsed_csv: + FOLDER: data/external/fitbit_csv + SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp. + +# Sensors ------ + +# See https://www.rapids.science/latest/features/fitbit-data-yield/ +FITBIT_DATA_YIELD: + SENSOR: FITBIT_HEARTRATE_INTRADAY + PROVIDERS: + RAPIDS: + COMPUTE: False + FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours] + MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid. + SRC_LANGUAGE: "r" + SRC_FOLDER: "rapids" # inside src/features/fitbit_data_yield + + +# See https://www.rapids.science/latest/features/fitbit-heartrate-summary/ FITBIT_HEARTRATE_SUMMARY: - CONTAINER: fitbit_data + CONTAINER: fitbit_data.csv PROVIDERS: RAPIDS: COMPUTE: True @@ -271,8 +374,9 @@ FITBIT_HEARTRATE_SUMMARY: SRC_FOLDER: "rapids" # inside src/features/fitbit_heartrate_summary SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/fitbit-heartrate-intraday/ FITBIT_HEARTRATE_INTRADAY: - CONTAINER: fitbit_data + CONTAINER: fitbit_data.csv PROVIDERS: RAPIDS: COMPUTE: True @@ -280,8 +384,9 @@ FITBIT_HEARTRATE_INTRADAY: SRC_FOLDER: "rapids" # inside src/features/fitbit_heartrate_intraday SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/fitbit-sleep-summary/ FITBIT_SLEEP_SUMMARY: - CONTAINER: fitbit_data + CONTAINER: fitbit_data.csv SLEEP_EPISODE_TIMESTAMP: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp. PROVIDERS: RAPIDS: @@ -291,8 +396,45 @@ FITBIT_SLEEP_SUMMARY: SRC_FOLDER: "rapids" # inside src/features/fitbit_sleep_summary SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/fitbit-sleep-intraday/ +FITBIT_SLEEP_INTRADAY: + CONTAINER: sleep_intraday + PROVIDERS: + RAPIDS: + COMPUTE: False + FEATURES: + LEVELS_AND_TYPES_COMBINING_ALL: True + LEVELS_AND_TYPES: [countepisode, sumduration, maxduration, minduration, avgduration, medianduration, stdduration] + RATIOS_TYPE: [count, duration] + RATIOS_SCOPE: [ACROSS_LEVELS, ACROSS_TYPES, WITHIN_LEVELS, WITHIN_TYPES] + ROUTINE: [starttimefirstmainsleep, endtimelastmainsleep, starttimefirstnap, endtimelastnap] + SLEEP_LEVELS: + CLASSIC: [awake, restless, asleep] + STAGES: [wake, deep, light, rem] + UNIFIED: [awake, asleep] + SLEEP_TYPES: [main, nap] + INCLUDE_SLEEP_LATER_THAN: 0 # a number ranged from 0 (midnight) to 1439 (23:59) + REFERENCE_TIME: MIDNIGHT # chosen from "MIDNIGHT" and "START_OF_THE_SEGMENT" + SRC_FOLDER: "rapids" # inside src/features/fitbit_sleep_intraday + SRC_LANGUAGE: "python" + + PRICE: + COMPUTE: False + FEATURES: [avgduration, avgratioduration, avgstarttimeofepisodemain, avgendtimeofepisodemain, avgmidpointofepisodemain, "stdstarttimeofepisodemain", "stdendtimeofepisodemain", "stdmidpointofepisodemain", socialjetlag, meanssdstarttimeofepisodemain, meanssdendtimeofepisodemain, meanssdmidpointofepisodemain, medianssdstarttimeofepisodemain, medianssdendtimeofepisodemain, medianssdmidpointofepisodemain] + SLEEP_LEVELS: + CLASSIC: [awake, restless, asleep] + STAGES: [wake, deep, light, rem] + UNIFIED: [awake, asleep] + DAY_TYPES: [WEEKEND, WEEK, ALL] + GROUP_EPISODES_WITHIN: # by default: today's 6pm to tomorrow's noon + START_TIME: 1080 # number of minutes after the midnight (18:00) 18*60 + LENGTH: 1080 # in minutes (18 hours) 18*60 + SRC_FOLDER: "price" # inside src/features/fitbit_sleep_intraday + SRC_LANGUAGE: "python" + +# See https://www.rapids.science/latest/features/fitbit-steps-summary/ FITBIT_STEPS_SUMMARY: - CONTAINER: fitbit_data + CONTAINER: fitbit_data.csv PROVIDERS: RAPIDS: COMPUTE: True @@ -300,8 +442,9 @@ FITBIT_STEPS_SUMMARY: SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_summary SRC_LANGUAGE: "python" +# See https://www.rapids.science/latest/features/fitbit-steps-intraday/ FITBIT_STEPS_INTRADAY: - CONTAINER: fitbit_data + CONTAINER: fitbit_data.csv PROVIDERS: RAPIDS: COMPUTE: True @@ -314,27 +457,113 @@ FITBIT_STEPS_INTRADAY: SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_intraday SRC_LANGUAGE: "python" +######################################################################################################################## +# EMPATICA # +######################################################################################################################## + +EMPATICA_DATA_STREAMS: + USE: empatica_zip + + # AVAILABLE: + empatica_zip: + FOLDER: data/external/empatica + +# Sensors ------ + +# See https://www.rapids.science/latest/features/empatica-accelerometer/ +EMPATICA_ACCELEROMETER: + CONTAINER: ACC + PROVIDERS: + DBDP: + COMPUTE: False + FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"] + SRC_FOLDER: "dbdp" # inside src/features/empatica_accelerometer + SRC_LANGUAGE: "python" + +# See https://www.rapids.science/latest/features/empatica-heartrate/ +EMPATICA_HEARTRATE: + CONTAINER: HR + PROVIDERS: + DBDP: + COMPUTE: False + FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr"] + SRC_FOLDER: "dbdp" # inside src/features/empatica_heartrate + SRC_LANGUAGE: "python" + +# See https://www.rapids.science/latest/features/empatica-temperature/ +EMPATICA_TEMPERATURE: + CONTAINER: TEMP + PROVIDERS: + DBDP: + COMPUTE: False + FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"] + SRC_FOLDER: "dbdp" # inside src/features/empatica_heartrate + SRC_LANGUAGE: "python" + +# See https://www.rapids.science/latest/features/empatica-electrodermal-activity/ +EMPATICA_ELECTRODERMAL_ACTIVITY: + CONTAINER: EDA + PROVIDERS: + DBDP: + COMPUTE: False + FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"] + SRC_FOLDER: "dbdp" # inside src/features/empatica_electrodermal_activity + SRC_LANGUAGE: "python" + +# See https://www.rapids.science/latest/features/empatica-blood-volume-pulse/ +EMPATICA_BLOOD_VOLUME_PULSE: + CONTAINER: BVP + PROVIDERS: + DBDP: + COMPUTE: False + FEATURES: ["maxbvp", "minbvp", "avgbvp", "medianbvp", "modebvp", "stdbvp", "diffmaxmodebvp", "diffminmodebvp", "entropybvp"] + SRC_FOLDER: "dbdp" # inside src/features/empatica_blood_volume_pulse + SRC_LANGUAGE: "python" + +# See https://www.rapids.science/latest/features/empatica-inter-beat-interval/ +EMPATICA_INTER_BEAT_INTERVAL: + CONTAINER: IBI + PROVIDERS: + DBDP: + COMPUTE: False + FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"] + SRC_FOLDER: "dbdp" # inside src/features/inter_beat_interval + SRC_LANGUAGE: "python" + +# See https://www.rapids.science/latest/features/empatica-tags/ +EMPATICA_TAGS: + CONTAINER: TAGS + PROVIDERS: # None implemented yet ######################################################################################################################## # PLOTS # ######################################################################################################################## +# Data quality ------ + +# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#1-histograms-of-phone-data-yield HISTOGRAM_PHONE_DATA_YIELD: PLOT: True -HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT: - PLOT: True - -HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT: - PLOT: True - SENSORS: [PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE] - +# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#2-heatmaps-of-overall-data-yield HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT: PLOT: True +# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#3-heatmap-of-recorded-phone-sensors +HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT: + PLOT: True + +# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#4-heatmap-of-sensor-row-count +HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT: + PLOT: False + SENSORS: [PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE] + +# Features ------ + +# See https://www.rapids.science/latest/visualizations/feature-visualizations/#1-heatmap-correlation-matrix HEATMAP_FEATURE_CORRELATION_MATRIX: - PLOT: TRUE + PLOT: False MIN_ROWS_RATIO: 0.5 CORR_THRESHOLD: 0.1 CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"} @@ -349,18 +578,14 @@ PARAMS_FOR_ANALYSIS: CATEGORICAL_OPERATORS: [mostcommon] DEMOGRAPHIC: - CONTAINER: participant_info + FOLDER: data/external/example_workflow + CONTAINER: participant_info.csv FEATURES: [age, gender, inpatientdays] CATEGORICAL_FEATURES: [gender] - SOURCE: - DATABASE_GROUP: *database_group - TIMEZONE: *timezone TARGET: - CONTAINER: participant_target - SOURCE: - DATABASE_GROUP: *database_group - TIMEZONE: *timezone + FOLDER: data/external/example_workflow + CONTAINER: participant_target.csv # Cleaning Parameters COLS_NAN_THRESHOLD: 0.3 diff --git a/rules/models.smk b/rules/models.smk index aa7fe0e6..7fe6aa6e 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -1,9 +1,7 @@ rule download_demographic_data: input: - participant_file = "data/external/participant_files/{pid}.yaml" - params: - source = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["SOURCE"], - table = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["CONTAINER"], + participant_file = "data/external/participant_files/{pid}.yaml", + data = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FOLDER"] + "/" + config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["CONTAINER"] output: "data/raw/{pid}/participant_info_raw.csv" script: @@ -22,10 +20,8 @@ rule demographic_features: rule download_target_data: input: - participant_file = "data/external/participant_files/{pid}.yaml" - params: - source = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["SOURCE"], - table = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["CONTAINER"], + participant_file = "data/external/participant_files/{pid}.yaml", + data = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["FOLDER"] + "/" + config["PARAMS_FOR_ANALYSIS"]["TARGET"]["CONTAINER"] output: "data/raw/{pid}/participant_target_raw.csv" script: @@ -34,15 +30,19 @@ rule download_target_data: rule target_readable_datetime: input: sensor_input = "data/raw/{pid}/participant_target_raw.csv", - time_segments = "data/interim/time_segments/{pid}_time_segments.csv" + time_segments = "data/interim/time_segments/{pid}_time_segments.csv", + pid_file = "data/external/participant_files/{pid}.yaml", + tzcodes_file = input_tzcodes_file, params: - fixed_timezone = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["SOURCE"]["TIMEZONE"], + device_type = "fitbit", + timezone_parameters = config["TIMEZONE"], + pid = "{pid}", time_segments_type = config["TIME_SEGMENTS"]["TYPE"], include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] output: "data/raw/{pid}/participant_target_with_datetime.csv" script: - "../src/data/readable_datetime.R" + "../src/data/datetime/readable_datetime.R" rule parse_targets: input: diff --git a/src/data/workflow_example/download_demographic_data.R b/src/data/workflow_example/download_demographic_data.R index 0adb1086..fa37c6be 100644 --- a/src/data/workflow_example/download_demographic_data.R +++ b/src/data/workflow_example/download_demographic_data.R @@ -1,5 +1,4 @@ source("renv/activate.R") -library(RMariaDB) library("dplyr", warn.conflicts = F) library(readr) library(stringr) @@ -7,16 +6,13 @@ library(yaml) participant_file <- snakemake@input[["participant_file"]] -source <- snakemake@params[["source"]] -table <- snakemake@params[["table"]] + sensor_file <- snakemake@output[[1]] participant <- read_yaml(participant_file) record_id <- participant$PHONE$LABEL -dbEngine <- dbConnect(MariaDB(), default.file = "./.env", group = source$DATABASE_GROUP) -query <- paste0("SELECT * FROM ", table, " WHERE record_id = '", record_id, "'") -sensor_data <- dbGetQuery(dbEngine, query) -dbDisconnect(dbEngine) +demographic_data = read.csv(snakemake@input[["data"]]) +demographic_data = demographic_data[demographic_data$record_id == record_id, ] -write_csv(sensor_data, sensor_file) +write_csv(demographic_data, sensor_file) diff --git a/src/data/workflow_example/download_target_data.R b/src/data/workflow_example/download_target_data.R index 19ffa8d9..e100a64d 100644 --- a/src/data/workflow_example/download_target_data.R +++ b/src/data/workflow_example/download_target_data.R @@ -1,5 +1,4 @@ source("renv/activate.R") -library(RMariaDB) library("dplyr", warn.conflicts = F) library(readr) library(stringr) @@ -8,19 +7,17 @@ library(lubridate) participant_file <- snakemake@input[["participant_file"]] -source <- snakemake@params[["source"]] -table <- snakemake@params[["table"]] sensor_file <- snakemake@output[[1]] participant <- read_yaml(participant_file) record_id <- participant$PHONE$LABEL -dbEngine <- dbConnect(MariaDB(), default.file = "./.env", group = source$DATABASE_GROUP) -query <- paste0("SELECT * FROM ", table, " WHERE record_id = '", record_id, "'") -sensor_data <- dbGetQuery(dbEngine, query) -dbDisconnect(dbEngine) +target_data <- read.csv(snakemake@input[["data"]]) +target_data <- target_data[target_data$record_id == record_id, ] -# generate timestamp based on local_date -sensor_data$timestamp <- as.numeric(ymd_hms(paste(sensor_data$local_date, "00:00:00"), tz=source$TIMEZONE, quiet=TRUE)) * 1000 +target_data$local_date_time <- paste(target_data$local_date, "00:00:00") +#target_data <- target_data %>% rename(local_date_time = local_date) -write_csv(sensor_data, sensor_file) +target_data$timestamp <- 0 + +write_csv(target_data, sensor_file)