From b4a512faf3d68b7e49d8278ba1cfa1637f8787f4 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Wed, 25 Nov 2020 16:34:05 -0500 Subject: [PATCH] Add analysis example workflow --- Snakefile | 43 +- example_profile/Snakefile | 493 ++++++++------- example_profile/example_config.yaml | 562 ++++++++++-------- .../exampleworkflow_daysegments.csv | 2 + rules/common.smk | 33 +- rules/features.smk | 33 +- rules/models.smk | 248 ++++---- .../download_demographic_data.R | 22 + .../workflow_example/download_target_data.R | 26 + src/features/__init__.py | 0 src/features/features_utils.py | 85 --- src/features/phone_bluetooth/rapids/main.R | 2 +- .../join_features_from_providers.R | 2 +- ...ge_sensor_features_for_all_participants.R} | 2 +- ...sor_features_for_individual_participants.R | 22 + .../demographic_features.py | 3 +- src/models/__init__.py | 0 src/models/clean_features_for_model.R | 61 -- src/models/merge_data_for_population_model.py | 8 - src/models/merge_features_and_targets.py | 66 -- .../merge_features_for_individual_model.R | 35 -- src/models/merge_population_model_results.py | 16 - .../nan_cells_ratio_of_cleaned_features.py | 8 - src/models/select_days_to_analyse.py | 43 -- src/models/targets.py | 18 - .../baselines.py} | 66 +- .../workflow_example/clean_sensor_features.R | 29 + ...atures_and_targets_for_individual_model.py | 10 + ...atures_and_targets_for_population_model.py | 27 + src/models/{ => workflow_example}/modeling.py | 51 +- .../{ => workflow_example}/modeling_utils.py | 25 +- src/models/workflow_example/parse_targets.py | 28 + 32 files changed, 983 insertions(+), 1086 deletions(-) create mode 100644 example_profile/exampleworkflow_daysegments.csv create mode 100644 src/data/workflow_example/download_demographic_data.R create mode 100644 src/data/workflow_example/download_target_data.R delete mode 100644 src/features/__init__.py delete mode 100644 src/features/features_utils.py rename src/features/{ => utils}/join_features_from_providers.R (84%) rename src/{models/merge_features_for_population_model.R => features/utils/merge_sensor_features_for_all_participants.R} (72%) create mode 100644 src/features/utils/merge_sensor_features_for_individual_participants.R rename src/features/{ => workflow_example}/demographic_features.py (86%) delete mode 100644 src/models/__init__.py delete mode 100644 src/models/clean_features_for_model.R delete mode 100644 src/models/merge_data_for_population_model.py delete mode 100644 src/models/merge_features_and_targets.py delete mode 100644 src/models/merge_features_for_individual_model.R delete mode 100644 src/models/merge_population_model_results.py delete mode 100644 src/models/nan_cells_ratio_of_cleaned_features.py delete mode 100644 src/models/select_days_to_analyse.py delete mode 100644 src/models/targets.py rename src/models/{baseline.py => workflow_example/baselines.py} (54%) create mode 100644 src/models/workflow_example/clean_sensor_features.R create mode 100644 src/models/workflow_example/merge_features_and_targets_for_individual_model.py create mode 100644 src/models/workflow_example/merge_features_and_targets_for_population_model.py rename src/models/{ => workflow_example}/modeling.py (75%) rename src/models/{ => workflow_example}/modeling_utils.py (83%) create mode 100644 src/models/workflow_example/parse_targets.py diff --git a/Snakefile b/Snakefile index 7082fc10..f7a188d1 100644 --- a/Snakefile +++ b/Snakefile @@ -19,6 +19,8 @@ for provider in config["PHONE_DATA_YIELD"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_data_yield_features/phone_data_yield_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_data_yield.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys(): if config["PHONE_MESSAGES"]["PROVIDERS"][provider]["COMPUTE"]: @@ -26,6 +28,8 @@ for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/phone_messages_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_messages_features/phone_messages_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_MESSAGES"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_messages.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["PHONE_CALLS"]["PROVIDERS"].keys(): if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]: @@ -34,6 +38,8 @@ for provider in config["PHONE_CALLS"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime_unified.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["PHONE_BLUETOOTH"]["PROVIDERS"].keys(): if config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["COMPUTE"]: @@ -41,6 +47,8 @@ for provider in config["PHONE_BLUETOOTH"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/phone_bluetooth_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_bluetooth_features/phone_bluetooth_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_bluetooth.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"].keys(): if config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["COMPUTE"]: @@ -52,7 +60,8 @@ for provider in config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_features/phone_activity_recognition_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_activity_recognition.csv", pid=config["PIDS"])) - + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys(): if config["PHONE_BATTERY"]["PROVIDERS"][provider]["COMPUTE"]: @@ -62,7 +71,8 @@ for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_battery_features/phone_battery_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_BATTERY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_battery.csv", pid=config["PIDS"])) - + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys(): if config["PHONE_SCREEN"]["PROVIDERS"][provider]["COMPUTE"]: @@ -78,6 +88,8 @@ for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_screen_features/phone_screen_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_SCREEN"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_screen.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["PHONE_LIGHT"]["PROVIDERS"].keys(): if config["PHONE_LIGHT"]["PROVIDERS"][provider]["COMPUTE"]: @@ -85,6 +97,8 @@ for provider in config["PHONE_LIGHT"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/phone_light_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_light_features/phone_light_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LIGHT"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_light.csv", pid=config["PIDS"],)) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["PHONE_ACCELEROMETER"]["PROVIDERS"].keys(): if config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]: @@ -92,6 +106,8 @@ for provider in config["PHONE_ACCELEROMETER"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/phone_accelerometer_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_accelerometer_features/phone_accelerometer_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_accelerometer.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys(): if config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["COMPUTE"]: @@ -100,6 +116,8 @@ for provider in config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_applications_foreground_features/phone_applications_foreground_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_foreground.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["PHONE_WIFI_VISIBLE"]["PROVIDERS"].keys(): if config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["COMPUTE"]: @@ -107,6 +125,8 @@ for provider in config["PHONE_WIFI_VISIBLE"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_visible_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_visible_features/phone_wifi_visible_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_visible.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["PHONE_WIFI_CONNECTED"]["PROVIDERS"].keys(): if config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["COMPUTE"]: @@ -114,6 +134,8 @@ for provider in config["PHONE_WIFI_CONNECTED"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_connected_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_connected_features/phone_wifi_connected_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_connected.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys(): if config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["COMPUTE"]: @@ -122,6 +144,8 @@ for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_with_datetime_unified.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_conversation_features/phone_conversation_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_conversation.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): if config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]: @@ -136,6 +160,8 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") if config["FITBIT_CALORIES"]["TABLE_FORMAT"] not in ["JSON", "CSV"]: raise ValueError("config['FITBIT_CALORIES']['TABLE_FORMAT'] should be JSON or CSV but you typed" + config["FITBIT_CALORIES"]["TABLE_FORMAT"]) @@ -147,6 +173,8 @@ for provider in config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_parsed_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_summary_features/fitbit_heartrate_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_summary.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"].keys(): if config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: @@ -155,6 +183,8 @@ for provider in config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_intraday_features/fitbit_heartrate_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_intraday.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"].keys(): if config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]: @@ -163,6 +193,8 @@ for provider in config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_parsed_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_summary_features/fitbit_sleep_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_sleep_summary.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") # for provider in config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"].keys(): # if config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: @@ -177,6 +209,8 @@ for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_summary.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"].keys(): if config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: @@ -185,12 +219,17 @@ for provider in config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_intraday.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") for provider in config["FITBIT_CALORIES"]["PROVIDERS"].keys(): if config["FITBIT_CALORIES"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_calories_{fitbit_data_type}_raw.csv", pid=config["PIDS"], fitbit_data_type=(["json"] if config["FITBIT_CALORIES"]["TABLE_FORMAT"] == "JSON" else ["summary", "intraday"]))) files_to_compute.extend(expand("data/raw/{pid}/fitbit_calories_{fitbit_data_type}_parsed.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"])) files_to_compute.extend(expand("data/raw/{pid}/fitbit_calories_{fitbit_data_type}_parsed_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + # visualization for data exploration # if config["HEATMAP_FEATURES_CORRELATIONS"]["PLOT"]: diff --git a/example_profile/Snakefile b/example_profile/Snakefile index d455f9ff..970cd1b2 100644 --- a/example_profile/Snakefile +++ b/example_profile/Snakefile @@ -13,272 +13,245 @@ files_to_compute = [] if len(config["PIDS"]) == 0: raise ValueError("Add participants IDs to PIDS in config.yaml. Remember to create their participant files in data/external") -if config["PHONE_VALID_SENSED_BINS"]["COMPUTE"] or config["PHONE_VALID_SENSED_DAYS"]["COMPUTE"]: # valid sensed bins is necessary for sensed days, so we add these files anyways if sensed days are requested - if len(config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]) == 0: - raise ValueError("If you want to compute PHONE_VALID_SENSED_BINS or PHONE_VALID_SENSED_DAYS, you need to add at least one table to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml") +for provider in config["PHONE_DATA_YIELD"]["PROVIDERS"].keys(): + if config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=map(str.lower, config["PHONE_DATA_YIELD"]["SENSORS"]))) + files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_data_yield_features/phone_data_yield_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_data_yield.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") - pids_android = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "android", config["PIDS"])) - pids_ios = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "ios", config["PIDS"])) - tables_android = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist - tables_ios = [table for table in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist +for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys(): + if config["PHONE_MESSAGES"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_messages_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_messages_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_messages_features/phone_messages_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_MESSAGES"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_messages.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") - for pids,table in zip([pids_android, pids_ios], [tables_android, tables_ios]): - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"])) +for provider in config["PHONE_CALLS"]["PROVIDERS"].keys(): + if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime_unified.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") -if config["PHONE_VALID_SENSED_DAYS"]["COMPUTE"]: - files_to_compute.extend(expand("data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv", +for provider in config["PHONE_BLUETOOTH"]["PROVIDERS"].keys(): + if config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_bluetooth_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_bluetooth_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_bluetooth_features/phone_bluetooth_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_bluetooth.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"].keys(): + if config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_activity_recognition_with_datetime_unified.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes_resampled.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_activity_recognition_features/phone_activity_recognition_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_activity_recognition.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys(): + if config["PHONE_BATTERY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_battery_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes_resampled.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_battery_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_battery_features/phone_battery_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_BATTERY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_battery.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys(): + if config["PHONE_SCREEN"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_screen_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime_unified.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes_resampled.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_screen_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_screen_features/phone_screen_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_SCREEN"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_screen.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["PHONE_LIGHT"]["PROVIDERS"].keys(): + if config["PHONE_LIGHT"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_light_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_light_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_light_features/phone_light_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LIGHT"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_light.csv", pid=config["PIDS"],)) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["PHONE_ACCELEROMETER"]["PROVIDERS"].keys(): + if config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_accelerometer_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_accelerometer_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_accelerometer_features/phone_accelerometer_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_accelerometer.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys(): + if config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_applications_foreground_with_datetime_with_categories.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_applications_foreground_features/phone_applications_foreground_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_foreground.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["PHONE_WIFI_VISIBLE"]["PROVIDERS"].keys(): + if config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_visible_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_visible_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_visible_features/phone_wifi_visible_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_visible.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["PHONE_WIFI_CONNECTED"]["PROVIDERS"].keys(): + if config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_connected_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_wifi_connected_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_wifi_connected_features/phone_wifi_connected_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_connected.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["PHONE_CONVERSATION"]["PROVIDERS"].keys(): + if config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_conversation_with_datetime_unified.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_conversation_features/phone_conversation_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CONVERSATION"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_conversation.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): + if config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]: + if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] == "FUSED_RESAMPLED": + if "PHONE_LOCATIONS" in config["PHONE_DATA_YIELD"]["SENSORS"]: + files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"])) + else: + raise ValueError("Error: Add PHONE_LOCATIONS (and as many PHONE_SENSORS as you have) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") + + files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"].keys(): + if config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_parsed.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_summary_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_summary_features/fitbit_heartrate_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_summary.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"].keys(): + if config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_parsed.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_intraday_features/fitbit_heartrate_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_intraday.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"].keys(): + if config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_parsed.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_summary_features/fitbit_sleep_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_sleep_summary.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys(): + if config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_parsed.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_summary.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + +for provider in config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"].keys(): + if config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_parsed.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_intraday.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + + +# Analysis Workflow Example +models, scalers = [], [] +for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]: + models = models + [model_name] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) + scalers = scalers + config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name] +results = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"] + +# Demographic features +files_to_compute.extend(expand("data/raw/{pid}/participant_info_raw.csv", pid=config["PIDS"])) +files_to_compute.extend(expand("data/processed/features/{pid}/demographic_features.csv", pid=config["PIDS"])) + +# Targets +files_to_compute.extend(expand("data/raw/{pid}/participant_target_raw.csv", pid=config["PIDS"])) +files_to_compute.extend(expand("data/raw/{pid}/participant_target_with_datetime.csv", pid=config["PIDS"])) +files_to_compute.extend(expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"])) + +# Individual model +files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"])) +files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"])) +files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"])) +files_to_compute.extend(expand( + expand("data/processed/models/individual_model/{pid}/output_{cv_method}/{{model}}/{{scaler}}/{result}.csv", pid=config["PIDS"], - min_valid_hours_per_day=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_HOURS_PER_DAY"], - min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"])) + cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"], + result = results), + zip, + model=models, + scaler=scalers)) -if config["MESSAGES"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/messages_{messages_type}_{day_segment}.csv", pid=config["PIDS"], messages_type = config["MESSAGES"]["TYPES"], day_segment = config["MESSAGES"]["DAY_SEGMENTS"])) +# Population model +files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv") +files_to_compute.append("data/processed/models/population_model/input.csv") +files_to_compute.extend(expand("data/processed/models/population_model/output_{cv_method}/baselines.csv", cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"])) +files_to_compute.extend(expand( + expand("data/processed/models/population_model/output_{cv_method}/{{model}}/{{scaler}}/{result}.csv", + cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"], + result = results), + zip, + model=models, + scaler=scalers)) -if config["CALLS"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}_{day_segment}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"], day_segment = config["CALLS"]["DAY_SEGMENTS"])) - -if config["BARNETT_LOCATION"]["COMPUTE"]: - if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": - if config["BARNETT_LOCATION"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]: - files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_resampled.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"])) - else: - raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BARNETT_LOCATION"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/location_barnett_{day_segment}.csv", pid=config["PIDS"], day_segment = config["BARNETT_LOCATION"]["DAY_SEGMENTS"])) - -if config["BLUETOOTH"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/bluetooth_{day_segment}.csv", pid=config["PIDS"], day_segment = config["BLUETOOTH"]["DAY_SEGMENTS"])) - -if config["ACTIVITY_RECOGNITION"]["COMPUTE"]: - pids_android = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "android", config["PIDS"])) - pids_ios = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "ios", config["PIDS"])) - - for pids,table in zip([pids_android, pids_ios], [config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]): - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/processed/{pid}/{sensor}_deltas.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/processed/{pid}/activity_recognition_{day_segment}.csv",pid=config["PIDS"], day_segment = config["ACTIVITY_RECOGNITION"]["DAY_SEGMENTS"])) - -if config["BATTERY"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BATTERY"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["BATTERY"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["BATTERY"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/processed/{pid}/battery_{day_segment}.csv", pid = config["PIDS"], day_segment = config["BATTERY"]["DAY_SEGMENTS"])) - -if config["SCREEN"]["COMPUTE"]: - if config["SCREEN"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]: - files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"])) - else: - raise ValueError("Error: Add your screen table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data)") - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SCREEN"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SCREEN"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["SCREEN"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/screen_deltas.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/processed/{pid}/screen_{day_segment}.csv", pid = config["PIDS"], day_segment = config["SCREEN"]["DAY_SEGMENTS"])) - -if config["LIGHT"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/light_{day_segment}.csv", pid = config["PIDS"], day_segment = config["LIGHT"]["DAY_SEGMENTS"])) - -if config["ACCELEROMETER"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/accelerometer_{day_segment}.csv", pid = config["PIDS"], day_segment = config["ACCELEROMETER"]["DAY_SEGMENTS"])) - -if config["APPLICATIONS_FOREGROUND"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) - files_to_compute.extend(expand("data/interim/{pid}/{sensor}_with_datetime_with_genre.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/applications_foreground_{day_segment}.csv", pid = config["PIDS"], day_segment = config["APPLICATIONS_FOREGROUND"]["DAY_SEGMENTS"])) - -if config["WIFI"]["COMPUTE"]: - if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])) - files_to_compute.extend(expand("data/processed/{pid}/wifi_{day_segment}.csv", pid = config["PIDS"], day_segment = config["WIFI"]["DAY_SEGMENTS"])) - - if len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])) - files_to_compute.extend(expand("data/processed/{pid}/wifi_{day_segment}.csv", pid = config["PIDS"], day_segment = config["WIFI"]["DAY_SEGMENTS"])) - -if config["HEARTRATE"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["HEARTRATE"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"])) - files_to_compute.extend(expand("data/processed/{pid}/fitbit_heartrate_{day_segment}.csv", pid = config["PIDS"], day_segment = config["HEARTRATE"]["DAY_SEGMENTS"])) - -if config["STEP"]["COMPUTE"]: - if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED": - files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["STEP"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_step_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["intraday"])) - files_to_compute.extend(expand("data/processed/{pid}/fitbit_step_{day_segment}.csv", pid = config["PIDS"], day_segment = config["STEP"]["DAY_SEGMENTS"])) - -if config["SLEEP"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SLEEP"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_{fitbit_data_type}_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["intraday", "summary"])) - files_to_compute.extend(expand("data/processed/{pid}/fitbit_sleep_{day_segment}.csv", pid = config["PIDS"], day_segment = config["SLEEP"]["DAY_SEGMENTS"])) - -if config["CONVERSATION"]["COMPUTE"]: - pids_android = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "android", config["PIDS"])) - pids_ios = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "ios", config["PIDS"])) - - for pids,table in zip([pids_android, pids_ios], [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"]]): - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/processed/{pid}/conversation_{day_segment}.csv",pid=config["PIDS"], day_segment = config["CONVERSATION"]["DAY_SEGMENTS"])) - -if config["DORYAB_LOCATION"]["COMPUTE"]: - if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": - if config["DORYAB_LOCATION"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["DB_TABLES"]: - files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_resampled.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"])) - else: - raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][DB_TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/location_doryab_{segment}.csv", pid=config["PIDS"], segment = config["DORYAB_LOCATION"]["DAY_SEGMENTS"])) - -# visualization for data exploration -if config["HEATMAP_FEATURES_CORRELATIONS"]["PLOT"]: - files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/heatmap_features_correlations.html", min_valid_hours_per_day=config["HEATMAP_FEATURES_CORRELATIONS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"])) - -if config["HISTOGRAM_VALID_SENSED_HOURS"]["PLOT"]: - files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/histogram_valid_sensed_hours.html", min_valid_hours_per_day=config["HISTOGRAM_VALID_SENSED_HOURS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"])) - -if config["HEATMAP_DAYS_BY_SENSORS"]["PLOT"]: - files_to_compute.extend(expand("reports/interim/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{pid}/heatmap_days_by_sensors.html", pid=config["PIDS"], min_valid_hours_per_day=config["HEATMAP_DAYS_BY_SENSORS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"])) - files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/heatmap_days_by_sensors_all_participants.html", min_valid_hours_per_day=config["HEATMAP_DAYS_BY_SENSORS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"])) - -if config["HEATMAP_SENSED_BINS"]["PLOT"]: - files_to_compute.extend(expand("reports/interim/heatmap_sensed_bins/{pid}/heatmap_sensed_bins.html", pid=config["PIDS"])) - files_to_compute.extend(["reports/data_exploration/heatmap_sensed_bins_all_participants.html"]) - -if config["OVERALL_COMPLIANCE_HEATMAP"]["PLOT"]: - files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/overall_compliance_heatmap.html", min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"])) - -# analysis example -if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]: - rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"] - cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"] - models, scalers, rows_nan_thresholds, cols_nan_thresholds = [], [], [], [] - for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]: - models = models + [model_name] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) * len(rows_nan_threshold) - scalers = scalers + config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name] * len(rows_nan_threshold) - rows_nan_thresholds = rows_nan_thresholds + list(itertools.chain.from_iterable([threshold] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) for threshold in rows_nan_threshold)) - cols_nan_thresholds = cols_nan_thresholds + list(itertools.chain.from_iterable([threshold] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) for threshold in cols_nan_threshold)) - results = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"] + ["merged_population_model_results"] - - files_to_compute.extend(expand("data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv", - pid = config["PIDS"], - min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], - min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"], - source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"])) - files_to_compute.extend(expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv", - min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], - min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"], - source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"])) - files_to_compute.extend(expand( - expand("data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv", - pid = config["PIDS"], - min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], - min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"], - days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], - days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], - cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], - source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), - zip, - rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], - cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) - files_to_compute.extend(expand( - expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv", - min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], - min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"], - days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], - days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], - cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], - source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), - zip, - rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], - cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) - files_to_compute.extend(expand("data/processed/data_for_population_model/demographic_features.csv")) - files_to_compute.extend(expand("data/processed/data_for_population_model/targets_{summarised}.csv", - summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"])) - files_to_compute.extend(expand( - expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv", - min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], - min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"], - days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], - days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], - cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], - source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), - zip, - rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], - cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) - files_to_compute.extend(expand( - expand("data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv", - min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], - min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"], - days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], - days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], - cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], - source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"], - summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]), - zip, - rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], - cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) - files_to_compute.extend(expand( - expand("data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/baseline/{cv_method}/{source}_{day_segment}_{summarised}.csv", - min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], - min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"], - days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], - days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], - cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], - cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"], - source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"], - summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]), - zip, - rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], - cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"])) - files_to_compute.extend(expand( - expand("data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{{rows_nan_threshold}}|{{cols_nan_threshold}}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{{model}}/{cv_method}/{source}_{day_segment}_{summarised}_{{scaler}}/{result}.csv", - min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], - min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"], - days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], - days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], - cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], - cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"], - source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], - day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"], - summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"], - result = results), - zip, - rows_nan_threshold = rows_nan_thresholds, - cols_nan_threshold = cols_nan_thresholds, - model = models, - scaler = scalers)) rule all: input: diff --git a/example_profile/example_config.yaml b/example_profile/example_config.yaml index ff59e3cb..47197e4d 100644 --- a/example_profile/example_config.yaml +++ b/example_profile/example_config.yaml @@ -1,313 +1,365 @@ -# Participants to include in the analysis -# You must create a file for each participant named pXXX containing their device_id. This can be done manually or automatically -PIDS: [example01, example02] +# See https://www.rapids.science/setup/configuration/#database-credentials +DATABASE_GROUP: &database_group + RAPIDS_EXAMPLE -# Global var with common day segments -DAY_SEGMENTS: &day_segments - [daily] - -# Global timezone -# Use codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones -# Double check your code, for example EST is not US Eastern Time. +# See https://www.rapids.science/setup/configuration/#timezone-of-your-study TIMEZONE: &timezone America/New_York -DATABASE_GROUP: &database_group - MY_GROUP +# See https://www.rapids.science/setup/configuration/#participant-files +PIDS: [t01, t02] -DOWNLOAD_PARTICIPANTS: - IGNORED_DEVICE_IDS: [] # for example "5a1dd68c-6cd1-48fe-ae1e-14344ac5215f" - GROUP: *database_group +# See https://www.rapids.science/setup/configuration/#automatic-creation-of-participant-files +CREATE_PARTICIPANT_FILES: + SOURCE: + TYPE: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE + DATABASE_GROUP: *database_group + CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format + TIMEZONE: *timezone + PHONE_SECTION: + ADD: TRUE + DEVICE_ID_COLUMN: device_id # column name + IGNORED_DEVICE_IDS: [] + FITBIT_SECTION: + ADD: TRUE + DEVICE_ID_COLUMN: device_id # column name + IGNORED_DEVICE_IDS: [] -# Download data config -DOWNLOAD_DATASET: - GROUP: *database_group +# See https://www.rapids.science/setup/configuration/#day-segments +DAY_SEGMENTS: &day_segments + TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT + FILE: "example_profile/exampleworkflow_daysegments.csv" + INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs -# Readable datetime config -READABLE_DATETIME: - FIXED_TIMEZONE: *timezone +# See https://www.rapids.science/setup/configuration/#device-data-source-configuration +DEVICE_DATA: + PHONE: + SOURCE: + TYPE: DATABASE + DATABASE_GROUP: *database_group + DEVICE_ID_COLUMN: device_id # column name + TIMEZONE: + TYPE: SINGLE # SINGLE or MULTIPLE + VALUE: *timezone # IF TYPE=SINGLE, see docs + FITBIT: + SOURCE: + TYPE: DATABASE # DATABASE or FILES (set each FITBIT_SENSOR TABLE attribute accordingly with a table name or a file path) + COLUMN_FORMAT: JSON # JSON or PLAIN_TEXT + DATABASE_GROUP: *database_group + DEVICE_ID_COLUMN: device_id # column name + TIMEZONE: + TYPE: SINGLE # Fitbit only supports SINGLE timezones + VALUE: *timezone # see docs -PHONE_VALID_SENSED_BINS: - COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features - BIN_SIZE: &bin_size 5 # (in minutes) - # Add as many sensor tables as you have, they all improve the computation of PHONE_VALID_SENSED_BINS and PHONE_VALID_SENSED_DAYS. - # If you are extracting screen or Barnett's location features, screen and locations tables are mandatory. - DB_TABLES: [messages, calls, locations, plugin_google_activity_recognition, plugin_ios_activity_recognition, battery, screen, light, applications_foreground, plugin_studentlife_audio_android, plugin_studentlife_audio, wifi, sensor_wifi, bluetooth, applications_notifications, aware_log, ios_status_monitor, push_notification, significant, timezone, touch, keyboard] +############## PHONE ########################################################### +################################################################################ -PHONE_VALID_SENSED_DAYS: - COMPUTE: False - MIN_VALID_HOURS_PER_DAY: &min_valid_hours_per_day [16, 20] # (out of 24) MIN_HOURS_PER_DAY - MIN_VALID_BINS_PER_HOUR: &min_valid_bins_per_hour [12] # (out of 60min/BIN_SIZE bins) +PHONE_DATA_YIELD: + SENSORS: [PHONE_ACCELEROMETER, PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE] + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours] + MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1 representing the number of minutes with at least + SRC_LANGUAGE: "r" + SRC_FOLDER: "rapids" # inside src/features/phone_data_yield # Communication SMS features config, TYPES and FEATURES keys need to match -MESSAGES: - COMPUTE: True - DB_TABLE: messages - TYPES : [received, sent] - FEATURES: - received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] - sent: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] - DAY_SEGMENTS: *day_segments +PHONE_MESSAGES: + TABLE: messages + PROVIDERS: + RAPIDS: + COMPUTE: True + MESSAGES_TYPES : [received, sent] + FEATURES: + received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] + sent: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] + SRC_LANGUAGE: "r" + SRC_FOLDER: "rapids" # inside src/features/phone_messages # Communication call features config, TYPES and FEATURES keys need to match -CALLS: - COMPUTE: True - DB_TABLE: calls - TYPES: [missed, incoming, outgoing] - FEATURES: - missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact] - incoming: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact] - outgoing: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact] - DAY_SEGMENTS: *day_segments +PHONE_CALLS: + TABLE: calls + PROVIDERS: + RAPIDS: + COMPUTE: True + CALL_TYPES: [missed, incoming, outgoing] + FEATURES: + missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact] + incoming: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact] + outgoing: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact] + SRC_LANGUAGE: "r" + SRC_FOLDER: "rapids" # inside src/features/phone_calls -APPLICATION_GENRES: - CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) - CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv" - UPDATE_CATALOGUE_FILE: false # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE - SCRAPE_MISSING_GENRES: false # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway +PHONE_LOCATIONS: + TABLE: locations + LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS OR FUSED_RESAMPLED + FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold + FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row + PROVIDERS: + DORYAB: + COMPUTE: True + FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"] + DBSCAN_EPS: 10 # meters + DBSCAN_MINSAMPLES: 5 + THRESHOLD_STATIC : 1 # km/h + MAXIMUM_GAP_ALLOWED: 300 + MINUTES_DATA_USED: False + SAMPLING_FREQUENCY: 0 + SRC_FOLDER: "doryab" # inside src/features/phone_locations + SRC_LANGUAGE: "python" -RESAMPLE_FUSED_LOCATION: - CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold - TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row - TIMEZONE: *timezone + BARNETT: + COMPUTE: False + FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"] + ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius + TIMEZONE: *timezone + MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features + SRC_FOLDER: "barnett" # inside src/features/phone_locations + SRC_LANGUAGE: "r" -BARNETT_LOCATION: - COMPUTE: False - DB_TABLE: locations - DAY_SEGMENTS: [daily] # These features are only available on a daily basis - FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"] - LOCATIONS_TO_USE: ALL # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED - ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius - TIMEZONE: *timezone - MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features +PHONE_BLUETOOTH: + TABLE: bluetooth + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] + SRC_FOLDER: "rapids" # inside src/features/phone_bluetooth + SRC_LANGUAGE: "r" -DORYAB_LOCATION: - COMPUTE: True - DB_TABLE: locations - DAY_SEGMENTS: *day_segments - FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"] - LOCATIONS_TO_USE: RESAMPLE_FUSED # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED - DBSCAN_EPS: 10 # meters - DBSCAN_MINSAMPLES: 5 - THRESHOLD_STATIC : 1 # km/h - MAXIMUM_GAP_ALLOWED: 300 - MINUTES_DATA_USED: False -BLUETOOTH: - COMPUTE: True - DB_TABLE: bluetooth - DAY_SEGMENTS: *day_segments - FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] - -ACTIVITY_RECOGNITION: - COMPUTE: True - DB_TABLE: +PHONE_ACTIVITY_RECOGNITION: + TABLE: ANDROID: plugin_google_activity_recognition IOS: plugin_ios_activity_recognition - DAY_SEGMENTS: *day_segments - FEATURES: ["count","mostcommonactivity","countuniqueactivities","activitychangecount","sumstationary","summobile","sumvehicle"] + EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode. + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"] + ACTIVITY_CLASSES: + STATIONARY: ["still", "tilting"] + MOBILE: ["on_foot", "walking", "running", "on_bicycle"] + VEHICLE: ["in_vehicle"] + SRC_FOLDER: "rapids" # inside src/features/phone_activity_recognition + SRC_LANGUAGE: "python" -BATTERY: - COMPUTE: True - DB_TABLE: battery - DAY_SEGMENTS: *day_segments - FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"] +PHONE_BATTERY: + TABLE: battery + EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode. + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"] + SRC_FOLDER: "rapids" # inside src/features/phone_battery + SRC_LANGUAGE: "python" -SCREEN: - COMPUTE: True - DB_TABLE: screen - DAY_SEGMENTS: *day_segments - REFERENCE_HOUR_FIRST_USE: 0 - IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable - IGNORE_EPISODES_LONGER_THAN: 0 # in minutes, set to 0 to disable - FEATURES_DELTAS: ["countepisode", "episodepersensedminutes", "sumduration", "maxduration", "minduration", "avgduration", "stdduration", "firstuseafter"] - EPISODE_TYPES: ["unlock"] +PHONE_SCREEN: + TABLE: screen + PROVIDERS: + RAPIDS: + COMPUTE: True + REFERENCE_HOUR_FIRST_USE: 0 + IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable + IGNORE_EPISODES_LONGER_THAN: 0 # in minutes, set to 0 to disable + FEATURES: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration", "firstuseafter"] # "episodepersensedminutes" needs to be added later + EPISODE_TYPES: ["unlock"] + SRC_FOLDER: "rapids" # inside src/features/phone_screen + SRC_LANGUAGE: "python" -LIGHT: - COMPUTE: True - DB_TABLE: light - DAY_SEGMENTS: *day_segments - FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] +PHONE_LIGHT: + TABLE: light + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] + SRC_FOLDER: "rapids" # inside src/features/phone_light + SRC_LANGUAGE: "python" -ACCELEROMETER: - COMPUTE: False - DB_TABLE: accelerometer - DAY_SEGMENTS: *day_segments - FEATURES: - MAGNITUDE: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"] - EXERTIONAL_ACTIVITY_EPISODE: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"] - NONEXERTIONAL_ACTIVITY_EPISODE: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"] - VALID_SENSED_MINUTES: True +PHONE_ACCELEROMETER: + TABLE: accelerometer + PROVIDERS: + RAPIDS: + COMPUTE: False + FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"] + SRC_FOLDER: "rapids" # inside src/features/phone_accelerometer + SRC_LANGUAGE: "python" + + PANDA: + COMPUTE: False + VALID_SENSED_MINUTES: False + FEATURES: + exertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"] + nonexertional_activity_episode: ["sumduration", "maxduration", "minduration", "avgduration", "medianduration", "stdduration"] + SRC_FOLDER: "panda" # inside src/features/phone_accelerometer + SRC_LANGUAGE: "python" -APPLICATIONS_FOREGROUND: - COMPUTE: True - DB_TABLE: applications_foreground - DAY_SEGMENTS: *day_segments - SINGLE_CATEGORIES: ["all", "email"] - MULTIPLE_CATEGORIES: - social: ["socialnetworks", "socialmediatools"] - entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"] - SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps - EXCLUDED_CATEGORIES: ["system_apps"] - EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"] - FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"] +PHONE_APPLICATIONS_FOREGROUND: + TABLE: applications_foreground + APPLICATION_CATEGORIES: + CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) + CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv" + UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE + SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway + PROVIDERS: + RAPIDS: + COMPUTE: True + SINGLE_CATEGORIES: ["all", "email"] + MULTIPLE_CATEGORIES: + social: ["socialnetworks", "socialmediatools"] + entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"] + SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps + EXCLUDED_CATEGORIES: ["system_apps"] + EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"] + FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"] + SRC_FOLDER: "rapids" # inside src/features/phone_applications_foreground + SRC_LANGUAGE: "python" -HEARTRATE: - COMPUTE: True - DB_TABLE: fitbit_data - DAY_SEGMENTS: *day_segments - SUMMARY_FEATURES: ["restinghr"] # calories features' accuracy depend on the accuracy of the participants fitbit profile (e.g. heigh, weight) use with care: ["caloriesoutofrange", "caloriesfatburn", "caloriescardio", "caloriespeak"] - INTRADAY_FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "minutesonoutofrangezone", "minutesonfatburnzone", "minutesoncardiozone", "minutesonpeakzone"] +PHONE_WIFI_VISIBLE: + TABLE: "wifi" + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] + SRC_FOLDER: "rapids" # inside src/features/phone_wifi_visible + SRC_LANGUAGE: "r" -STEP: - COMPUTE: True - DB_TABLE: fitbit_data - DAY_SEGMENTS: *day_segments - EXCLUDE_SLEEP: - EXCLUDE: False - TYPE: FIXED # FIXED OR FITBIT_BASED (CONFIGURE FITBIT's SLEEP DB_TABLE) - FIXED: - START: "23:00" - END: "07:00" - FEATURES: - ALL_STEPS: ["sumallsteps", "maxallsteps", "minallsteps", "avgallsteps", "stdallsteps"] - SEDENTARY_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"] - ACTIVE_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"] - THRESHOLD_ACTIVE_BOUT: 10 # steps - INCLUDE_ZERO_STEP_ROWS: False +PHONE_WIFI_CONNECTED: + TABLE: "sensor_wifi" + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] + SRC_FOLDER: "rapids" # inside src/features/phone_wifi_connected + SRC_LANGUAGE: "r" -SLEEP: - COMPUTE: True - DB_TABLE: fitbit_data - DAY_SEGMENTS: *day_segments - SLEEP_TYPES: ["main", "nap", "all"] - SUMMARY_FEATURES: ["sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgefficiency", "countepisode"] - -WIFI: - COMPUTE: True - DB_TABLE: - VISIBLE_ACCESS_POINTS: "wifi" # if you only have a CONNECTED_ACCESS_POINTS table, set this value to "" - CONNECTED_ACCESS_POINTS: "sensor_wifi" # if you only have a VISIBLE_ACCESS_POINTS table, set this value to "" - DAY_SEGMENTS: *day_segments - FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] - -CONVERSATION: - COMPUTE: True - DB_TABLE: +PHONE_CONVERSATION: + TABLE: ANDROID: plugin_studentlife_audio_android IOS: plugin_studentlife_audio - DAY_SEGMENTS: *day_segments - FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration", - "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy", - "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction", + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration", + "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy", + "noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy", + "voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction", "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction", "unknownexpectedfraction","countconversation"] - RECORDINGMINUTES: 1 - PAUSEDMINUTES : 3 + RECORDING_MINUTES: 1 + PAUSED_MINUTES : 3 + SRC_FOLDER: "rapids" # inside src/features/phone_conversation + SRC_LANGUAGE: "python" -### Visualizations ################################################################ -HEATMAP_FEATURES_CORRELATIONS: - PLOT: True - MIN_ROWS_RATIO: 0.5 - MIN_VALID_HOURS_PER_DAY: *min_valid_hours_per_day - MIN_VALID_BINS_PER_HOUR: *min_valid_bins_per_hour - PHONE_FEATURES: [activity_recognition, applications_foreground, battery, calls_incoming, calls_missed, calls_outgoing, conversation, light, location_doryab, messages_received, messages_sent, screen] - FITBIT_FEATURES: [fitbit_heartrate, fitbit_step, fitbit_sleep] - CORR_THRESHOLD: 0.1 - CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"} +############## FITBIT ########################################################## +################################################################################ -HISTOGRAM_VALID_SENSED_HOURS: - PLOT: True - MIN_VALID_HOURS_PER_DAY: *min_valid_hours_per_day - MIN_VALID_BINS_PER_HOUR: *min_valid_bins_per_hour +FITBIT_HEARTRATE_SUMMARY: + TABLE: fitbit_data + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["maxrestinghr", "minrestinghr", "avgrestinghr", "medianrestinghr", "moderestinghr", "stdrestinghr", "diffmaxmoderestinghr", "diffminmoderestinghr", "entropyrestinghr"] # calories features' accuracy depend on the accuracy of the participants fitbit profile (e.g. height, weight) use these with care: ["sumcaloriesoutofrange", "maxcaloriesoutofrange", "mincaloriesoutofrange", "avgcaloriesoutofrange", "mediancaloriesoutofrange", "stdcaloriesoutofrange", "entropycaloriesoutofrange", "sumcaloriesfatburn", "maxcaloriesfatburn", "mincaloriesfatburn", "avgcaloriesfatburn", "mediancaloriesfatburn", "stdcaloriesfatburn", "entropycaloriesfatburn", "sumcaloriescardio", "maxcaloriescardio", "mincaloriescardio", "avgcaloriescardio", "mediancaloriescardio", "stdcaloriescardio", "entropycaloriescardio", "sumcaloriespeak", "maxcaloriespeak", "mincaloriespeak", "avgcaloriespeak", "mediancaloriespeak", "stdcaloriespeak", "entropycaloriespeak"] + SRC_FOLDER: "rapids" # inside src/features/fitbit_heartrate_summary + SRC_LANGUAGE: "python" -HEATMAP_DAYS_BY_SENSORS: - PLOT: True - MIN_VALID_HOURS_PER_DAY: *min_valid_hours_per_day - MIN_VALID_BINS_PER_HOUR: *min_valid_bins_per_hour - EXPECTED_NUM_OF_DAYS: -1 - DB_TABLES: [applications_foreground, battery, bluetooth, calls, light, locations, messages, screen, wifi, sensor_wifi, plugin_google_activity_recognition, plugin_ios_activity_recognition, plugin_studentlife_audio_android, plugin_studentlife_audio] +FITBIT_HEARTRATE_INTRADAY: + TABLE: fitbit_data + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "minutesonoutofrangezone", "minutesonfatburnzone", "minutesoncardiozone", "minutesonpeakzone"] + SRC_FOLDER: "rapids" # inside src/features/fitbit_heartrate_intraday + SRC_LANGUAGE: "python" +FITBIT_STEPS_SUMMARY: + TABLE: fitbit_data + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["maxsumsteps", "minsumsteps", "avgsumsteps", "mediansumsteps", "stdsumsteps"] + SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_summary + SRC_LANGUAGE: "python" -HEATMAP_SENSED_BINS: - PLOT: True - BIN_SIZE: *bin_size +FITBIT_STEPS_INTRADAY: + TABLE: fitbit_data + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: + STEPS: ["sum", "max", "min", "avg", "std"] + SEDENTARY_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"] + ACTIVE_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"] + THRESHOLD_ACTIVE_BOUT: 10 # steps + INCLUDE_ZERO_STEP_ROWS: False + SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_intraday + SRC_LANGUAGE: "python" -OVERALL_COMPLIANCE_HEATMAP: - PLOT: True - ONLY_SHOW_VALID_DAYS: False - EXPECTED_NUM_OF_DAYS: -1 - BIN_SIZE: *bin_size - MIN_VALID_HOURS_PER_DAY: *min_valid_hours_per_day - MIN_VALID_BINS_PER_HOUR: *min_valid_bins_per_hour +FITBIT_SLEEP_SUMMARY: + TABLE: fitbit_data + SLEEP_EPISODE_TIMESTAMP: end # summary sleep episodes are considered as events based on either the start timestamp or end timestamp. + PROVIDERS: + RAPIDS: + COMPUTE: True + FEATURES: ["countepisode", "avgefficiency", "sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgdurationafterwakeup", "avgdurationasleep", "avgdurationawake", "avgdurationtofallasleep", "avgdurationinbed"] + SLEEP_TYPES: ["main", "nap", "all"] + SRC_FOLDER: "rapids" # inside src/features/fitbit_sleep_summary + SRC_LANGUAGE: "python" + +### Analysis Workflow Example ################################################## +################################################################################ -### Example Analysis ################################################################ PARAMS_FOR_ANALYSIS: - COMPUTE: True - GROUNDTRUTH_TABLE: participant_info - TARGET_TABLE: participant_target - SOURCES: &sources ["phone_features", "fitbit_features", "phone_fitbit_features"] - DAY_SEGMENTS: *day_segments - PHONE_FEATURES: [activity_recognition, applications_foreground, battery, bluetooth, calls_incoming, calls_missed, calls_outgoing, conversation, light, location_doryab, messages_received, messages_sent, screen, wifi] - FITBIT_FEATURES: [fitbit_heartrate, fitbit_step, fitbit_sleep] - PHONE_FITBIT_FEATURES: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile - DEMOGRAPHIC_FEATURES: [age, gender, inpatientdays] - CATEGORICAL_DEMOGRAPHIC_FEATURES: ["gender"] - FEATURES_EXCLUDE_DAY_IDX: True + CATEGORICAL_OPERATORS: [mostcommon] - # Whether or not to include only days with enough valid sensed hours - # logic can be found in rule phone_valid_sensed_days of rules/preprocessing.snakefile - DROP_VALID_SENSED_DAYS: - ENABLED: True - - # Whether or not to include certain days in the analysis, logic can be found in rule days_to_analyse of rules/mystudy.snakefile - # If you want to include all days downloaded for each participant, set ENABLED to False - DAYS_TO_ANALYSE: - ENABLED: True - DAYS_BEFORE_SURGERY: 6 #15 - DAYS_IN_HOSPITAL: F # T or F - DAYS_AFTER_DISCHARGE: 5 #7 + DEMOGRAPHIC: + TABLE: participant_info + FEATURES: [age, gender, inpatientdays] + CATEGORICAL_FEATURES: [gender] + SOURCE: + DATABASE_GROUP: *database_group + TIMEZONE: *timezone + + TARGET: + TABLE: participant_target + SOURCE: + DATABASE_GROUP: *database_group + TIMEZONE: *timezone # Cleaning Parameters - COLS_NAN_THRESHOLD: [0.1, 0.3] + COLS_NAN_THRESHOLD: 0.3 COLS_VAR_THRESHOLD: True - ROWS_NAN_THRESHOLD: [0.1, 0.3] - PARTICIPANT_DAYS_BEFORE_THRESHOLD: 3 - PARTICIPANT_DAYS_AFTER_THRESHOLD: 3 - - # Extract summarised features from daily features with any of the following substrings - NUMERICAL_OPERATORS: ["count", "sum", "length", "avg", "restinghr"] - CATEGORICAL_OPERATORS: ["mostcommon"] - - MODEL_NAMES: ["LogReg", "kNN", "SVM", "DT", "RF", "GB", "XGBoost", "LightGBM"] - CV_METHODS: ["LeaveOneOut"] - SUMMARISED: ["notsummarised"] # "summarised" or "notsummarised" - RESULT_COMPONENTS: ["fold_predictions", "fold_metrics", "overall_results", "fold_feature_importances"] + ROWS_NAN_THRESHOLD: 0.3 + DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75 + + MODEL_NAMES: [LogReg, kNN , SVM, DT, RF, GB, XGBoost, LightGBM] + CV_METHODS: [LeaveOneOut] + RESULT_COMPONENTS: [fold_predictions, fold_metrics, overall_results, fold_feature_importances] MODEL_SCALER: - LogReg: ["notnormalized", "minmaxscaler", "standardscaler", "robustscaler"] - kNN: ["minmaxscaler", "standardscaler", "robustscaler"] - SVM: ["minmaxscaler", "standardscaler", "robustscaler"] - DT: ["notnormalized"] - RF: ["notnormalized"] - GB: ["notnormalized"] - XGBoost: ["notnormalized"] - LightGBM: ["notnormalized"] + LogReg: [notnormalized, minmaxscaler, standardscaler, robustscaler] + kNN: [minmaxscaler, standardscaler, robustscaler] + SVM: [minmaxscaler, standardscaler, robustscaler] + DT: [notnormalized] + RF: [notnormalized] + GB: [notnormalized] + XGBoost: [notnormalized] + LightGBM: [notnormalized] MODEL_HYPERPARAMS: LogReg: {"clf__C": [0.01, 0.1, 1, 10, 100], "clf__solver": ["newton-cg", "lbfgs", "liblinear", "saga"], "clf__penalty": ["l2"]} kNN: - {"clf__n_neighbors": [1, 3, 5], "clf__weights": ["uniform", "distance"], "clf__metric": ["euclidean", "manhattan", "minkowski"]} + {"clf__n_neighbors": [3, 5, 7], "clf__weights": ["uniform", "distance"], "clf__metric": ["euclidean", "manhattan", "minkowski"]} SVM: {"clf__C": [0.01, 0.1, 1, 10, 100], "clf__gamma": ["scale", "auto"], "clf__kernel": ["rbf", "poly", "sigmoid"]} DT: - {"clf__criterion": ["gini", "entropy"], "clf__max_depth": [null, 3, 5, 7, 9], "clf__max_features": [null, "auto", "sqrt", "log2"]} + {"clf__criterion": ["gini", "entropy"], "clf__max_depth": [null, 3, 7, 15], "clf__max_features": [null, "auto", "sqrt", "log2"]} RF: - {"clf__n_estimators": [2, 5, 10, 100],"clf__max_depth": [null, 3, 5, 7, 9]} + {"clf__n_estimators": [10, 100, 200],"clf__max_depth": [null, 3, 7, 15]} GB: - {"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [5, 10, 100, 200], "clf__subsample": [0.5, 0.7, 1.0], "clf__max_depth": [3, 5, 7, 9]} + {"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [10, 100, 200], "clf__subsample": [0.5, 0.7, 1.0], "clf__max_depth": [null, 3, 5, 7]} XGBoost: - {"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [5, 10, 100, 200], "clf__num_leaves": [5, 16, 31, 62]} + {"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [10, 100, 200], "clf__max_depth": [3, 5, 7]} LightGBM: - {"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [5, 10, 100, 200], "clf__num_leaves": [5, 16, 31, 62]} + {"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [10, 100, 200], "clf__num_leaves": [3, 5, 7], "clf__colsample_bytree": [0.6, 0.8, 1]} diff --git a/example_profile/exampleworkflow_daysegments.csv b/example_profile/exampleworkflow_daysegments.csv new file mode 100644 index 00000000..4338e809 --- /dev/null +++ b/example_profile/exampleworkflow_daysegments.csv @@ -0,0 +1,2 @@ +label,start_time,length,repeats_on,repeats_value +daily,00:00:00,23H 59M 59S,every_day,0 \ No newline at end of file diff --git a/rules/common.smk b/rules/common.smk index 72852b16..b3b0c815 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -28,30 +28,15 @@ def optional_steps_sleep_input(wildcards): else: return [] -# Models.smk ########################################################################################################### - -def input_merge_features_of_single_participant(wildcards): - if wildcards.source == "phone_fitbit_features": - return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"]["PHONE_FEATURES"] + config["PARAMS_FOR_ANALYSIS"]["FITBIT_FEATURES"], day_segment=wildcards.day_segment) - else: - return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"][wildcards.source.upper()], day_segment=wildcards.day_segment) - -def optional_input_days_to_include(wildcards): - if config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["ENABLED"]: - # This input automatically trigers the rule days_to_analyse in mystudy.snakefile - return ["data/interim/{pid}/days_to_analyse" + \ - "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"]) + \ - "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]) + \ - "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"]) + ".csv"] - else: - return [] - -def optional_input_valid_sensed_days(wildcards): - if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]: - # This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile - return ["data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv"] - else: - return [] +def input_merge_sensor_features_for_individual_participants(wildcards): + feature_files = [] + for config_key in config.keys(): + if config_key.startswith(("PHONE", "FITBIT")) and "PROVIDERS" in config[config_key]: + for provider_key, provider in config[config_key]["PROVIDERS"].items(): + if "COMPUTE" in provider.keys() and provider["COMPUTE"]: + feature_files.append("data/processed/features/{pid}/" + config_key.lower() + ".csv") + break + return feature_files # Reports.smk ########################################################################################################### diff --git a/rules/features.smk b/rules/features.smk index 023f27d4..8942beb5 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -1,10 +1,12 @@ rule join_features_from_providers: input: - location_features = find_features_files + sensor_features = find_features_files + wildcard_constraints: + sensor_key = '(phone|fitbit).*' output: "data/processed/features/{pid}/{sensor_key}.csv" script: - "../src/features/join_features_from_providers.R" + "../src/features/utils/join_features_from_providers.R" rule phone_data_yield_python_features: input: @@ -528,15 +530,18 @@ rule fitbit_sleep_summary_r_features: script: "../src/features/entry.R" -# rule fitbit_sleep_features: -# input: -# sleep_summary_data = "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv", -# sleep_intraday_data = "data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv" -# params: -# day_segment = "{day_segment}", -# summary_features = config["SLEEP"]["SUMMARY_FEATURES"], -# sleep_types = config["SLEEP"]["SLEEP_TYPES"] -# output: -# "data/processed/{pid}/fitbit_sleep_{day_segment}.csv" -# script: -# "../src/features/fitbit_sleep_features.py" +rule merge_sensor_features_for_individual_participants: + input: + feature_files = input_merge_sensor_features_for_individual_participants + output: + "data/processed/features/{pid}/all_sensor_features.csv" + script: + "../src/features/utils/merge_sensor_features_for_individual_participants.R" + +rule merge_sensor_features_for_all_participants: + input: + feature_files = expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]) + output: + "data/processed/features/all_participants/all_sensor_features.csv" + script: + "../src/features/utils/merge_sensor_features_for_all_participants.R" diff --git a/rules/models.smk b/rules/models.smk index ce6fcba6..b0742e32 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -1,174 +1,174 @@ -ruleorder: nan_cells_ratio_of_cleaned_features > merge_features_and_targets - -rule days_to_analyse: +rule download_demographic_data: input: - participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv" + participant_file = "data/external/participant_files/{pid}.yaml" params: - days_before_surgery = "{days_before_surgery}", - days_in_hospital = "{days_in_hospital}", - days_after_discharge= "{days_after_discharge}" + source = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["SOURCE"], + table = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["TABLE"], output: - "data/interim/{pid}/days_to_analyse_{days_before_surgery}_{days_in_hospital}_{days_after_discharge}.csv" + "data/raw/{pid}/participant_info_raw.csv" script: - "../src/models/select_days_to_analyse.py" - -rule targets: - input: - participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["TARGET_TABLE"] + "_raw.csv" - params: - pid = "{pid}", - summarised = "{summarised}" - output: - "data/processed/{pid}/targets_{summarised}.csv" - script: - "../src/models/targets.py" + "../src/data/workflow_example/download_demographic_data.R" rule demographic_features: input: - participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv" + participant_info = "data/raw/{pid}/participant_info_raw.csv" params: pid = "{pid}", - features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC_FEATURES"] + features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FEATURES"] output: - "data/processed/{pid}/demographic_features.csv" + "data/processed/features/{pid}/demographic_features.csv" script: - "../src/features/demographic_features.py" + "../src/features/workflow_example/demographic_features.py" -rule merge_features_for_individual_model: +rule download_target_data: input: - feature_files = input_merge_features_of_single_participant, - phone_valid_sensed_days = optional_input_valid_sensed_days, - days_to_include = optional_input_days_to_include + participant_file = "data/external/participant_files/{pid}.yaml" params: - source = "{source}" + source = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["SOURCE"], + table = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["TABLE"], output: - "data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv" + "data/raw/{pid}/participant_target_raw.csv" script: - "../src/models/merge_features_for_individual_model.R" + "../src/data/workflow_example/download_target_data.R" -rule merge_features_for_population_model: +rule target_readable_datetime: input: - feature_files = expand("data/processed/{pid}/data_for_individual_model/{{min_valid_hours_per_day}}hours_{{min_valid_bins_per_hour}}bins/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"]) - output: - "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv" - script: - "../src/models/merge_features_for_population_model.R" - -rule merge_demographicfeatures_for_population_model: - input: - data_files = expand("data/processed/{pid}/demographic_features.csv", pid=config["PIDS"]) - output: - "data/processed/data_for_population_model/demographic_features.csv" - script: - "../src/models/merge_data_for_population_model.py" - -rule merge_targets_for_population_model: - input: - data_files = expand("data/processed/{pid}/targets_{{summarised}}.csv", pid=config["PIDS"]) - output: - "data/processed/data_for_population_model/targets_{summarised}.csv" - script: - "../src/models/merge_data_for_population_model.py" - -rule clean_features_for_individual_model: - input: - rules.merge_features_for_individual_model.output + sensor_input = "data/raw/{pid}/participant_target_raw.csv", + day_segments = "data/interim/day_segments/{pid}_day_segments.csv" params: - features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"], - cols_nan_threshold = "{cols_nan_threshold}", - cols_var_threshold = "{cols_var_threshold}", - days_before_threshold = "{days_before_threshold}", - days_after_threshold = "{days_after_threshold}", - rows_nan_threshold = "{rows_nan_threshold}", + fixed_timezone = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["SOURCE"]["TIMEZONE"], + day_segments_type = config["DAY_SEGMENTS"]["TYPE"], + include_past_periodic_segments = config["DAY_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] output: - "data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv" + "data/raw/{pid}/participant_target_with_datetime.csv" script: - "../src/models/clean_features_for_model.R" + "../src/data/readable_datetime.R" -rule clean_features_for_population_model: +rule parse_targets: input: - rules.merge_features_for_population_model.output - params: - features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"], - cols_nan_threshold = "{cols_nan_threshold}", - cols_var_threshold = "{cols_var_threshold}", - days_before_threshold = "{days_before_threshold}", - days_after_threshold = "{days_after_threshold}", - rows_nan_threshold = "{rows_nan_threshold}", + targets = "data/raw/{pid}/participant_target_with_datetime.csv", + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" output: - "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv" + "data/processed/targets/{pid}/parsed_targets.csv" script: - "../src/models/clean_features_for_model.R" + "../src/models/workflow_example/parse_targets.py" -rule nan_cells_ratio_of_cleaned_features: +rule clean_sensor_features_for_individual_participants: input: - cleaned_features = "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv" - output: - "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv" - script: - "../src/models/nan_cells_ratio_of_cleaned_features.py" - -rule merge_features_and_targets: - input: - cleaned_features = "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv", - demographic_features = "data/processed/data_for_population_model/demographic_features.csv", - targets = "data/processed/data_for_population_model/targets_{summarised}.csv", + rules.merge_sensor_features_for_individual_participants.output params: - summarised = "{summarised}", - cols_var_threshold = "{cols_var_threshold}", - numerical_operators = config["PARAMS_FOR_ANALYSIS"]["NUMERICAL_OPERATORS"], - categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"], - features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"], + cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], + cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], + data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"], output: - "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv" + "data/processed/features/{pid}/all_sensor_features_cleaned.csv" script: - "../src/models/merge_features_and_targets.py" - -rule baseline: + "../src/models/workflow_example/clean_sensor_features.R" + +rule clean_sensor_features_for_all_participants: input: - "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv" + rules.merge_sensor_features_for_all_participants.output + params: + cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], + cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], + data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"], + output: + "data/processed/features/all_participants/all_sensor_features_cleaned.csv" + script: + "../src/models/workflow_example/clean_sensor_features.R" + + + + + + + + + + +rule merge_features_and_targets_for_individual_model: + input: + cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned.csv", + targets = "data/processed/targets/{pid}/parsed_targets.csv", + output: + "data/processed/models/individual_model/{pid}/input.csv" + script: + "../src/models/workflow_example/merge_features_and_targets_for_individual_model.py" + +rule merge_features_and_targets_for_population_model: + input: + cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned.csv", + demographic_features = expand("data/processed/features/{pid}/demographic_features.csv", pid=config["PIDS"]), + targets = expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"]), + output: + "data/processed/models/population_model/input.csv" + script: + "../src/models/workflow_example/merge_features_and_targets_for_population_model.py" + +rule baselines_for_individual_model: + input: + "data/processed/models/individual_model/{pid}/input.csv" params: cv_method = "{cv_method}", - rowsnan_colsnan_days_colsvar_threshold = "{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}", - demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC_FEATURES"] + colnames_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FEATURES"], output: - "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/baseline/{cv_method}/{source}_{day_segment}_{summarised}.csv" + "data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv" log: - "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/baseline/{cv_method}/{source}_{day_segment}_{summarised}_notes.log" + "data/processed/models/individual_model/{pid}/output_{cv_method}/baselines_notes.log" script: - "../src/models/baseline.py" - - -rule modeling: + "../src/models/workflow_example/baselines.py" + +rule baselines_for_population_model: input: - data = "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv" + "data/processed/models/population_model/input.csv" + params: + cv_method = "{cv_method}", + colnames_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FEATURES"], + output: + "data/processed/models/population_model/output_{cv_method}/baselines.csv" + log: + "data/processed/models/population_model/output_{cv_method}/baselines_notes.log" + script: + "../src/models/workflow_example/baselines.py" + +rule modeling_for_individual_participants: + input: + data = "data/processed/models/individual_model/{pid}/input.csv" params: model = "{model}", cv_method = "{cv_method}", - source = "{source}", - day_segment = "{day_segment}", - summarised = "{summarised}", scaler = "{scaler}", categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"], - categorical_demographic_features = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_DEMOGRAPHIC_FEATURES"], + categorical_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["CATEGORICAL_FEATURES"], model_hyperparams = config["PARAMS_FOR_ANALYSIS"]["MODEL_HYPERPARAMS"], - rowsnan_colsnan_days_colsvar_threshold = "{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}" output: - fold_predictions = "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_predictions.csv", - fold_metrics = "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_metrics.csv", - overall_results = "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/overall_results.csv", - fold_feature_importances = "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_feature_importances.csv" + fold_predictions = "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/fold_predictions.csv", + fold_metrics = "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/fold_metrics.csv", + overall_results = "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/overall_results.csv", + fold_feature_importances = "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/fold_feature_importances.csv" log: - "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/notes.log" + "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/notes.log" script: - "../src/models/modeling.py" + "../src/models/workflow_example/modeling.py" -rule merge_population_model_results: +rule modeling_for_all_participants: input: - overall_results = "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/overall_results.csv", - nan_cells_ratio = "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv", - baseline = "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/baseline/{cv_method}/{source}_{day_segment}_{summarised}.csv" + data = "data/processed/models/population_model/input.csv" + params: + model = "{model}", + cv_method = "{cv_method}", + scaler = "{scaler}", + categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"], + categorical_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["CATEGORICAL_FEATURES"], + model_hyperparams = config["PARAMS_FOR_ANALYSIS"]["MODEL_HYPERPARAMS"], output: - "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/merged_population_model_results.csv" + fold_predictions = "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/fold_predictions.csv", + fold_metrics = "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/fold_metrics.csv", + overall_results = "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/overall_results.csv", + fold_feature_importances = "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/fold_feature_importances.csv" + log: + "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/notes.log" script: - "../src/models/merge_population_model_results.py" + "../src/models/workflow_example/modeling.py" diff --git a/src/data/workflow_example/download_demographic_data.R b/src/data/workflow_example/download_demographic_data.R new file mode 100644 index 00000000..ecfa2d40 --- /dev/null +++ b/src/data/workflow_example/download_demographic_data.R @@ -0,0 +1,22 @@ +source("renv/activate.R") +library(RMySQL) +library("dplyr", warn.conflicts = F) +library(readr) +library(stringr) +library(yaml) + + +participant_file <- snakemake@input[["participant_file"]] +source <- snakemake@params[["source"]] +table <- snakemake@params[["table"]] +sensor_file <- snakemake@output[[1]] + +participant <- read_yaml(participant_file) +record_id <- participant$PHONE$LABEL + +dbEngine <- dbConnect(MySQL(), default.file = "./.env", group = source$DATABASE_GROUP) +query <- paste0("SELECT * FROM ", table, " WHERE record_id = '", record_id, "'") +sensor_data <- dbGetQuery(dbEngine, query) +dbDisconnect(dbEngine) + +write_csv(sensor_data, sensor_file) diff --git a/src/data/workflow_example/download_target_data.R b/src/data/workflow_example/download_target_data.R new file mode 100644 index 00000000..22e6d892 --- /dev/null +++ b/src/data/workflow_example/download_target_data.R @@ -0,0 +1,26 @@ +source("renv/activate.R") +library(RMySQL) +library("dplyr", warn.conflicts = F) +library(readr) +library(stringr) +library(yaml) +library(lubridate) + + +participant_file <- snakemake@input[["participant_file"]] +source <- snakemake@params[["source"]] +table <- snakemake@params[["table"]] +sensor_file <- snakemake@output[[1]] + +participant <- read_yaml(participant_file) +record_id <- participant$PHONE$LABEL + +dbEngine <- dbConnect(MySQL(), default.file = "./.env", group = source$DATABASE_GROUP) +query <- paste0("SELECT * FROM ", table, " WHERE record_id = '", record_id, "'") +sensor_data <- dbGetQuery(dbEngine, query) +dbDisconnect(dbEngine) + +# generate timestamp based on local_date +sensor_data$timestamp <- as.numeric(ymd_hms(paste(sensor_data$local_date, "00:00:00"), tz=source$TIMEZONE, quiet=TRUE)) * 1000 + +write_csv(sensor_data, sensor_file) diff --git a/src/features/__init__.py b/src/features/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/features/features_utils.py b/src/features/features_utils.py deleted file mode 100644 index f824402b..00000000 --- a/src/features/features_utils.py +++ /dev/null @@ -1,85 +0,0 @@ -import pandas as pd -from datetime import datetime, timedelta, time - -SEGMENT = {"night": 0, "morning": 1, "afternoon": 2, "evening": 3} -EPOCH_TIMES = {"night": [0,5], "morning": [6,11], "afternoon": [12,17], "evening": [18,23]} - -def truncateTime(df, segment_column, new_day_segment, datetime_column, date_column, new_time): - df.loc[:, segment_column] = new_day_segment - df.loc[:, datetime_column] = df[date_column].apply(lambda date: datetime.combine(date, new_time)) - return df - -# calculate truncated time differences and truncated extra_cols if it is not empty -def computeTruncatedDifferences(df, extra_cols): - df["truncated_time_diff"] = df["local_end_date_time"] - df["local_start_date_time"] - df["truncated_time_diff"] = df["truncated_time_diff"].apply(lambda time: time.total_seconds()/60) - if extra_cols: - for extra_col in extra_cols: - df[extra_col] = df[extra_col] * (df["truncated_time_diff"] / df["time_diff"]) - del df["time_diff"] - df.rename(columns={"truncated_time_diff": "time_diff"}, inplace=True) - return df - -def splitOvernightEpisodes(sensor_deltas, extra_cols, fixed_cols): - overnight = sensor_deltas[(sensor_deltas["local_start_date"] + timedelta(days=1)) == sensor_deltas["local_end_date"]] - not_overnight = sensor_deltas[sensor_deltas["local_start_date"] == sensor_deltas["local_end_date"]] - - if not overnight.empty: - today = overnight[extra_cols + fixed_cols + ["time_diff", "local_start_date_time", "local_start_date", "local_start_day_segment"]].copy() - tomorrow = overnight[extra_cols + fixed_cols + ["time_diff", "local_end_date_time", "local_end_date", "local_end_day_segment"]].copy() - - # truncate the end time of all overnight periods to midnight - today = truncateTime(today, "local_end_day_segment", "evening", "local_end_date_time", "local_start_date", time(23,59,59)) - today["local_end_date"] = overnight["local_start_date"] - - # set the start time of all periods after midnight to midnight - tomorrow = truncateTime(tomorrow, "local_start_day_segment", "night", "local_start_date_time", "local_end_date", time(0,0,0)) - tomorrow["local_start_date"] = overnight["local_end_date"] - - overnight = pd.concat([today, tomorrow], axis=0, sort=False) - - # calculate new time_diff and extra_cols for split overnight periods - overnight = computeTruncatedDifferences(overnight, extra_cols) - - # sort by local_start_date_time and reset the index - days = pd.concat([not_overnight, overnight], axis=0, sort=False) - days = days.sort_values(by=['local_start_date_time']).reset_index(drop=True) - - return days - -def splitMultiSegmentEpisodes(sensor_deltas, day_segment, extra_cols): - # extract episodes that start and end at the same epochs - exact_segments = sensor_deltas.query("local_start_day_segment == local_end_day_segment and local_start_day_segment == @day_segment").copy() - - # extract episodes that start and end at different epochs - across_segments = sensor_deltas.query("local_start_day_segment != local_end_day_segment").copy() - # 1) if start time is in current day_segment - start_segment = across_segments[across_segments["local_start_day_segment"] == day_segment].copy() - if not start_segment.empty: - start_segment = truncateTime(start_segment, "local_end_day_segment", day_segment, "local_end_date_time", "local_end_date", time(EPOCH_TIMES[day_segment][1],59,59)) - # 2) if end time is in current day_segment - end_segment = across_segments[across_segments["local_end_day_segment"] == day_segment].copy() - if not end_segment.empty: - end_segment = truncateTime(end_segment, "local_start_day_segment", day_segment, "local_start_date_time", "local_start_date", time(EPOCH_TIMES[day_segment][0],0,0)) - # 3) if current episode comtains day_segment - across_segments.loc[:,"start_segment"] = across_segments["local_start_day_segment"].apply(lambda seg: SEGMENT[seg]) - across_segments.loc[:,"end_segment"] = across_segments["local_end_day_segment"].apply(lambda seg: SEGMENT[seg]) - day_segment_num = SEGMENT[day_segment] - within_segments = across_segments.query("start_segment < @day_segment_num and end_segment > @day_segment_num") - del across_segments["start_segment"], across_segments["end_segment"] - del within_segments["start_segment"], within_segments["end_segment"] - - if not within_segments.empty: - within_segments = truncateTime(within_segments, "local_start_day_segment", day_segment, "local_start_date_time", "local_start_date", time(EPOCH_TIMES[day_segment][0],0,0)) - within_segments = truncateTime(within_segments, "local_end_day_segment", day_segment, "local_end_date_time", "local_end_date", time(EPOCH_TIMES[day_segment][1],59,59)) - - across_segments = pd.concat([start_segment, end_segment, within_segments], axis=0, sort=False) - - if not across_segments.empty: - accross_segments = computeTruncatedDifferences(across_segments, extra_cols) - - # sort by local_start_date_time and reset the index - segments = pd.concat([exact_segments, across_segments], axis=0, sort=False) - segments = segments.sort_values(by=['local_start_date_time']).reset_index(drop=True) - - return segments \ No newline at end of file diff --git a/src/features/phone_bluetooth/rapids/main.R b/src/features/phone_bluetooth/rapids/main.R index 3f8af8c1..599db220 100644 --- a/src/features/phone_bluetooth/rapids/main.R +++ b/src/features/phone_bluetooth/rapids/main.R @@ -46,7 +46,7 @@ rapids_features <- function(sensor_data_files, day_segment, provider){ features <- merge(features, feature, by="local_segment", all = TRUE) } - features <- features %>% mutate_at(vars(contains("countscansmostuniquedevice")), list( ~ replace_na(., 0))) %>% select(-local_segment) + features <- features %>% mutate_at(vars(contains("countscansmostuniquedevice")), list( ~ replace_na(., 0))) return(features) } \ No newline at end of file diff --git a/src/features/join_features_from_providers.R b/src/features/utils/join_features_from_providers.R similarity index 84% rename from src/features/join_features_from_providers.R rename to src/features/utils/join_features_from_providers.R index 1b07a197..bbbc18b3 100644 --- a/src/features/join_features_from_providers.R +++ b/src/features/utils/join_features_from_providers.R @@ -3,7 +3,7 @@ source("renv/activate.R") library("tidyr") library("dplyr", warn.conflicts = F) -location_features_files <- snakemake@input[["location_features"]] +location_features_files <- snakemake@input[["sensor_features"]] location_features <- setNames(data.frame(matrix(ncol = 1, nrow = 0)), c("local_segment")) diff --git a/src/models/merge_features_for_population_model.R b/src/features/utils/merge_sensor_features_for_all_participants.R similarity index 72% rename from src/models/merge_features_for_population_model.R rename to src/features/utils/merge_sensor_features_for_all_participants.R index 2808455b..972e56d3 100644 --- a/src/models/merge_features_for_population_model.R +++ b/src/features/utils/merge_sensor_features_for_all_participants.R @@ -9,7 +9,7 @@ feature_files <- snakemake@input[["feature_files"]] features_of_all_participants <- tibble(filename = feature_files) %>% # create a data frame - mutate(file_contents = map(filename, ~ read.csv(., stringsAsFactors = F, colClasses = c(local_date = "character"))), + mutate(file_contents = map(filename, ~ read.csv(., stringsAsFactors = F, colClasses = c(local_segment = "character", local_segment_label = "character", local_segment_start_datetime="character", local_segment_end_datetime="character"))), pid = str_match(filename, ".*/([a-zA-Z]+?[0-9]+?)/.*")[,2]) %>% unnest(cols = c(file_contents)) %>% select(-filename) diff --git a/src/features/utils/merge_sensor_features_for_individual_participants.R b/src/features/utils/merge_sensor_features_for_individual_participants.R new file mode 100644 index 00000000..e1264e1e --- /dev/null +++ b/src/features/utils/merge_sensor_features_for_individual_participants.R @@ -0,0 +1,22 @@ +source("renv/activate.R") + +library(tidyr) +library(purrr) +library("dplyr", warn.conflicts = F) +library("methods") +library("mgm") +library("qgraph") +library("dplyr", warn.conflicts = F) +library("scales") +library("ggplot2") +library("purrr") +library("tidyr") +library("reshape2") + +feature_files <- snakemake@input[["feature_files"]] + +features_for_individual_model <- feature_files %>% + map(read.csv, stringsAsFactors = F, colClasses = c(local_segment = "character", local_segment_label = "character", local_segment_start_datetime="character", local_segment_end_datetime="character")) %>% + reduce(full_join, by=c("local_segment","local_segment_label","local_segment_start_datetime","local_segment_end_datetime")) + +write.csv(features_for_individual_model, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/demographic_features.py b/src/features/workflow_example/demographic_features.py similarity index 86% rename from src/features/demographic_features.py rename to src/features/workflow_example/demographic_features.py index 63718350..8768e214 100644 --- a/src/features/demographic_features.py +++ b/src/features/workflow_example/demographic_features.py @@ -2,10 +2,9 @@ import pandas as pd pid = snakemake.params["pid"] requested_features = snakemake.params["features"] -demographic_features = pd.DataFrame(columns=["pid"] + requested_features) +demographic_features = pd.DataFrame(columns=requested_features) participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"]) -demographic_features.loc[0, "pid"] = pid if not participant_info.empty: if "age" in requested_features: demographic_features.loc[0, "age"] = participant_info.loc[0, "age"] diff --git a/src/models/__init__.py b/src/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/models/clean_features_for_model.R b/src/models/clean_features_for_model.R deleted file mode 100644 index 7079c686..00000000 --- a/src/models/clean_features_for_model.R +++ /dev/null @@ -1,61 +0,0 @@ -source("renv/activate.R") -library(tidyr) -library("dplyr", warn.conflicts = F) - -filter_participant_without_enough_days <- function(clean_features, days_before_threshold, days_after_threshold){ - clean_features$day_type <- ifelse(clean_features$day_idx < 0, -1, ifelse(clean_features$day_idx > 0, 1, 0)) - if("pid" %in% colnames(clean_features)){ - clean_features <- clean_features %>% - group_by(pid) %>% - add_count(pid, day_type) # this adds a new column "n" - } else { - clean_features <- clean_features %>% add_count(day_type < 0) - } - - # Only keep participants with enough days before surgery and after discharge - clean_features <- clean_features %>% - mutate(count_before = ifelse(day_type == -1, n, NA), # before surgery - count_after = ifelse(day_type == 1, n, NA)) %>% # after discharge - fill(count_before, .direction = "downup") %>% - fill(count_after, .direction = "downup") %>% - filter(count_before >= days_before_threshold & count_after >= days_after_threshold) %>% - select(-n, -count_before, -count_after, -day_type) %>% - ungroup() - - return(clean_features) -} - -clean_features <- read.csv(snakemake@input[[1]]) -cols_nan_threshold <- as.numeric(snakemake@params[["cols_nan_threshold"]]) -drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]]) -rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]]) -days_before_threshold <- as.numeric(snakemake@params[["days_before_threshold"]]) -days_after_threshold <- as.numeric(snakemake@params[["days_after_threshold"]]) -features_exclude_day_idx <- as.logical(snakemake@params[["features_exclude_day_idx"]]) - - -# We have to do this before and after dropping rows, that's why is duplicated -clean_features <- filter_participant_without_enough_days(clean_features, days_before_threshold, days_after_threshold) - -# drop columns with a percentage of NA values above cols_nan_threshold -if(nrow(clean_features)) - clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold ) - -if(drop_zero_variance_columns) - clean_features <- clean_features %>% select_if(grepl("pid|local_date",names(.)) | sapply(., n_distinct, na.rm = T) > 1) - -# drop rows with a percentage of NA values above rows_nan_threshold -clean_features <- clean_features %>% - mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>% - filter(percentage_na < rows_nan_threshold) %>% - select(-percentage_na) - -if(nrow(clean_features) != 0){ - clean_features <- filter_participant_without_enough_days(clean_features, days_before_threshold, days_after_threshold) - - # include "day_idx" as features or not - if(features_exclude_day_idx) - clean_features <- clean_features %>% select(-day_idx) -} - -write.csv(clean_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/models/merge_data_for_population_model.py b/src/models/merge_data_for_population_model.py deleted file mode 100644 index 376b43a8..00000000 --- a/src/models/merge_data_for_population_model.py +++ /dev/null @@ -1,8 +0,0 @@ -import pandas as pd - -data_all_participants = pd.DataFrame() -for data_file in snakemake.input["data_files"]: - data_single_participant = pd.read_csv(data_file) - data_all_participants = pd.concat([data_all_participants, data_single_participant], axis=0) - -data_all_participants.to_csv(snakemake.output[0], index=False) diff --git a/src/models/merge_features_and_targets.py b/src/models/merge_features_and_targets.py deleted file mode 100644 index 6a8cac97..00000000 --- a/src/models/merge_features_and_targets.py +++ /dev/null @@ -1,66 +0,0 @@ -import pandas as pd -import numpy as np -from modeling_utils import getMatchingColNames, dropZeroVarianceCols - - -def summarisedNumericalFeatures(col_names, features): - numerical_features = features.groupby(["pid"])[col_names].var() - numerical_features.columns = numerical_features.columns.str.replace("daily", "overallvar") - return numerical_features - -def summarisedCategoricalFeatures(col_names, features): - categorical_features = features.groupby(["pid"])[col_names].agg(lambda x: int(pd.Series.mode(x)[0])) - categorical_features.columns = categorical_features.columns.str.replace("daily", "overallmode") - return categorical_features - -def summariseFeatures(features, numerical_operators, categorical_operators, cols_var_threshold): - numerical_col_names = getMatchingColNames(numerical_operators, features) - categorical_col_names = getMatchingColNames(categorical_operators, features) - numerical_features = summarisedNumericalFeatures(numerical_col_names, features) - categorical_features = summarisedCategoricalFeatures(categorical_col_names, features) - features = pd.concat([numerical_features, categorical_features], axis=1) - if cols_var_threshold == "True": # double check the categorical features - features = dropZeroVarianceCols(features) - elif cols_var_threshold == "Flase": - pass - else: - ValueError("COLS_VAR_THRESHOLD parameter in config.yaml can only be 'True' or 'False'") - return features - - -summarised = snakemake.params["summarised"] -cols_var_threshold = snakemake.params["cols_var_threshold"] -numerical_operators = snakemake.params["numerical_operators"] -categorical_operators = snakemake.params["categorical_operators"] -features_exclude_day_idx = snakemake.params["features_exclude_day_idx"] - - -# Extract summarised features based on daily features: -# for categorical features: calculate variance across all days -# for numerical features: calculate mode across all days -if summarised == "summarised": - - features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"]) - demographic_features = pd.read_csv(snakemake.input["demographic_features"], index_col=["pid"]) - targets = pd.read_csv(snakemake.input["targets"], index_col=["pid"]) - - features = summariseFeatures(features, numerical_operators, categorical_operators, cols_var_threshold) - data = pd.concat([features, demographic_features, targets], axis=1, join="inner") - -elif summarised == "notsummarised": - - features = pd.read_csv(snakemake.input["cleaned_features"]) - demographic_features = pd.read_csv(snakemake.input["demographic_features"]) - - features = features.merge(demographic_features, on="pid", how="left").set_index(["pid", "local_date"]) - targets = pd.read_csv(snakemake.input["targets"], index_col=["pid", "local_date"]) - data = pd.concat([features, targets], axis=1, join="inner") - -else: - raise ValueError("SUMMARISED parameter in config.yaml can only be 'summarised' or 'notsummarised'") - -if features_exclude_day_idx and ("day_idx" in data.columns): - del data["day_idx"] - -data.to_csv(snakemake.output[0], index=True) - diff --git a/src/models/merge_features_for_individual_model.R b/src/models/merge_features_for_individual_model.R deleted file mode 100644 index cb3a1f02..00000000 --- a/src/models/merge_features_for_individual_model.R +++ /dev/null @@ -1,35 +0,0 @@ -source("renv/activate.R") - -library(tidyr) -library(purrr) -library("dplyr", warn.conflicts = F) -library("methods") -library("mgm") -library("qgraph") -library("dplyr", warn.conflicts = F) -library("scales") -library("ggplot2") -library("purrr") -library("tidyr") -library("reshape2") - -feature_files <- snakemake@input[["feature_files"]] -phone_valid_sensed_days <- snakemake@input[["phone_valid_sensed_days"]] -days_to_include <- snakemake@input[["days_to_include"]] -source <- snakemake@params[["source"]] - -features_for_individual_model <- feature_files %>% - map(read.csv, stringsAsFactors = F, colClasses = c(local_date = "character")) %>% - reduce(full_join, by="local_date") - -if(!is.null(phone_valid_sensed_days) && source %in% c("phone_features", "phone_fitbit_features")){ - valid_days <- read.csv(phone_valid_sensed_days) - valid_days <- valid_days[valid_days$is_valid_sensed_day == TRUE, ] - features_for_individual_model <- merge(features_for_individual_model, valid_days, by="local_date") %>% select(-valid_sensed_hours, -is_valid_sensed_day) -} - -if(!is.null(days_to_include)){ - features_for_individual_model <- merge(features_for_individual_model, read.csv(days_to_include), by="local_date") -} - -write.csv(features_for_individual_model, snakemake@output[[1]], row.names = FALSE) \ No newline at end of file diff --git a/src/models/merge_population_model_results.py b/src/models/merge_population_model_results.py deleted file mode 100644 index d81cc596..00000000 --- a/src/models/merge_population_model_results.py +++ /dev/null @@ -1,16 +0,0 @@ -import pandas as pd - -overall_results = pd.read_csv(snakemake.input["overall_results"]) -nan_cells_ratio = pd.read_csv(snakemake.input["nan_cells_ratio"]) -baseline = pd.read_csv(snakemake.input["baseline"], index_col=["method"]) - -# add nan cells ratio -overall_results.insert(3, "nan_cells_ratio", nan_cells_ratio["nan_cells_ratio"]) - -# add baseline -baseline = baseline.stack().to_frame().T -baseline.columns = ['{}_{}'.format(*col) for col in baseline.columns] -baseline = baseline.add_prefix('b_') -results = pd.concat([overall_results, baseline], axis=1) - -results.to_csv(snakemake.output[0], index=False) diff --git a/src/models/nan_cells_ratio_of_cleaned_features.py b/src/models/nan_cells_ratio_of_cleaned_features.py deleted file mode 100644 index de06a0c2..00000000 --- a/src/models/nan_cells_ratio_of_cleaned_features.py +++ /dev/null @@ -1,8 +0,0 @@ -import pandas as pd - -features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"]) - -# Compute the proportion of missing value cells among all features -nan_cells_ratio = features.isnull().sum().sum() / (features.shape[0] * features.shape[1]) - -pd.DataFrame({"nan_cells_ratio": [nan_cells_ratio]}).to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/models/select_days_to_analyse.py b/src/models/select_days_to_analyse.py deleted file mode 100644 index 5a1370b0..00000000 --- a/src/models/select_days_to_analyse.py +++ /dev/null @@ -1,43 +0,0 @@ -import numpy as np -import pandas as pd -from datetime import timedelta - -def appendDaysInRange(days_to_analyse, start_date, end_date, day_type): - num_of_days = (end_date - start_date).days - if np.isnan(num_of_days): - return days_to_analyse - - for day in range(num_of_days + 1): - - if day_type == -1: - day_idx = (num_of_days - day + 1) * day_type - elif day_type == 1: - day_idx = day + 1 - else: - day_idx = 0 - - days_to_analyse = days_to_analyse.append({"local_date": start_date + timedelta(days = day), "day_idx": day_idx}, ignore_index=True) - - return days_to_analyse - -days_before_surgery = int(snakemake.params["days_before_surgery"]) -days_in_hospital = str(snakemake.params["days_in_hospital"]) -days_after_discharge = int(snakemake.params["days_after_discharge"]) -participant_info = pd.read_csv(snakemake.input["participant_info"], parse_dates=["surgery_date", "discharge_date"]) -days_to_analyse = pd.DataFrame(columns = ["local_date", "day_idx"]) - -try: - surgery_date, discharge_date = participant_info["surgery_date"].iloc[0].date(), participant_info["discharge_date"].iloc[0].date() -except: - pass -else: - start_date = surgery_date - timedelta(days = days_before_surgery) - end_date = discharge_date + timedelta(days = days_after_discharge) - - # days before surgery: -1; in hospital: 0; after discharge: 1 - days_to_analyse = appendDaysInRange(days_to_analyse, start_date, surgery_date - timedelta(days = 1), -1) - if days_in_hospital == "T": - days_to_analyse = appendDaysInRange(days_to_analyse, surgery_date, discharge_date, 0) - days_to_analyse = appendDaysInRange(days_to_analyse, discharge_date + timedelta(days = 1), end_date, 1) - -days_to_analyse.to_csv(snakemake.output[0], index=False) diff --git a/src/models/targets.py b/src/models/targets.py deleted file mode 100644 index e12794d4..00000000 --- a/src/models/targets.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -import numpy as np - -pid = snakemake.params["pid"] -summarised = snakemake.params["summarised"] -participant_info = pd.read_csv(snakemake.input["participant_info"]) - -if summarised == "summarised": - raise ValueError("Do not support summarised features for example dataset.") - -elif summarised == "notsummarised": - targets = participant_info[["local_date", "target"]] - targets.insert(0, "pid", pid) - -else: - raise ValueError("SUMMARISED parameter in config.yaml can only be 'summarised' or 'notsummarised'") - -targets.to_csv(snakemake.output[0], index=False) diff --git a/src/models/baseline.py b/src/models/workflow_example/baselines.py similarity index 54% rename from src/models/baseline.py rename to src/models/workflow_example/baselines.py index faa5bfeb..8db60593 100644 --- a/src/models/baseline.py +++ b/src/models/workflow_example/baselines.py @@ -10,18 +10,18 @@ from sklearn.model_selection import LeaveOneOut def baselineAccuracyOfMajorityClassClassifier(targets): majority_class = targets["target"].value_counts().idxmax() pred_y = [majority_class] * targets.shape[0] - pred_y_prob = pred_y - metrics = getMetrics(pred_y, pred_y_prob, targets["target"].values.ravel().tolist()) + pred_y_proba = pred_y + metrics = getMetrics(pred_y, pred_y_proba, targets["target"].values.ravel().tolist()) return metrics, majority_class def baselineMetricsOfRandomWeightedClassifier(targets, majority_ratio, majority_class, iter_times): - metrics_all_iters = {"accuracy": [], "precision0":[], "recall0": [], "f10": [], "precision1": [], "recall1": [], "f11": [], "auc": [], "kappa": []} + metrics_all_iters = {"accuracy": [], "precision0":[], "recall0": [], "f10": [], "precision1": [], "recall1": [], "f11": [], "f1_macro": [], "auc": [], "kappa": []} probabilities = [0, 0] probabilities[majority_class], probabilities[1 - majority_class] = majority_ratio, 1 - majority_ratio for i in range(iter_times): pred_y = np.random.RandomState(i).multinomial(1, probabilities, targets.shape[0])[:,1].tolist() - pred_y_prob = pred_y - metrics = getMetrics(pred_y, pred_y_prob, targets["target"].values.ravel().tolist()) + pred_y_proba = pred_y + metrics = getMetrics(pred_y, pred_y_proba, targets["target"].values.ravel().tolist()) for key in metrics_all_iters.keys(): metrics_all_iters[key].append(metrics[key].item()) # Calculate average metrics across all iterations @@ -38,21 +38,25 @@ def baselineMetricsOfDTWithDemographicFeatures(cv_method, data_x, data_y, oversa clf = createPipeline("DT", oversampler_type) clf.fit(train_x, train_y.values.ravel()) pred_y = pred_y + clf.predict(test_x).ravel().tolist() - pred_y_prob = pred_y + pred_y_proba = pred_y true_y = true_y + test_y.values.ravel().tolist() - return getMetrics(pred_y, pred_y_prob, true_y) + return getMetrics(pred_y, pred_y_proba, true_y) cv_method = globals()[snakemake.params["cv_method"]]() -colnames_demographic_features = snakemake.params["demographic_features"] -rowsnan_colsnan_days_colsvar_threshold = snakemake.params["rowsnan_colsnan_days_colsvar_threshold"] +colnames_demographic_features = snakemake.params["colnames_demographic_features"] + +data = pd.read_csv(snakemake.input[0]) +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +if "pid" in data.columns: + index_columns.append("pid") +data.set_index(index_columns, inplace=True) -data = pd.read_csv(snakemake.input[0], index_col=["pid"]) data_x, data_y = data.drop("target", axis=1), data[["target"]] targets_value_counts = data_y["target"].value_counts() -baseline_metrics = pd.DataFrame(columns=["method", "fullMethodName", "accuracy", "precision0", "recall0", "f10", "precision1", "recall1", "f11", "auc", "kappa"]) +baseline_metrics = pd.DataFrame(columns=["method", "fullMethodName", "accuracy", "precision0", "recall0", "f10", "precision1", "recall1", "f11", "f1_macro", "auc", "kappa"]) if len(targets_value_counts) < 2: fout = open(snakemake.log[0], "w") fout.write(targets_value_counts.to_string()) @@ -69,21 +73,33 @@ else: majority_ratio = baseline1_metrics["accuracy"] # Baseline 2: random weighted classifier => random classifier with binomial distribution baseline2_metrics = baselineMetricsOfRandomWeightedClassifier(data_y, majority_ratio, majority_class, 1000) - # Baseline 3: decision tree with demographic features - baseline3_metrics = baselineMetricsOfDTWithDemographicFeatures(cv_method, data_x[colnames_demographic_features], data_y, oversampler_type) - baselines = [baseline1_metrics, baseline2_metrics, baseline3_metrics] + if "pid" in index_columns: + # Baseline 3: decision tree with demographic features + baseline3_metrics = baselineMetricsOfDTWithDemographicFeatures(cv_method, data_x[colnames_demographic_features], data_y, oversampler_type) + + baselines = [baseline1_metrics, baseline2_metrics, baseline3_metrics] + methods = ["majority", "rwc", "dt"] + fullMethodNames = ["MajorityClassClassifier", "RandomWeightedClassifier", "DecisionTreeWithDemographicFeatures"] + + else: + # Only have 2 baselines + baselines = [baseline1_metrics, baseline2_metrics] + methods = ["majority", "rwc"] + fullMethodNames = ["MajorityClassClassifier", "RandomWeightedClassifier"] + + baseline_metrics = pd.DataFrame({"method": methods, + "fullMethodName": fullMethodNames, + "accuracy": [baseline["accuracy"] for baseline in baselines], + "precision0": [baseline["precision0"] for baseline in baselines], + "recall0": [baseline["recall0"] for baseline in baselines], + "f10": [baseline["f10"] for baseline in baselines], + "precision1": [baseline["precision1"] for baseline in baselines], + "recall1": [baseline["recall1"] for baseline in baselines], + "f11": [baseline["f11"] for baseline in baselines], + "f1_macro": [baseline["f1_macro"] for baseline in baselines], + "auc": [baseline["auc"] for baseline in baselines], + "kappa": [baseline["kappa"] for baseline in baselines]}) - baseline_metrics = pd.DataFrame({"method": ["majority", "rwc", "dt"], - "fullMethodName": ["MajorityClassClassifier", "RandomWeightedClassifier", "DecisionTreeWithDemographicFeatures"], - "accuracy": [baseline["accuracy"] for baseline in baselines], - "precision0": [baseline["precision0"] for baseline in baselines], - "recall0": [baseline["recall0"] for baseline in baselines], - "f10": [baseline["f10"] for baseline in baselines], - "precision1": [baseline["precision1"] for baseline in baselines], - "recall1": [baseline["recall1"] for baseline in baselines], - "f11": [baseline["f11"] for baseline in baselines], - "auc": [baseline["auc"] for baseline in baselines], - "kappa": [baseline["kappa"] for baseline in baselines]}) baseline_metrics.to_csv(snakemake.output[0], index=False) diff --git a/src/models/workflow_example/clean_sensor_features.R b/src/models/workflow_example/clean_sensor_features.R new file mode 100644 index 00000000..57b9b285 --- /dev/null +++ b/src/models/workflow_example/clean_sensor_features.R @@ -0,0 +1,29 @@ +source("renv/activate.R") +library(tidyr) +library("dplyr", warn.conflicts = F) + + +clean_features <- read.csv(snakemake@input[[1]]) +cols_nan_threshold <- as.numeric(snakemake@params[["cols_nan_threshold"]]) +drop_zero_variance_columns <- as.logical(snakemake@params[["cols_var_threshold"]]) +rows_nan_threshold <- as.numeric(snakemake@params[["rows_nan_threshold"]]) +data_yielded_hours_ratio_threshold <- as.numeric(snakemake@params[["data_yielded_hours_ratio_threshold"]]) + +# drop rows with the value of "phone_data_yield_rapids_ratiovalidyieldedhours" column less than data_yielded_hours_ratio_threshold +clean_features <- clean_features %>% + filter(phone_data_yield_rapids_ratiovalidyieldedhours > data_yielded_hours_ratio_threshold) + +# drop columns with a percentage of NA values above cols_nan_threshold +if(nrow(clean_features)) + clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold ) + +if(drop_zero_variance_columns) + clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1) + +# drop rows with a percentage of NA values above rows_nan_threshold +clean_features <- clean_features %>% + mutate(percentage_na = rowSums(is.na(.)) / ncol(.)) %>% + filter(percentage_na < rows_nan_threshold) %>% + select(-percentage_na) + +write.csv(clean_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/models/workflow_example/merge_features_and_targets_for_individual_model.py b/src/models/workflow_example/merge_features_and_targets_for_individual_model.py new file mode 100644 index 00000000..95ce9041 --- /dev/null +++ b/src/models/workflow_example/merge_features_and_targets_for_individual_model.py @@ -0,0 +1,10 @@ +import pandas as pd +import numpy as np + +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"], index_col=index_columns) +targets = pd.read_csv(snakemake.input["targets"], index_col=index_columns) + +data = pd.concat([sensor_features, targets[["target"]]], axis=1, join="inner") + +data.to_csv(snakemake.output[0], index=True) diff --git a/src/models/workflow_example/merge_features_and_targets_for_population_model.py b/src/models/workflow_example/merge_features_and_targets_for_population_model.py new file mode 100644 index 00000000..69c4cca7 --- /dev/null +++ b/src/models/workflow_example/merge_features_and_targets_for_population_model.py @@ -0,0 +1,27 @@ +import pandas as pd +import numpy as np + +merge_keys = ["pid", "local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"]) + +all_demographic_features = pd.DataFrame() +for demographic_features_path in snakemake.input["demographic_features"]: + pid = demographic_features_path.split("/")[3] + demographic_features = pd.read_csv(demographic_features_path) + demographic_features = demographic_features.assign(pid=pid) + all_demographic_features = pd.concat([all_demographic_features, demographic_features], axis=0) + +# merge sensor features and demographic features +features = sensor_features.merge(all_demographic_features, on="pid", how="left") + +all_targets = pd.DataFrame() +for targets_path in snakemake.input["targets"]: + pid = targets_path.split("/")[3] + targets = pd.read_csv(targets_path) + targets = targets.assign(pid=pid) + all_targets = pd.concat([all_targets, targets], axis=0) + +# merge features and targets +data = features.merge(all_targets[["target"] + merge_keys], on=merge_keys, how="inner") + +data.to_csv(snakemake.output[0], index=False) diff --git a/src/models/modeling.py b/src/models/workflow_example/modeling.py similarity index 75% rename from src/models/modeling.py rename to src/models/workflow_example/modeling.py index 57a50eb7..41323600 100644 --- a/src/models/modeling.py +++ b/src/models/workflow_example/modeling.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np -from modeling_utils import getMatchingColNames, dropZeroVarianceCols, getNormAllParticipantsScaler, getMetrics, getFeatureImportances, createPipeline -from sklearn.model_selection import train_test_split, LeaveOneOut, GridSearchCV, cross_val_score, KFold +from modeling_utils import getMatchingColNames, getNormAllParticipantsScaler, getMetrics, getFeatureImportances, createPipeline +from sklearn.model_selection import LeaveOneOut, GridSearchCV @@ -25,7 +25,8 @@ def preprocessCategoricalFeatures(categorical_features, mode_categorical_feature categorical_features = categorical_features.fillna(mode_categorical_features) # one-hot encoding categorical_features = categorical_features.apply(lambda col: col.astype("category")) - categorical_features = pd.get_dummies(categorical_features) + if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) return categorical_features def splitNumericalCategoricalFeatures(features, categorical_feature_colnames): @@ -48,32 +49,32 @@ def preprocesFeatures(train_numerical_features, test_numerical_features, categor # Step 4. Save results, parameters, and metrics to CSV files ############################################################## - +# For reproducibility +np.random.seed(0) # Step 1. Read parameters and data # Read parameters model = snakemake.params["model"] -source = snakemake.params["source"] -summarised = snakemake.params["summarised"] -day_segment = snakemake.params["day_segment"] scaler = snakemake.params["scaler"] cv_method = snakemake.params["cv_method"] categorical_operators = snakemake.params["categorical_operators"] categorical_colnames_demographic_features = snakemake.params["categorical_demographic_features"] model_hyperparams = snakemake.params["model_hyperparams"][model] -rowsnan_colsnan_days_colsvar_threshold = snakemake.params["rowsnan_colsnan_days_colsvar_threshold"] # thresholds for data cleaning # Read data and split -if summarised == "summarised": - data = pd.read_csv(snakemake.input["data"], index_col=["pid"]) -elif summarised == "notsummarised": - data = pd.read_csv(snakemake.input["data"], index_col=["pid", "local_date"]) -else: - raise ValueError("SUMMARISED parameter in config.yaml can only be 'summarised' or 'notsummarised'") +data = pd.read_csv(snakemake.input["data"]) +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +if "pid" in data.columns: + index_columns.append("pid") +data.set_index(index_columns, inplace=True) data_x, data_y = data.drop("target", axis=1), data[["target"]] -categorical_feature_colnames = categorical_colnames_demographic_features + getMatchingColNames(categorical_operators, data_x) + +if "pid" in index_columns: + categorical_feature_colnames = categorical_colnames_demographic_features + getMatchingColNames(categorical_operators, data_x) +else: + categorical_feature_colnames = getMatchingColNames(categorical_operators, data_x) @@ -82,7 +83,7 @@ cv_class = globals()[cv_method] inner_cv = cv_class() outer_cv = cv_class() -fold_id, pid, best_params, true_y, pred_y, pred_y_prob = [], [], [], [], [], [] +fold_id, pid, best_params, true_y, pred_y, pred_y_proba = [], [], [], [], [], [] feature_importances_all_folds = pd.DataFrame() fold_count = 1 @@ -99,7 +100,7 @@ for train_index, test_index in outer_cv.split(data_x): mode_categorical_features = train_categorical_features.mode().iloc[0] train_x = preprocesFeatures(train_numerical_features, None, train_categorical_features, mode_categorical_features, scaler, "train") test_x = preprocesFeatures(train_numerical_features, test_numerical_features, test_categorical_features, mode_categorical_features, scaler, "test") - train_x, test_x = train_x.align(test_x, join='outer', axis=1, fill_value=0) # in case we get rid off categorical columns + train_x, test_x = train_x.align(test_x, join="outer", axis=1, fill_value=0) # in case we get rid off categorical columns # Compute number of participants and features # values do not change between folds @@ -129,7 +130,7 @@ for train_index, test_index in outer_cv.split(data_x): pred_y = pred_y + cur_fold_pred proba_of_two_categories = clf.predict_proba(test_x).tolist() - pred_y_prob = pred_y_prob + [probabilities[clf.classes_.tolist().index(1)] for probabilities in proba_of_two_categories] + pred_y_proba = pred_y_proba + [probabilities[clf.classes_.tolist().index(1)] for probabilities in proba_of_two_categories] true_y = true_y + test_y.values.ravel().tolist() pid = pid + test_y.index.tolist() # each test partition (fold) in the outer cv is a participant (LeaveOneOut cv) @@ -140,16 +141,16 @@ for train_index, test_index in outer_cv.split(data_x): # Step 3. Model evaluation if len(pred_y) > 1: - metrics = getMetrics(pred_y, pred_y_prob, true_y) + metrics = getMetrics(pred_y, pred_y_proba, true_y) else: - metrics = {"accuracy": None, "precision0": None, "recall0": None, "f10": None, "precision1": None, "recall1": None, "f11": None, "auc": None, "kappa": None} + metrics = {"accuracy": None, "precision0": None, "recall0": None, "f10": None, "precision1": None, "recall1": None, "f11": None, "f1_macro": None, "auc": None, "kappa": None} # Step 4. Save results, parameters, and metrics to CSV files -fold_predictions = pd.DataFrame({"fold_id": fold_id, "pid": pid, "hyperparameters": best_params, "true_y": true_y, "pred_y": pred_y, "pred_y_prob": pred_y_prob}) -fold_metrics = pd.DataFrame({"fold_id":[], "accuracy":[], "precision0": [], "recall0": [], "f10": [], "precision1": [], "recall1": [], "f11": [], "auc": [], "kappa": []}) -overall_results = pd.DataFrame({"num_of_rows": [num_of_rows], "num_of_features": [num_of_features], "rowsnan_colsnan_days_colsvar_threshold": [rowsnan_colsnan_days_colsvar_threshold], "model": [model], "cv_method": [cv_method], "source": [source], "scaler": [scaler], "day_segment": [day_segment], "summarised": [summarised], "accuracy": [metrics["accuracy"]], "precision0": [metrics["precision0"]], "recall0": [metrics["recall0"]], "f10": [metrics["f10"]], "precision1": [metrics["precision1"]], "recall1": [metrics["recall1"]], "f11": [metrics["f11"]], "auc": [metrics["auc"]], "kappa": [metrics["kappa"]]}) -feature_importances_all_folds.insert(loc=0, column='fold_id', value=fold_id) -feature_importances_all_folds.insert(loc=1, column='pid', value=pid) +fold_predictions = pd.DataFrame({"fold_id": fold_id, "pid": pid, "hyperparameters": best_params, "true_y": true_y, "pred_y": pred_y, "pred_y_proba": pred_y_proba}) +fold_metrics = pd.DataFrame({"fold_id":[], "accuracy":[], "precision0": [], "recall0": [], "f10": [], "precision1": [], "recall1": [], "f11": [], "f1_macro": [], "auc": [], "kappa": []}) +overall_results = pd.DataFrame({"num_of_rows": [num_of_rows], "num_of_features": [num_of_features], "model": [model], "cv_method": [cv_method], "scaler": [scaler], "accuracy": [metrics["accuracy"]], "precision0": [metrics["precision0"]], "recall0": [metrics["recall0"]], "f10": [metrics["f10"]], "precision1": [metrics["precision1"]], "recall1": [metrics["recall1"]], "f11": [metrics["f11"]], "f1_macro": [metrics["f1_macro"]], "auc": [metrics["auc"]], "kappa": [metrics["kappa"]]}) +feature_importances_all_folds.insert(loc=0, column="fold_id", value=fold_id) +feature_importances_all_folds.insert(loc=1, column="pid", value=pid) fold_predictions.to_csv(snakemake.output["fold_predictions"], index=False) fold_metrics.to_csv(snakemake.output["fold_metrics"], index=False) diff --git a/src/models/modeling_utils.py b/src/models/workflow_example/modeling_utils.py similarity index 83% rename from src/models/modeling_utils.py rename to src/models/workflow_example/modeling_utils.py index 1ba17c50..5d11c995 100644 --- a/src/models/modeling_utils.py +++ b/src/models/workflow_example/modeling_utils.py @@ -1,4 +1,5 @@ import pandas as pd +import numpy as np from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix from sklearn.metrics import precision_recall_fscore_support @@ -44,24 +45,24 @@ def getNormAllParticipantsScaler(features, scaler_flag): scaler.fit(features) return scaler -# get metrics: accuracy, precision1, recall1, f11, auc, kappa -def getMetrics(pred_y, pred_y_prob, true_y): +# get metrics: accuracy, precision0, recall0, f10, precision1, recall1, f11, f1_macro, auc, kappa +def getMetrics(pred_y, pred_y_proba, true_y): metrics = {} + count = len(np.unique(true_y)) + label= np.unique(true_y)[0] # metrics for all categories metrics["accuracy"] = accuracy_score(true_y, pred_y) - try: - metrics["auc"] = roc_auc_score(true_y, pred_y_prob) - except: - metrics["auc"] = None + metrics["f1_macro"] = f1_score(true_y, pred_y, average="macro") # unweighted mean + metrics["auc"] = np.nan if count == 1 else roc_auc_score(true_y, pred_y_proba) metrics["kappa"] = cohen_kappa_score(true_y, pred_y) # metrics for label 0 - metrics["precision0"] = precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[0] - metrics["recall0"] = recall_score(true_y, pred_y, average=None, labels=[0,1])[0] - metrics["f10"] = f1_score(true_y, pred_y, average=None, labels=[0,1])[0] + metrics["precision0"] = np.nan if (count == 1 and label == 1) else precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[0] + metrics["recall0"] = np.nan if (count == 1 and label == 1) else recall_score(true_y, pred_y, average=None, labels=[0,1])[0] + metrics["f10"] = np.nan if (count == 1 and label == 1) else f1_score(true_y, pred_y, average=None, labels=[0,1])[0] # metrics for label 1 - metrics["precision1"] = precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[1] - metrics["recall1"] = recall_score(true_y, pred_y, average=None, labels=[0,1])[1] - metrics["f11"] = f1_score(true_y, pred_y, average=None, labels=[0,1])[1] + metrics["precision1"] = np.nan if (count == 1 and label == 0) else precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[1] + metrics["recall1"] = np.nan if (count == 1 and label == 0) else recall_score(true_y, pred_y, average=None, labels=[0,1])[1] + metrics["f11"] = np.nan if (count == 1 and label == 0) else f1_score(true_y, pred_y, average=None, labels=[0,1])[1] return metrics diff --git a/src/models/workflow_example/parse_targets.py b/src/models/workflow_example/parse_targets.py new file mode 100644 index 00000000..a3fd1e8b --- /dev/null +++ b/src/models/workflow_example/parse_targets.py @@ -0,0 +1,28 @@ +import pandas as pd +import numpy as np +from importlib import import_module, util +from pathlib import Path + + +# import filter_data_by_segment from src/features/utils/utils.py +spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent.parent / "features" / "utils" / "utils.py")) +mod = util.module_from_spec(spec) +spec.loader.exec_module(mod) +filter_data_by_segment = getattr(mod, "filter_data_by_segment") + +targets = pd.read_csv(snakemake.input["targets"]) +day_segments_labels = pd.read_csv(snakemake.input["day_segments_labels"], header=0) + +all_targets = pd.DataFrame(columns=["local_segment"]) +for day_segment in day_segments_labels["label"]: + filtered_targets = filter_data_by_segment(targets, day_segment) + all_targets = all_targets.merge(filtered_targets, how="outer") + +segment_colums = pd.DataFrame() +split_segemnt_columns = all_targets["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True) +new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"]) +segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns +for i in range(segment_colums.shape[1]): + all_targets.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]]) + +all_targets.to_csv(snakemake.output[0], index=False)