From bb3c6141358ce40bfddbb723fb09d0b434a4792a Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Mon, 28 Jun 2021 16:30:44 -0400 Subject: [PATCH] Update analysis workflow example --- docs/workflow-examples/analysis.md | 2 +- example_profile/Snakefile | 23 ++++++++++++++++++++++- example_profile/example_config.yaml | 6 +++++- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/docs/workflow-examples/analysis.md b/docs/workflow-examples/analysis.md index e11ae3e7..c5288e97 100644 --- a/docs/workflow-examples/analysis.md +++ b/docs/workflow-examples/analysis.md @@ -69,7 +69,7 @@ Note you will see a lot of warning messages, you can ignore them since they happ ??? info "6. Feature cleaning." In this stage we perform four steps to clean our sensor feature file. First, we discard days with a data yield hour ratio less than or equal to 0.75, i.e. we include days with at least 18 hours of data. Second, we drop columns (features) with more than 30% of missing rows. Third, we drop columns with zero variance. Fourth, we drop rows (days) with more than 30% of missing columns (features). In this cleaning stage several parameters are created and exposed in `example_profile/example_config.yaml`. - After this step, we kept 158 features over 11 days for the individual model of p01, 101 features over 12 days for the individual model of p02 and 106 features over 20 days for the population model. Note that the difference in the number of features between p01 and p02 is mostly due to iOS restrictions that stops researchers from collecting the same number of sensors than in Android phones. + After this step, we kept 161 features over 11 days for the individual model of p01, 101 features over 12 days for the individual model of p02 and 109 features over 20 days for the population model. Note that the difference in the number of features between p01 and p02 is mostly due to iOS restrictions that stops researchers from collecting the same number of sensors than in Android phones. Feature cleaning for the individual models is done in the `clean_sensor_features_for_individual_participants` rule and for the population model in the `clean_sensor_features_for_all_participants` rule in `rules/models.smk`. diff --git a/example_profile/Snakefile b/example_profile/Snakefile index f969fdcb..65cea721 100644 --- a/example_profile/Snakefile +++ b/example_profile/Snakefile @@ -204,15 +204,28 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): else: raise ValueError("Error: Add PHONE_LOCATIONS (and as many PHONE_SENSORS as you have) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data) which is used to resample fused location data (ALL_RESAMPLED and RESAMPLED_FUSED)") + if provider == "BARNETT": + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_barnett_daily.csv", pid=config["PIDS"])) + if provider == "DORYAB": + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime_with_doryab_columns.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") +for provider in config["FITBIT_CALORIES_INTRADAY"]["PROVIDERS"].keys(): + if config["FITBIT_CALORIES_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_calories_intraday_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_calories_intraday_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_calories_intraday_features/fitbit_calories_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_CALORIES_INTRADAY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_calories_intraday.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + for provider in config["FITBIT_DATA_YIELD"]["PROVIDERS"].keys(): if config["FITBIT_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"])) @@ -271,6 +284,12 @@ for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys(): for provider in config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"].keys(): if config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: + + if config["FITBIT_STEPS_INTRADAY"]["EXCLUDE_SLEEP"]["TIME_BASED"]["EXCLUDE"] or config["FITBIT_STEPS_INTRADAY"]["EXCLUDE_SLEEP"]["FITBIT_BASED"]["EXCLUDE"]: + if config["FITBIT_STEPS_INTRADAY"]["EXCLUDE_SLEEP"]["FITBIT_BASED"]["EXCLUDE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_intraday_with_datetime_exclude_sleep.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower())) @@ -357,6 +376,8 @@ if config["HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT"]["PLOT"]: files_to_compute.append("reports/data_exploration/heatmap_sensor_row_count_per_time_segment.html") if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]: + if not config["PHONE_DATA_YIELD"]["PROVIDERS"]["RAPIDS"]["COMPUTE"]: + raise ValueError("Error: [PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] must be True in config.yaml to get heatmaps of overall data yield.") files_to_compute.append("reports/data_exploration/heatmap_phone_data_yield_per_participant_per_time_segment.html") if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]: diff --git a/example_profile/example_config.yaml b/example_profile/example_config.yaml index be3deb29..c2f269c7 100644 --- a/example_profile/example_config.yaml +++ b/example_profile/example_config.yaml @@ -198,7 +198,11 @@ PHONE_DATA_YIELD: # See https://www.rapids.science/latest/features/phone-keyboard/ PHONE_KEYBOARD: CONTAINER: keyboard - PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD + PROVIDERS: + RAPIDS: + COMPUTE: False + FEATURES: ["sessioncount","averageinterkeydelay","averagesessionlength","changeintextlengthlessthanminusone","changeintextlengthequaltominusone","changeintextlengthequaltoone","changeintextlengthmorethanone","maxtextlength","lastmessagelength","totalkeyboardtouches"] + SRC_SCRIPT: src/features/phone_keyboard/rapids/main.py # See https://www.rapids.science/latest/features/phone-light/ PHONE_LIGHT: