From 3e2473c4c0232dfd877867a880d6da97d7517e2a Mon Sep 17 00:00:00 2001
From: Meng Li <34143965+Meng6@users.noreply.github.com>
Date: Tue, 2 Nov 2021 15:26:01 -0400
Subject: [PATCH] Update example workflow and docs

---
 .../complete-workflow-example.md}             |  8 +-
 docs/analysis/data-cleaning.md                | 81 +++++++++++++++++++
 .../minimal.md                                |  0
 docs/features/phone-keyboard.md               |  6 ++
 docs/index.md                                 |  4 +-
 docs/setup/overview.md                        |  4 +-
 example_profile/Snakefile                     |  9 ++-
 example_profile/example_config.yaml           | 41 ++++++++--
 mkdocs.yml                                    |  7 +-
 rules/models.smk                              |  4 +-
 10 files changed, 140 insertions(+), 24 deletions(-)
 rename docs/{workflow-examples/analysis.md => analysis/complete-workflow-example.md} (94%)
 create mode 100644 docs/analysis/data-cleaning.md
 rename docs/{workflow-examples => analysis}/minimal.md (100%)

diff --git a/docs/workflow-examples/analysis.md b/docs/analysis/complete-workflow-example.md
similarity index 94%
rename from docs/workflow-examples/analysis.md
rename to docs/analysis/complete-workflow-example.md
index 053c1825..885c9f20 100644
--- a/docs/workflow-examples/analysis.md
+++ b/docs/analysis/complete-workflow-example.md
@@ -1,8 +1,8 @@
 # Analysis Workflow Example
 
 !!! info "TL;DR"
-    - In addition to using RAPIDS to extract behavioral features and create plots, you can structure your data analysis within RAPIDS (i.e. cleaning your features and creating ML/statistical models)
-    - We include an analysis example in RAPIDS that covers raw data processing, cleaning, feature extraction, machine learning modeling, and evaluation
+    - In addition to using RAPIDS to extract behavioral features, create plots, and clean sensor features, you can structure your data analysis within RAPIDS (i.e. creating ML/statistical models and evaluating your models)
+    - We include an analysis example in RAPIDS that covers raw data processing, feature extraction, cleaning, machine learning modeling, and evaluation
     - Use this example as a guide to structure your own analysis within RAPIDS
     - RAPIDS analysis workflows are compatible with your favorite data science tools and libraries
     - RAPIDS analysis workflows are reproducible and we encourage you to publish them along with your research papers
@@ -69,12 +69,12 @@ Note you will see a lot of warning messages, you can ignore them since they happ
 ??? info "6. Feature cleaning."
     In this stage we perform four steps to clean our sensor feature file. First, we discard days with a data yield hour ratio less than or equal to 0.75, i.e. we include days with at least 18 hours of data. Second, we drop columns (features) with more than 30% of missing rows. Third, we drop columns with zero variance. Fourth, we drop rows (days) with more than 30% of missing columns (features). In this cleaning stage several parameters are created and exposed in `example_profile/example_config.yaml`. 
 
-    After this step, we kept 163 features over 11 days for the individual model of p01, 101 features over 12 days for the individual model of p02 and 109 features over 20 days for the population model. Note that the difference in the number of features between p01 and p02 is mostly due to iOS restrictions that stops researchers from collecting the same number of sensors than in Android phones. 
+    After this step, we kept 173 features over 11 days for the individual model of p01, 101 features over 12 days for the individual model of p02 and 117 features over 22 days for the population model. Note that the difference in the number of features between p01 and p02 is mostly due to iOS restrictions that stops researchers from collecting the same number of sensors than in Android phones. 
     
     Feature cleaning for the individual models is done in the `clean_sensor_features_for_individual_participants` rule and for the population model in the `clean_sensor_features_for_all_participants` rule in `rules/models.smk`.
 
 ??? info "7. Merge features and targets."
-    In this step we merge the cleaned features and target labels for our individual models in the `merge_features_and_targets_for_individual_model` rule in `rules/models.smk`. Additionally, we merge the cleaned features, target labels, and demographic features of our two participants for the population model in the `merge_features_and_targets_for_population_model` rule in `rules/models.smk`. These two merged files are the input for our individual and population models. 
+    In this step we merge the cleaned features and target labels for our individual models in the `merge_features_and_targets_for_individual_model` rule in `rules/features.smk`. Additionally, we merge the cleaned features, target labels, and demographic features of our two participants for the population model in the `merge_features_and_targets_for_population_model` rule in `rules/features.smk`. These two merged files are the input for our individual and population models. 
 
 ??? info "8. Modelling."
     This stage has three phases: model building, training and evaluation. 
diff --git a/docs/analysis/data-cleaning.md b/docs/analysis/data-cleaning.md
new file mode 100644
index 00000000..20b13850
--- /dev/null
+++ b/docs/analysis/data-cleaning.md
@@ -0,0 +1,81 @@
+Data Cleaning
+=============
+
+This module is to clean the extracted sensor features before merging it with the target labels.
+    
+## Clean sensor features for individual participants
+
+!!! info "File Sequence"
+    ```bash
+    - data/processed/features/{pid}/all_sensor_features.csv
+    - data/processed/features/{pid}/all_sensor_features_cleaned_{provider_key}.csv
+    ```
+
+### RAPIDS provider
+
+Parameters description for `[ALL_CLEANING_INDIVIDUAL][PROVIDERS][RAPIDS]`:
+
+|Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;            | Description |
+|----------------|-----------------------------------------------------------------------------------------------------------------------------------
+|`[COMPUTE]` | Set to `True` to clean sensor features for individual participants from the `RAPIDS` provider|
+|`[IMPUTE_SELECTED_EVENT_FEATURES]`     | Fill NA with 0 for the selected event features, see table below
+|`[COLS_NAN_THRESHOLD]`                 | Discard columns with missing value ratios higher than `[COLS_NAN_THRESHOLD]`. Set to 1 to disable
+|`[COLS_VAR_THRESHOLD]`                 | Set to `True` to discard columns with zero variance
+|`[ROWS_NAN_THRESHOLD]`                 | Discard rows with missing value ratios higher than `[ROWS_NAN_THRESHOLD]`. Set to 1 to disable
+|`[DATA_YIELDED_HOURS_RATIO_THRESHOLD]` | Discard rows with `phone_data_yield_rapids_ratiovalidyieldedhours` feature less than `[DATA_YIELDED_HOURS_RATIO_THRESHOLD]`. Set to 0 to disable
+|`DROP_HIGHLY_CORRELATED_FEATURES`      | Discard highly correlated features, see table below
+
+Parameters description for `[ALL_CLEANING_INDIVIDUAL][PROVIDERS][RAPIDS][IMPUTE_SELECTED_EVENT_FEATURES]`:
+
+|Parameters                             | Description                                                    |
+|-------------------------------------- |----------------------------------------------------------------|
+|`[COMPUTE]`                            | Set to `True` to fill NA with 0 for the selected event features
+|`[MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]` | Assume the selected event sensor is working when phone_data_yield_rapids_ratiovalidyieldedminutes > `[MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]`. |
+
+Parameters description for `[ALL_CLEANING_INDIVIDUAL][PROVIDERS][RAPIDS][DROP_HIGHLY_CORRELATED_FEATURES]`:
+
+|Parameters                             | Description                                                    |
+|-------------------------------------- |----------------------------------------------------------------|
+|`[COMPUTE]`                            | Set to `True` to drop highly correlated features
+|`[MIN_OVERLAP_FOR_CORR_THRESHOLD]`     | Minimum ratio of observations required per pair of columns (features) to be considered as a valid correlation. 
+|`[CORR_THRESHOLD]` | The absolute values of pair-wise correlations are calculated. If two variables have a valid correlation higher than `[CORR_THRESHOLD]`, we looks at the mean absolute correlation of each variable and removes the variable with the largest mean absolute correlation.
+
+Steps to clean sensor features for individual participants. It only considers the **phone sensors** currently.
+
+??? info "1. Fill NA with 0 for the selected event features."
+    Some event features should be zero instead of NA. In this step, we fill those missing features with 0 when the `phone_data_yield_rapids_ratiovalidyieldedminutes` column is higher than the `[IMPUTE_SELECTED_EVENT_FEATURES][MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]` parameter. Plugins such as Activity Recognition sensor are not considered. You can skip this step by setting `[IMPUTE_SELECTED_EVENT_FEATURES][COMPUTE]` to `False`.
+    
+    Take phone calls sensor as an example. If there are no calls records during a time segment for a participant, then (1) the calls sensor was not working during that time segment; or (2) the calls sensor was working and the participant did not have any calls during that time segment. To differentiate these two situations, we assume the selected sensors are working when `phone_data_yield_rapids_ratiovalidyieldedminutes > [MIN_DATA_YIELDED_MINUTES_TO_IMPUTE]`.
+
+??? info "2. Discard unreliable rows."
+    Extracted features might be not reliable if the sensor only works for a short period during a time segment. In this step, we discard rows when the `phone_data_yield_rapids_ratiovalidyieldedhours` column is less than the `[DATA_YIELDED_HOURS_RATIO_THRESHOLD]` parameter. We do not recommend you to skip this step, but you can do it by setting `[DATA_YIELDED_HOURS_RATIO_THRESHOLD]` to 0.
+
+??? info "3. Discard columns (features) with too many missing values."
+    In this step, we discard columns with missing value ratios higher than `[COLS_NAN_THRESHOLD]`. We do not recommend you to skip this step, but you can do it by setting `[COLS_NAN_THRESHOLD]` to 1.
+
+??? info "4. Discard columns (features) with zero variance."
+    In this step, we discard columns with zero variance. We do not recommend you to skip this step, but you can do it by setting `[COLS_VAR_THRESHOLD]` to `False`.
+
+??? info "5. Drop highly correlated features."
+    As highly correlated features might not bring additional information and will increase the complexity of our model, we drop them in this step. The absolute values of pair-wise correlations are calculated. It is regarded as valid only if the ratio of this pair of columns (features) are less than `[DROP_HIGHLY_CORRELATED_FEATURES][MIN_OVERLAP_FOR_CORR_THRESHOLD]`. If two variables have a valid correlation higher than `[DROP_HIGHLY_CORRELATED_FEATURES][CORR_THRESHOLD]`, we looks at the mean absolute correlation of each variable and removes the variable with the largest mean absolute correlation. This step can be skip by setting `[DROP_HIGHLY_CORRELATED_FEATURES][COMPUTE]` to `False`.
+
+??? info "6. Discard rows with too many missing values."
+    In this step, we discard rows with missing value ratios higher than `[ROWS_NAN_THRESHOLD]`. We do not recommend you to skip this step, but you can do it by setting `[ROWS_NAN_THRESHOLD]` to 1.
+
+
+
+
+## Clean sensor features for all participants.
+
+!!! info "File Sequence"
+    ```bash
+    - data/processed/features/all_participants/all_sensor_features.csv
+    - data/processed/features/all_participants/all_sensor_features_cleaned_{provider_key}.csv
+    ```
+
+
+### RAPIDS provider
+
+Parameters description and the steps are the same as the above [RAPIDS provider](#rapids-provider) section for individual participants.
+
+
diff --git a/docs/workflow-examples/minimal.md b/docs/analysis/minimal.md
similarity index 100%
rename from docs/workflow-examples/minimal.md
rename to docs/analysis/minimal.md
diff --git a/docs/features/phone-keyboard.md b/docs/features/phone-keyboard.md
index 905873ff..e5344d41 100644
--- a/docs/features/phone-keyboard.md
+++ b/docs/features/phone-keyboard.md
@@ -6,6 +6,12 @@ Sensor parameters description for `[PHONE_KEYBOARD]`:
 |----------------|-----------------------------------------------------------------------------------------------------------------------------------
 |`[CONTAINER]`| Data stream [container](../../datastreams/data-streams-introduction/) (database table, CSV file, etc.) where the keyboard data is stored
 
+## RAPIDS provider
+
+!!! info "Available time segments and platforms"
+    - Available for all time segments
+    - Available for Android only
+
 !!! info "File Sequence"
     ```bash
     - data/raw/{pid}/phone_keyboard_raw.csv
diff --git a/docs/index.md b/docs/index.md
index 2989fe81..ae201807 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,12 +1,12 @@
 # Welcome to RAPIDS documentation
 
-Reproducible Analysis Pipeline for Data Streams (RAPIDS) allows you to process smartphone and wearable data to [extract](features/feature-introduction.md) and [create](features/add-new-features.md) **behavioral features** (a.k.a. digital biomarkers), [visualize](visualizations/data-quality-visualizations.md) mobile sensor data, and [structure](workflow-examples/analysis.md) your analysis into reproducible workflows.
+Reproducible Analysis Pipeline for Data Streams (RAPIDS) allows you to process smartphone and wearable data to [extract](features/feature-introduction.md) and [create](features/add-new-features.md) **behavioral features** (a.k.a. digital biomarkers), [visualize](visualizations/data-quality-visualizations.md) mobile sensor data, and [structure](analysis/complete-workflow-example.md) your analysis into reproducible workflows.
 
 RAPIDS is open source, documented, multi-platform, modular, tested, and reproducible. At the moment, we support [data streams](datastreams/data-streams-introduction) logged by smartphones, Fitbit wearables, and Empatica wearables (the latter in collaboration with the [DBDP](https://dbdp.org/)). 
 
 !!! tip "Where do I start?"
 
-    :material-power-standby: New to RAPIDS? Check our [Overview + FAQ](setup/overview/) and [minimal example](workflow-examples/minimal)
+    :material-power-standby: New to RAPIDS? Check our [Overview + FAQ](setup/overview/) and [minimal example](analysis/minimal)
 
     :material-play-speed: [Install](setup/installation), [configure](setup/configuration), and [execute](setup/execution) RAPIDS to [extract](features/feature-introduction.md) and [plot](visualizations/data-quality-visualizations.md) behavioral features
 
diff --git a/docs/setup/overview.md b/docs/setup/overview.md
index 72e732e4..eb6da504 100644
--- a/docs/setup/overview.md
+++ b/docs/setup/overview.md
@@ -23,10 +23,10 @@ Let's review some key concepts we use throughout these docs:
     - [Add your own behavioral features](../../features/add-new-features/) (we can include them in RAPIDS if you want to share them with the community)
     - [Add support for new data streams](../../datastreams/add-new-data-streams/) if yours cannot be processed by RAPIDS yet
     - Create visualizations for [data quality control](../../visualizations/data-quality-visualizations/)  and [feature inspection](../../visualizations/feature-visualizations/)
-    - [Extending RAPIDS to organize your analysis](../../workflow-examples/analysis/) and publish a code repository along with your code
+    - [Extending RAPIDS to organize your analysis](../../analysis/complete-workflow-example/) and publish a code repository along with your code
 
 !!! hint
-    - We recommend you follow the [Minimal Example](../../workflow-examples/minimal/) tutorial to get familiar with RAPIDS
+    - We recommend you follow the [Minimal Example](../../analysis/minimal/) tutorial to get familiar with RAPIDS
 
     - In order to follow any of the previous tutorials, you will have to [Install](../installation/), [Configure](../configuration/), and learn how to [Execute](../execution/) RAPIDS.
 
diff --git a/example_profile/Snakefile b/example_profile/Snakefile
index 1dd19ea6..e8723de4 100644
--- a/example_profile/Snakefile
+++ b/example_profile/Snakefile
@@ -385,9 +385,12 @@ if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
     files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html")
 
 # Data Cleaning
-if config["DATA_CLEANING"]["COMPUTE"]:
-    files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned.csv", pid=config["PIDS"]))
-    files_to_compute.append("data/processed/features/all_participants/all_sensor_features_cleaned.csv")
+for provider in config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"].keys():
+    if config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][provider]["COMPUTE"]:
+        files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() +".csv", pid=config["PIDS"]))
+for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
+    if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
+        files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))
 
 # Analysis Workflow Example
 models, scalers = [], []
diff --git a/example_profile/example_config.yaml b/example_profile/example_config.yaml
index 3034ed30..873e7f98 100644
--- a/example_profile/example_config.yaml
+++ b/example_profile/example_config.yaml
@@ -538,14 +538,39 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
 #                                                    Data Cleaning                                                     #
 ########################################################################################################################
 
-DATA_CLEANING:
-  COMPUTE: True
-  COLS_NAN_THRESHOLD: 0.3
-  COLS_VAR_THRESHOLD: True
-  ROWS_NAN_THRESHOLD: 0.3
-  DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75
-  CORR_VALID_PAIRS_THRESHOLD: 0.5
-  CORR_THRESHOLD: 0.95
+ALL_CLEANING_INDIVIDUAL:
+  PROVIDERS:
+    RAPIDS:
+      COMPUTE: True
+      IMPUTE_SELECTED_EVENT_FEATURES:
+        COMPUTE: False
+        MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
+      COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
+      COLS_VAR_THRESHOLD: True
+      ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
+      DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75 # set to 0 to disable
+      DROP_HIGHLY_CORRELATED_FEATURES:
+        COMPUTE: False
+        MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
+        CORR_THRESHOLD: 0.95
+      SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
+
+ALL_CLEANING_OVERALL:
+  PROVIDERS:
+    RAPIDS:
+      COMPUTE: True
+      IMPUTE_SELECTED_EVENT_FEATURES:
+        COMPUTE: False
+        MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
+      COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
+      COLS_VAR_THRESHOLD: True
+      ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
+      DATA_YIELDED_HOURS_RATIO_THRESHOLD: 0.75 # set to 0 to disable
+      DROP_HIGHLY_CORRELATED_FEATURES:
+        COMPUTE: False
+        MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
+        CORR_THRESHOLD: 0.95
+      SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
 
 
 ########################################################################################################################
diff --git a/mkdocs.yml b/mkdocs.yml
index ea96f145..d584e6c4 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -74,7 +74,7 @@ extra_css:
 nav:
   - Home: 'index.md'
   - Overview: setup/overview.md
-  - Minimal Example: workflow-examples/minimal.md
+  - Minimal Example: analysis/minimal.md
   - Citation: citation.md
   - Contributing: contributing.md
   - Setup:
@@ -140,8 +140,9 @@ nav:
   - Visualizations:
     - Data Quality: visualizations/data-quality-visualizations.md
     - Features: visualizations/feature-visualizations.md
-  - Analysis Workflows:
-    - Complete Example: workflow-examples/analysis.md
+  - Analysis:
+    - Data Cleaning: analysis/data-cleaning.md
+    - Complete Workflow Example: analysis/complete-workflow-example.md
   - Developers:
     - Git Flow: developers/git-flow.md
     - Remote Support: developers/remote-support.md
diff --git a/rules/models.smk b/rules/models.smk
index d3c5b7b4..6a07eb09 100644
--- a/rules/models.smk
+++ b/rules/models.smk
@@ -55,7 +55,7 @@ rule parse_targets:
 
 rule merge_features_and_targets_for_individual_model:
     input:
-        cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned.csv",
+        cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned_rapids.csv",
         targets = "data/processed/targets/{pid}/parsed_targets.csv",
     output:
         "data/processed/models/individual_model/{pid}/input.csv"
@@ -64,7 +64,7 @@ rule merge_features_and_targets_for_individual_model:
 
 rule merge_features_and_targets_for_population_model:
     input:
-        cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned.csv",
+        cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned_rapids.csv",
         demographic_features = expand("data/processed/features/{pid}/demographic_features.csv", pid=config["PIDS"]),
         targets = expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"]),
     output: