Merge branch 'ds2' into day_segments

2020-10-28 17:53:00 -04:00 · 2020-10-28 17:53:00 -04:00 · 7f7eac0769
parent 93055dacee c5197b6c5f
commit 7f7eac0769
24 changed files with 427 additions and 58 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -1,6 +1,7 @@
 services:
 - mysql
 - docker
 sudo: required
 language: python
 jobs:
  include:
@ -39,7 +40,6 @@ jobs:
      - "$TRAVIS_BUILD_DIR/renv/library"
    script:
    - bash tests/scripts/run_tests.sh all test 
    # - bash tests/scripts/run_tests.sh periodic test && tests/scripts/run_tests.sh frequency test 
  - name: Python 3.7 on macOS
    os: osx
    osx_image: xcode11.3
@ -71,7 +71,6 @@ jobs:
      - "$TRAVIS_BUILD_DIR/renv/library"
    script:
    - bash tests/scripts/run_tests.sh all test 
    # - bash tests/scripts/run_tests.sh periodic test # && tests/scripts/run_tests.sh frequency test 
  - stage: deploy
    name: Python 3.7 on Xenial Linux Docker
    os: linux
@ -81,13 +80,16 @@ jobs:
    - docker login -u "agamk" -p $DOCKERPWD
    - docker tag rapids agamk/rapids:travislatest
    - docker push agamk/rapids:travislatest
-#branches:
+branches:
-#  only:
+  only:
-#  - master
+  - master
-#  - day_segment
+  - day_segment
 stages:
  - name: deploy
-    if: branch = master AND type = push
+    if: branch = master AND \
        type = push
 notifications:
  email: false
  slack:
--- a/53
+++ b/53
@ -0,0 +1,53 @@
 # getting base image ubuntu
 FROM ubuntu:20.04
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt update && apt install -y \
    libcurl4-openssl-dev \
    libssl-dev \
    libxml2-dev \
    libmysqlclient-dev \
    mysql-server
 RUN apt-get update && apt-get install -y gnupg
 RUN apt-get update && apt-get install -y software-properties-common
 RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
 RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/'
 RUN apt update && apt install -y r-base
 RUN apt install -y pandoc
 RUN apt install -y git
 RUN apt-get update && apt-get install -y vim
 RUN apt update && apt install -y unzip
 ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
 ENV PATH /opt/conda/bin:$PATH
 RUN apt-get update --fix-missing && \
    apt-get install -y wget bzip2 ca-certificates curl git && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \
    /bin/bash ~/miniconda.sh -b -p /opt/conda && \
    rm ~/miniconda.sh && \
    /opt/conda/bin/conda clean -tipsy && \
    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
    echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
    echo "conda activate base" >> ~/.bashrc
 ENV TINI_VERSION v0.16.1
 ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /usr/bin/tini
 RUN chmod +x /usr/bin/tini
 RUN git clone https://github.com/carissalow/rapids
 ENTRYPOINT [ "/usr/bin/tini", "--" ]
 CMD [ "/bin/bash" ]
 RUN conda update -n base -c defaults conda
 WORKDIR /rapids
 RUN conda env create -f environment.yml -n rapids
 RUN Rscript --vanilla -e 'install.packages("rmarkdown", repos="http://cran.us.r-project.org")'
 RUN R -e 'renv::restore()'
 ADD https://osf.io/587wc/download data/external
 RUN mv data/external/download data/external/rapids_example.sql.zip
 RUN unzip data/external/rapids_example.sql.zip
 RUN cp rapids_example.sql data/external/rapids_example.sql
 RUN rm data/external/rapids_example.sql.zip
 RUN rm rapids_example.sql
 RUN echo "source activate rapids" > ~/.bashrc
 ENV PATH /opt/conda/envs/rapids/bin:$PATH
--- a/config.yaml
+++ b/config.yaml
@ -243,8 +243,9 @@ PHONE_CONVERSATION:
    RAPIDS:
      COMPUTE: False
      FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration",
-        "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy",
+    "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy",
-        "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction",
+    "noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy",
    "voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction",
    "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction",
    "unknownexpectedfraction","countconversation"]
      RECORDING_MINUTES: 1
--- a/docs/develop/contributors.rst
+++ b/docs/develop/contributors.rst
@ -57,15 +57,27 @@ Nicolas is a rising senior studying computer science at the University of Pittsb
 Nikunj Goel, BS
 """"""""""""""""
-**intern**
+**Intern**
 Nik is a graduate student at the University of Pittsburgh pursuing Master of Science in Information Science. He earned his Bachelor of Technology degree in Information Technology from India. He is a Data Enthusiasts and passionate about finding the meaning out of raw data. In a long term, his goal is to create a breakthrough in Data Science and Deep Learning.
 `Nikunj Goel Linkedin Profile`_
 Agam Kumar, BS
 """"""""""""""""
 **Research Assistant at CMU**
 Agam is a junior at Carnegie Mellon University studying Statistics and Machine Learning and pursuing an additional major in Computer Science.  He is a member of the Data Science team in the Health and Human Performance Lab at CMU and has keen interests in software development and data science.  His research interests include ML applications in medicine.
 `Agam Kumar Linkedin Profile`_
 `Agam Kumar Github Profile`_
 .. _`Julio Vega Personal Website`: https://juliovega.info/
 .. _`Meng Li Linkedin Profile`: https://www.linkedin.com/in/meng-li-57238414a
 .. _`Meng Li Github Profile`: https://github.com/Meng6
 .. _`Kwesi Aguillera Linkedin Profile`: https://www.linkedin.com/in/kwesi-aguillera-29529823
 .. _`Echhit Joshi Linkedin Profile`: https://www.linkedin.com/in/echhitjoshi/
 .. _`Nikunj Goel Linkedin Profile`: https://www.linkedin.com/in/nikunjgoel95/
 .. _`Agam Kumar Linkedin Profile`: https://www.linkedin.com/in/agam-kumar
 .. _`Agam Kumar Github Profile`: https://github.com/agam-kumar
--- a/docs/develop/documentation.rst
+++ b/docs/develop/documentation.rst
@ -1,17 +1,21 @@
 How to Edit Documentation
 ============================
-The following is a basic guide for editing the documentation for this project. The documentation is rendered using Sphinx_ documentation builder. This guide is intended to be a basic guide that will allow a contributer to start editing the documentation for the RAPIDS Pipeline. The first step is to install Sphinx.
+The following is a basic guide for editing the documentation for this project. The documentation is rendered using Sphinx_ documentation builder
-Mac OS  
+Quick start up
 ----------------------------------
-    - ``brew install sphinx-doc``
+#. Install Sphinx in Mac OS  ``brew install sphinx-doc`` or Linux (Ubuntu) ``apt-get install python3-sphinx``
-Linux (Ubuntu)
+#. Go to the docs folder ``cd docs``
-    - ``apt-get install python3-sphinx``
+#. Change any ``.rst`` file you need to modify
 #. To visualise the results locally do ``make dirhtml`` and check the html files in the ``_build/dirhtml`` directory
 #. When you are done, push your changes to the git repo.
 Sphinx is a tool that translates a set of reStructuredText_ source files into various output formats such as HTML and PDF, automatically producing cross-references, indices, etc. The following is a basic outline of structure of Sphinx workspace and the syntax of reStructuredText.
 Sphinx Workspace Structure
 ----------------------------
@ -34,12 +38,6 @@ Thus the directory structure for the above example is shown below::
        ├── introduction.rst
        └── installation.rst
 Once the ``index.rst`` has been editted and content has been added and/or editted the documentation is built using the following command::
    $ make dirhtml
 This command creates the ``_build`` directory which contains the generated HTML files of the documentation. It shoould be noted that once you have pushed your change to the repository the changes will be published even if you have not run ``make dirhtml``
 Basic reStructuredText Syntax
 -------------------------------
--- a/docs/develop/remotesupport.rst
+++ b/docs/develop/remotesupport.rst
@ -0,0 +1,16 @@
 Remote Support
 ======================================
 We use the Live Share extension of Visual Studio Code to debug bugs when sharing data or database credentials is not possible.
 #. Install `Visual Studio Code <https://code.visualstudio.com/>`_
 #. Open you rapids folder in a new VSCode window
 #. Open a new Terminal ``Terminal > New terminal``
 #. Install the `Live Share extension pack <https://marketplace.visualstudio.com/items?itemName=MS-vsliveshare.vsliveshare-pack>`_
 #. Press ``Ctrl+P``/``Cmd+P`` and run this command ``>live share: start collaboration session`` 
 #. Follow the instructions and share the session link you receive
--- a/docs/develop/test_cases.rst
+++ b/docs/develop/test_cases.rst
@ -100,7 +100,7 @@ Activity Recognition
 Conversation
 """""""""""""
-    - The raw conversation data file contains data for 1 day. 
+    - The raw conversation data file contains data for 2 day. 
    - The raw conversation data contains records with a sample of both ``datatypes`` (i.e. ``voice/noise`` = ``0``, and ``conversation`` = ``2`` ) as well as rows with for samples of each of the ``inference`` values (i.e. ``silence`` = ``0``, ``noise`` = ``1``, ``voice`` = ``2``, and ``unknown`` = ``3``) for each ``epoch``. The different ``datatype`` and ``inference`` records are randomly distributed throughout the ``epoch``. 
    - Additionally there are 2 - 5 records for conversations (``datatype`` = 2, and ``inference`` = -1) in each ``epoch`` and for each ``epoch`` except night, there is a conversation record that has a ``double_convo_start`` ``timestamp`` that is from the previous ``epoch``. This is to test the calculations of features across ``epochs``.
    - There is a raw conversation data file for both android and iOS platforms (``plugin_studentlife_audio_android_raw.csv`` and ``plugin_studentlife_audio_raw.csv`` respectively).
--- a/docs/features/extracted.rst
+++ b/docs/features/extracted.rst
@ -49,6 +49,8 @@ Global Parameters
    - ``FIXED_TIMEZONE``. See ``TIMEZONE`` above. This assumes that all data of all participants was collected within one time zone.
    - Support for multiple time zones for each participant coming soon based on the ``timezone`` table collected by Aware.
 .. _phone-valid-sensed-bins:
 - ``PHONE_VALID_SENSED_BINS``
     Contains three attributes: ``COMPUTE``, ``BIN_SIZE`` and ``TABLES``. See the PHONE_VALID_SENSED_BINS_ section in the ``config.yaml`` file
@ -492,13 +494,12 @@ features        Features to be computed, see table below
 ======================   ==============    =============
 Name                     Units             Description
 ======================   ==============    =============
-count                    rows              Number of detect activity events (rows).
+count                    rows              Number of episodes.
 mostcommonactivity       activity_type     The most common ``activity_type``. If this feature is not unique the first ``activity_type`` of the set of most common ``activity_types`` is selected ordered by ``activity_type``.
-countuniqueactivities    activities        Number of unique activities.
+countuniqueactivities    activity_type     Number of unique ``activity_type``.
-activitychangecount      transitions       Number of transitions between two different activities; still to running for example.
+durationstationary       minutes           The total duration of episodes of still and tilting (phone) activities.
-sumstationary            minutes           The total duration of episodes of still and tilting (phone) activities.
+durationmobile           minutes           The total duration of episodes of on foot, running, and on bicycle activities
-summobile                minutes           The total duration of episodes of on foot, running, and on bicycle activities
+durationvehicle          minutes           The total duration of episodes of on vehicle activity
 sumvehicle               minutes           The total duration of episodes of on vehicle activity
 ======================   ==============    =============
 **Assumptions/Observations:**
@ -844,11 +845,16 @@ avgconversationduration     minutes             Average duration of all conversa
 sdconversationduration      minutes             Standard Deviation of the duration of all conversations
 timefirstconversation       minutes             Minutes since midnight when the first conversation for a day segment was detected
 timelastconversation        minutes             Minutes since midnight when the last conversation for a day segment was detected
-sumenergy                   L2-norm             Sum of all energy values
+noisesumenergy              L2-norm             Sum of all energy values when inference is noise
-avgenergy                   L2-norm             Average of all energy values
+noiseavgenergy              L2-norm             Average of all energy values when inference is noise
-sdenergy                    L2-norm             Standard Deviation of all energy values
+noisesdenergy               L2-norm             Standard Deviation of all energy values when inference is noise
-minenergy                   L2-norm             Minimum of all energy values
+noiseminenergy              L2-norm             Minimum of all energy values when inference is noise
-maxenergy                   L2-norm             Maximum of all energy values
+noisemaxenergy              L2-norm             Maximum of all energy values when inference is noise
 voicesumenergy              L2-norm             Sum of all energy values when inference is voice
 voiceavgenergy              L2-norm             Average of all energy values when inference is voice
 voicesdenergy               L2-norm             Standard Deviation of all energy values when inference is voice
 voiceminenergy              L2-norm             Minimum of all energy values when inference is voice
 voicemaxenergy              L2-norm             Maximum of all energy values when inference is voice
 silencesensedfraction                           Ratio between minutessilence and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
 noisesensedfraction                             Ratio between minutesnoise and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
 voicesensedfraction                             Ratio between minutesvoice and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
--- a/docs/index.rst
+++ b/docs/index.rst
@ -6,7 +6,9 @@
 RAPIDS
 ======
-**R**\ eproducible **A**\ nalysis **Pi**\ pline for **D**\ ata **S**\ treams
+**R**\ eproducible **A**\ nalysis **Pi**\ peline for **D**\ ata **S**\ treams
 Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_
 Contents:
@ -27,13 +29,22 @@ Contents:
   features/extracted
 .. toctree::
   :maxdepth: 2
   :caption: Visualization
   visualization/data_exploration
 .. toctree::
   :maxdepth: 2
   :caption: Developers
   develop/remotesupport
   develop/documentation
   develop/features
   develop/environments
   develop/contributors
   develop/testing
   develop/test_cases
 .. _slack: http://awareframework.com:3000/
--- a/docs/usage/example.rst
+++ b/docs/usage/example.rst
@ -26,6 +26,8 @@ This is a quick guide for creating and running a simple pipeline to analysis an
 #. Make sure your conda environment is active (the environment is already active in our docker container). See step 6 of :ref:`install-page`.
 #. If you installed RAPIDS from GitHub (did not use docker) you need to download the `example db backup <https://osf.io/skqfv/files/>`_ and save it to ``data/external/rapids_example.sql``.
 #. Run the following command to restore database from ``rapids_example.sql`` file::
    snakemake -j1 restore_sql_file
--- a/docs/usage/faq.rst
+++ b/docs/usage/faq.rst
@ -146,6 +146,33 @@ This is a bug in Ubuntu 20.04 when trying to connect to an old MySQL server with
 If you can't update your server, the quickest solution would be to import your database to another server or to a local environment. Alternatively, you could replace ``mysql-client`` and ``libmysqlclient-dev`` with ``mariadb-client`` and ``libmariadbclient-dev`` and reinstall renv. More info about this issue here https://bugs.launchpad.net/ubuntu/+source/mysql-8.0/+bug/1872541
 11. ``DB_TABLES`` key not found
 """"""""""""""""""""""""""""""""
 If you get the following error ``KeyError in line 43 of preprocessing.smk: 'DB_TABLES'``, means that the indentation of the key ``DB_TABLES`` is not matching the other child elements of ``PHONE_VALID_SENSED_BINS`` and you need to add or remove any leading whitespaces as needed.
 ::
    PHONE_VALID_SENSED_BINS:
        COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features
        BIN_SIZE: &bin_size 5 # (in minutes)
        # Add as many sensor tables as you have, they all improve the computation of PHONE_VALID_SENSED_BINS and PHONE_VALID_SENSED_DAYS. 
        # If you are extracting screen or Barnett's location features, screen and locations tables are mandatory.
        DB_TABLES: []
 12. Error while updating your conda environment in Ubuntu
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""
 If you get the following error try reinstalling conda.
 ::
    CondaMultiError: CondaVerificationError: The package for tk located at /home/ubuntu/miniconda2/pkgs/tk-8.6.9-hed695b0_1003
        appears to be corrupted. The path 'include/mysqlStubs.h'
        specified in the package manifest cannot be found.
    ClobberError: This transaction has incompatible packages due to a shared path.
        packages: conda-forge/linux-64::llvm-openmp-10.0.0-hc9558a2_0, anaconda/linux-64::intel-openmp-2019.4-243
        path: 'lib/libiomp5.so'
 .. ------------------------ Links --------------------------- ..
--- a/docs/usage/introduction.rst
+++ b/docs/usage/introduction.rst
@ -7,6 +7,8 @@ At the moment, mobile data can be collected using different sensing frameworks (
 We recommend reading Snakemake_ docs, but the main idea behind the pipeline is that every link in the analysis chain is a rule with an input and an output. Input and output are files, which can be manipulated using any programming language (although Snakemake_ has wrappers for Julia_, Python_, and R_ that can make development slightly more comfortable). Snakemake_ also allows the pipeline rules to be executed in parallel on multiple cores without any code changes. This can drastically reduce the time needed to complete an analysis.
 Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_
 Available features:
 - :ref:`accelerometer-sensor-doc`
@ -39,3 +41,4 @@ We are updating these docs constantly, but if you think something needs clarific
 .. _Fitbit: https://www.fitbit.com/us/home
 .. _Python: https://www.python.org/
 .. _Julia: https://julialang.org/
 .. _slack: http://awareframework.com:3000/
--- a/docs/visualization/data_exploration.rst
+++ b/docs/visualization/data_exploration.rst
@ -0,0 +1,216 @@
 .. _data_exploration:
 Data Exploration
 ================
 These plots are in beta, if you get an error while computing them please let us know.
 .. _histogram-of-valid-sensed-hours:
 Histogram of valid sensed hours
 """""""""""""""""""""""""""""""
 See `Histogram of Valid Sensed Hours Config Code`_
 **Rule Chain:**
 - Rule: ``rules/preprocessing.smk/download_dataset``
 - Rule: ``rules/preprocessing.smk/readable_datetime``
 - Rule: ``rules/preprocessing.smk/phone_sensed_bins``
 - Rule: ``rules/preprocessing.smk/phone_valid_sensed_days``
 - Rule: ``rules/reports.smk/histogram_valid_sensed_hours``
 .. _figure1-parameters:
 **Parameters of histogram_valid_sensed_hours Rule:**
 =======================    =======================
 Name                       Description
 =======================    =======================
 plot                       Whether the rule is executed or not. The available options are ``True`` and ``False``.
 min_valid_bins_per_hour    The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS<phone-valid-sensed-bins>` for more information.
 min_valid_hours_per_day    The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS<phone-valid-sensed-days>` for more information.
 =======================    =======================
 **Observations:**
 This histogram shows the valid sensed hours of all participants processed in RAPIDS (See valid sensed :ref:`bins<phone-valid-sensed-bins>` and :ref:`days<phone-valid-sensed-days>` sections). It can be used as a rough indication of the AWARE client monitoring coverage during a study for all participants. See Figure 1.
 .. figure:: figures/Figure1.png
    :scale: 90 %
    :align: center
    Figure 1 Histogram of valid sensed hours for all participants
 .. _heatmap-of-phone-sensed-bins:
 Heatmap of phone sensed bins
 """"""""""""""""""""""""""""
 See `Heatmap of Phone Sensed Bins Config Code`_
 **Rule Chain:**
 - Rule: ``rules/preprocessing.smk/download_dataset``
 - Rule: ``rules/preprocessing.smk/readable_datetime``
 - Rule: ``rules/preprocessing.smk/phone_sensed_bins``
 - Rule: ``rules/reports.smk/heatmap_sensed_bins``
 .. _figure2-parameters:
 **Parameters of heatmap_sensed_bins Rule:**
 =======================    =======================
 Name                       Description
 =======================    =======================
 plot                       Whether the rule is executed or not. The available options are ``True`` and ``False``.
 bin_size                   Every hour is divided into N bins of size ``BIN_SIZE`` (in minutes). It modifies the way we compute ``data/interim/pXX/phone_sensed_bins.csv`` file.
 =======================    =======================
 **Observations:**
 In this heatmap rows are dates, columns are sensed bins for a participant, and cells’ color shows the number of mobile sensors that logged at least one row of data during that bin. This plot shows the periods of time without data for a participant and can be used as a rough indication of whether time-based sensors were following their sensing schedule (e.g. if location was being sensed every 2 minutes). See Figure 2.
 .. figure:: figures/Figure2.png
    :scale: 90 %
    :align: center
    Figure 2 Heatmap of phone sensed bins for a single participant
 .. _heatmap-of-days-by-sensors
 Heatmap of days by sensors
 """"""""""""""""""""""""""
 See `Heatmap of Days by Sensors Config Code`_
 **Rule Chain:**
 - Rule: ``rules/preprocessing.smk/download_dataset``
 - Rule: ``rules/preprocessing.smk/readable_datetime``
 - Rule: ``rules/preprocessing.smk/phone_sensed_bins``
 - Rule: ``rules/preprocessing.smk/phone_valid_sensed_days``
 - Rule: ``rules/reports.smk/heatmap_days_by_sensors``
 .. _figure3-parameters:
 **Parameters of heatmap_days_by_sensors Rule:**
 =======================    =======================
 Name                       Description
 =======================    =======================
 plot                       Whether the rule is executed or not. The available options are ``True`` and ``False``.
 min_valid_bins_per_hour    The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS<phone-valid-sensed-bins>` for more information.
 min_valid_hours_per_day    The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS<phone-valid-sensed-days>` for more information.
 expected_num_of_days       The number of days of data to show starting from the first day of each participant.
 db_tables                  List of sensor tables to compute valid bins & hours.
 =======================    =======================
 **Observations:**
 In this heatmap rows are sensors, columns are days and cells’ color shows the normalized (0 to 1) number of valid sensed hours (See valid sensed :ref:`bins<phone-valid-sensed-bins>` and :ref:`days<phone-valid-sensed-days>` sections) collected by a sensor during a day for a participant. The user can decide how many days of data to show starting from the first day of each participant. This plot can used to judge missing data on a per participant, per sensor basis as well as the number of valid sensed hours (usable data) for each day. See Figure 3.
 .. figure:: figures/Figure3.png
    :scale: 90 %
    :align: center
    Figure 3 Heatmap of days by sensors for a participant
 .. _overall-compliance-heatmap
 Overall compliance heatmap
 """"""""""""""""""""""""""
 See `Overall Compliance Heatmap Config Code`_
 **Rule Chain:**
 - Rule: ``rules/preprocessing.smk/download_dataset``
 - Rule: ``rules/preprocessing.smk/readable_datetime``
 - Rule: ``rules/preprocessing.smk/phone_sensed_bins``
 - Rule: ``rules/preprocessing.smk/phone_valid_sensed_days``
 - Rule: ``rules/reports.smk/overall_compliance_heatmap``
 .. _figure4-parameters:
 **Parameters of overall_compliance_heatmap Rule:**
 =======================    =======================
 Name                       Description
 =======================    =======================
 plot                       Whether the rule is executed or not. The available options are ``True`` and ``False``.
 only_show_valid_days       Whether the plot only shows valid days or not. The available options are ``True`` and ``False``.
 expected_num_of_days       The number of days to show before today.
 bin_size                   Every hour is divided into N bins of size ``BIN_SIZE`` (in minutes). It modifies the way we compute ``data/interim/pXX/phone_sensed_bins.csv`` file.
 min_valid_bins_per_hour    The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS<phone-valid-sensed-bins>` for more information.
 min_valid_hours_per_day    The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS<phone-valid-sensed-days>` for more information.
 =======================    =======================
 **Observations:**
 In this heatmap rows are participants, columns are days and cells’ color shows the valid sensed hours for a participant during a day (See valid sensed :ref:`bins<phone-valid-sensed-bins>` and :ref:`days<phone-valid-sensed-days>` sections). This plot can be configured to show a certain number of days before today using the ``EXPECTED_NUM_OF_DAYS`` parameter (by default -1 showing all days for every participant). As different participants might join the study on different dates, the x-axis has a day index instead of a date. This plot gives the user a quick overview of the amount of data collected per person and is complementary to the histogram of valid sensed hours as it is broken down per participant and per day. See Figure 4.
 .. figure:: figures/Figure4.png
    :scale: 90 %
    :align: center
    Figure 4 Overall compliance heatmap for all participants
 .. _heatmap-of-correlation-matrix-between-features
 Heatmap of correlation matrix between features
 """"""""""""""""""""""""""""""""""""""""""""""
 See `Heatmap of Correlation Matrix Config Code`_
 **Rule Chain:**
 - Rules to extract features
 - Rule: ``rules/preprocessing.smk/download_dataset``
 - Rule: ``rules/preprocessing.smk/readable_datetime``
 - Rule: ``rules/preprocessing.smk/phone_sensed_bins``
 - Rule: ``rules/preprocessing.smk/phone_valid_sensed_days``
 - Rule: ``rules/reports.smk/heatmap_features_correlations``
 .. _figure5-parameters:
 **Parameters of heatmap_features_correlations Rule:**
 =======================    ==============
 Name                       Description
 =======================    ==============
 plot                       Whether the rule is executed or not. The available options are ``True`` and ``False``.
 min_valid_bins_per_hour    The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS<phone-valid-sensed-bins>` for more information.
 min_valid_hours_per_day    The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS<phone-valid-sensed-days>` for more information.
 corr_method                Method of correlation. The available options are ``pearson``, ``kendall`` and ``spearman``.
 min_rows_ratio             Minimum number of observations required per pair of columns to have a valid correlation coefient. Currently, only available for ``pearson`` and ``spearman`` correlation.
 phone_features             The list of phone features.
 fitbit_features            The list of Fitbit features.
 corr_threshold             Only correlation coefficients larger than ``corr_threshold`` can be shown in the heatmap.
 =======================    ==============
 **Observations:**
 Columns and rows are features computed in RAPIDS, cells’ color represents the correlation coefficient between all days of data for every pair of feature of all participants. The user can specify a minimum number of observations required to compute the correlation between two features using the ``MIN_ROWS_RATIO`` parameter (0.5 by default). In addition, this plot can be configured to only display correlation coefficients above a threshold using the ``CORR_THRESHOLD`` parameter (0.1 by default). See Figure 5.
 .. figure:: figures/Figure5.png
    :scale: 90 %
    :align: center
    Figure 5 Correlation matrix heatmap for all the data of all participants
 .. _`Histogram of Valid Sensed Hours Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L221
 .. _`Heatmap of Phone Sensed Bins Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L233
 .. _`Heatmap of Days by Sensors Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L226
 .. _`Overall Compliance Heatmap Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L237
 .. _`Heatmap of Correlation Matrix Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L211
--- a/docs/visualization/figures/Figure1.png
+++ b/docs/visualization/figures/Figure1.png
--- a/docs/visualization/figures/Figure2.png
+++ b/docs/visualization/figures/Figure2.png
--- a/docs/visualization/figures/Figure3.png
+++ b/docs/visualization/figures/Figure3.png
--- a/docs/visualization/figures/Figure4.png
+++ b/docs/visualization/figures/Figure4.png
--- a/docs/visualization/figures/Figure5.png
+++ b/docs/visualization/figures/Figure5.png
--- a/src/data/phone_sensed_bins.R
+++ b/src/data/phone_sensed_bins.R
@ -2,6 +2,7 @@ source("renv/activate.R")
 library("dplyr", warn.conflicts = F)
 library(tidyr)
 library(lubridate)
 all_sensors <- snakemake@input[["all_sensors"]]
 bin_size <- snakemake@params[["bin_size"]]
@ -16,16 +17,24 @@ for(sensor in all_sensors){
  all_sensor_data <- rbind(all_sensor_data, sensor_data)
 }
 if(nrow(all_sensor_data) == 0){
  bins = seq(0, 59, by = bin_size)
  hours = seq(0, 23, 1)
  write.csv(crossing(hours, bins) %>% unite("hour_bin",hours, bins, sep = "_") %>% mutate(value = NA, local_date = NA) %>% pivot_wider(names_from = hour_bin, values_from=value) %>% head(0), output_file, row.names = FALSE)
 } else{
  phone_sensed_bins <- all_sensor_data %>% 
    mutate(bin = (local_minute %/% bin_size) * bin_size) %>% # bin rows into bin_size-minute bins
    group_by(local_date, local_hour, bin) %>% 
    summarise(sensor_count = n_distinct(sensor)) %>%
    ungroup() %>% 
    mutate(local_date = lubridate::ymd(local_date)) %>% 
    complete(local_date = seq.Date(min(local_date), max(local_date), by="day"), 
            fill = list(local_hour = 0, bin = 0, sensor_count = 0)) %>% 
    complete(nesting(local_date), 
            local_hour = seq(0, 23, 1), 
-           bin = seq(0, (59 %/% bin_size) * bin_size, bin_size), 
+            bin = seq(0, 59, bin_size), 
            fill = list(sensor_count=0)) %>% 
    pivot_wider(names_from = c(local_hour, bin), values_from = sensor_count)
  write.csv(phone_sensed_bins, output_file, row.names = FALSE)
-
+}
--- a/src/data/phone_valid_sensed_days.R
+++ b/src/data/phone_valid_sensed_days.R
@ -9,9 +9,8 @@ output_file <- snakemake@output[[1]]
 phone_valid_sensed_days <- phone_sensed_bins %>% 
  pivot_longer(cols = -local_date, names_to = c("hour", "bin"), names_sep = "_") %>% 
  filter(value > 0) %>%
  group_by(local_date, hour) %>%
-  summarise(valid_bins = n()) %>% 
+  summarise(valid_bins = sum(value > 0)) %>% 
  group_by(local_date) %>% 
  summarise(valid_sensed_hours = sum(valid_bins >= min_valid_bins_per_hour)) %>% 
  mutate(is_valid_sensed_day = ifelse(valid_sensed_hours >= min_valid_hours_per_day, TRUE, FALSE))
--- a/src/features/phone_locations/barnett/main.R
+++ b/src/features/phone_locations/barnett/main.R
@ -75,7 +75,12 @@ barnett_features <- function(sensor_data_files, day_segment, params){
      # Select only the columns that the algorithm needs
      location <- location %>% select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy)
      if(nrow(location %>% filter(accuracy < accuracy_limit)) > 1){
        outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone)
      } else {
        print(paste("Cannot compute location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT", accuracy_limit))
        outputMobility <- NULL
      }
      if(is.null(outputMobility)){
        location_features <- create_empty_file(requested_features)
--- a/src/features/phone_locations/doryab/main.py
+++ b/src/features/phone_locations/doryab/main.py
@ -46,6 +46,11 @@ def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_seg
            location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)]
            if location_data.empty:
                location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute])
                location_features = location_features.reset_index(drop=True)
                return location_features
            if "locationvariance" in features_to_compute:
                location_features["locations_doryab_locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()
@ -353,6 +358,8 @@ def radius_of_gyration(locationData,sampling_frequency):
        rog = rog + (time_in_cluster * distance)
    time_all_clusters = valid_clusters.shape[0] * sampling_frequency
    if time_all_clusters == 0:
        return 0
    final_rog = (1/time_all_clusters) * rog
    return np.sqrt(final_rog)
--- a/src/visualization/overall_compliance_heatmap.py
+++ b/src/visualization/overall_compliance_heatmap.py
@ -40,16 +40,18 @@ def getOverallComplianceHeatmap(sensors_with_data, valid_sensed_hours, last_cert
                                       x=[date.replace("-", "/") for date in last_certain_dates] if expected_num_of_days != -1 else last_certain_dates,
                                       y=[pid + "." + label for pid, label in zip(sensors_with_data["pid"].to_list(), sensors_with_data["label"].to_list())],
                                       text=sensors_with_data[last_certain_dates].values,
-                                       hovertemplate="Date: %{x}<br>Participant: %{y}<br>Valid sensed hours: %{z}<br>Number of sensors with data: %{text}<extra></extra>" if expected_num_of_days != -1 else "Date_idx: %{x}<br>Participant: %{y}<br>Valid sensed hours: %{z}<br>Number of sensors with data: %{text}<extra></extra>",
+                                       hovertemplate="Date: %{x}<br>Participant: %{y}<br>Valid sensed hours: %{z}<br>Number of sensors with data: %{text}<extra></extra>" if expected_num_of_days != -1 else "Day index: %{x}<br>Participant: %{y}<br>Valid sensed hours: %{z}<br>Number of sensors with data: %{text}<extra></extra>",
                                       colorscale="Viridis",
                                       colorbar={"tick0": 0,"dtick": 1},
                                       showscale=True))
    if expected_num_of_days != -1:
-        plot.update_layout(title="Overall compliance heatmap for last " + str(expected_num_of_days) + " days.<br>Bin's color shows valid sensed hours for that day.<br>A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes")
+        plot.update_layout(title="Overall compliance heatmap for last " + str(expected_num_of_days) + " days.<br>Bin's color shows valid sensed hours for that day.<br>A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes.<br>You can hover over every day to see the number of sensors with data in that day.")
    else:
-        plot.update_layout(title="Overall compliance heatmap for all days.<br>Bin's color shows valid sensed hours for that day.<br>A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes")
+        plot.update_layout(title="Overall compliance heatmap for all days.<br>Bin's color shows valid sensed hours for that day.<br>A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes.<br>You can hover over every day to see the number of sensors with data in that day.")
    plot["layout"]["xaxis"].update(side="bottom")
    plot["layout"].update(xaxis_title="Day indexes")
    plot["layout"].update(margin=dict(t=160))
    pio.write_html(plot, file=output_path, auto_open=False, include_plotlyjs="cdn")