Merge branch 'ds2' into day_segments

pull/103/head
JulioV 2020-10-28 17:53:00 -04:00
commit 7f7eac0769
24 changed files with 427 additions and 58 deletions

View File

@ -1,6 +1,7 @@
services: services:
- mysql - mysql
- docker - docker
sudo: required
language: python language: python
jobs: jobs:
include: include:
@ -39,7 +40,6 @@ jobs:
- "$TRAVIS_BUILD_DIR/renv/library" - "$TRAVIS_BUILD_DIR/renv/library"
script: script:
- bash tests/scripts/run_tests.sh all test - bash tests/scripts/run_tests.sh all test
# - bash tests/scripts/run_tests.sh periodic test && tests/scripts/run_tests.sh frequency test
- name: Python 3.7 on macOS - name: Python 3.7 on macOS
os: osx os: osx
osx_image: xcode11.3 osx_image: xcode11.3
@ -71,7 +71,6 @@ jobs:
- "$TRAVIS_BUILD_DIR/renv/library" - "$TRAVIS_BUILD_DIR/renv/library"
script: script:
- bash tests/scripts/run_tests.sh all test - bash tests/scripts/run_tests.sh all test
# - bash tests/scripts/run_tests.sh periodic test # && tests/scripts/run_tests.sh frequency test
- stage: deploy - stage: deploy
name: Python 3.7 on Xenial Linux Docker name: Python 3.7 on Xenial Linux Docker
os: linux os: linux
@ -81,13 +80,16 @@ jobs:
- docker login -u "agamk" -p $DOCKERPWD - docker login -u "agamk" -p $DOCKERPWD
- docker tag rapids agamk/rapids:travislatest - docker tag rapids agamk/rapids:travislatest
- docker push agamk/rapids:travislatest - docker push agamk/rapids:travislatest
#branches: branches:
# only: only:
# - master - master
# - day_segment - day_segment
stages: stages:
- name: deploy - name: deploy
if: branch = master AND type = push if: branch = master AND \
type = push
notifications: notifications:
email: false email: false
slack: slack:

53
Dockerfile 100644
View File

@ -0,0 +1,53 @@
# getting base image ubuntu
FROM ubuntu:20.04
ENV DEBIAN_FRONTEND=noninteractive
RUN apt update && apt install -y \
libcurl4-openssl-dev \
libssl-dev \
libxml2-dev \
libmysqlclient-dev \
mysql-server
RUN apt-get update && apt-get install -y gnupg
RUN apt-get update && apt-get install -y software-properties-common
RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/'
RUN apt update && apt install -y r-base
RUN apt install -y pandoc
RUN apt install -y git
RUN apt-get update && apt-get install -y vim
RUN apt update && apt install -y unzip
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
ENV PATH /opt/conda/bin:$PATH
RUN apt-get update --fix-missing && \
apt-get install -y wget bzip2 ca-certificates curl git && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \
/bin/bash ~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda clean -tipsy && \
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate base" >> ~/.bashrc
ENV TINI_VERSION v0.16.1
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /usr/bin/tini
RUN chmod +x /usr/bin/tini
RUN git clone https://github.com/carissalow/rapids
ENTRYPOINT [ "/usr/bin/tini", "--" ]
CMD [ "/bin/bash" ]
RUN conda update -n base -c defaults conda
WORKDIR /rapids
RUN conda env create -f environment.yml -n rapids
RUN Rscript --vanilla -e 'install.packages("rmarkdown", repos="http://cran.us.r-project.org")'
RUN R -e 'renv::restore()'
ADD https://osf.io/587wc/download data/external
RUN mv data/external/download data/external/rapids_example.sql.zip
RUN unzip data/external/rapids_example.sql.zip
RUN cp rapids_example.sql data/external/rapids_example.sql
RUN rm data/external/rapids_example.sql.zip
RUN rm rapids_example.sql
RUN echo "source activate rapids" > ~/.bashrc
ENV PATH /opt/conda/envs/rapids/bin:$PATH

View File

@ -243,8 +243,9 @@ PHONE_CONVERSATION:
RAPIDS: RAPIDS:
COMPUTE: False COMPUTE: False
FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration", FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration",
"sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy", "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy",
"avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction", "noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy",
"voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction",
"voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction", "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction",
"unknownexpectedfraction","countconversation"] "unknownexpectedfraction","countconversation"]
RECORDING_MINUTES: 1 RECORDING_MINUTES: 1

View File

@ -57,15 +57,27 @@ Nicolas is a rising senior studying computer science at the University of Pittsb
Nikunj Goel, BS Nikunj Goel, BS
"""""""""""""""" """"""""""""""""
**intern** **Intern**
Nik is a graduate student at the University of Pittsburgh pursuing Master of Science in Information Science. He earned his Bachelor of Technology degree in Information Technology from India. He is a Data Enthusiasts and passionate about finding the meaning out of raw data. In a long term, his goal is to create a breakthrough in Data Science and Deep Learning. Nik is a graduate student at the University of Pittsburgh pursuing Master of Science in Information Science. He earned his Bachelor of Technology degree in Information Technology from India. He is a Data Enthusiasts and passionate about finding the meaning out of raw data. In a long term, his goal is to create a breakthrough in Data Science and Deep Learning.
`Nikunj Goel Linkedin Profile`_ `Nikunj Goel Linkedin Profile`_
Agam Kumar, BS
""""""""""""""""
**Research Assistant at CMU**
Agam is a junior at Carnegie Mellon University studying Statistics and Machine Learning and pursuing an additional major in Computer Science. He is a member of the Data Science team in the Health and Human Performance Lab at CMU and has keen interests in software development and data science. His research interests include ML applications in medicine.
`Agam Kumar Linkedin Profile`_
`Agam Kumar Github Profile`_
.. _`Julio Vega Personal Website`: https://juliovega.info/ .. _`Julio Vega Personal Website`: https://juliovega.info/
.. _`Meng Li Linkedin Profile`: https://www.linkedin.com/in/meng-li-57238414a .. _`Meng Li Linkedin Profile`: https://www.linkedin.com/in/meng-li-57238414a
.. _`Meng Li Github Profile`: https://github.com/Meng6 .. _`Meng Li Github Profile`: https://github.com/Meng6
.. _`Kwesi Aguillera Linkedin Profile`: https://www.linkedin.com/in/kwesi-aguillera-29529823 .. _`Kwesi Aguillera Linkedin Profile`: https://www.linkedin.com/in/kwesi-aguillera-29529823
.. _`Echhit Joshi Linkedin Profile`: https://www.linkedin.com/in/echhitjoshi/ .. _`Echhit Joshi Linkedin Profile`: https://www.linkedin.com/in/echhitjoshi/
.. _`Nikunj Goel Linkedin Profile`: https://www.linkedin.com/in/nikunjgoel95/ .. _`Nikunj Goel Linkedin Profile`: https://www.linkedin.com/in/nikunjgoel95/
.. _`Agam Kumar Linkedin Profile`: https://www.linkedin.com/in/agam-kumar
.. _`Agam Kumar Github Profile`: https://github.com/agam-kumar

View File

@ -1,17 +1,21 @@
How to Edit Documentation How to Edit Documentation
============================ ============================
The following is a basic guide for editing the documentation for this project. The documentation is rendered using Sphinx_ documentation builder. This guide is intended to be a basic guide that will allow a contributer to start editing the documentation for the RAPIDS Pipeline. The first step is to install Sphinx. The following is a basic guide for editing the documentation for this project. The documentation is rendered using Sphinx_ documentation builder
Mac OS Quick start up
----------------------------------
- ``brew install sphinx-doc`` #. Install Sphinx in Mac OS ``brew install sphinx-doc`` or Linux (Ubuntu) ``apt-get install python3-sphinx``
Linux (Ubuntu) #. Go to the docs folder ``cd docs``
- ``apt-get install python3-sphinx`` #. Change any ``.rst`` file you need to modify
#. To visualise the results locally do ``make dirhtml`` and check the html files in the ``_build/dirhtml`` directory
#. When you are done, push your changes to the git repo.
Sphinx is a tool that translates a set of reStructuredText_ source files into various output formats such as HTML and PDF, automatically producing cross-references, indices, etc. The following is a basic outline of structure of Sphinx workspace and the syntax of reStructuredText.
Sphinx Workspace Structure Sphinx Workspace Structure
---------------------------- ----------------------------
@ -34,12 +38,6 @@ Thus the directory structure for the above example is shown below::
├── introduction.rst ├── introduction.rst
└── installation.rst └── installation.rst
Once the ``index.rst`` has been editted and content has been added and/or editted the documentation is built using the following command::
$ make dirhtml
This command creates the ``_build`` directory which contains the generated HTML files of the documentation. It shoould be noted that once you have pushed your change to the repository the changes will be published even if you have not run ``make dirhtml``
Basic reStructuredText Syntax Basic reStructuredText Syntax
------------------------------- -------------------------------

View File

@ -0,0 +1,16 @@
Remote Support
======================================
We use the Live Share extension of Visual Studio Code to debug bugs when sharing data or database credentials is not possible.
#. Install `Visual Studio Code <https://code.visualstudio.com/>`_
#. Open you rapids folder in a new VSCode window
#. Open a new Terminal ``Terminal > New terminal``
#. Install the `Live Share extension pack <https://marketplace.visualstudio.com/items?itemName=MS-vsliveshare.vsliveshare-pack>`_
#. Press ``Ctrl+P``/``Cmd+P`` and run this command ``>live share: start collaboration session``
#. Follow the instructions and share the session link you receive

View File

@ -100,7 +100,7 @@ Activity Recognition
Conversation Conversation
""""""""""""" """""""""""""
- The raw conversation data file contains data for 1 day. - The raw conversation data file contains data for 2 day.
- The raw conversation data contains records with a sample of both ``datatypes`` (i.e. ``voice/noise`` = ``0``, and ``conversation`` = ``2`` ) as well as rows with for samples of each of the ``inference`` values (i.e. ``silence`` = ``0``, ``noise`` = ``1``, ``voice`` = ``2``, and ``unknown`` = ``3``) for each ``epoch``. The different ``datatype`` and ``inference`` records are randomly distributed throughout the ``epoch``. - The raw conversation data contains records with a sample of both ``datatypes`` (i.e. ``voice/noise`` = ``0``, and ``conversation`` = ``2`` ) as well as rows with for samples of each of the ``inference`` values (i.e. ``silence`` = ``0``, ``noise`` = ``1``, ``voice`` = ``2``, and ``unknown`` = ``3``) for each ``epoch``. The different ``datatype`` and ``inference`` records are randomly distributed throughout the ``epoch``.
- Additionally there are 2 - 5 records for conversations (``datatype`` = 2, and ``inference`` = -1) in each ``epoch`` and for each ``epoch`` except night, there is a conversation record that has a ``double_convo_start`` ``timestamp`` that is from the previous ``epoch``. This is to test the calculations of features across ``epochs``. - Additionally there are 2 - 5 records for conversations (``datatype`` = 2, and ``inference`` = -1) in each ``epoch`` and for each ``epoch`` except night, there is a conversation record that has a ``double_convo_start`` ``timestamp`` that is from the previous ``epoch``. This is to test the calculations of features across ``epochs``.
- There is a raw conversation data file for both android and iOS platforms (``plugin_studentlife_audio_android_raw.csv`` and ``plugin_studentlife_audio_raw.csv`` respectively). - There is a raw conversation data file for both android and iOS platforms (``plugin_studentlife_audio_android_raw.csv`` and ``plugin_studentlife_audio_raw.csv`` respectively).

View File

@ -49,6 +49,8 @@ Global Parameters
- ``FIXED_TIMEZONE``. See ``TIMEZONE`` above. This assumes that all data of all participants was collected within one time zone. - ``FIXED_TIMEZONE``. See ``TIMEZONE`` above. This assumes that all data of all participants was collected within one time zone.
- Support for multiple time zones for each participant coming soon based on the ``timezone`` table collected by Aware. - Support for multiple time zones for each participant coming soon based on the ``timezone`` table collected by Aware.
.. _phone-valid-sensed-bins:
- ``PHONE_VALID_SENSED_BINS`` - ``PHONE_VALID_SENSED_BINS``
Contains three attributes: ``COMPUTE``, ``BIN_SIZE`` and ``TABLES``. See the PHONE_VALID_SENSED_BINS_ section in the ``config.yaml`` file Contains three attributes: ``COMPUTE``, ``BIN_SIZE`` and ``TABLES``. See the PHONE_VALID_SENSED_BINS_ section in the ``config.yaml`` file
@ -492,13 +494,12 @@ features Features to be computed, see table below
====================== ============== ============= ====================== ============== =============
Name Units Description Name Units Description
====================== ============== ============= ====================== ============== =============
count rows Number of detect activity events (rows). count rows Number of episodes.
mostcommonactivity activity_type The most common ``activity_type``. If this feature is not unique the first ``activity_type`` of the set of most common ``activity_types`` is selected ordered by ``activity_type``. mostcommonactivity activity_type The most common ``activity_type``. If this feature is not unique the first ``activity_type`` of the set of most common ``activity_types`` is selected ordered by ``activity_type``.
countuniqueactivities activities Number of unique activities. countuniqueactivities activity_type Number of unique ``activity_type``.
activitychangecount transitions Number of transitions between two different activities; still to running for example. durationstationary minutes The total duration of episodes of still and tilting (phone) activities.
sumstationary minutes The total duration of episodes of still and tilting (phone) activities. durationmobile minutes The total duration of episodes of on foot, running, and on bicycle activities
summobile minutes The total duration of episodes of on foot, running, and on bicycle activities durationvehicle minutes The total duration of episodes of on vehicle activity
sumvehicle minutes The total duration of episodes of on vehicle activity
====================== ============== ============= ====================== ============== =============
**Assumptions/Observations:** **Assumptions/Observations:**
@ -844,11 +845,16 @@ avgconversationduration minutes Average duration of all conversa
sdconversationduration minutes Standard Deviation of the duration of all conversations sdconversationduration minutes Standard Deviation of the duration of all conversations
timefirstconversation minutes Minutes since midnight when the first conversation for a day segment was detected timefirstconversation minutes Minutes since midnight when the first conversation for a day segment was detected
timelastconversation minutes Minutes since midnight when the last conversation for a day segment was detected timelastconversation minutes Minutes since midnight when the last conversation for a day segment was detected
sumenergy L2-norm Sum of all energy values noisesumenergy L2-norm Sum of all energy values when inference is noise
avgenergy L2-norm Average of all energy values noiseavgenergy L2-norm Average of all energy values when inference is noise
sdenergy L2-norm Standard Deviation of all energy values noisesdenergy L2-norm Standard Deviation of all energy values when inference is noise
minenergy L2-norm Minimum of all energy values noiseminenergy L2-norm Minimum of all energy values when inference is noise
maxenergy L2-norm Maximum of all energy values noisemaxenergy L2-norm Maximum of all energy values when inference is noise
voicesumenergy L2-norm Sum of all energy values when inference is voice
voiceavgenergy L2-norm Average of all energy values when inference is voice
voicesdenergy L2-norm Standard Deviation of all energy values when inference is voice
voiceminenergy L2-norm Minimum of all energy values when inference is voice
voicemaxenergy L2-norm Maximum of all energy values when inference is voice
silencesensedfraction Ratio between minutessilence and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown) silencesensedfraction Ratio between minutessilence and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
noisesensedfraction Ratio between minutesnoise and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown) noisesensedfraction Ratio between minutesnoise and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)
voicesensedfraction Ratio between minutesvoice and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown) voicesensedfraction Ratio between minutesvoice and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown)

View File

@ -6,7 +6,9 @@
RAPIDS RAPIDS
====== ======
**R**\ eproducible **A**\ nalysis **Pi**\ pline for **D**\ ata **S**\ treams **R**\ eproducible **A**\ nalysis **Pi**\ peline for **D**\ ata **S**\ treams
Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_
Contents: Contents:
@ -27,13 +29,22 @@ Contents:
features/extracted features/extracted
.. toctree::
:maxdepth: 2
:caption: Visualization
visualization/data_exploration
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
:caption: Developers :caption: Developers
develop/remotesupport
develop/documentation develop/documentation
develop/features develop/features
develop/environments develop/environments
develop/contributors develop/contributors
develop/testing develop/testing
develop/test_cases develop/test_cases
.. _slack: http://awareframework.com:3000/

View File

@ -26,6 +26,8 @@ This is a quick guide for creating and running a simple pipeline to analysis an
#. Make sure your conda environment is active (the environment is already active in our docker container). See step 6 of :ref:`install-page`. #. Make sure your conda environment is active (the environment is already active in our docker container). See step 6 of :ref:`install-page`.
#. If you installed RAPIDS from GitHub (did not use docker) you need to download the `example db backup <https://osf.io/skqfv/files/>`_ and save it to ``data/external/rapids_example.sql``.
#. Run the following command to restore database from ``rapids_example.sql`` file:: #. Run the following command to restore database from ``rapids_example.sql`` file::
snakemake -j1 restore_sql_file snakemake -j1 restore_sql_file

View File

@ -146,6 +146,33 @@ This is a bug in Ubuntu 20.04 when trying to connect to an old MySQL server with
If you can't update your server, the quickest solution would be to import your database to another server or to a local environment. Alternatively, you could replace ``mysql-client`` and ``libmysqlclient-dev`` with ``mariadb-client`` and ``libmariadbclient-dev`` and reinstall renv. More info about this issue here https://bugs.launchpad.net/ubuntu/+source/mysql-8.0/+bug/1872541 If you can't update your server, the quickest solution would be to import your database to another server or to a local environment. Alternatively, you could replace ``mysql-client`` and ``libmysqlclient-dev`` with ``mariadb-client`` and ``libmariadbclient-dev`` and reinstall renv. More info about this issue here https://bugs.launchpad.net/ubuntu/+source/mysql-8.0/+bug/1872541
11. ``DB_TABLES`` key not found
""""""""""""""""""""""""""""""""
If you get the following error ``KeyError in line 43 of preprocessing.smk: 'DB_TABLES'``, means that the indentation of the key ``DB_TABLES`` is not matching the other child elements of ``PHONE_VALID_SENSED_BINS`` and you need to add or remove any leading whitespaces as needed.
::
PHONE_VALID_SENSED_BINS:
COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features
BIN_SIZE: &bin_size 5 # (in minutes)
# Add as many sensor tables as you have, they all improve the computation of PHONE_VALID_SENSED_BINS and PHONE_VALID_SENSED_DAYS.
# If you are extracting screen or Barnett's location features, screen and locations tables are mandatory.
DB_TABLES: []
12. Error while updating your conda environment in Ubuntu
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
If you get the following error try reinstalling conda.
::
CondaMultiError: CondaVerificationError: The package for tk located at /home/ubuntu/miniconda2/pkgs/tk-8.6.9-hed695b0_1003
appears to be corrupted. The path 'include/mysqlStubs.h'
specified in the package manifest cannot be found.
ClobberError: This transaction has incompatible packages due to a shared path.
packages: conda-forge/linux-64::llvm-openmp-10.0.0-hc9558a2_0, anaconda/linux-64::intel-openmp-2019.4-243
path: 'lib/libiomp5.so'
.. ------------------------ Links --------------------------- .. .. ------------------------ Links --------------------------- ..

View File

@ -7,6 +7,8 @@ At the moment, mobile data can be collected using different sensing frameworks (
We recommend reading Snakemake_ docs, but the main idea behind the pipeline is that every link in the analysis chain is a rule with an input and an output. Input and output are files, which can be manipulated using any programming language (although Snakemake_ has wrappers for Julia_, Python_, and R_ that can make development slightly more comfortable). Snakemake_ also allows the pipeline rules to be executed in parallel on multiple cores without any code changes. This can drastically reduce the time needed to complete an analysis. We recommend reading Snakemake_ docs, but the main idea behind the pipeline is that every link in the analysis chain is a rule with an input and an output. Input and output are files, which can be manipulated using any programming language (although Snakemake_ has wrappers for Julia_, Python_, and R_ that can make development slightly more comfortable). Snakemake_ also allows the pipeline rules to be executed in parallel on multiple cores without any code changes. This can drastically reduce the time needed to complete an analysis.
Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_
Available features: Available features:
- :ref:`accelerometer-sensor-doc` - :ref:`accelerometer-sensor-doc`
@ -39,3 +41,4 @@ We are updating these docs constantly, but if you think something needs clarific
.. _Fitbit: https://www.fitbit.com/us/home .. _Fitbit: https://www.fitbit.com/us/home
.. _Python: https://www.python.org/ .. _Python: https://www.python.org/
.. _Julia: https://julialang.org/ .. _Julia: https://julialang.org/
.. _slack: http://awareframework.com:3000/

View File

@ -0,0 +1,216 @@
.. _data_exploration:
Data Exploration
================
These plots are in beta, if you get an error while computing them please let us know.
.. _histogram-of-valid-sensed-hours:
Histogram of valid sensed hours
"""""""""""""""""""""""""""""""
See `Histogram of Valid Sensed Hours Config Code`_
**Rule Chain:**
- Rule: ``rules/preprocessing.smk/download_dataset``
- Rule: ``rules/preprocessing.smk/readable_datetime``
- Rule: ``rules/preprocessing.smk/phone_sensed_bins``
- Rule: ``rules/preprocessing.smk/phone_valid_sensed_days``
- Rule: ``rules/reports.smk/histogram_valid_sensed_hours``
.. _figure1-parameters:
**Parameters of histogram_valid_sensed_hours Rule:**
======================= =======================
Name Description
======================= =======================
plot Whether the rule is executed or not. The available options are ``True`` and ``False``.
min_valid_bins_per_hour The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS<phone-valid-sensed-bins>` for more information.
min_valid_hours_per_day The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS<phone-valid-sensed-days>` for more information.
======================= =======================
**Observations:**
This histogram shows the valid sensed hours of all participants processed in RAPIDS (See valid sensed :ref:`bins<phone-valid-sensed-bins>` and :ref:`days<phone-valid-sensed-days>` sections). It can be used as a rough indication of the AWARE client monitoring coverage during a study for all participants. See Figure 1.
.. figure:: figures/Figure1.png
:scale: 90 %
:align: center
Figure 1 Histogram of valid sensed hours for all participants
.. _heatmap-of-phone-sensed-bins:
Heatmap of phone sensed bins
""""""""""""""""""""""""""""
See `Heatmap of Phone Sensed Bins Config Code`_
**Rule Chain:**
- Rule: ``rules/preprocessing.smk/download_dataset``
- Rule: ``rules/preprocessing.smk/readable_datetime``
- Rule: ``rules/preprocessing.smk/phone_sensed_bins``
- Rule: ``rules/reports.smk/heatmap_sensed_bins``
.. _figure2-parameters:
**Parameters of heatmap_sensed_bins Rule:**
======================= =======================
Name Description
======================= =======================
plot Whether the rule is executed or not. The available options are ``True`` and ``False``.
bin_size Every hour is divided into N bins of size ``BIN_SIZE`` (in minutes). It modifies the way we compute ``data/interim/pXX/phone_sensed_bins.csv`` file.
======================= =======================
**Observations:**
In this heatmap rows are dates, columns are sensed bins for a participant, and cells color shows the number of mobile sensors that logged at least one row of data during that bin. This plot shows the periods of time without data for a participant and can be used as a rough indication of whether time-based sensors were following their sensing schedule (e.g. if location was being sensed every 2 minutes). See Figure 2.
.. figure:: figures/Figure2.png
:scale: 90 %
:align: center
Figure 2 Heatmap of phone sensed bins for a single participant
.. _heatmap-of-days-by-sensors
Heatmap of days by sensors
""""""""""""""""""""""""""
See `Heatmap of Days by Sensors Config Code`_
**Rule Chain:**
- Rule: ``rules/preprocessing.smk/download_dataset``
- Rule: ``rules/preprocessing.smk/readable_datetime``
- Rule: ``rules/preprocessing.smk/phone_sensed_bins``
- Rule: ``rules/preprocessing.smk/phone_valid_sensed_days``
- Rule: ``rules/reports.smk/heatmap_days_by_sensors``
.. _figure3-parameters:
**Parameters of heatmap_days_by_sensors Rule:**
======================= =======================
Name Description
======================= =======================
plot Whether the rule is executed or not. The available options are ``True`` and ``False``.
min_valid_bins_per_hour The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS<phone-valid-sensed-bins>` for more information.
min_valid_hours_per_day The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS<phone-valid-sensed-days>` for more information.
expected_num_of_days The number of days of data to show starting from the first day of each participant.
db_tables List of sensor tables to compute valid bins & hours.
======================= =======================
**Observations:**
In this heatmap rows are sensors, columns are days and cells color shows the normalized (0 to 1) number of valid sensed hours (See valid sensed :ref:`bins<phone-valid-sensed-bins>` and :ref:`days<phone-valid-sensed-days>` sections) collected by a sensor during a day for a participant. The user can decide how many days of data to show starting from the first day of each participant. This plot can used to judge missing data on a per participant, per sensor basis as well as the number of valid sensed hours (usable data) for each day. See Figure 3.
.. figure:: figures/Figure3.png
:scale: 90 %
:align: center
Figure 3 Heatmap of days by sensors for a participant
.. _overall-compliance-heatmap
Overall compliance heatmap
""""""""""""""""""""""""""
See `Overall Compliance Heatmap Config Code`_
**Rule Chain:**
- Rule: ``rules/preprocessing.smk/download_dataset``
- Rule: ``rules/preprocessing.smk/readable_datetime``
- Rule: ``rules/preprocessing.smk/phone_sensed_bins``
- Rule: ``rules/preprocessing.smk/phone_valid_sensed_days``
- Rule: ``rules/reports.smk/overall_compliance_heatmap``
.. _figure4-parameters:
**Parameters of overall_compliance_heatmap Rule:**
======================= =======================
Name Description
======================= =======================
plot Whether the rule is executed or not. The available options are ``True`` and ``False``.
only_show_valid_days Whether the plot only shows valid days or not. The available options are ``True`` and ``False``.
expected_num_of_days The number of days to show before today.
bin_size Every hour is divided into N bins of size ``BIN_SIZE`` (in minutes). It modifies the way we compute ``data/interim/pXX/phone_sensed_bins.csv`` file.
min_valid_bins_per_hour The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS<phone-valid-sensed-bins>` for more information.
min_valid_hours_per_day The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS<phone-valid-sensed-days>` for more information.
======================= =======================
**Observations:**
In this heatmap rows are participants, columns are days and cells color shows the valid sensed hours for a participant during a day (See valid sensed :ref:`bins<phone-valid-sensed-bins>` and :ref:`days<phone-valid-sensed-days>` sections). This plot can be configured to show a certain number of days before today using the ``EXPECTED_NUM_OF_DAYS`` parameter (by default -1 showing all days for every participant). As different participants might join the study on different dates, the x-axis has a day index instead of a date. This plot gives the user a quick overview of the amount of data collected per person and is complementary to the histogram of valid sensed hours as it is broken down per participant and per day. See Figure 4.
.. figure:: figures/Figure4.png
:scale: 90 %
:align: center
Figure 4 Overall compliance heatmap for all participants
.. _heatmap-of-correlation-matrix-between-features
Heatmap of correlation matrix between features
""""""""""""""""""""""""""""""""""""""""""""""
See `Heatmap of Correlation Matrix Config Code`_
**Rule Chain:**
- Rules to extract features
- Rule: ``rules/preprocessing.smk/download_dataset``
- Rule: ``rules/preprocessing.smk/readable_datetime``
- Rule: ``rules/preprocessing.smk/phone_sensed_bins``
- Rule: ``rules/preprocessing.smk/phone_valid_sensed_days``
- Rule: ``rules/reports.smk/heatmap_features_correlations``
.. _figure5-parameters:
**Parameters of heatmap_features_correlations Rule:**
======================= ==============
Name Description
======================= ==============
plot Whether the rule is executed or not. The available options are ``True`` and ``False``.
min_valid_bins_per_hour The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS<phone-valid-sensed-bins>` for more information.
min_valid_hours_per_day The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS<phone-valid-sensed-days>` for more information.
corr_method Method of correlation. The available options are ``pearson``, ``kendall`` and ``spearman``.
min_rows_ratio Minimum number of observations required per pair of columns to have a valid correlation coefient. Currently, only available for ``pearson`` and ``spearman`` correlation.
phone_features The list of phone features.
fitbit_features The list of Fitbit features.
corr_threshold Only correlation coefficients larger than ``corr_threshold`` can be shown in the heatmap.
======================= ==============
**Observations:**
Columns and rows are features computed in RAPIDS, cells color represents the correlation coefficient between all days of data for every pair of feature of all participants. The user can specify a minimum number of observations required to compute the correlation between two features using the ``MIN_ROWS_RATIO`` parameter (0.5 by default). In addition, this plot can be configured to only display correlation coefficients above a threshold using the ``CORR_THRESHOLD`` parameter (0.1 by default). See Figure 5.
.. figure:: figures/Figure5.png
:scale: 90 %
:align: center
Figure 5 Correlation matrix heatmap for all the data of all participants
.. _`Histogram of Valid Sensed Hours Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L221
.. _`Heatmap of Phone Sensed Bins Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L233
.. _`Heatmap of Days by Sensors Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L226
.. _`Overall Compliance Heatmap Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L237
.. _`Heatmap of Correlation Matrix Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L211

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 274 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 113 KiB

View File

@ -2,6 +2,7 @@ source("renv/activate.R")
library("dplyr", warn.conflicts = F) library("dplyr", warn.conflicts = F)
library(tidyr) library(tidyr)
library(lubridate)
all_sensors <- snakemake@input[["all_sensors"]] all_sensors <- snakemake@input[["all_sensors"]]
bin_size <- snakemake@params[["bin_size"]] bin_size <- snakemake@params[["bin_size"]]
@ -16,16 +17,24 @@ for(sensor in all_sensors){
all_sensor_data <- rbind(all_sensor_data, sensor_data) all_sensor_data <- rbind(all_sensor_data, sensor_data)
} }
if(nrow(all_sensor_data) == 0){
bins = seq(0, 59, by = bin_size)
hours = seq(0, 23, 1)
write.csv(crossing(hours, bins) %>% unite("hour_bin",hours, bins, sep = "_") %>% mutate(value = NA, local_date = NA) %>% pivot_wider(names_from = hour_bin, values_from=value) %>% head(0), output_file, row.names = FALSE)
} else{
phone_sensed_bins <- all_sensor_data %>% phone_sensed_bins <- all_sensor_data %>%
mutate(bin = (local_minute %/% bin_size) * bin_size) %>% # bin rows into bin_size-minute bins mutate(bin = (local_minute %/% bin_size) * bin_size) %>% # bin rows into bin_size-minute bins
group_by(local_date, local_hour, bin) %>% group_by(local_date, local_hour, bin) %>%
summarise(sensor_count = n_distinct(sensor)) %>% summarise(sensor_count = n_distinct(sensor)) %>%
ungroup() %>% ungroup() %>%
mutate(local_date = lubridate::ymd(local_date)) %>%
complete(local_date = seq.Date(min(local_date), max(local_date), by="day"),
fill = list(local_hour = 0, bin = 0, sensor_count = 0)) %>%
complete(nesting(local_date), complete(nesting(local_date),
local_hour = seq(0, 23, 1), local_hour = seq(0, 23, 1),
bin = seq(0, (59 %/% bin_size) * bin_size, bin_size), bin = seq(0, 59, bin_size),
fill = list(sensor_count=0)) %>% fill = list(sensor_count=0)) %>%
pivot_wider(names_from = c(local_hour, bin), values_from = sensor_count) pivot_wider(names_from = c(local_hour, bin), values_from = sensor_count)
write.csv(phone_sensed_bins, output_file, row.names = FALSE) write.csv(phone_sensed_bins, output_file, row.names = FALSE)
}

View File

@ -9,9 +9,8 @@ output_file <- snakemake@output[[1]]
phone_valid_sensed_days <- phone_sensed_bins %>% phone_valid_sensed_days <- phone_sensed_bins %>%
pivot_longer(cols = -local_date, names_to = c("hour", "bin"), names_sep = "_") %>% pivot_longer(cols = -local_date, names_to = c("hour", "bin"), names_sep = "_") %>%
filter(value > 0) %>%
group_by(local_date, hour) %>% group_by(local_date, hour) %>%
summarise(valid_bins = n()) %>% summarise(valid_bins = sum(value > 0)) %>%
group_by(local_date) %>% group_by(local_date) %>%
summarise(valid_sensed_hours = sum(valid_bins >= min_valid_bins_per_hour)) %>% summarise(valid_sensed_hours = sum(valid_bins >= min_valid_bins_per_hour)) %>%
mutate(is_valid_sensed_day = ifelse(valid_sensed_hours >= min_valid_hours_per_day, TRUE, FALSE)) mutate(is_valid_sensed_day = ifelse(valid_sensed_hours >= min_valid_hours_per_day, TRUE, FALSE))

View File

@ -75,7 +75,12 @@ barnett_features <- function(sensor_data_files, day_segment, params){
# Select only the columns that the algorithm needs # Select only the columns that the algorithm needs
location <- location %>% select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy) location <- location %>% select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy)
if(nrow(location %>% filter(accuracy < accuracy_limit)) > 1){
outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone) outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone)
} else {
print(paste("Cannot compute location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT", accuracy_limit))
outputMobility <- NULL
}
if(is.null(outputMobility)){ if(is.null(outputMobility)){
location_features <- create_empty_file(requested_features) location_features <- create_empty_file(requested_features)

View File

@ -46,6 +46,11 @@ def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_seg
location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)] location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)]
if location_data.empty:
location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute])
location_features = location_features.reset_index(drop=True)
return location_features
if "locationvariance" in features_to_compute: if "locationvariance" in features_to_compute:
location_features["locations_doryab_locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var() location_features["locations_doryab_locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()
@ -353,6 +358,8 @@ def radius_of_gyration(locationData,sampling_frequency):
rog = rog + (time_in_cluster * distance) rog = rog + (time_in_cluster * distance)
time_all_clusters = valid_clusters.shape[0] * sampling_frequency time_all_clusters = valid_clusters.shape[0] * sampling_frequency
if time_all_clusters == 0:
return 0
final_rog = (1/time_all_clusters) * rog final_rog = (1/time_all_clusters) * rog
return np.sqrt(final_rog) return np.sqrt(final_rog)

View File

@ -40,16 +40,18 @@ def getOverallComplianceHeatmap(sensors_with_data, valid_sensed_hours, last_cert
x=[date.replace("-", "/") for date in last_certain_dates] if expected_num_of_days != -1 else last_certain_dates, x=[date.replace("-", "/") for date in last_certain_dates] if expected_num_of_days != -1 else last_certain_dates,
y=[pid + "." + label for pid, label in zip(sensors_with_data["pid"].to_list(), sensors_with_data["label"].to_list())], y=[pid + "." + label for pid, label in zip(sensors_with_data["pid"].to_list(), sensors_with_data["label"].to_list())],
text=sensors_with_data[last_certain_dates].values, text=sensors_with_data[last_certain_dates].values,
hovertemplate="Date: %{x}<br>Participant: %{y}<br>Valid sensed hours: %{z}<br>Number of sensors with data: %{text}<extra></extra>" if expected_num_of_days != -1 else "Date_idx: %{x}<br>Participant: %{y}<br>Valid sensed hours: %{z}<br>Number of sensors with data: %{text}<extra></extra>", hovertemplate="Date: %{x}<br>Participant: %{y}<br>Valid sensed hours: %{z}<br>Number of sensors with data: %{text}<extra></extra>" if expected_num_of_days != -1 else "Day index: %{x}<br>Participant: %{y}<br>Valid sensed hours: %{z}<br>Number of sensors with data: %{text}<extra></extra>",
colorscale="Viridis", colorscale="Viridis",
colorbar={"tick0": 0,"dtick": 1}, colorbar={"tick0": 0,"dtick": 1},
showscale=True)) showscale=True))
if expected_num_of_days != -1: if expected_num_of_days != -1:
plot.update_layout(title="Overall compliance heatmap for last " + str(expected_num_of_days) + " days.<br>Bin's color shows valid sensed hours for that day.<br>A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes") plot.update_layout(title="Overall compliance heatmap for last " + str(expected_num_of_days) + " days.<br>Bin's color shows valid sensed hours for that day.<br>A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes.<br>You can hover over every day to see the number of sensors with data in that day.")
else: else:
plot.update_layout(title="Overall compliance heatmap for all days.<br>Bin's color shows valid sensed hours for that day.<br>A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes") plot.update_layout(title="Overall compliance heatmap for all days.<br>Bin's color shows valid sensed hours for that day.<br>A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes.<br>You can hover over every day to see the number of sensors with data in that day.")
plot["layout"]["xaxis"].update(side="bottom") plot["layout"]["xaxis"].update(side="bottom")
plot["layout"].update(xaxis_title="Day indexes")
plot["layout"].update(margin=dict(t=160))
pio.write_html(plot, file=output_path, auto_open=False, include_plotlyjs="cdn") pio.write_html(plot, file=output_path, auto_open=False, include_plotlyjs="cdn")