diff --git a/.travis.yml b/.travis.yml index 4e0ba7ff..d50cc0b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,7 @@ services: - mysql - docker +sudo: required language: python jobs: include: @@ -39,7 +40,6 @@ jobs: - "$TRAVIS_BUILD_DIR/renv/library" script: - bash tests/scripts/run_tests.sh all test - # - bash tests/scripts/run_tests.sh periodic test && tests/scripts/run_tests.sh frequency test - name: Python 3.7 on macOS os: osx osx_image: xcode11.3 @@ -71,7 +71,6 @@ jobs: - "$TRAVIS_BUILD_DIR/renv/library" script: - bash tests/scripts/run_tests.sh all test - # - bash tests/scripts/run_tests.sh periodic test # && tests/scripts/run_tests.sh frequency test - stage: deploy name: Python 3.7 on Xenial Linux Docker os: linux @@ -81,13 +80,16 @@ jobs: - docker login -u "agamk" -p $DOCKERPWD - docker tag rapids agamk/rapids:travislatest - docker push agamk/rapids:travislatest -#branches: -# only: -# - master -# - day_segment +branches: + only: + - master + - day_segment + stages: - name: deploy - if: branch = master AND type = push + if: branch = master AND \ + type = push + notifications: email: false slack: diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..3bb4b72e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,53 @@ +# getting base image ubuntu +FROM ubuntu:20.04 +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && apt install -y \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev \ + libmysqlclient-dev \ + mysql-server +RUN apt-get update && apt-get install -y gnupg +RUN apt-get update && apt-get install -y software-properties-common +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' +RUN apt update && apt install -y r-base +RUN apt install -y pandoc +RUN apt install -y git +RUN apt-get update && apt-get install -y vim +RUN apt update && apt install -y unzip +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 +ENV PATH /opt/conda/bin:$PATH + +RUN apt-get update --fix-missing && \ + apt-get install -y wget bzip2 ca-certificates curl git && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \ + /bin/bash ~/miniconda.sh -b -p /opt/conda && \ + rm ~/miniconda.sh && \ + /opt/conda/bin/conda clean -tipsy && \ + ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ + echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + echo "conda activate base" >> ~/.bashrc + +ENV TINI_VERSION v0.16.1 +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /usr/bin/tini +RUN chmod +x /usr/bin/tini +RUN git clone https://github.com/carissalow/rapids +ENTRYPOINT [ "/usr/bin/tini", "--" ] +CMD [ "/bin/bash" ] +RUN conda update -n base -c defaults conda +WORKDIR /rapids +RUN conda env create -f environment.yml -n rapids +RUN Rscript --vanilla -e 'install.packages("rmarkdown", repos="http://cran.us.r-project.org")' +RUN R -e 'renv::restore()' +ADD https://osf.io/587wc/download data/external +RUN mv data/external/download data/external/rapids_example.sql.zip +RUN unzip data/external/rapids_example.sql.zip +RUN cp rapids_example.sql data/external/rapids_example.sql +RUN rm data/external/rapids_example.sql.zip +RUN rm rapids_example.sql +RUN echo "source activate rapids" > ~/.bashrc +ENV PATH /opt/conda/envs/rapids/bin:$PATH \ No newline at end of file diff --git a/config.yaml b/config.yaml index d82fcffa..c7d9cf49 100644 --- a/config.yaml +++ b/config.yaml @@ -243,10 +243,11 @@ PHONE_CONVERSATION: RAPIDS: COMPUTE: False FEATURES: ["minutessilence", "minutesnoise", "minutesvoice", "minutesunknown","sumconversationduration","avgconversationduration", - "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","sumenergy", - "avgenergy","sdenergy","minenergy","maxenergy","silencesensedfraction","noisesensedfraction", - "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction", - "unknownexpectedfraction","countconversation"] + "sdconversationduration","minconversationduration","maxconversationduration","timefirstconversation","timelastconversation","noisesumenergy", + "noiseavgenergy","noisesdenergy","noiseminenergy","noisemaxenergy","voicesumenergy", + "voiceavgenergy","voicesdenergy","voiceminenergy","voicemaxenergy","silencesensedfraction","noisesensedfraction", + "voicesensedfraction","unknownsensedfraction","silenceexpectedfraction","noiseexpectedfraction","voiceexpectedfraction", + "unknownexpectedfraction","countconversation"] RECORDING_MINUTES: 1 PAUSED_MINUTES : 3 SRC_FOLDER: "rapids" # inside src/features/phone_conversation diff --git a/docs/develop/contributors.rst b/docs/develop/contributors.rst index c4eb8440..d780279f 100644 --- a/docs/develop/contributors.rst +++ b/docs/develop/contributors.rst @@ -57,15 +57,27 @@ Nicolas is a rising senior studying computer science at the University of Pittsb Nikunj Goel, BS """""""""""""""" -**intern** +**Intern** Nik is a graduate student at the University of Pittsburgh pursuing Master of Science in Information Science. He earned his Bachelor of Technology degree in Information Technology from India. He is a Data Enthusiasts and passionate about finding the meaning out of raw data. In a long term, his goal is to create a breakthrough in Data Science and Deep Learning. `Nikunj Goel Linkedin Profile`_ +Agam Kumar, BS +"""""""""""""""" +**Research Assistant at CMU** + +Agam is a junior at Carnegie Mellon University studying Statistics and Machine Learning and pursuing an additional major in Computer Science. He is a member of the Data Science team in the Health and Human Performance Lab at CMU and has keen interests in software development and data science. His research interests include ML applications in medicine. + +`Agam Kumar Linkedin Profile`_ + +`Agam Kumar Github Profile`_ + .. _`Julio Vega Personal Website`: https://juliovega.info/ .. _`Meng Li Linkedin Profile`: https://www.linkedin.com/in/meng-li-57238414a .. _`Meng Li Github Profile`: https://github.com/Meng6 .. _`Kwesi Aguillera Linkedin Profile`: https://www.linkedin.com/in/kwesi-aguillera-29529823 .. _`Echhit Joshi Linkedin Profile`: https://www.linkedin.com/in/echhitjoshi/ .. _`Nikunj Goel Linkedin Profile`: https://www.linkedin.com/in/nikunjgoel95/ +.. _`Agam Kumar Linkedin Profile`: https://www.linkedin.com/in/agam-kumar +.. _`Agam Kumar Github Profile`: https://github.com/agam-kumar diff --git a/docs/develop/documentation.rst b/docs/develop/documentation.rst index 490601c9..cd9dc06c 100644 --- a/docs/develop/documentation.rst +++ b/docs/develop/documentation.rst @@ -1,17 +1,21 @@ How to Edit Documentation ============================ -The following is a basic guide for editing the documentation for this project. The documentation is rendered using Sphinx_ documentation builder. This guide is intended to be a basic guide that will allow a contributer to start editing the documentation for the RAPIDS Pipeline. The first step is to install Sphinx. +The following is a basic guide for editing the documentation for this project. The documentation is rendered using Sphinx_ documentation builder -Mac OS +Quick start up +---------------------------------- - - ``brew install sphinx-doc`` +#. Install Sphinx in Mac OS ``brew install sphinx-doc`` or Linux (Ubuntu) ``apt-get install python3-sphinx`` -Linux (Ubuntu) +#. Go to the docs folder ``cd docs`` - - ``apt-get install python3-sphinx`` +#. Change any ``.rst`` file you need to modify + +#. To visualise the results locally do ``make dirhtml`` and check the html files in the ``_build/dirhtml`` directory + +#. When you are done, push your changes to the git repo. -Sphinx is a tool that translates a set of reStructuredText_ source files into various output formats such as HTML and PDF, automatically producing cross-references, indices, etc. The following is a basic outline of structure of Sphinx workspace and the syntax of reStructuredText. Sphinx Workspace Structure ---------------------------- @@ -34,12 +38,6 @@ Thus the directory structure for the above example is shown below:: ├── introduction.rst └── installation.rst -Once the ``index.rst`` has been editted and content has been added and/or editted the documentation is built using the following command:: - - $ make dirhtml - -This command creates the ``_build`` directory which contains the generated HTML files of the documentation. It shoould be noted that once you have pushed your change to the repository the changes will be published even if you have not run ``make dirhtml`` - Basic reStructuredText Syntax ------------------------------- diff --git a/docs/develop/remotesupport.rst b/docs/develop/remotesupport.rst new file mode 100644 index 00000000..213b1420 --- /dev/null +++ b/docs/develop/remotesupport.rst @@ -0,0 +1,16 @@ +Remote Support +====================================== + +We use the Live Share extension of Visual Studio Code to debug bugs when sharing data or database credentials is not possible. + +#. Install `Visual Studio Code `_ + +#. Open you rapids folder in a new VSCode window + +#. Open a new Terminal ``Terminal > New terminal`` + +#. Install the `Live Share extension pack `_ + +#. Press ``Ctrl+P``/``Cmd+P`` and run this command ``>live share: start collaboration session`` + +#. Follow the instructions and share the session link you receive \ No newline at end of file diff --git a/docs/develop/test_cases.rst b/docs/develop/test_cases.rst index b2651754..593fdbbb 100644 --- a/docs/develop/test_cases.rst +++ b/docs/develop/test_cases.rst @@ -100,7 +100,7 @@ Activity Recognition Conversation """"""""""""" - - The raw conversation data file contains data for 1 day. + - The raw conversation data file contains data for 2 day. - The raw conversation data contains records with a sample of both ``datatypes`` (i.e. ``voice/noise`` = ``0``, and ``conversation`` = ``2`` ) as well as rows with for samples of each of the ``inference`` values (i.e. ``silence`` = ``0``, ``noise`` = ``1``, ``voice`` = ``2``, and ``unknown`` = ``3``) for each ``epoch``. The different ``datatype`` and ``inference`` records are randomly distributed throughout the ``epoch``. - Additionally there are 2 - 5 records for conversations (``datatype`` = 2, and ``inference`` = -1) in each ``epoch`` and for each ``epoch`` except night, there is a conversation record that has a ``double_convo_start`` ``timestamp`` that is from the previous ``epoch``. This is to test the calculations of features across ``epochs``. - There is a raw conversation data file for both android and iOS platforms (``plugin_studentlife_audio_android_raw.csv`` and ``plugin_studentlife_audio_raw.csv`` respectively). diff --git a/docs/features/extracted.rst b/docs/features/extracted.rst index 1e89f46f..e5f12cee 100644 --- a/docs/features/extracted.rst +++ b/docs/features/extracted.rst @@ -49,6 +49,8 @@ Global Parameters - ``FIXED_TIMEZONE``. See ``TIMEZONE`` above. This assumes that all data of all participants was collected within one time zone. - Support for multiple time zones for each participant coming soon based on the ``timezone`` table collected by Aware. +.. _phone-valid-sensed-bins: + - ``PHONE_VALID_SENSED_BINS`` Contains three attributes: ``COMPUTE``, ``BIN_SIZE`` and ``TABLES``. See the PHONE_VALID_SENSED_BINS_ section in the ``config.yaml`` file @@ -492,13 +494,12 @@ features Features to be computed, see table below ====================== ============== ============= Name Units Description ====================== ============== ============= -count rows Number of detect activity events (rows). +count rows Number of episodes. mostcommonactivity activity_type The most common ``activity_type``. If this feature is not unique the first ``activity_type`` of the set of most common ``activity_types`` is selected ordered by ``activity_type``. -countuniqueactivities activities Number of unique activities. -activitychangecount transitions Number of transitions between two different activities; still to running for example. -sumstationary minutes The total duration of episodes of still and tilting (phone) activities. -summobile minutes The total duration of episodes of on foot, running, and on bicycle activities -sumvehicle minutes The total duration of episodes of on vehicle activity +countuniqueactivities activity_type Number of unique ``activity_type``. +durationstationary minutes The total duration of episodes of still and tilting (phone) activities. +durationmobile minutes The total duration of episodes of on foot, running, and on bicycle activities +durationvehicle minutes The total duration of episodes of on vehicle activity ====================== ============== ============= **Assumptions/Observations:** @@ -844,11 +845,16 @@ avgconversationduration minutes Average duration of all conversa sdconversationduration minutes Standard Deviation of the duration of all conversations timefirstconversation minutes Minutes since midnight when the first conversation for a day segment was detected timelastconversation minutes Minutes since midnight when the last conversation for a day segment was detected -sumenergy L2-norm Sum of all energy values -avgenergy L2-norm Average of all energy values -sdenergy L2-norm Standard Deviation of all energy values -minenergy L2-norm Minimum of all energy values -maxenergy L2-norm Maximum of all energy values +noisesumenergy L2-norm Sum of all energy values when inference is noise +noiseavgenergy L2-norm Average of all energy values when inference is noise +noisesdenergy L2-norm Standard Deviation of all energy values when inference is noise +noiseminenergy L2-norm Minimum of all energy values when inference is noise +noisemaxenergy L2-norm Maximum of all energy values when inference is noise +voicesumenergy L2-norm Sum of all energy values when inference is voice +voiceavgenergy L2-norm Average of all energy values when inference is voice +voicesdenergy L2-norm Standard Deviation of all energy values when inference is voice +voiceminenergy L2-norm Minimum of all energy values when inference is voice +voicemaxenergy L2-norm Maximum of all energy values when inference is voice silencesensedfraction Ratio between minutessilence and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown) noisesensedfraction Ratio between minutesnoise and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown) voicesensedfraction Ratio between minutesvoice and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown) diff --git a/docs/index.rst b/docs/index.rst index d3805bc9..9eb010f7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,7 +6,9 @@ RAPIDS ====== -**R**\ eproducible **A**\ nalysis **Pi**\ pline for **D**\ ata **S**\ treams +**R**\ eproducible **A**\ nalysis **Pi**\ peline for **D**\ ata **S**\ treams + +Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_ Contents: @@ -27,13 +29,22 @@ Contents: features/extracted +.. toctree:: + :maxdepth: 2 + :caption: Visualization + + visualization/data_exploration + .. toctree:: :maxdepth: 2 :caption: Developers + develop/remotesupport develop/documentation develop/features develop/environments develop/contributors develop/testing - develop/test_cases \ No newline at end of file + develop/test_cases + +.. _slack: http://awareframework.com:3000/ diff --git a/docs/usage/example.rst b/docs/usage/example.rst index 73cf9413..79be53ac 100644 --- a/docs/usage/example.rst +++ b/docs/usage/example.rst @@ -26,6 +26,8 @@ This is a quick guide for creating and running a simple pipeline to analysis an #. Make sure your conda environment is active (the environment is already active in our docker container). See step 6 of :ref:`install-page`. +#. If you installed RAPIDS from GitHub (did not use docker) you need to download the `example db backup `_ and save it to ``data/external/rapids_example.sql``. + #. Run the following command to restore database from ``rapids_example.sql`` file:: snakemake -j1 restore_sql_file diff --git a/docs/usage/faq.rst b/docs/usage/faq.rst index 30920b44..b1d0f9c7 100644 --- a/docs/usage/faq.rst +++ b/docs/usage/faq.rst @@ -146,6 +146,33 @@ This is a bug in Ubuntu 20.04 when trying to connect to an old MySQL server with If you can't update your server, the quickest solution would be to import your database to another server or to a local environment. Alternatively, you could replace ``mysql-client`` and ``libmysqlclient-dev`` with ``mariadb-client`` and ``libmariadbclient-dev`` and reinstall renv. More info about this issue here https://bugs.launchpad.net/ubuntu/+source/mysql-8.0/+bug/1872541 +11. ``DB_TABLES`` key not found +"""""""""""""""""""""""""""""""" + +If you get the following error ``KeyError in line 43 of preprocessing.smk: 'DB_TABLES'``, means that the indentation of the key ``DB_TABLES`` is not matching the other child elements of ``PHONE_VALID_SENSED_BINS`` and you need to add or remove any leading whitespaces as needed. + +:: + + PHONE_VALID_SENSED_BINS: + COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features + BIN_SIZE: &bin_size 5 # (in minutes) + # Add as many sensor tables as you have, they all improve the computation of PHONE_VALID_SENSED_BINS and PHONE_VALID_SENSED_DAYS. + # If you are extracting screen or Barnett's location features, screen and locations tables are mandatory. + DB_TABLES: [] + +12. Error while updating your conda environment in Ubuntu +""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +If you get the following error try reinstalling conda. + +:: + + CondaMultiError: CondaVerificationError: The package for tk located at /home/ubuntu/miniconda2/pkgs/tk-8.6.9-hed695b0_1003 + appears to be corrupted. The path 'include/mysqlStubs.h' + specified in the package manifest cannot be found. + ClobberError: This transaction has incompatible packages due to a shared path. + packages: conda-forge/linux-64::llvm-openmp-10.0.0-hc9558a2_0, anaconda/linux-64::intel-openmp-2019.4-243 + path: 'lib/libiomp5.so' .. ------------------------ Links --------------------------- .. diff --git a/docs/usage/introduction.rst b/docs/usage/introduction.rst index bfe98b55..b14d8743 100644 --- a/docs/usage/introduction.rst +++ b/docs/usage/introduction.rst @@ -7,6 +7,8 @@ At the moment, mobile data can be collected using different sensing frameworks ( We recommend reading Snakemake_ docs, but the main idea behind the pipeline is that every link in the analysis chain is a rule with an input and an output. Input and output are files, which can be manipulated using any programming language (although Snakemake_ has wrappers for Julia_, Python_, and R_ that can make development slightly more comfortable). Snakemake_ also allows the pipeline rules to be executed in parallel on multiple cores without any code changes. This can drastically reduce the time needed to complete an analysis. +Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_ + Available features: - :ref:`accelerometer-sensor-doc` @@ -39,3 +41,4 @@ We are updating these docs constantly, but if you think something needs clarific .. _Fitbit: https://www.fitbit.com/us/home .. _Python: https://www.python.org/ .. _Julia: https://julialang.org/ +.. _slack: http://awareframework.com:3000/ diff --git a/docs/visualization/data_exploration.rst b/docs/visualization/data_exploration.rst new file mode 100644 index 00000000..89cbbd4b --- /dev/null +++ b/docs/visualization/data_exploration.rst @@ -0,0 +1,216 @@ +.. _data_exploration: + +Data Exploration +================ + +These plots are in beta, if you get an error while computing them please let us know. + +.. _histogram-of-valid-sensed-hours: + +Histogram of valid sensed hours +""""""""""""""""""""""""""""""" + +See `Histogram of Valid Sensed Hours Config Code`_ + +**Rule Chain:** + +- Rule: ``rules/preprocessing.smk/download_dataset`` +- Rule: ``rules/preprocessing.smk/readable_datetime`` +- Rule: ``rules/preprocessing.smk/phone_sensed_bins`` +- Rule: ``rules/preprocessing.smk/phone_valid_sensed_days`` +- Rule: ``rules/reports.smk/histogram_valid_sensed_hours`` + +.. _figure1-parameters: + +**Parameters of histogram_valid_sensed_hours Rule:** + +======================= ======================= +Name Description +======================= ======================= +plot Whether the rule is executed or not. The available options are ``True`` and ``False``. +min_valid_bins_per_hour The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS` for more information. +min_valid_hours_per_day The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS` for more information. +======================= ======================= + +**Observations:** + +This histogram shows the valid sensed hours of all participants processed in RAPIDS (See valid sensed :ref:`bins` and :ref:`days` sections). It can be used as a rough indication of the AWARE client monitoring coverage during a study for all participants. See Figure 1. + +.. figure:: figures/Figure1.png + :scale: 90 % + :align: center + + Figure 1 Histogram of valid sensed hours for all participants + + +.. _heatmap-of-phone-sensed-bins: + +Heatmap of phone sensed bins +"""""""""""""""""""""""""""" + +See `Heatmap of Phone Sensed Bins Config Code`_ + +**Rule Chain:** + +- Rule: ``rules/preprocessing.smk/download_dataset`` +- Rule: ``rules/preprocessing.smk/readable_datetime`` +- Rule: ``rules/preprocessing.smk/phone_sensed_bins`` +- Rule: ``rules/reports.smk/heatmap_sensed_bins`` + +.. _figure2-parameters: + +**Parameters of heatmap_sensed_bins Rule:** + +======================= ======================= +Name Description +======================= ======================= +plot Whether the rule is executed or not. The available options are ``True`` and ``False``. +bin_size Every hour is divided into N bins of size ``BIN_SIZE`` (in minutes). It modifies the way we compute ``data/interim/pXX/phone_sensed_bins.csv`` file. +======================= ======================= + +**Observations:** + +In this heatmap rows are dates, columns are sensed bins for a participant, and cells’ color shows the number of mobile sensors that logged at least one row of data during that bin. This plot shows the periods of time without data for a participant and can be used as a rough indication of whether time-based sensors were following their sensing schedule (e.g. if location was being sensed every 2 minutes). See Figure 2. + +.. figure:: figures/Figure2.png + :scale: 90 % + :align: center + + Figure 2 Heatmap of phone sensed bins for a single participant + + +.. _heatmap-of-days-by-sensors + +Heatmap of days by sensors +"""""""""""""""""""""""""" + +See `Heatmap of Days by Sensors Config Code`_ + +**Rule Chain:** + +- Rule: ``rules/preprocessing.smk/download_dataset`` +- Rule: ``rules/preprocessing.smk/readable_datetime`` +- Rule: ``rules/preprocessing.smk/phone_sensed_bins`` +- Rule: ``rules/preprocessing.smk/phone_valid_sensed_days`` +- Rule: ``rules/reports.smk/heatmap_days_by_sensors`` + +.. _figure3-parameters: + +**Parameters of heatmap_days_by_sensors Rule:** + +======================= ======================= +Name Description +======================= ======================= +plot Whether the rule is executed or not. The available options are ``True`` and ``False``. +min_valid_bins_per_hour The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS` for more information. +min_valid_hours_per_day The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS` for more information. +expected_num_of_days The number of days of data to show starting from the first day of each participant. +db_tables List of sensor tables to compute valid bins & hours. +======================= ======================= + +**Observations:** + +In this heatmap rows are sensors, columns are days and cells’ color shows the normalized (0 to 1) number of valid sensed hours (See valid sensed :ref:`bins` and :ref:`days` sections) collected by a sensor during a day for a participant. The user can decide how many days of data to show starting from the first day of each participant. This plot can used to judge missing data on a per participant, per sensor basis as well as the number of valid sensed hours (usable data) for each day. See Figure 3. + +.. figure:: figures/Figure3.png + :scale: 90 % + :align: center + + Figure 3 Heatmap of days by sensors for a participant + + +.. _overall-compliance-heatmap + +Overall compliance heatmap +"""""""""""""""""""""""""" + +See `Overall Compliance Heatmap Config Code`_ + +**Rule Chain:** + +- Rule: ``rules/preprocessing.smk/download_dataset`` +- Rule: ``rules/preprocessing.smk/readable_datetime`` +- Rule: ``rules/preprocessing.smk/phone_sensed_bins`` +- Rule: ``rules/preprocessing.smk/phone_valid_sensed_days`` +- Rule: ``rules/reports.smk/overall_compliance_heatmap`` + +.. _figure4-parameters: + +**Parameters of overall_compliance_heatmap Rule:** + +======================= ======================= +Name Description +======================= ======================= +plot Whether the rule is executed or not. The available options are ``True`` and ``False``. +only_show_valid_days Whether the plot only shows valid days or not. The available options are ``True`` and ``False``. +expected_num_of_days The number of days to show before today. +bin_size Every hour is divided into N bins of size ``BIN_SIZE`` (in minutes). It modifies the way we compute ``data/interim/pXX/phone_sensed_bins.csv`` file. +min_valid_bins_per_hour The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS` for more information. +min_valid_hours_per_day The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS` for more information. +======================= ======================= + +**Observations:** + +In this heatmap rows are participants, columns are days and cells’ color shows the valid sensed hours for a participant during a day (See valid sensed :ref:`bins` and :ref:`days` sections). This plot can be configured to show a certain number of days before today using the ``EXPECTED_NUM_OF_DAYS`` parameter (by default -1 showing all days for every participant). As different participants might join the study on different dates, the x-axis has a day index instead of a date. This plot gives the user a quick overview of the amount of data collected per person and is complementary to the histogram of valid sensed hours as it is broken down per participant and per day. See Figure 4. + +.. figure:: figures/Figure4.png + :scale: 90 % + :align: center + + Figure 4 Overall compliance heatmap for all participants + + +.. _heatmap-of-correlation-matrix-between-features + +Heatmap of correlation matrix between features +"""""""""""""""""""""""""""""""""""""""""""""" + +See `Heatmap of Correlation Matrix Config Code`_ + +**Rule Chain:** + +- Rules to extract features +- Rule: ``rules/preprocessing.smk/download_dataset`` +- Rule: ``rules/preprocessing.smk/readable_datetime`` +- Rule: ``rules/preprocessing.smk/phone_sensed_bins`` +- Rule: ``rules/preprocessing.smk/phone_valid_sensed_days`` +- Rule: ``rules/reports.smk/heatmap_features_correlations`` + +.. _figure5-parameters: + +**Parameters of heatmap_features_correlations Rule:** + +======================= ============== +Name Description +======================= ============== +plot Whether the rule is executed or not. The available options are ``True`` and ``False``. +min_valid_bins_per_hour The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS` for more information. +min_valid_hours_per_day The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS` for more information. +corr_method Method of correlation. The available options are ``pearson``, ``kendall`` and ``spearman``. +min_rows_ratio Minimum number of observations required per pair of columns to have a valid correlation coefient. Currently, only available for ``pearson`` and ``spearman`` correlation. +phone_features The list of phone features. +fitbit_features The list of Fitbit features. +corr_threshold Only correlation coefficients larger than ``corr_threshold`` can be shown in the heatmap. +======================= ============== + +**Observations:** + +Columns and rows are features computed in RAPIDS, cells’ color represents the correlation coefficient between all days of data for every pair of feature of all participants. The user can specify a minimum number of observations required to compute the correlation between two features using the ``MIN_ROWS_RATIO`` parameter (0.5 by default). In addition, this plot can be configured to only display correlation coefficients above a threshold using the ``CORR_THRESHOLD`` parameter (0.1 by default). See Figure 5. + +.. figure:: figures/Figure5.png + :scale: 90 % + :align: center + + Figure 5 Correlation matrix heatmap for all the data of all participants + + + + + + + +.. _`Histogram of Valid Sensed Hours Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L221 +.. _`Heatmap of Phone Sensed Bins Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L233 +.. _`Heatmap of Days by Sensors Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L226 +.. _`Overall Compliance Heatmap Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L237 +.. _`Heatmap of Correlation Matrix Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L211 diff --git a/docs/visualization/figures/Figure1.png b/docs/visualization/figures/Figure1.png new file mode 100644 index 00000000..c4d47637 Binary files /dev/null and b/docs/visualization/figures/Figure1.png differ diff --git a/docs/visualization/figures/Figure2.png b/docs/visualization/figures/Figure2.png new file mode 100644 index 00000000..af22ccde Binary files /dev/null and b/docs/visualization/figures/Figure2.png differ diff --git a/docs/visualization/figures/Figure3.png b/docs/visualization/figures/Figure3.png new file mode 100644 index 00000000..09fd60b3 Binary files /dev/null and b/docs/visualization/figures/Figure3.png differ diff --git a/docs/visualization/figures/Figure4.png b/docs/visualization/figures/Figure4.png new file mode 100644 index 00000000..6bfeb752 Binary files /dev/null and b/docs/visualization/figures/Figure4.png differ diff --git a/docs/visualization/figures/Figure5.png b/docs/visualization/figures/Figure5.png new file mode 100644 index 00000000..93eeeee4 Binary files /dev/null and b/docs/visualization/figures/Figure5.png differ diff --git a/src/data/phone_sensed_bins.R b/src/data/phone_sensed_bins.R index 0276a862..cc6dc791 100644 --- a/src/data/phone_sensed_bins.R +++ b/src/data/phone_sensed_bins.R @@ -2,6 +2,7 @@ source("renv/activate.R") library("dplyr", warn.conflicts = F) library(tidyr) +library(lubridate) all_sensors <- snakemake@input[["all_sensors"]] bin_size <- snakemake@params[["bin_size"]] @@ -16,16 +17,24 @@ for(sensor in all_sensors){ all_sensor_data <- rbind(all_sensor_data, sensor_data) } -phone_sensed_bins <- all_sensor_data %>% - mutate(bin = (local_minute %/% bin_size) * bin_size) %>% # bin rows into bin_size-minute bins - group_by(local_date, local_hour, bin) %>% - summarise(sensor_count = n_distinct(sensor)) %>% - ungroup() %>% - complete(nesting(local_date), - local_hour = seq(0, 23, 1), - bin = seq(0, (59 %/% bin_size) * bin_size, bin_size), - fill = list(sensor_count=0)) %>% - pivot_wider(names_from = c(local_hour, bin), values_from = sensor_count) - -write.csv(phone_sensed_bins, output_file, row.names = FALSE) +if(nrow(all_sensor_data) == 0){ + bins = seq(0, 59, by = bin_size) + hours = seq(0, 23, 1) + write.csv(crossing(hours, bins) %>% unite("hour_bin",hours, bins, sep = "_") %>% mutate(value = NA, local_date = NA) %>% pivot_wider(names_from = hour_bin, values_from=value) %>% head(0), output_file, row.names = FALSE) +} else{ + phone_sensed_bins <- all_sensor_data %>% + mutate(bin = (local_minute %/% bin_size) * bin_size) %>% # bin rows into bin_size-minute bins + group_by(local_date, local_hour, bin) %>% + summarise(sensor_count = n_distinct(sensor)) %>% + ungroup() %>% + mutate(local_date = lubridate::ymd(local_date)) %>% + complete(local_date = seq.Date(min(local_date), max(local_date), by="day"), + fill = list(local_hour = 0, bin = 0, sensor_count = 0)) %>% + complete(nesting(local_date), + local_hour = seq(0, 23, 1), + bin = seq(0, 59, bin_size), + fill = list(sensor_count=0)) %>% + pivot_wider(names_from = c(local_hour, bin), values_from = sensor_count) + write.csv(phone_sensed_bins, output_file, row.names = FALSE) +} diff --git a/src/data/phone_valid_sensed_days.R b/src/data/phone_valid_sensed_days.R index 3df749dc..6390f82f 100644 --- a/src/data/phone_valid_sensed_days.R +++ b/src/data/phone_valid_sensed_days.R @@ -9,9 +9,8 @@ output_file <- snakemake@output[[1]] phone_valid_sensed_days <- phone_sensed_bins %>% pivot_longer(cols = -local_date, names_to = c("hour", "bin"), names_sep = "_") %>% - filter(value > 0) %>% group_by(local_date, hour) %>% - summarise(valid_bins = n()) %>% + summarise(valid_bins = sum(value > 0)) %>% group_by(local_date) %>% summarise(valid_sensed_hours = sum(valid_bins >= min_valid_bins_per_hour)) %>% mutate(is_valid_sensed_day = ifelse(valid_sensed_hours >= min_valid_hours_per_day, TRUE, FALSE)) diff --git a/src/features/phone_locations/barnett/main.R b/src/features/phone_locations/barnett/main.R index 90f05b4e..ad544e1b 100644 --- a/src/features/phone_locations/barnett/main.R +++ b/src/features/phone_locations/barnett/main.R @@ -75,7 +75,12 @@ barnett_features <- function(sensor_data_files, day_segment, params){ # Select only the columns that the algorithm needs location <- location %>% select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy) - outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone) + if(nrow(location %>% filter(accuracy < accuracy_limit)) > 1){ + outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone) + } else { + print(paste("Cannot compute location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT", accuracy_limit)) + outputMobility <- NULL + } if(is.null(outputMobility)){ location_features <- create_empty_file(requested_features) diff --git a/src/features/phone_locations/doryab/main.py b/src/features/phone_locations/doryab/main.py index 70097d0c..c3657336 100644 --- a/src/features/phone_locations/doryab/main.py +++ b/src/features/phone_locations/doryab/main.py @@ -46,6 +46,11 @@ def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_seg location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)] + if location_data.empty: + location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute]) + location_features = location_features.reset_index(drop=True) + return location_features + if "locationvariance" in features_to_compute: location_features["locations_doryab_locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var() @@ -351,8 +356,10 @@ def radius_of_gyration(locationData,sampling_frequency): time_in_cluster = locationData[locationData["location_label"]==labels].shape[0]* sampling_frequency rog = rog + (time_in_cluster * distance) - + time_all_clusters = valid_clusters.shape[0] * sampling_frequency + if time_all_clusters == 0: + return 0 final_rog = (1/time_all_clusters) * rog return np.sqrt(final_rog) diff --git a/src/visualization/overall_compliance_heatmap.py b/src/visualization/overall_compliance_heatmap.py index 5f2b879f..877ab0d2 100644 --- a/src/visualization/overall_compliance_heatmap.py +++ b/src/visualization/overall_compliance_heatmap.py @@ -40,16 +40,18 @@ def getOverallComplianceHeatmap(sensors_with_data, valid_sensed_hours, last_cert x=[date.replace("-", "/") for date in last_certain_dates] if expected_num_of_days != -1 else last_certain_dates, y=[pid + "." + label for pid, label in zip(sensors_with_data["pid"].to_list(), sensors_with_data["label"].to_list())], text=sensors_with_data[last_certain_dates].values, - hovertemplate="Date: %{x}
Participant: %{y}
Valid sensed hours: %{z}
Number of sensors with data: %{text}" if expected_num_of_days != -1 else "Date_idx: %{x}
Participant: %{y}
Valid sensed hours: %{z}
Number of sensors with data: %{text}", + hovertemplate="Date: %{x}
Participant: %{y}
Valid sensed hours: %{z}
Number of sensors with data: %{text}" if expected_num_of_days != -1 else "Day index: %{x}
Participant: %{y}
Valid sensed hours: %{z}
Number of sensors with data: %{text}", colorscale="Viridis", colorbar={"tick0": 0,"dtick": 1}, showscale=True)) if expected_num_of_days != -1: - plot.update_layout(title="Overall compliance heatmap for last " + str(expected_num_of_days) + " days.
Bin's color shows valid sensed hours for that day.
A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes") + plot.update_layout(title="Overall compliance heatmap for last " + str(expected_num_of_days) + " days.
Bin's color shows valid sensed hours for that day.
A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes.
You can hover over every day to see the number of sensors with data in that day.") else: - plot.update_layout(title="Overall compliance heatmap for all days.
Bin's color shows valid sensed hours for that day.
A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes") + plot.update_layout(title="Overall compliance heatmap for all days.
Bin's color shows valid sensed hours for that day.
A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes.
You can hover over every day to see the number of sensors with data in that day.") plot["layout"]["xaxis"].update(side="bottom") + plot["layout"].update(xaxis_title="Day indexes") + plot["layout"].update(margin=dict(t=160)) pio.write_html(plot, file=output_path, auto_open=False, include_plotlyjs="cdn") diff --git a/tests/data/raw/test01/phone_battery_raw.csv b/tests/data/raw/test01/phone_battery_raw.csv index b9a6d337..9295393d 100644 --- a/tests/data/raw/test01/phone_battery_raw.csv +++ b/tests/data/raw/test01/phone_battery_raw.csv @@ -13,7 +13,7 @@ timestamp,device_id,battery_status,battery_level,battery_scale,battery_voltage,b 1593597450123,wYESbVwI-4GfR-G5I6-7iKL-tOmCKs02MBun,3,80,100,4170,23,0,2,Li-ion 1593597589435,wYESbVwI-4GfR-G5I6-7iKL-tOmCKs02MBun,3,79,100,4094,23,0,2,Li-ion - + 1593597739321,wYESbVwI-4GfR-G5I6-7iKL-tOmCKs02MBun,3,78,100,4157,23,0,2,Li-ion 1593597872456,wYESbVwI-4GfR-G5I6-7iKL-tOmCKs02MBun,3,77,100,4157,23,0,2,Li-ion