diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index a4da3214..00000000 --- a/docs/Makefile +++ /dev/null @@ -1,153 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = _build - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext - -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - -rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/moshi-aware.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/moshi-aware.qhc" - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/moshi-aware" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/moshi-aware" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index e5047625..00000000 --- a/docs/conf.py +++ /dev/null @@ -1,244 +0,0 @@ -# -*- coding: utf-8 -*- -# -# RAPIDS documentation build configuration file, created by -# sphinx-quickstart. -# -# This file is execfile()d with the current directory set to its containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import os -import sys - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ----------------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be extensions -# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = [] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -# source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'RAPIDS' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '0.1' -# The full version, including alpha/beta/rc tags. -release = '0.1' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -# today = '' -# Else, today_fmt is used as the format for a strftime call. -# today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build'] - -# The reST default role (used for this markup: `text`) to use for all documents. -# default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -# add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -# add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -# show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -# modindex_common_prefix = [] - - -# -- Options for HTML output --------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'sphinx_rtd_theme' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -# html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -# html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -# html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -# html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -# html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -# html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -# html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -# html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -# html_additional_pages = {} - -# If false, no module index is generated. -# html_domain_indices = True - -# If false, no index is generated. -# html_use_index = True - -# If true, the index is split into individual pages for each letter. -# html_split_index = False - -# If true, links to the reST sources are added to the pages. -# html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -# html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -# html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -# html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -# html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = 'rapidsdoc' - - -# -- Options for LaTeX output -------------------------------------------------- - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # 'preamble': '', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass [howto/manual]). -latex_documents = [ - ('index', - 'rapids.tex', - u'RAPIDS Documentation', - u"RAPIDS", 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -# latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -# latex_use_parts = False - -# If true, show page references after internal links. -# latex_show_pagerefs = False - -# If true, show URL addresses after external links. -# latex_show_urls = False - -# Documents to append as an appendix to all manuals. -# latex_appendices = [] - -# If false, no module index is generated. -# latex_domain_indices = True - - -# -- Options for manual page output -------------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'RAPIDS', u'RAPIDS Documentation', - [u"RAPIDS"], 1) -] - -# If true, show URL addresses after external links. -# man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------------ - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ('index', 'RAPIDS', u'RAPIDS Documentation', - u"RAPIDS", 'RAPIDS', - 'Reproducible Analysis Pipeline for Data Streams', 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -# texinfo_appendices = [] - -# If false, no module index is generated. -# texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -# texinfo_show_urls = 'footnote' diff --git a/docs/develop/contributors.rst b/docs/develop/contributors.rst deleted file mode 100644 index d780279f..00000000 --- a/docs/develop/contributors.rst +++ /dev/null @@ -1,83 +0,0 @@ -RAPIDS Contributors -==================== - -Currently, RAPIDS is being developed by the Mobile Sensing + Health Institute (MoSHI) but if you are interested in contributing feel free to submit a pull request or contact us. - - -Julio Vega, PhD -"""""""""""""""""" -**Postdoctoral Associate** - -vegaju@upmc.edu - -Julio Vega is a postdoctoral associate at the Mobile Sensing + Health Institute. He is interested in personalized methodologies to monitor chronic conditions that affect daily human behavior using mobile and wearable data. In the long term, his goal is to explore how we can enable patients to inform, amend, and evaluate their health tracking algorithms to improve disease self-management. - -`Julio Vega Personal Website`_ - - - -Meng Li, MS -""""""""""""" -**Data Scientist** - -lim11@upmc.edu - -Meng Li received her Master of Science degree in Information Science from the University of Pittsburgh. She is interested in applying machine learning algorithms to the medical field. - -`Meng Li Linkedin Profile`_ - -`Meng Li Github Profile`_ - - - - -Kwesi Aguillera, BS -"""""""""""""""""""" -**Intern** - -Kwesi Aguillera is currently in his first year at the University of Pittsburgh pursuing a Master of Sciences in Information Science specializing in Big Data Analytics. He received his Bachelor of Science degree in Computer Science and Management from the University of the West Indies. Kwesi considers himself a full stack developer and looks forward to applying this knowledge to big data analysis. - -`Kwesi Aguillera Linkedin Profile`_ - - -Echhit Joshi, BS -""""""""""""""""" -**Intern** - -Echhit Joshi is a Masters student at the School of Computing and Information at University of Pittsburgh. His areas of interest are Machine/Deep Learning, Data Mining, and Analytics. - -`Echhit Joshi Linkedin Profile`_ - -Nicolas Leo, BS -"""""""""""""""" -**Intern** - -Nicolas is a rising senior studying computer science at the University of Pittsburgh. His academic interests include databases, machine learning, and application development. After completing his undergraduate degree, he plans to attend graduate school for a MS in Computer Science with a focus on Intelligent Systems. - - -Nikunj Goel, BS -"""""""""""""""" -**Intern** - -Nik is a graduate student at the University of Pittsburgh pursuing Master of Science in Information Science. He earned his Bachelor of Technology degree in Information Technology from India. He is a Data Enthusiasts and passionate about finding the meaning out of raw data. In a long term, his goal is to create a breakthrough in Data Science and Deep Learning. - -`Nikunj Goel Linkedin Profile`_ - -Agam Kumar, BS -"""""""""""""""" -**Research Assistant at CMU** - -Agam is a junior at Carnegie Mellon University studying Statistics and Machine Learning and pursuing an additional major in Computer Science. He is a member of the Data Science team in the Health and Human Performance Lab at CMU and has keen interests in software development and data science. His research interests include ML applications in medicine. - -`Agam Kumar Linkedin Profile`_ - -`Agam Kumar Github Profile`_ - -.. _`Julio Vega Personal Website`: https://juliovega.info/ -.. _`Meng Li Linkedin Profile`: https://www.linkedin.com/in/meng-li-57238414a -.. _`Meng Li Github Profile`: https://github.com/Meng6 -.. _`Kwesi Aguillera Linkedin Profile`: https://www.linkedin.com/in/kwesi-aguillera-29529823 -.. _`Echhit Joshi Linkedin Profile`: https://www.linkedin.com/in/echhitjoshi/ -.. _`Nikunj Goel Linkedin Profile`: https://www.linkedin.com/in/nikunjgoel95/ -.. _`Agam Kumar Linkedin Profile`: https://www.linkedin.com/in/agam-kumar -.. _`Agam Kumar Github Profile`: https://github.com/agam-kumar diff --git a/docs/develop/documentation.rst b/docs/develop/documentation.rst deleted file mode 100644 index cd9dc06c..00000000 --- a/docs/develop/documentation.rst +++ /dev/null @@ -1,237 +0,0 @@ -How to Edit Documentation -============================ - -The following is a basic guide for editing the documentation for this project. The documentation is rendered using Sphinx_ documentation builder - -Quick start up ----------------------------------- - -#. Install Sphinx in Mac OS ``brew install sphinx-doc`` or Linux (Ubuntu) ``apt-get install python3-sphinx`` - -#. Go to the docs folder ``cd docs`` - -#. Change any ``.rst`` file you need to modify - -#. To visualise the results locally do ``make dirhtml`` and check the html files in the ``_build/dirhtml`` directory - -#. When you are done, push your changes to the git repo. - - -Sphinx Workspace Structure ----------------------------- - -All of the files concerned with documentation can be found in the ``docs`` directory. At the top level there is the ``conf.py`` file and an ``index.rst`` file among others. There should be no need to change the ``conf.py`` file. The ``index.rst`` file is known as the master document and defines the document structure of the documentation (i.e. Menu Or Table of Contents structure). It contains the root of the “table of contents" tree -or toctree- that is used to connect the multiple files to a single hierarchy of documents. The TOC is defined using the ``toctree`` directive which is used as follows:: - - .. toctree:: - :maxdepth: 2 - :caption: Getting Started - - usage/introduction - usage/installation - -The ``toctree`` inserts a TOC tree at the current location using the individual TOCs of the documents given in the directive command body. In other words if there are ``toctree`` directives in the files listed in the above example it will also be applied to the resulting TOC. Relative document names (not beginning with a slash) are relative to the document the directive occurs in, absolute names are relative to the source directory. Thus in the example above the ``usage`` directory is relative to the ``index.rst`` page . The ``:maxdepth:`` parameter defines the depth of the tree for that particular menu. The ``caption`` parameter is used to give a caption for that menu tree at that level. It should be noted the titles for the links of the menu items under that header would be taken from the titles of the referenced document. For example the menu item title for ``usage/introduction`` is taken from the main header specified in ``introduction.rst`` document in the ``usage`` directory. Also note the document name does not include the extention (i.e. .rst). - -Thus the directory structure for the above example is shown below:: - - ├── index.rst - └── usage - ├── introduction.rst - └── installation.rst - - -Basic reStructuredText Syntax -------------------------------- - -Now we will look at some basic reStructuredText syntax necessary to start editing the .rst files that are used to generate documentation. - -Headers -"""""""" - -**Section Header** - -The following was used to make the header at the top of this page: -:: - - How to Edit Documentation - ========================== - -**Subsection Header** - -The follwoing was used to create the secondary header (e.g. Sphinx Workspace Structure section header) -:: - - Sphinx Workspace structure - ---------------------------- - -..... - - -Lists -"""""" -**Bullets List** -:: - - - This is a bullet - - This is a bullet - -Will produce the following: - -- This is a bullet -- This is a bullet - - -**Numbered List** -:: - - #. This is a numbered list item - #. This is a numbered list item - -Will produce the following: - -#. This is a numbered list item -#. This is a numbered list item - -..... - -Inline Markup -"""""""""""""" -**Emphasis/Italics** -:: - - *This is for emphasis* - -Will produce the following - -*This is for emphasis* - - -**Bold** -:: - - **This is bold text** - -Will produce the following - -**This is bold text** - -..... - -**Code Sample** -:: - - ``Backquotes = code sample`` - -Will produce the following: - -``Backquotes = code sample`` - -**Apostraphies in Text** -:: - - `don't know` - -Will produce the following - -`don't know` - - -**Literal blocks** - -Literal code blocks are introduced by ending a paragraph with the special marker ``::``. The literal block must be indented (and, like all paragraphs, separated from the surrounding ones by blank lines):: - - This is a normal text paragraph. The next paragraph is a code sample:: - - It is not processed in any way, except - that the indentation is removed. - - It can span multiple lines. - - This is a normal text paragraph again. - - -The following is produced: - -..... - -This is a normal text paragraph. The next paragraph is a code sample:: - - It is not processed in any way, except - that the indentation is removed. - - It can span multiple lines. - -This is a normal text paragraph again. - -..... - -**Doctest blocks** - -Doctest blocks are interactive Python sessions cut-and-pasted into docstrings. They do not require the literal blocks syntax. The doctest block must end with a blank line and should not end with with an unused prompt: - ->>> 1 + 1 -2 - -**External links** - -Use ```Link text `_`` for inline web links `Link text `_. If the link text should be the web address, you don’t need special markup at all, the parser finds links and mail addresses in ordinary text. *Important:* There must be a space between the link text and the opening ``<`` for the URL. - -You can also separate the link and the target definition , like this -:: - - This is a paragraph that contains `a link`_. - - .. _a link: https://domain.invalid/ - - -Will produce the following: - -This is a paragraph that contains `a link`_. - -.. _a link: https://domain.invalid/ - - - -**Internal links** - -Internal linking is done via a special reST role provided by Sphinx to cross-reference arbitrary locations. For this to work label names must be unique throughout the entire documentation. There are two ways in which you can refer to labels: - -- If you place a label directly before a section title, you can reference to it with ``:ref:`label-name```. For example:: - - .. _my-reference-label: - - Section to cross-reference - -------------------------- - - This is the text of the section. - - It refers to the section itself, see :ref:`my-reference-label`. - -The ``:ref:`` role would then generate a link to the section, with the link title being “Section to cross-reference”. This works just as well when section and reference are in different source files. The above produces the following: - -..... - -.. _my-reference-label: - -Section to cross-reference -""""""""""""""""""""""""""" - -This is the text of the section. - -It refers to the section itself, see :ref:`my-reference-label`. - -..... - -- Labels that aren’t placed before a section title can still be referenced, but you must give the link an explicit title, using this syntax: ``:ref:`Link title ```. - - -**Comments** - -Every explicit markup block which isn’t a valid markup construct is regarded as a comment. For example:: - - .. This is a comment. - -Go to Sphinx_ for more documentation. - -.. _Sphinx: https://www.sphinx-doc.org -.. _reStructuredText: https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html - diff --git a/docs/develop/environments.rst b/docs/develop/environments.rst deleted file mode 100644 index 76abcb1a..00000000 --- a/docs/develop/environments.rst +++ /dev/null @@ -1,18 +0,0 @@ -Manage virtual environments -============================= - -**Add new packages** - -Try to install any new package using `conda install my_package`. If a package is not available in one of conda's channels you can install it with pip but make sure your virtual environment is active. - -**Update your conda environment.yaml** - -After installing a new package you can use the following command in your terminal to update your ``environment.yaml`` before publishing your pipeline. Note that we ignore the package version for ``libfortran`` to keep compatibility with Linux: - - ``conda env export --no-builds | sed 's/^.*libgfortran.*$/ - libgfortran/' > environment.yml`` - -**Update and prune your conda environment from a environment.yaml file** - -Execute the following command in your terminal. See https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#updating-an-environment - - ``conda env update --prefix ./env --file environment.yml --prune`` \ No newline at end of file diff --git a/docs/develop/features.rst b/docs/develop/features.rst deleted file mode 100644 index ad5b86e2..00000000 --- a/docs/develop/features.rst +++ /dev/null @@ -1,28 +0,0 @@ -Add new features to RAPIDS -============================ - -Take accelerometer features as an example. - -#. Add your script to accelerometer_ folder - - - Copy the signature of the base_accelerometer_features() function_ for your own feature function - -#. Add any parameters you need for your function - - - Add your parameters to the settings_ of accelerometer sensor in config file - - Add your parameters to the params_ of accelerometer_features rule in features.snakefile - -#. Merge your new features with the existent features - - - Call the function you just created below this line (LINK_) of accelerometer_features.py script - -#. Update config file - - - Add your new feature names to the ``FEATURES`` list for accelerometer in the config_ file - -.. _accelerometer: https://github.com/carissalow/rapids/tree/master/src/features/accelerometer -.. _function: https://github.com/carissalow/rapids/blob/master/src/features/accelerometer/accelerometer_base.py#L35 -.. _settings: https://github.com/carissalow/rapids/blob/master/config.yaml#L100 -.. _params: https://github.com/carissalow/rapids/blob/master/rules/features.snakefile#L146 -.. _LINK: https://github.com/carissalow/rapids/blob/master/src/features/accelerometer_features.py#L10 -.. _config: https://github.com/carissalow/rapids/blob/master/config.yaml#L102 diff --git a/docs/develop/remotesupport.rst b/docs/develop/remotesupport.rst deleted file mode 100644 index 213b1420..00000000 --- a/docs/develop/remotesupport.rst +++ /dev/null @@ -1,16 +0,0 @@ -Remote Support -====================================== - -We use the Live Share extension of Visual Studio Code to debug bugs when sharing data or database credentials is not possible. - -#. Install `Visual Studio Code `_ - -#. Open you rapids folder in a new VSCode window - -#. Open a new Terminal ``Terminal > New terminal`` - -#. Install the `Live Share extension pack `_ - -#. Press ``Ctrl+P``/``Cmd+P`` and run this command ``>live share: start collaboration session`` - -#. Follow the instructions and share the session link you receive \ No newline at end of file diff --git a/docs/develop/test_cases.rst b/docs/develop/test_cases.rst deleted file mode 100644 index 593fdbbb..00000000 --- a/docs/develop/test_cases.rst +++ /dev/null @@ -1,110 +0,0 @@ -.. _test-cases: - -Test Cases ------------ - -Along with the continued development and the addition of new sensors and features to the RAPIDS pipeline, tests for the currently available sensors and features are being implemented. Since this is a Work In Progress this page will be updated with the list of sensors and features for which testing is available. For each of the sensors listed a description of the data used for testing (test cases) are outline. Currently for all intent and testing purposes the ``tests/data/raw/test01/`` contains all the test data files for testing android data formats and ``tests/data/raw/test02/`` contains all the test data files for testing iOS data formats. It follows that the expected (verified output) are contained in the ``tests/data/processed/test01/`` and ``tests/data/processed/test02/`` for Android and iOS respectively. ``tests/data/raw/test03/`` and ``tests/data/raw/test04/`` contain data files for testing empty raw data files for android and iOS respectively. - -List of Sensor with Tests -^^^^^^^^^^^^^^^^^^^^^^^^^^ -The following is a list of the sensors that testing is currently available. - - -Messages (SMS) -""""""""""""""" - - - The raw message data file contains data for 2 separate days. - - The data for the first day contains records 5 records for every ``epoch``. - - The second day's data contains 6 records for each of only 2 ``epoch`` (currently ``morning`` and ``evening``) - - The raw message data contains records for both ``message_types`` (i.e. ``recieved`` and ``sent``) in both days in all epochs. The number records with each ``message_types`` per epoch is randomly distributed There is at least one records with each ``message_types`` per epoch. - - There is one raw message data file each, as described above, for testing both iOS and Android data. - - There is also an additional empty data file for both android and iOS for testing empty data files - -Calls -""""""" - - Due to the difference in the format of the raw call data for iOS and Android (see the **Assumptions/Observations** section of :ref:`Calls`) the following is the expected results the ``calls_with_datetime_unified.csv``. This would give a better idea of the use cases being tested since the ``calls_with_datetime_unified.csv`` would make both the iOS and Android data comparable. - - - The call data would contain data for 2 days. - - The data for the first day contains 6 records for every ``epoch``. - - The second day's data contains 6 records for each of only 2 ``epoch`` (currently ``morning`` and ``evening``) - - The call data contains records for all ``call_types`` (i.e. ``incoming``, ``outgoing`` and ``missed``) in both days in all epochs. The number records with each of the ``call_types`` per epoch is randomly distributed. There is at least one records with each ``call_types`` per epoch. - - There is one call data file each, as described above, for testing both iOS and Android data. - - There is also an additional empty data file for both android and iOS for testing empty data files - -Screen -"""""""" - - Due to the difference in the format of the raw screen data for iOS and Android (see the **Assumptions/Observations** section of :ref:`Screen`) the following is the expected results the ``screen_deltas.csv``. This would give a better idea of the use cases being tested since the ``screen_deltas.csv`` would make both the iOS and Android data comparable. These files are used to calculate the features for the screen sensor. - - - The screen delta data file contains data for 1 day. - - The screen delta data contains 1 record to represent an ``unlock`` episode that falls within an ``epoch`` for every ``epoch``. - - The screen delta data contains 1 record to represent an ``unlock`` episode that falls across the boundary of 2 epochs. Namely the ``unlock`` episode starts in one epoch and ends in the next, thus there is a record for ``unlock`` episodes that fall across ``night`` to ``morning``, ``morning`` to ``afternoon`` and finally ``afternoon`` to ``night`` - - The testing is done for ``unlock`` episode_type. - - There is one screen data file each for testing both iOS and Android data formats. - - There is also an additional empty data file for both android and iOS for testing empty data files - -Battery -""""""""" - - Due to the difference in the format of the raw battery data for iOS and Android as well as versions of iOS (see the **Assumptions/Observations** section of :ref:`Battery`) the following is the expected results the ``battery_deltas.csv``. This would give a better idea of the use cases being tested since the ``battery_deltas.csv`` would make both the iOS and Android data comparable. These files are used to calculate the features for the battery sensor. - - - The battery delta data file contains data for 1 day. - - The battery delta data contains 1 record each for a ``charging`` and ``discharging`` episode that falls within an ``epoch`` for every ``epoch``. Thus, for the ``daily`` epoch there would be multiple ``charging`` and ``discharging`` episodes - - Since either a ``charging`` episode or a ``discharging`` episode and not both can occur across epochs, in order to test episodes that occur across epochs alternating episodes of ``charging`` and ``discharging`` episodes that fall across ``night`` to ``morning``, ``morning`` to ``afternoon`` and finally ``afternoon`` to ``night`` are present in the battery delta data. This starts with a ``discharging`` episode that begins in ``night`` and end in ``morning``. - - There is one battery data file each, for testing both iOS and Android data formats. - - There is also an additional empty data file for both android and iOS for testing empty data files - -Bluetooth -"""""""""" - - - The raw Bluetooth data file contains data for 1 day. - - The raw Bluetooth data contains at least 2 records for each ``epoch``. Each ``epoch`` has a record with a ``timestamp`` for the beginning boundary for that ``epoch`` and a record with a ``timestamp`` for the ending boundary for that ``epoch``. (e.g. For the ``morning`` epoch there is a record with a ``timestamp`` for ``6:00AM`` and another record with a ``timestamp`` for ``11:59:59AM``. These are to test edge cases) - - An option of 5 Bluetooth devices are randomly distributed throughout the data records. - - There is one raw Bluetooth data file each, for testing both iOS and Android data formats. - - There is also an additional empty data file for both android and iOS for testing empty data files. - -WIFI -""""" - - - There are 2 data files (``wifi_raw.csv`` and ``sensor_wifi_raw.csv``) for each fake participant for each phone platform. (see the **Assumptions/Observations** section of :ref:`WIFI`) - - The raw WIFI data files contain data for 1 day. - - The ``sensor_wifi_raw.csv`` data contains at least 2 records for each ``epoch``. Each ``epoch`` has a record with a ``timestamp`` for the beginning boundary for that ``epoch`` and a record with a ``timestamp`` for the ending boundary for that ``epoch``. (e.g. For the ``morning`` epoch there is a record with a ``timestamp`` for ``6:00AM`` and another record with a ``timestamp`` for ``11:59:59AM``. These are to test edge cases) - - The ``wifi_raw.csv`` data contains 3 records with random timestamps for each ``epoch`` to represent visible broadcasting WIFI network. This file is empty for the iOS phone testing data. - - An option of 10 access point devices is randomly distributed throughout the data records. 5 each for ``sensor_wifi_raw.csv`` and ``wifi_raw.csv``. - - There data files for testing both iOS and Android data formats. - - There are also additional empty data files for both android and iOS for testing empty data files. - -Light -""""""" - - - The raw light data file contains data for 1 day. - - The raw light data contains 3 or 4 rows of data for each ``epoch`` except ``night``. The single row of data for ``night`` is for testing features for single values inputs. (Example testing the standard deviation of one input value) - - Since light is only available for Android there is only one file that contains data for Android. All other files (i.e. for iPhone) are empty data files. - -Application Foreground -""""""""""""""""""""""" - - - The raw application foreground data file contains data for 1 day. - - The raw application foreground data contains 7 - 9 rows of data for each ``epoch``. The records for each ``epoch`` contains apps that are randomly selected from a list of apps that are from the ``MULTIPLE_CATEGORIES`` and ``SINGLE_CATEGORIES`` (See `testing_config.yaml`_). There are also records in each epoch that have apps randomly selected from a list of apps that are from the ``EXCLUDED_CATEGORIES`` and ``EXCLUDED_APPS``. This is to test that these apps are actually being excluded from the calculations of features. There are also records to test ``SINGLE_APPS`` calculations. - - Since application foreground is only available for Android there is only one file that contains data for Android. All other files (i.e. for iPhone) are empty data files. - -Activity Recognition -"""""""""""""""""""""" - - - The raw Activity Recognition data file contains data for 1 day. - - The raw Activity Recognition data each ``epoch`` period contains rows that records 2 - 5 different ``activity_types``. The is such that durations of activities can be tested. Additionally, there are records that mimic the duration of an activity over the time boundary of neighboring epochs. (For example, there a set of records that mimic the participant ``in_vehicle`` from ``afternoon`` into ``evening``) - - There is one file each with raw Activity Recognition data for testing both iOS and Android data formats. (plugin_google_activity_recognition_raw.csv for android and plugin_ios_activity_recognition_raw.csv for iOS) - - There is also an additional empty data file for both android and iOS for testing empty data files. - -Conversation -""""""""""""" - - - The raw conversation data file contains data for 2 day. - - The raw conversation data contains records with a sample of both ``datatypes`` (i.e. ``voice/noise`` = ``0``, and ``conversation`` = ``2`` ) as well as rows with for samples of each of the ``inference`` values (i.e. ``silence`` = ``0``, ``noise`` = ``1``, ``voice`` = ``2``, and ``unknown`` = ``3``) for each ``epoch``. The different ``datatype`` and ``inference`` records are randomly distributed throughout the ``epoch``. - - Additionally there are 2 - 5 records for conversations (``datatype`` = 2, and ``inference`` = -1) in each ``epoch`` and for each ``epoch`` except night, there is a conversation record that has a ``double_convo_start`` ``timestamp`` that is from the previous ``epoch``. This is to test the calculations of features across ``epochs``. - - There is a raw conversation data file for both android and iOS platforms (``plugin_studentlife_audio_android_raw.csv`` and ``plugin_studentlife_audio_raw.csv`` respectively). - - Finally, there are also additional empty data files for both android and iOS for testing empty data files - - - .. _`testing_config.yaml`: https://github.com/carissalow/rapids/blob/c498b8d2dfd7cc29d1e4d53e978d30cff6cdf3f2/tests/settings/testing_config.yaml#L70 diff --git a/docs/develop/testing.rst b/docs/develop/testing.rst deleted file mode 100644 index 46d41fc4..00000000 --- a/docs/develop/testing.rst +++ /dev/null @@ -1,67 +0,0 @@ -Testing -========== - -The following is a simple guide to testing RAPIDS. All files necessary for testing are stored in the ``tests`` directory: - -:: - - ├── tests - │ ├── data <- Replica of the project root data directory for testing. - │ │ ├── external <- Contains the fake testing participant files. - │ │ ├── interim <- The expected intermediate data that has been transformed. - │ │ ├── processed <- The expected final data, canonical data sets for modeling used to test/validate feature calculations. - │ │ └── raw <- The specially created raw input datasets (fake data) that will be used for testing. - │ │ - │ ├── scripts <- Scripts for testing. Add test scripts in this directory. - │ │ ├── run_tests.sh <- The shell script to runs RAPIDS pipeline test data and test the results - │ │ ├── test_sensor_features.py <- The default test script for testing RAPIDS builting sensor features. - │ │ └── utils.py <- Contains any helper functions and methods. - │ │ - │ ├── settings <- The directory contains the config and settings files for testing snakemake. - │ │ ├── config.yaml <- Defines the testing profile configurations for running snakemake. - │ │ └── testing_config.yaml <- Contains the actual snakemake configuration settings for testing. - │ │ - │ └── Snakefile <- The Snakefile for testing only. It contains the rules that you would be testing. - │ - - -Steps for Testing -"""""""""""""""""" - -#. To begin testing RAPIDS place the fake raw input data ``csv`` files in ``tests/data/raw/``. The fake participant files should be placed in ``tests/data/external/``. The expected output files of RAPIDS after processing the input data should be placed in ``tests/data/processesd/``. - -#. The Snakemake rule(s) that are to be tested must be placed in the ``tests/Snakemake`` file. The current ``tests/Snakemake`` is a good example of how to define them. (At the time of writing this documentation the snakefile contains rules messages (SMS), calls and screen) - -#. Edit the ``tests/settings/config.yaml``. Add and/or remove the rules to be run for testing from the ``forcerun`` list. - -#. Edit the ``tests/settings/testing_config.yaml`` with the necessary configuration settings for running the rules to be tested. - -#. Add any additional testscripts in ``tests/scripts``. - -#. Uncomment or comment off lines in the testing shell script ``tests/scripts/run_tests.sh``. - -#. Run the testing shell script. - -:: - - $ tests/scripts/run_tests.sh - - -The following is a snippet of the output you should see after running your test. - -:: - - test_sensors_files_exist (test_sensor_features.TestSensorFeatures) ... ok - test_sensors_features_calculations (test_sensor_features.TestSensorFeatures) ... FAIL - - ====================================================================== - FAIL: test_sensors_features_calculations (test_sensor_features.TestSensorFeatures) - ---------------------------------------------------------------------- - -The results above show that the first test ``test_sensors_files_exist`` passed while ``test_sensors_features_calculations`` failed. In addition you should get the traceback of the failure (not shown here). For more information on how to implement test scripts and use unittest please see `Unittest Documentation`_ - -Testing of the RAPIDS sensors and features is a work-in-progess. Please see :ref:`test-cases` for a list of sensors and features that have testing currently available. - -Currently the repository is set up to test a number of senssors out of the box by simply running the ``tests/scripts/run_tests.sh`` command once the RAPIDS python environment is active. - -.. _`Unittest Documentation`: https://docs.python.org/3.7/library/unittest.html#command-line-interface diff --git a/docs/features/extracted.rst b/docs/features/extracted.rst deleted file mode 100644 index e5f12cee..00000000 --- a/docs/features/extracted.rst +++ /dev/null @@ -1,1113 +0,0 @@ -.. _rapids_features: - -RAPIDS Features -=============== - -*How do I compute any of these features?* In your ``config.yaml``, go to the sensor section you are interested in and set the corresponding ``COMPUTE`` option to ``TRUE`` as well as ``DB_TABLE`` to the senor's table name in your database (the default table name is the one assigned by Aware), for example -:: - - MESSAGES: - COMPUTE: True - DB_TABLE: messages - ... - -If you want to extract phone_valid_sensed_days.csv, screen features or locaton features based on fused location data don't forget to configure ``[PHONE_VALID_SENSED_BINS][TABLES]`` (see below). - -.. _global-sensor-doc: - -Global Parameters -""""""""""""""""" - -.. _sensor-list: - -.. _pid: - -- ``PIDS`` - The list of participant ids to be included in the analysis. These should match the names of the files created in the ``data/external`` directory (:ref:`see more details`). - -.. _day-segments: - -- ``DAY_SEGMENTS`` - The list of day epochs that features can be segmented into: ``daily``, ``morning`` (6am-12pm), ``afternnon`` (12pm-6pm), ``evening`` (6pm-12am) and ``night`` (12am-6am). This list can be modified globally or on a per sensor basis. See DAY_SEGMENTS_ in ``config`` file. - -.. _timezone: - -- ``TIMEZONE`` - The time zone where data was collected. Use the timezone names from this `List of Timezones`_. Double check your chosen name is correct, for example US Eastern Time is called New America/New_York, not EST. - -.. _database_group: - -- ``DATABASE_GROUP`` - The name of your database credentials group, it should match the one in ``.env`` (:ref:`see the datbase configuration`). - -.. _download-dataset: - -- ``DOWNLOAD_DATASET`` - - - ``GROUP``. Credentials group to connect to the database containing ``SENSORS``. By default it points to ``DATABASE_GROUP``. - -.. _readable-datetime: - -- ``READABLE_DATETIME`` - Configuration to convert UNIX timestamps into readbale date time strings. - - - ``FIXED_TIMEZONE``. See ``TIMEZONE`` above. This assumes that all data of all participants was collected within one time zone. - - Support for multiple time zones for each participant coming soon based on the ``timezone`` table collected by Aware. - -.. _phone-valid-sensed-bins: - -- ``PHONE_VALID_SENSED_BINS`` - Contains three attributes: ``COMPUTE``, ``BIN_SIZE`` and ``TABLES``. See the PHONE_VALID_SENSED_BINS_ section in the ``config.yaml`` file - - Set the ``COMPUTE`` flag to True if you want to get this file (``data/interim/{pid}/phone_sensed_bins``). Phone valid sensed bins is a matrix of days x bins where we divide every hour of every day into N bins of size ``BIN_SIZE`` (in minutes). Each bin contains the number of rows that were recorded in that interval by all the sensors listed in ``TABLES``. Add as many sensor tables to ``TABLES`` as you have in your database because valid sensed bins are used to compute ``PHONE_VALID_SENSED_DAYS``, the ``episodepersensedminutes`` feature of :ref:`Screen` and to resample fused location data if you configure Barnett's/Doryab's location features to use ``RESAMPLE_FUSED``. - - The ``COMPUTE`` flag is automatically ignored (set internally to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features. - -.. _phone-valid-sensed-days: - -- ``PHONE_VALID_SENSED_DAYS``. - - Contains three attributes: ``COMPUTE``, ``MIN_VALID_HOURS_PER_DAY``, ``MIN_VALID_BINS_PER_HOUR``. See the PHONE_VALID_SENSED_DAYS_ section in ``config.yaml``. - - On any given day, Aware could have sensed data only for a few minutes or for 24 hours. Daily estimates of features should be considered more reliable the more hours Aware was running and logging data, for example, 10 calls logged on a day when only one hour of data was recorded is a less reliable feature compared to 10 calls on a day when 23 hours of data were recorded. - - Therefore, we define a valid hour as those that contain a minimum number of valid bins. A valid bin are those that contain at least one row of data from any sensor logged within that period (See ``PHONE_VALID_SENSED_BINS`` above). We mark an hour as valid if contains at least ``MIN_VALID_BINS_PER_HOUR`` (out of the total possible number of bins that can be captured in an hour based on their length i.e. 60min/``BIN_SIZE`` bins). In turn, we mark a day as valid if it has at least ``MIN_VALID_HOURS_PER_DAY``. ``MIN_VALID_HOURS_PER_DAY`` could be a list. For different thresholds, we can get different valid sensed days: ``"data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}h.csv"``. - - Note that at the moment RAPIDS *DOES NOT* filter your feature files automatically, you need to do this after your features have been extracted using ``"data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}h.csv"``. - -.. _individual-sensor-settings: - - -.. _messages-sensor-doc: - -Messages (SMS) -""""""""""""""" - -See `Messages Config Code`_ - -**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night - -**Available Platforms:** Android - -**Rule Chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/readable_datetime`` -- Rule ``rules/features.snakefile/messages_features`` - -.. _messages-parameters: - -**Messages Rule Parameters (messages_features):** - -============== =================== -Name Description -============== =================== -messages_type The particular ``messages_type`` that will be analyzed. The options for this parameter are ``received`` or ``sent``. -day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -features Features to be computed, see table below -============== =================== - -.. _messages-available-features: - -**Available Message Features** - -========================= ========= ============= -Name Units Description -========================= ========= ============= -count messages Number of messages of type ``messages_type`` that occurred during a particular ``day_segment``. -distinctcontacts contacts Number of distinct contacts that are associated with a particular ``messages_type`` during a particular ``day_segment``. -timefirstmessages minutes Number of minutes between 12:00am (midnight) and the first ``message`` of a particular ``messages_type``. -timelastmessages minutes Number of minutes between 12:00am (midnight) and the last ``message`` of a particular ``messages_type``. -countmostfrequentcontact messages Number of messages from the contact with the most messages of ``messages_type`` during a ``day_segment`` throughout the whole dataset of each participant. -========================= ========= ============= - -**Assumptions/Observations:** - -``TYPES`` and ``FEATURES`` keys in ``config.yaml`` need to match. For example, below the ``TYPE`` ``sent`` matches the ``FEATURES`` key ``sent``:: - - MESSAGES: - ... - TYPES: [sent] - FEATURES: - sent: [count, distinctcontacts, timefirstmessages, timelastmessages, countmostfrequentcontact] - - -.. _call-sensor-doc: - -Calls -"""""" - -See `Call Config Code`_ - -**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night - -**Available Platforms:** Android and iOS - -**Rule Chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/readable_datetime`` -- Rule ``rules/features.snakefile/call_features`` - -.. _calls-parameters: - -**Call Rule Parameters (call_features):** - -============ =================== -Name Description -============ =================== -call_type The particular ``call_type`` that will be analyzed. The options for this parameter are ``incoming``, ``outgoing`` or ``missed``. -day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -features Features to be computed. Note that the same features are available for both ``incoming`` and ``outgoing`` calls, while ``missed`` calls has its own set of features. See :ref:`Available Incoming and Outgoing Call Features ` Table and :ref:`Available Missed Call Features ` Table below. -============ =================== - -.. _available-in-and-out-call-features: - -**Available Incoming and Outgoing Call Features** - -========================= ========= ============= -Name Units Description -========================= ========= ============= -count calls Number of calls of a particular ``call_type`` occurred during a particular ``day_segment``. -distinctcontacts contacts Number of distinct contacts that are associated with a particular ``call_type`` for a particular ``day_segment`` -meanduration seconds The mean duration of all calls of a particular ``call_type`` during a particular ``day_segment``. -sumduration seconds The sum of the duration of all calls of a particular ``call_type`` during a particular ``day_segment``. -minduration seconds The duration of the shortest call of a particular ``call_type`` during a particular ``day_segment``. -maxduration seconds The duration of the longest call of a particular ``call_type`` during a particular ``day_segment``. -stdduration seconds The standard deviation of the duration of all the calls of a particular ``call_type`` during a particular ``day_segment``. -modeduration seconds The mode of the duration of all the calls of a particular ``call_type`` during a particular ``day_segment``. -entropyduration nats The estimate of the Shannon entropy for the the duration of all the calls of a particular ``call_type`` during a particular ``day_segment``. -timefirstcall minutes The time in minutes between 12:00am (midnight) and the first call of ``call_type``. -timelastcall minutes The time in minutes between 12:00am (midnight) and the last call of ``call_type``. -countmostfrequentcontact calls The number of calls of a particular ``call_type`` during a particular ``day_segment`` of the most frequent contact throughout the monitored period. -========================= ========= ============= - -.. _available-missed-call-features: - -**Available Missed Call Features** - -========================= ========= ============= -Name Units Description -========================= ========= ============= -count calls Number of ``missed`` calls that occurred during a particular ``day_segment``. -distinctcontacts contacts Number of distinct contacts that are associated with ``missed`` calls for a particular ``day_segment`` -timefirstcall minutes The time in hours from 12:00am (Midnight) that the first ``missed`` call occurred. -timelastcall minutes The time in hours from 12:00am (Midnight) that the last ``missed`` call occurred. -countmostfrequentcontact calls The number of ``missed`` calls during a particular ``day_segment`` of the most frequent contact throughout the monitored period. -========================= ========= ============= - -**Assumptions/Observations:** - -Traces for iOS calls are unique even for the same contact calling a participant more than once which renders ``countmostfrequentcontact`` meaningless and ``distinctcontacts`` equal to the total number of traces. - -``TYPES`` and ``FEATURES`` keys in ``config.yaml`` need to match. For example, below the ``TYPE`` ``missed`` matches the ``FEATURES`` key ``missed``:: - - CALLS: - ... - TYPES: [missed] - FEATURES: - missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact] - -Aware Android client stores call types 1=incoming, 2=outgoing, 3=missed while Aware iOS client stores call status 1=incoming, 2=connected, 3=dialing, 4=disconnected. We extract iOS call types based on call status sequences: (1,2,4)=incoming=1, (3,2,4)=outgoing=2, (1,4) or (3,4)=missed=3. Sometimes (due to a possible bug in Aware) sequences get logged on the exact same timestamp, thus 3-item sequences can be 2,3,4 or 3,2,4. Although iOS stores the duration of ringing/dialing stages for missed calls, we set it to 0 to match Android. - - -.. _bluetooth-sensor-doc: - -Bluetooth -"""""""""" - -See `Bluetooth Config Code`_ - -**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night - -**Available Platforms:** Android and iOS - -**Snakemake rule chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/readable_datetime`` -- Rule ``rules/features.snakefile/bluetooth_features`` - -.. _bluetooth-parameters: - -**Bluetooth Rule Parameters (bluetooth_features):** - -============ =================== -Name Description -============ =================== -day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -features Features to be computed, see table below -============ =================== - -.. _bluetooth-available-features: - -**Available Bluetooth Features** - -=========================== ========= ============= -Name Units Description -=========================== ========= ============= -countscans devices Number of scanned devices during a ``day_segment``, a device can be detected multiple times over time and these appearances are counted separately -uniquedevices devices Number of unique devices during a ``day_segment`` as identified by their hardware address -countscansmostuniquedevice scans Number of scans of the most scanned device during a ``day_segment`` across the whole monitoring period -=========================== ========= ============= - -**Assumptions/Observations:** N/A - - -.. _wifi-sensor-doc: - -WiFi -"""""""""" - -See `WiFi Config Code`_ - -**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night - -**Available Platforms:** Android and iOS - -**Snakemake rule chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/readable_datetime`` -- Rule ``rules/features.snakefile/wifi_features`` - -.. _wifi-parameters: - -**WiFi Rule Parameters (wifi_features):** - -============ =================== -Name Description -============ =================== -day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -features Features to be computed, see table below -============ =================== - -.. _wifi-available-features: - -**Available WiFi Features** - -=========================== ========= ============= -Name Units Description -=========================== ========= ============= -countscans devices Number of scanned WiFi access points during a ``day_segment``, an access point can be detected multiple times over time and these appearances are counted separately -uniquedevices devices Number of unique access point during a ``day_segment`` as identified by their hardware address -countscansmostuniquedevice scans Number of scans of the most scanned access point during a ``day_segment`` across the whole monitoring period -=========================== ========= ============= - -**Assumptions/Observations:** -Both phone platforms record the wifi networks a phone is connected to in ``sensor_wifi`` and those networks that are being broadcasted around a phone in ``wifi``. However, iOS cannot record any broadcasting network due to API restrictions, therefore iOS wifi data only exists in ``sensor_wifi``. - - -.. _accelerometer-sensor-doc: - -Accelerometer -"""""""""""""" - -See `Accelerometer Config Code`_ - -**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night - -**Available Platforms:** Android and iOS - -**Rule chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/readable_datetime`` -- Rule ``rules/features.snakefile/accelerometer_features`` - -.. _Accelerometer-parameters: - -**Accelerometer Rule Parameters (accelerometer_features):** - -============ =================== -Name Description -============ =================== -day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -features Features to be computed, see table below -============ =================== - -.. _accelerometer-available-features: - -**Available Accelerometer Features** - -====================== ============== ============= -Name Units Description -====================== ============== ============= -maxmagnitude m/s\ :sup:`2` The maximum magnitude of acceleration (:math:`\|acceleration\| = \sqrt{x^2 + y^2 + z^2}`). -minmagnitude m/s\ :sup:`2` The minimum magnitude of acceleration. -avgmagnitude m/s\ :sup:`2` The average magnitude of acceleration. -medianmagnitude m/s\ :sup:`2` The median magnitude of acceleration. -stdmagnitude m/s\ :sup:`2` The standard deviation of acceleration. -sumduration minutes Total duration of all exertional or non-exertional activity episodes. -maxduration minutes Longest duration of any exertional or non-exertional activity episode. -minduration minutes Shortest duration of any exertional or non-exertional activity episode. -avgduration minutes Average duration of any exertional or non-exertional activity episode. -medianduration minutes Median duration of any exertional or non-exertional activity episode. -stdduration minutes Standard deviation of the duration of all exertional or non-exertional activity episodes. -====================== ============== ============= - -**Assumptions/Observations:** - -Exertional activity episodes are based on this paper: Panda N, Solsky I, Huang EJ, et al. Using Smartphones to Capture Novel Recovery Metrics After Cancer Surgery. JAMA Surg. 2020;155(2):123–129. doi:10.1001/jamasurg.2019.4702 - - -.. _applications-foreground-sensor-doc: - -Applications Foreground -"""""""""""""""""""""""" - -See `Applications Foreground Config Code`_ - -**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night - -**Available Platforms:** Android - -**Snakemake rule chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/readable_datetime`` -- Rule ``rules/preprocessing.snakefile/application_genres`` -- Rule ``rules/features.snakefile/applications_foreground_features`` - -.. _applications-foreground-parameters: - -**Applications Foreground Rule Parameters (applications_foreground_features):** - -==================== =================== -Name Description -==================== =================== -day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -single_categories App categories to be included in the feature extraction computation. See ``APPLICATION_GENRES`` in this file to add new categories or use the catalogue we provide and read :ref:`Assumtions and Observations ` for more information. -multiple_categories You can group multiple categories into meta categories, for example ``social: ["socialnetworks", "socialmediatools"]``. -single_apps Apps to be included in the feature extraction computation. Use their package name, for example, ``com.google.android.youtube`` or the reserved word ``top1global`` (the most used app by a participant over the whole monitoring study). -excluded_categories App categories to be excluded in the feature extraction computation. See ``APPLICATION_GENRES`` in this file to add new categories or use the catalogue we provide and read :ref:`Assumtions and Observations ` for more information. -excluded_apps Apps to be excluded in the feature extraction computation. Use their package name, for example: ``com.google.android.youtube`` -features Features to be computed, see table below -==================== =================== - -.. _applications-foreground-available-features: - -**Available Applications Foreground Features** - -================== ========= ============= -Name Units Description -================== ========= ============= -count apps Number of times a single app or apps within a category were used (i.e. they were brought to the foreground either by tapping their icon or switching to it from another app). -timeoffirstuse minutes The time in minutes between 12:00am (midnight) and the first use of a single app or apps within a category during a ``day_segment``. -timeoflastuse minutes The time in minutes between 12:00am (midnight) and the last use of a single app or apps within a category during a ``day_segment``. -frequencyentropy nats The entropy of the used apps within a category during a ``day_segment`` (each app is seen as a unique event, the more apps were used, the higher the entropy). This is especially relevant when computed over all apps. Entropy cannot be obtained for a single app. -================== ========= ============= - -.. _applications-foreground-observations: - -**Assumptions/Observations:** - -Features can be computed by app, by apps grouped under a single category (genre) and by multiple categories grouped together (meta categories). For example, we can get features for Facebook, for Social Network Apps (including Facebook and others) or for a meta category called Social formed by Social Network and Social Media Tools categories. - -Apps installed by default like YouTube are considered systems apps on some phones. We do an exact match to exclude apps where "genre" == ``EXCLUDED_CATEGORIES`` or "package_name" == ``EXCLUDED_APPS``. - -We provide three ways of classifying and app within a category (genre): a) by automatically scraping its official category from the Google Play Store, b) by using the catalogue created by Stachl et al. which we provide in RAPIDS (``data/external/``), or c) by manually creating a personalized catalogue. - -The way you choose strategy a, b or c is by modifying ``APPLICATION_GENRES`` keys and values. Set ``CATALOGUE_SOURCE`` to ``FILE`` if you want to use a CSV file as catalogue (strategy b and c) or to ``GOOGLE`` if you want to scrape the genres from the Play Store (strategy a). By default ``CATALOGUE_FILE`` points to the catalogue created by Stachl et al. (strategy b) and you can change this path to your own catalogue that follows the same format (strategy c). In addition, set ``SCRAPE_MISSING_GENRES`` to true if you are using a FILE catalogue and you want to scrape from the Play Store any missing genres and ``UPDATE_CATALOGUE_FILE`` to true if you want to save those scrapped genres back into the FILE. - -The genre catalogue we provide was shared as part of the Supplemental Materials of Stachl, C., Au, Q., Schoedel, R., Buschek, D., Völkel, S., Schuwerk, T., … Bühner, M. (2019, June 12). Behavioral Patterns in Smartphone Usage Predict Big Five Personality Traits. https://doi.org/10.31234/osf.io/ks4vd - -.. _battery-sensor-doc: - -Battery -""""""""" - -See `Battery Config Code`_ - -**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night - -**Available Platforms:** Android and iOS - -**Snakemake rule chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/readable_datetime`` -- Rule ``rules/features.snakefile/battery_deltas`` -- Rule ``rules/features.snakefile/battery_features`` - -.. _battery-parameters: - -**Battery Rule Parameters (battery_features):** - -============ =================== -Name Description -============ =================== -day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -features Features to be computed, see table below -============ =================== - -.. _battery-available-features: - -**Available Battery Features** - -===================== ================= ============= -Name Units Description -===================== ================= ============= -countdischarge episodes Number of discharging episodes. -sumdurationdischarge minutes The total duration of all discharging episodes. -countcharge episodes Number of battery charging episodes. -sumdurationcharge minutes The total duration of all charging episodes. -avgconsumptionrate episodes/minutes The average of all episodes’ consumption rates. An episode’s consumption rate is defined as the ratio between its battery delta and duration -maxconsumptionrate episodes/minutes The highest of all episodes’ consumption rates. An episode’s consumption rate is defined as the ratio between its battery delta and duration -===================== ================= ============= - -**Assumptions/Observations:** - -For Aware iOS client V1 we swap battery status 3 to 5 and 1 to 3, client V2 does not have this problem. - -.. _activity-recognition-sensor-doc: - - -Activity Recognition -"""""""""""""""""""""""""""" - -See `Activity Recognition Config Code`_ - -**Available Epochs:** daily, morning, afternoon, evening, night - -**Available Platforms:** Android and iOS - -**Snakemake rule chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/readable_datetime`` -- Rule ``rules/preprocessing.snakefile/unify_ios_android`` -- Rule ``rules/features.snakefile/google_activity_recognition_deltas`` -- Rule ``rules/features.snakefile/ios_activity_recognition_deltas`` -- Rule ``rules/features.snakefile/activity_features`` - -.. _activity-recognition-parameters: - -**Rule Parameters (activity_features):** - -============ =================== -Name Description -============ =================== -day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -features Features to be computed, see table below -============ =================== - -.. _activity-recognition-available-features: - -**Available Activity Recognition Features** - -====================== ============== ============= -Name Units Description -====================== ============== ============= -count rows Number of episodes. -mostcommonactivity activity_type The most common ``activity_type``. If this feature is not unique the first ``activity_type`` of the set of most common ``activity_types`` is selected ordered by ``activity_type``. -countuniqueactivities activity_type Number of unique ``activity_type``. -durationstationary minutes The total duration of episodes of still and tilting (phone) activities. -durationmobile minutes The total duration of episodes of on foot, running, and on bicycle activities -durationvehicle minutes The total duration of episodes of on vehicle activity -====================== ============== ============= - -**Assumptions/Observations:** - -iOS Activity Recognition data labels are unified with Google Activity Recognition labels: "automotive" to "in_vehicle", "cycling" to "on_bicycle", "walking" and "running" to "on_foot", "stationary" to "still". In addition, iOS activity pairs formed by "stationary" and "automotive" labels (driving but stopped at a traffic light) are transformed to "automotive" only. - -In AWARE, Activity Recognition data for Google (Android) and iOS are stored in two different database tables, RAPIDS (via Snakemake) automatically infers what platform each participant belongs to based on their participant file (``data/external/``) which in turn takes this information from the ``aware_device`` table (see ``optional_ar_input`` function in ``rules/features.snakefile``). - -The activties are mapped to activity_types as follows: - -=============== =============== -Activity Name Activity Type -=============== =============== -in_vehicle 0 -on_bicycle 1 -on_foot 2 -still 3 -unknown 4 -tilting 5 -walking 7 -running 8 -=============== =============== - - -.. _light-doc: - -Light -""""""" - -See `Light Config Code`_ - -**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night - -**Available Platforms:** Android - -**Rule Chain:** - -- Rule: ``rules/preprocessing.snakefile/download_dataset`` -- Rule: ``rules/preprocessing.snakefile/readable_datetime`` -- Rule: ``rules/features.snakefile/light_features`` - -.. _light-parameters: - -**Light Rule Parameters (light_features):** - -============ =================== -Name Description -============ =================== -day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -features Features to be computed, see table below -============ =================== - -.. _light-available-features: - -**Available Light Features** - -=========== ========= ============= -Name Units Description -=========== ========= ============= -count rows Number light sensor rows recorded. -maxlux lux The maximum ambient luminance. -minlux lux The minimum ambient luminance. -avglux lux The average ambient luminance. -medianlux lux The median ambient luminance. -stdlux lux The standard deviation of ambient luminance. -=========== ========= ============= - -**Assumptions/Observations:** N/A - - -.. _location-sensor-doc: - -Location (Barnett’s) Features -"""""""""""""""""""""""""""""" -Barnett’s location features are based on the concept of flights and pauses. GPS coordinates are converted into a -sequence of flights (straight line movements) and pauses (time spent stationary). Data is imputed before features -are computed. See Ian Barnett, Jukka-Pekka Onnela, Inferring mobility measures from GPS traces with missing data, Biostatistics, Volume 21, Issue 2, April 2020, Pages e98–e112, https://doi.org/10.1093/biostatistics/kxy059. The code for these features was made open source by Ian Barnett (https://scholar.harvard.edu/ibarnett/software/gpsmobility). - -See `Location (Barnett’s) Config Code`_ - -**Available Day Segments (epochs) :** only daily periods of EVERY_DAY_INTERVAL or FLEXIBLE_DAY_INTERVAL (periods that start at 00:00:00 and end at 23:59:59 on the same day) - -**Available Platforms:** Android and iOS - -**Snakemake rule chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` (de duplication and sorting by timestamp) -- Rule ``rules/preprocessing.snakefile/readable_datetime`` (add local date and time components, add local day segment) -- Rule ``rules/preprocessing.snakefile/phone_sensed_bins`` (get the periods of time the phone was sensing data to resample over them) -- Rule ``rules/preprocessing.snakefile/process_location_types`` (filter gps data or resample fused location, deletes (0,0) coordinates) -- Rule ``rules/features.snakefile/locations_r_features`` (RAPIDS executes ``barnett_location_features`` from ``src/features/location/barnett/main.R`) -- Rule ``rules/features.snakefile/join_features_from_providers`` (joins the location features of all python and r providers) - -.. _location-parameters: - -**Location Rule Parameters (location_barnett_features):** - -================= =================== -Name Description -================= =================== -location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``GPS`` OR ``RESAMPLE_FUSED`` -accuracy_limit This is in meters. The sensor drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius specified. -timezone The timezone used to calculate location. -minutes_data_used This is NOT a feature. This is just a quality control check, and if set to TRUE, a new column is added to the output file with the number of minutes containing location data that were used to compute all features. The more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. -features Features to be computed, see table below -================= =================== - -.. _location-available-features: - -**Available Location Features** - -Description taken from `Beiwe Summary Statistics`_. - -================ ========= ============= -Name Units Description -================ ========= ============= -hometime minutes Time at home. Time spent at home in minutes. Home is the most visited significant location between 8 pm and 8 am including any pauses within a 200-meter radius. -disttravelled meters Total distance travelled over a day (flights). -rog meters The Radius of Gyration (rog) is a measure in meters of the area covered by a person over a day. A centroid is calculated for all the places (pauses) visited during a day and a weighted distance between all the places and that centroid is computed. The weights are proportional to the time spent in each place. -maxdiam meters The maximum diameter is the largest distance between any two pauses. -maxhomedist meters The maximum distance from home in meters. -siglocsvisited locations The number of significant locations visited during the day. Significant locations are computed using k-means clustering over pauses found in the whole monitoring period. The number of clusters is found iterating k from 1 to 200 stopping until the centroids of two significant locations are within 400 meters of one another. -avgflightlen meters Mean length of all flights. -stdflightlen meters Standard deviation of the length of all flights. -avgflightdur seconds Mean duration of all flights. -stdflightdur seconds The standard deviation of the duration of all flights. -probpause The fraction of a day spent in a pause (as opposed to a flight) -siglocentropy nats Shannon’s entropy measurement based on the proportion of time spent at each significant location visited during a day. -circdnrtn A continuous metric quantifying a person’s circadian routine that can take any value between 0 and 1, where 0 represents a daily routine completely different from any other sensed days and 1 a routine the same as every other sensed day. -wkenddayrtn Same as circdnrtn but computed separately for weekends and weekdays. -================ ========= ============= - -**Assumptions/Observations:** - -*Types of location data to use* - -Aware Android and iOS clients can collect location coordinates through the phone's GPS, the network cellular towers around the phone or Google's fused location API. If you want to use only the GPS provider set ``location_to_use`` to ``GPS``, if you want to use all providers (not recommended due to the difference in accuracy) set ``location_to_use`` to ``ALL``, if your Aware client was configured to use fused location only or want to focus only on this provider, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days `), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one. - -There are two parameters associated with resampling fused location in the ``LOCATIONS`` section of the ``config.yaml`` file. ``RESAMPLE_FUSED_CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``RESAMPLE_FUSED_TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) makes that the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know. - -*Barnett's et al features* - -These features are based on a Pause-Flight model. A pause is defined as a mobiity trace (location pings) within a certain duration and distance (by default 300 seconds and 60 meters). A flight is any mobility trace between two pauses. Data is resampled and imputed before the features are computed. See this paper for more information: https://doi.org/10.1093/biostatistics/kxy059. - -In RAPIDS we only expose two parameters for these features (timezone and accuracy). If you wish to change others you can do so in ``src/features/location/barnett/library/MobilityFeatures.R`` - -*Significant Locations* - -Significant locations are determined using K-means clustering on pauses longer than 10 minutes. The number of clusters (K) is increased until no two clusters are within 400 meters from each other. After this, pauses within a certain range of a cluster (200 meters by default) will count as a visit to that significant location. This description was adapted from the Supplementary Materials of https://doi.org/10.1093/biostatistics/kxy059. - - -*The Circadian Calculation* - -For a detailed description of how this is calculated, see Canzian, L., & Musolesi, M. (2015, September). Trajectories of depression: unobtrusive monitoring of depressive states by means of smartphone mobility traces analysis. In Proceedings of the 2015 ACM international joint conference on pervasive and ubiquitous computing (pp. 1293-1304). Their procedure was followed using 30-min increments as a bin size. Taken from `Beiwe Summary Statistics`_. - - -Location (Doryab's) Features -"""""""""""""""""""""""""""""" -Doryab's location features are based on this paper: Doryab, A., Chikarsel, P., Liu, X., & Dey, A. K. (2019). Extraction of Behavioral Features from Smartphone and Wearable Data. ArXiv:1812.10394 [Cs, Stat]. http://arxiv.org/abs/1812.10394 - -See `Location (Doryab's) Config Code`_ - -**Available Day Segments (epochs):** any of EVERY_DAY_FREQUENCY, EVERY_DAY_INTERVAL and FLEXIBLE_DAY_INTERVAL - -**Available Platforms:** Android and iOS - -**Snakemake rule chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` (de duplication and sorting by timestamp) -- Rule ``rules/preprocessing.snakefile/readable_datetime`` (add local date and time components, add local day segment) -- Rule ``rules/preprocessing.snakefile/phone_sensed_bins`` (get the periods of time the phone was sensing data to resample over them) -- Rule ``rules/preprocessing.snakefile/process_location_types`` (filter gps data or resample fused location, deletes (0,0) coordinates) -- Rule ``rules/features.snakefile/locations_python_features`` (RAPIDS executes ``doryab_location_features`` from ``src/features/location/doryab/main.py`) -- Rule ``rules/features.snakefile/join_features_from_providers`` (joins the location features of all python and r providers) - -.. _location-doryab-parameters: - -**Location Rule Parameters (location_doryab_features):** - -=================== =================== -Name Description -=================== =================== -day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``GPS`` OR ``RESAMPLE_FUSED``. -features Features to be computed, see table below. -threshold_static It is the threshold value in km/hr which labels a row as Static or Moving. -dbscan_minsamples The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. -dbscan_eps The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. -maximum_gap_allowed The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the the phone was not sensing. -minutes_data_used This is NOT a feature. This is just a quality control check, and if set to TRUE, a new column is added to the output file with the number of minutes containing location data that were used to compute all features. The more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. -sampling_frequency Expected time difference between any two location rows in minutes. If set to '0', the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps. This parameter impacts all the time calculations. -=================== =================== - -.. _location-doryab-available-features: - -**Available Location Features** - -============================ ================ ============= -Name Units Description -============================ ================ ============= -locationvariance :math:`meters^2` The sum of the variances of the latitude and longitude columns. -loglocationvariance Log of the sum of the variances of the latitude and longitude columns. -totaldistance meters Total distance travelled in a ``day_segment`` using the haversine formula. -averagespeed km/hr Average speed in a ``day_segment`` considering only the instances labeled as Moving. -varspeed km/hr Speed variance in a ``day_segment`` considering only the instances labeled as Moving. -circadianmovement "It encodes the extent to which a person’s location patterns follow a 24-hour circadian cycle." (Doryab et. al. 2019) -numberofsignificantplaces places Number of significant locations visited. It is calculated using the DBSCAN clustering algorithm which takes in EPS and MIN_SAMPLES as paramters to identify clusters. Each cluster is a significant place. -numberlocationtransitions transitions Number of movements between any two clusters in a ``day_segment``. -radiusgyration meters Quantifies the area covered by a participant -timeattop1location minutes Time spent at the most significant location. -timeattop2location minutes Time spent at the 2nd most significant location. -timeattop3location minutes Time spent at the 3rd most significant location. -movingtostaticratio Ratio between the number of rows labeled Moving versus Static -outlierstimepercent Ratio between the number of rows that belong to non-significant clusters divided by the total number of rows in a ``day_segment``. -maxlengthstayatclusters minutes Maximum time spent in a cluster (significant location). -minlengthstayatclusters minutes Minimum time spent in a cluster (significant location). -meanlengthstayatclusters minutes Average time spent in a cluster (significant location). -stdlengthstayatclusters minutes Standard deviation of time spent in a cluster (significant location). -locationentropy nats Shannon Entropy computed over the row count of each cluster (significant location), it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location). -normalizedlocationentropy nats Shannon Entropy computed over the row count of each cluster (significant location) divided by the number of clusters, it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location). -============================ ================ ============= - -**Assumptions/Observations:** - -*Types of location data to use* - -Aware Android and iOS clients can collect location coordinates through the phone's GPS, the network cellular towers around the phone or Google's fused location API. If you want to use only the GPS provider set ``location_to_use`` to ``GPS``, if you want to use all providers (not recommended due to the difference in accuracy) set ``location_to_use`` to ``ALL``, if your Aware client was configured to use fused location only or want to focus only on this provider, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days `), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one. - -There are two parameters associated with resampling fused location in the ``LOCATIONS`` section of the ``config.yaml`` file. ``RESAMPLE_FUSED_CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``RESAMPLE_FUSED_TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) makes that the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know. - -*Significant Locations Identified* - -Significant locations are determined using DBSCAN clustering on locations that a patient visit over the course of the period of data collection. - -*Circadian Movement Calculation* - -"Circadian movement (Saeb et al. 2015) is calculated using the Lomb-Scargle method" (Doryab et. al. 2019) - -.. _screen-sensor-doc: - -Screen -"""""""" - -See `Screen Config Code`_ - -**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night - -**Available Platforms:** Android and iOS - -**Snakemake rule chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/readable_datetime`` -- Rule ``rules/preprocessing.snakefile/unify_ios_android`` -- Rule ``rules/features.snakefile/screen_deltas`` -- Rule ``rules/features.snakefile/screen_features`` - -.. _screen-parameters: - -**Screen Rule Parameters (screen_features):** - -============================ =================== -Name Description -============================ =================== -day_segment The particular ``day_segments`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -reference_hour_first_use The reference point from which ``firstuseafter`` is to be computed, default is midnight -ignore_episodes_shorter_than Ignore episodes that are shorter than this threshold (minutes). Set to 0 to disable this filter. -ignore_episodes_longer_than Ignore episodes that are longer than this threshold (minutes). Set to 0 to disable this filter. -features_deltas Features to be computed, see table below -episode_types Currently we only support unlock episodes (from when the phone is unlocked until the screen is off) -============================ =================== - -.. _screen-episodes-available-features: - -**Available Screen Episodes Features** - -========================= ================= ============= -Name Units Description -========================= ================= ============= -sumduration minutes Total duration of all unlock episodes. -maxduration minutes Longest duration of any unlock episode. -minduration minutes Shortest duration of any unlock episode. -avgduration minutes Average duration of all unlock episodes. -stdduration minutes Standard deviation duration of all unlock episodes. -countepisode episodes Number of all unlock episodes -episodepersensedminutes episodes/minute The ratio between the total number of episodes in an epoch divided by the total time (minutes) the phone was sensing data. -firstuseafter minutes Minutes until the first unlock episode. -========================= ================= ============= - -**Assumptions/Observations:** - -In Android, ``lock`` events can happen right after an ``off`` event, after a few seconds of an ``off`` event, or never happen depending on the phone's settings, therefore, an ``unlock`` episode is defined as the time between an ``unlock`` and a ``off`` event. In iOS, ``on`` and ``off`` events do not exist, so an ``unlock`` episode is defined as the time between an ``unlock`` and a ``lock`` event. - -Events in iOS are recorded reliably albeit some duplicated ``lock`` events within milliseconds from each other, so we only keep consecutive unlock/lock pairs. In Android you cand find multiple consecutive ``unlock`` or ``lock`` events, so we only keep consecutive unlock/off pairs. In our experiments these cases are less than 10% of the screen events collected and this happens because ``ACTION_SCREEN_OFF`` and ``ACTION_SCREEN_ON`` are "sent when the device becomes non-interactive which may have nothing to do with the screen turning off". In addition to unlock/off episodes, in Android it is possible to measure the time spent on the lock screen before an ``unlock`` event as well as the total screen time (i.e. ``ON`` to ``OFF``) but these are not implemented at the moment. - -To unify the screen processing and use the same code in RAPIDS, we replace LOCKED episodes with OFF episodes (2 with 0) in iOS. However, as mentioned above this is still computing ``unlock`` to ``lock`` episodes. - -.. _conversation-sensor-doc: - -Conversation -"""""""""""""" - -See `Conversation Config Code`_ - -**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night - -**Available Platforms:** Android and iOS - -**Snakemake rule chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/readable_datetime`` -- Rule ``rules/features.snakefile/conversation_features`` - -.. _conversation-parameters: - -**Conversation Rule Parameters (conversation_features):** - -========================= =================== -Name Description -========================= =================== -day_segment The particular ``day_segments`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -recordingMinutes Minutes the plugin was recording audio (default 1 min) -pausedMinutes Minutes the plugin was NOT recording audio (default 3 min) -features Features to be computed, see table below -========================= =================== - -.. _conversation-available-features: - -**Available Conversation Features** - -========================= ================= ============= -Name Units Description -========================= ================= ============= -minutessilence minutes Minutes labeled as silence -minutesnoise minutes Minutes labeled as noise -minutesvoice minutes Minutes labeled as voice -minutesunknown minutes Minutes labeled as unknown -sumconversationduration minutes Total duration of all conversations -maxconversationduration minutes Longest duration of all conversations -minconversationduration minutes Shortest duration of all conversations -avgconversationduration minutes Average duration of all conversations -sdconversationduration minutes Standard Deviation of the duration of all conversations -timefirstconversation minutes Minutes since midnight when the first conversation for a day segment was detected -timelastconversation minutes Minutes since midnight when the last conversation for a day segment was detected -noisesumenergy L2-norm Sum of all energy values when inference is noise -noiseavgenergy L2-norm Average of all energy values when inference is noise -noisesdenergy L2-norm Standard Deviation of all energy values when inference is noise -noiseminenergy L2-norm Minimum of all energy values when inference is noise -noisemaxenergy L2-norm Maximum of all energy values when inference is noise -voicesumenergy L2-norm Sum of all energy values when inference is voice -voiceavgenergy L2-norm Average of all energy values when inference is voice -voicesdenergy L2-norm Standard Deviation of all energy values when inference is voice -voiceminenergy L2-norm Minimum of all energy values when inference is voice -voicemaxenergy L2-norm Maximum of all energy values when inference is voice -silencesensedfraction Ratio between minutessilence and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown) -noisesensedfraction Ratio between minutesnoise and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown) -voicesensedfraction Ratio between minutesvoice and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown) -unknownsensedfraction Ratio between minutesunknown and the sum of (minutessilence, minutesnoise, minutesvoice, minutesunknown) -silenceexpectedfraction Ration between minutessilence and the number of minutes that in theory should have been sensed based on the record and pause cycle of the plugin (1440 / recordingMinutes+pausedMinutes) -noiseexpectedfraction Ration between minutesnoise and the number of minutes that in theory should have been sensed based on the record and pause cycle of the plugin (1440 / recordingMinutes+pausedMinutes) -voiceexpectedfraction Ration between minutesvoice and the number of minutes that in theory should have been sensed based on the record and pause cycle of the plugin (1440 / recordingMinutes+pausedMinutes) -unknownexpectedfraction Ration between minutesunknown and the number of minutes that in theory should have been sensed based on the record and pause cycle of the plugin (1440 / recordingMinutes+pausedMinutes) -========================= ================= ============= - -**Assumptions/Observations:** -N/A - -.. ------------------------------- Begin Fitbit Section ----------------------------------- .. - -.. _fitbit-sleep-sensor-doc: - -Fitbit: Sleep -""""""""""""""""""" - -See `Fitbit: Sleep Config Code`_ - -**Available Epochs (day_segment) :** daily - -**Available Platforms:**: Fitbit - -**Snakemake rule chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/fitbit_with_datetime`` -- Rule ``rules/features.snakefile/fitbit_sleep_features`` - -.. _fitbit-sleep-parameters: - -**Fitbit: Sleep Rule Parameters (fitbit_sleep_features):** - -================================== =================== -Name Description -================================== =================== -day_segment The particular ``day_segment`` that will be analyzed. For this sensor only ``daily`` is used. -sleep_types The types of sleep provided by Fitbit: ``main``, ``nap``, ``all``. -daily_features_from_summary_data The sleep features that can be computed based on Fitbit's summary data. See :ref:`Available Fitbit: Sleep Features ` Table below -================================== =================== - -.. _fitbit-sleep-available-features: - -**Available Fitbit: Sleep Features** - -======================== =========== ============= -Name Units Description -======================== =========== ============= -sumdurationtofallasleep minutes Time it took the user to fall asleep for ``sleep_type`` during ``day_segment``. -sumdurationawake minutes Time the user was awake but still in bed for ``sleep_type`` during ``day_segment``. -sumdurationasleep minutes Sleep duration for ``sleep_type`` during ``day_segment``. -sumdurationafterwakeup minutes Time the user stayed in bed after waking up for ``sleep_type`` during ``day_segment``. -sumdurationinbed minutes Total time the user stayed in bed (sumdurationtofallasleep + sumdurationawake + sumdurationasleep + sumdurationafterwakeup) for ``sleep_type`` during ``day_segment``. -avgefficiency scores Sleep efficiency average for ``sleep_type`` during ``day_segment``. -countepisode episodes Number of sleep episodes for ``sleep_type`` during ``day_segment``. -======================== =========== ============= - -**Assumptions/Observations:** - -Only features from summary data are available at the momement. - -The `fitbit_with_datetime` rule will extract Summary data (`fitbit_sleep_summary_with_datetime.csv`) and Intraday data (`fitbit_sleep_intraday_with_datetime.csv`). There are two versions of Fitbit's sleep API (`version 1`_ and `version 1.2`_), and each provides raw sleep data in a different format: - - - Sleep level. In ``v1``, sleep level is an integer with three possible values (1, 2, 3) while in ``v1.2`` is a string. We convert integer levels to strings, ``asleep``, ``restless`` or ``awake`` respectively. - - Count summaries. For Summary data, ``v1`` contains ``count_awake``, ``duration_awake``, ``count_awakenings``, ``count_restless``, and ``duration_restless`` fields for every sleep record while ``v1.2`` does not. - - Types of sleep records. ``v1.2`` has two types of sleep records: ``classic`` and ``stages``. The ``classic`` type contains three sleep levels: ``awake``, ``restless`` and ``asleep``. The ``stages`` type contains four sleep levels: ``wake``, ``deep``, ``light``, and ``rem``. Sleep records from ``v1`` will have the same sleep levels as `v1.2` classic type; therefore we set their type to ``classic``. - - Unified level of sleep. For intraday data, we unify sleep levels of each sleep record with a column named ``unified_level``. Based on `this Fitbit forum post`_ , we merge levels into two categories: - - For the ``classic`` type unified_level is one of {0, 1} where 0 means awake and groups ``awake`` + ``restless``, while 1 means asleep and groups ``asleep``. - - For the ``stages`` type, unified_level is one of {0, 1} where 0 means awake and groups ``wake`` while 1 means asleep and groups ``deep`` + ``light`` + ``rem``. - - Short Data. In ``v1.2``, records of type ``stages`` contain ``shortData`` in addition to ``data``. We merge both to extract intraday data. - - ``data`` contains sleep stages and any wake periods > 3 minutes (180 seconds). - - ``shortData`` contains short wake periods representing physiological awakenings that are <= 3 minutes (180 seconds). - - The following columns of Summary data are not computed by RAPIDS but taken directly from columns with a similar name provided by Fitbit's API: ``efficiency``, ``minutes_after_wakeup``, ``minutes_asleep``, ``minutes_awake``, ``minutes_to_fall_asleep``, ``minutes_in_bed``, ``is_main_sleep`` and ``type`` - - The following columns of Intraday data are not computed by RAPIDS but taken directly from columns with a similar name provided by Fitbit's API: ``original_level``, ``is_main_sleep`` and ``type``. We compute ``unified_level`` as explained above. - -These are examples of intraday and summary data: - -- Intraday data (at 30-second intervals for ``stages`` type or 60-second intervals for ``classic`` type) - -========= ============== ============= ============= ====== =================== ========== =========== ========= ================= ========== ========== ============ ================= -device_id original_level unified_level is_main_sleep type local_date_time local_date local_month local_day local_day_of_week local_time local_hour local_minute local_day_segment -========= ============== ============= ============= ====== =================== ========== =========== ========= ================= ========== ========== ============ ================= -did wake 0 1 stages 2020-05-20 22:13:30 2020-05-20 5 20 2 22:13:30 22 13 evening -did wake 0 1 stages 2020-05-20 22:14:00 2020-05-20 5 20 2 22:14:00 22 14 evening -did light 1 1 stages 2020-05-20 22:14:30 2020-05-20 5 20 2 22:14:30 22 14 evening -did light 1 1 stages 2020-05-20 22:15:00 2020-05-20 5 20 2 22:15:00 22 15 evening -did light 1 1 stages 2020-05-20 22:15:30 2020-05-20 5 20 2 22:15:30 22 15 evening -========= ============== ============= ============= ====== =================== ========== =========== ========= ================= ========== ========== ============ ================= - -- Summary data - -========= ========== ==================== ============== ============= ====================== ============== ============= ====== ===================== =================== ================ ============== ======================= ===================== -device_id efficiency minutes_after_wakeup minutes_asleep minutes_awake minutes_to_fall_asleep minutes_in_bed is_main_sleep type local_start_date_time local_end_date_time local_start_date local_end_date local_start_day_segment local_end_day_segment -========= ========== ==================== ============== ============= ====================== ============== ============= ====== ===================== =================== ================ ============== ======================= ===================== -did 90 0 381 54 0 435 1 stages 2020-05-20 22:12:00 2020-05-21 05:27:00 2020-05-20 2020-05-21 evening night -did 88 0 498 86 0 584 1 stages 2020-05-22 22:03:00 2020-05-23 07:47:03 2020-05-22 2020-05-23 evening morning -========= ========== ==================== ============== ============= ====================== ============== ============= ====== ===================== =================== ================ ============== ======================= ===================== - - -.. _fitbit-heart-rate-sensor-doc: - -Fitbit: Heart Rate -""""""""""""""""""" - -See `Fitbit: Heart Rate Config Code`_ - -**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night - -**Available Platforms:**: Fitbit - -**Snakemake rule chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/fitbit_with_datetime`` -- Rule ``rules/features.snakefile/fitbit_heartrate_features`` - -.. _fitbit-heart-rate-parameters: - -**Fitbit: Heart Rate Rule Parameters (fitbit_heartrate_features):** - -============ =================== -Name Description -============ =================== -day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -features The heartrate features that can be computed. See :ref:`Available Fitbit: Heart Rate Features ` Table below -============ =================== - -.. _fitbit-heart-rate-available-features: - -**Available Fitbit: Heart Rate Features** - -================== =========== ============= -Name Units Description -================== =========== ============= -restingheartrate beats/mins The number of times your heart beats per minute when participant is still and well rested for ``daily`` epoch. -calories cals Calories burned during ``heartrate_zone`` for ``daily`` epoch. -maxhr beats/mins The maximum heart rate during ``day_segment`` epoch. -minhr beats/mins The minimum heart rate during ``day_segment`` epoch. -avghr beats/mins The average heart rate during ``day_segment`` epoch. -medianhr beats/mins The median of heart rate during ``day_segment`` epoch. -modehr beats/mins The mode of heart rate during ``day_segment`` epoch. -stdhr beats/mins The standard deviation of heart rate during ``day_segment`` epoch. -diffmaxmodehr beats/mins The difference between the maximum and mode heart rate during ``day_segment`` epoch. -diffminmodehr beats/mins The difference between the mode and minimum heart rate during ``day_segment`` epoch. -entropyhr nats Shannon’s entropy measurement based on heart rate during ``day_segment`` epoch. -minutesonZONE minutes Number of minutes the user's heartrate fell within each ``heartrate_zone`` during ``day_segment`` epoch. -================== =========== ============= - -**Assumptions/Observations:** - -There are four heart rate zones: ``out_of_range``, ``fat_burn``, ``cardio``, and ``peak``. Please refer to `Fitbit documentation`_ for more information about the way they are computed. - -Calories' accuracy depends on the users’ Fitbit profile (weight, height, etc.). - - -.. _fitbit-steps-sensor-doc: - -Fitbit: Steps -""""""""""""""" - -See `Fitbit: Steps Config Code`_ - -**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night - -**Available Platforms:**: Fitbit - -**Snakemake rule chain:** - -- Rule ``rules/preprocessing.snakefile/download_dataset`` -- Rule ``rules/preprocessing.snakefile/fitbit_with_datetime`` -- Rule ``rules/features.snakefile/fitbit_step_features`` - -.. _fitbit-steps-parameters: - -**Fitbit: Steps Rule Parameters (fitbit_step_features):** - -========================== =================== -Name Description -========================== =================== -day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -features The features that can be computed. See :ref:`Available Fitbit: Steps Features ` Table below -threshold_active_bout Every minute with Fitbit step data wil be labelled as ``sedentary`` if its step count is below this threshold, otherwise, ``active``. -include_zero_step_rows Whether or not to include day segments with a 0 step count -exclude_sleep Whether or not to exclude step rows that happen during sleep -exclude_sleep_type If ``exclude_sleep`` is True, then you can choose between ``FIXED`` or ``FITBIT_BASED``. ``FIXED`` will exclude all step rows that happen between a start and end time (see below). ``FITBIT_BASED`` will exclude step rows that happen during main sleep segments as measured by the Fitbit device (``config[SLEEP][DB_TABLE]`` should be a valid table in your database, it usually is the same table that contains your STEP data) -exclude_sleep_fixed_start Start time of the fixed sleep period to exclude. Only relevant if ``exclude_sleep`` is True and ``exclude_sleep_type`` is ``FIXED`` -exclude_sleep_fixed_end Start time of the fixed sleep period to exclude. Only relevant if ``exclude_sleep`` is True and ``exclude_sleep_type`` is ``FIXED`` -========================== =================== - -.. _fitbit-steps-available-features: - -**Available Fitbit: Steps Features** - -========================== ========= ============= -Name Units Description -========================== ========= ============= -sumallsteps steps The total step count during ``day_segment`` epoch. -maxallsteps steps The maximum step count during ``day_segment`` epoch. -minallsteps steps The minimum step count during ``day_segment`` epoch. -avgallsteps steps The average step count during ``day_segment`` epoch. -stdallsteps steps The standard deviation of step count during ``day_segment`` epoch. -countepisodesedentarybout bouts Number of sedentary bouts during ``day_segment`` epoch. -sumdurationsedentarybout minutes Total duration of all sedentary bouts during ``day_segment`` epoch. -maxdurationsedentarybout minutes The maximum duration of any sedentary bout during ``day_segment`` epoch. -mindurationsedentarybout minutes The minimum duration of any sedentary bout during ``day_segment`` epoch. -avgdurationsedentarybout minutes The average duration of sedentary bouts during ``day_segment`` epoch. -stddurationsedentarybout minutes The standard deviation of the duration of sedentary bouts during ``day_segment`` epoch. -countepisodeactivebout bouts Number of active bouts during ``day_segment`` epoch. -sumdurationactivebout minutes Total duration of all active bouts during ``day_segment`` epoch. -maxdurationactivebout minutes The maximum duration of any active bout during ``day_segment`` epoch. -mindurationactivebout minutes The minimum duration of any active bout during ``day_segment`` epoch. -avgdurationactivebout minutes The average duration of active bouts during ``day_segment`` epoch. -stddurationactivebout minutes The standard deviation of the duration of active bouts during ``day_segment`` epoch. -========================== ========= ============= - -**Assumptions/Observations:** - -Active and sedentary bouts. If the step count per minute is smaller than ``THRESHOLD_ACTIVE_BOUT`` (default value is 10), that minute is labelled as sedentary, otherwise, is labelled as active. Active and sedentary bouts are periods of consecutive minutes labelled as ``active`` or ``sedentary``. - -``validsensedminutes`` feature is not available for Step sensor as we cannot determine the valid minutes based on the raw Fitbit step data. - - -.. -------------------------Links ------------------------------------ .. - -.. _PHONE_VALID_SENSED_BINS: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L30 -.. _`Messages Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L43 -.. _AWARE: https://awareframework.com/what-is-aware/ -.. _`List of Timezones`: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones -.. _DAY_SEGMENTS: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L6 -.. _PHONE_VALID_SENSED_DAYS: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L37 -.. _`Call Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L53 -.. _`WiFi Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L172 -.. _`Bluetooth Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L84 -.. _`Accelerometer Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L118 -.. _`Applications Foreground Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L128 -.. _`Battery Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L98 -.. _`Activity Recognition Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L90 -.. _`Light Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L112 -.. _`Location (Barnett’s) Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L74 -.. _`Location (Doryab's) Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L74 -.. _`Screen Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L104 -.. _`Fitbit: Sleep Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L165 -.. _`version 1`: https://dev.fitbit.com/build/reference/web-api/sleep-v1/ -.. _`version 1.2`: https://dev.fitbit.com/build/reference/web-api/sleep/ -.. _`Conversation Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L191 -.. _`this Fitbit forum post`: https://community.fitbit.com/t5/Alta/What-does-Restless-mean-in-sleep-tracking/td-p/2989011 -.. _shortData: https://dev.fitbit.com/build/reference/web-api/sleep/#interpreting-the-sleep-stage-and-short-data -.. _`Fitbit: Heart Rate Config Code`: https://github.com/carissalow/rapids/blob/4bdc30ffa4e13987b398a2354746d1a1977bef27/config.yaml#L141 -.. _`Fitbit: Steps Config Code`: https://github.com/carissalow/rapids/blob/29b04b0601b62379fbdb76de685f3328b8dde2a2/config.yaml#L148 -.. _`Fitbit documentation`: https://help.fitbit.com/articles/en_US/Help_article/1565 -.. _top1global: https://github.com/carissalow/rapids/blob/765bb462636d5029a05f54d4c558487e3786b90b/config.yaml#L136 -.. _`Beiwe Summary Statistics`: http://wiki.beiwe.org/wiki/Summary_Statistics -.. _`Pause-Flight Model`: https://academic.oup.com/biostatistics/advance-article/doi/10.1093/biostatistics/kxy059/5145908 diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..5c200d6a --- /dev/null +++ b/docs/index.md @@ -0,0 +1,40 @@ +# Welcome to RAPIDS documentation + +Reproducible Analysis Pipeline for Data Streams (RAPIDS) allows you to process smartphone and wearable data to extract **behavioral features** (a.k.a. digital biomarkers/phenotypes). + +RAPIDS is open source, documented, modular, tested, and reproducible. At the moment we support smartphone data collected with [AWARE](awareframework.com/) and wearable data from Fitbit devices. + +:material-slack: Questions or feedback can be posted on \#rapids in AWARE Framework\'s [slack](http://awareframework.com:3000/). + +:material-github: Bugs should be reported on [Github](https://github.com/carissalow/rapids/issues). + +:fontawesome-solid-tasks: Join our discussions on our algorithms and assumptions for feature [processing](https://github.com/carissalow/rapids/issues?q=is%3Aissue+is%3Aopen+label%3Adiscussion). + +## How does it work? + +RAPIDS is formed by R and Python scripts orchestrated by [Snakemake](https://snakemake.readthedocs.io/en/stable/). We suggest you read Snakemake's docs but in short: every link in the analysis chain is atomic and has files as input and output. Behavioral features are processed per sensor and per participant. + +## What are the benefits of using RAPIDS? + +1. **Consistent analysis**. Every participant sensor dataset is analyzed in the exact same way and isolated from each other. +2. **Efficient analysis**. Every analysis step is executed only once. Whenever your data or configuration changes only the affected files are updated. +5. **Parallel execution**. Thanks to Snakemake, your analysis can be executed over multiple cores without changing your code. +6. **Extensible code**. You can easily add your own behavioral features in R or Python and keep authorship and citations. +3. **Timezone aware**. Your data is adjusted to the specified timezone (multiple timezones suport *coming soon*). +4. **Flexible day segments**. You can extract behavioral features on time windows of any length (e.g. 5 minutes, 3 hours, 2 days), on every day or particular days (e.g. weekends, Mondays, the 1st of each month, etc.) or around events of interest (e.g. surveys or clinical relapses). +7. **Tested code**. We are constantly adding tests to make sure our behavioral features are correct. +8. **Reproducible code**. You can be sure your code will run in other computers as intended thanks to R and Python virtual environments. You can share your analysis code along your publications without any overhead. +9. **Private**. All your data is processed locally. + + + + +## How is it organized? + +The `config.yaml` file is the only file that you will have to modify. It includes parameters to manage participants, data sources, sensor data, visualizations and more. + +All data is saved in `data/`. The `data/external/` folder stores any data imported by the user, `data/raw/` stores sensor data as imported from your database, `data/interim/` has intermediate files necessary to compute behavioral features from raw data, and `data/processed/` has all the final files with the behavioral features per sensor and participant. + +All the source code is saved in `src/`. The `src/data/` folder stores scripts to download, clean and pre-process sensor data, `src/features` has scripts to extract behavioral features organized in their respective subfolders , `src/models/` can host any script to create models or statistical analyses with the behavioral features you extract, and `src/visualization/` has scripts to create plots of the raw and processed data. + +There are other important files and folders but only relevant if you are interested in extending RAPIDS (e.g. virtual env files, docs, tests, Dockerfile, the Snakefile, etc.). \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index 9eb010f7..00000000 --- a/docs/index.rst +++ /dev/null @@ -1,50 +0,0 @@ -.. moshi-aware documentation master file, created by - sphinx-quickstart. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -RAPIDS -====== - -**R**\ eproducible **A**\ nalysis **Pi**\ peline for **D**\ ata **S**\ treams - -Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_ - -Contents: - -.. toctree:: - :maxdepth: 2 - :caption: Getting Started - - usage/introduction - usage/installation - usage/quick_rule - usage/example - usage/snakemake_docs - usage/faq - -.. toctree:: - :maxdepth: 2 - :caption: Features - - features/extracted - -.. toctree:: - :maxdepth: 2 - :caption: Visualization - - visualization/data_exploration - -.. toctree:: - :maxdepth: 2 - :caption: Developers - - develop/remotesupport - develop/documentation - develop/features - develop/environments - develop/contributors - develop/testing - develop/test_cases - -.. _slack: http://awareframework.com:3000/ diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 910e7117..00000000 --- a/docs/make.bat +++ /dev/null @@ -1,190 +0,0 @@ -@ECHO OFF - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set BUILDDIR=_build -set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . -set I18NSPHINXOPTS=%SPHINXOPTS% . -if NOT "%PAPER%" == "" ( - set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% - set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% -) - -if "%1" == "" goto help - -if "%1" == "help" ( - :help - echo.Please use `make ^` where ^ is one of - echo. html to make standalone HTML files - echo. dirhtml to make HTML files named index.html in directories - echo. singlehtml to make a single large HTML file - echo. pickle to make pickle files - echo. json to make JSON files - echo. htmlhelp to make HTML files and a HTML help project - echo. qthelp to make HTML files and a qthelp project - echo. devhelp to make HTML files and a Devhelp project - echo. epub to make an epub - echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter - echo. text to make text files - echo. man to make manual pages - echo. texinfo to make Texinfo files - echo. gettext to make PO message catalogs - echo. changes to make an overview over all changed/added/deprecated items - echo. linkcheck to check all external links for integrity - echo. doctest to run all doctests embedded in the documentation if enabled - goto end -) - -if "%1" == "clean" ( - for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i - del /q /s %BUILDDIR%\* - goto end -) - -if "%1" == "html" ( - %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/html. - goto end -) - -if "%1" == "dirhtml" ( - %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. - goto end -) - -if "%1" == "singlehtml" ( - %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. - goto end -) - -if "%1" == "pickle" ( - %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can process the pickle files. - goto end -) - -if "%1" == "json" ( - %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can process the JSON files. - goto end -) - -if "%1" == "htmlhelp" ( - %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can run HTML Help Workshop with the ^ -.hhp project file in %BUILDDIR%/htmlhelp. - goto end -) - -if "%1" == "qthelp" ( - %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can run "qcollectiongenerator" with the ^ -.qhcp project file in %BUILDDIR%/qthelp, like this: - echo.^> qcollectiongenerator %BUILDDIR%\qthelp\moshi-aware.qhcp - echo.To view the help file: - echo.^> assistant -collectionFile %BUILDDIR%\qthelp\moshi-aware.ghc - goto end -) - -if "%1" == "devhelp" ( - %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. - goto end -) - -if "%1" == "epub" ( - %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The epub file is in %BUILDDIR%/epub. - goto end -) - -if "%1" == "latex" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "text" ( - %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The text files are in %BUILDDIR%/text. - goto end -) - -if "%1" == "man" ( - %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The manual pages are in %BUILDDIR%/man. - goto end -) - -if "%1" == "texinfo" ( - %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. - goto end -) - -if "%1" == "gettext" ( - %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The message catalogs are in %BUILDDIR%/locale. - goto end -) - -if "%1" == "changes" ( - %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes - if errorlevel 1 exit /b 1 - echo. - echo.The overview file is in %BUILDDIR%/changes. - goto end -) - -if "%1" == "linkcheck" ( - %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck - if errorlevel 1 exit /b 1 - echo. - echo.Link check complete; look for any errors in the above output ^ -or in %BUILDDIR%/linkcheck/output.txt. - goto end -) - -if "%1" == "doctest" ( - %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest - if errorlevel 1 exit /b 1 - echo. - echo.Testing of doctests in the sources finished, look at the ^ -results in %BUILDDIR%/doctest/output.txt. - goto end -) - -:end diff --git a/docs/setup/configuration.md b/docs/setup/configuration.md new file mode 100644 index 00000000..bb1c039f --- /dev/null +++ b/docs/setup/configuration.md @@ -0,0 +1,388 @@ + +# Initial Configuration + +You need to follow these steps to configure your RAPIDS deployment before you can extract behavioral features + +1. Add your [database credentials](#database-credentials) +2. Choose the [timezone of your study](#timezone-of-your-study) +3. Create your [participants files](#participant-files) +4. Select what [day segments](#day-segments) you want to extract features on +5. Modify your [device data configuration](#device-data-configuration) +6. Select what [sensors and features](#sensor-and-features-to-process) you want to process + +When you are done with this initial configuration, go to [executing RAPIDS](). + +!!! hint + Every time you see `config["KEY"]` or `[KEY]` in these docs we are referring to the corresponding key in the `config.yaml` file. + +--- +## Database credentials + +1. Create an empty file called `#!bash .env` in your RAPIDS root directory +2. Add the following lines and replace your database-specific credentials (user, password, host, and database): + + ``` yaml + [MY_GROUP] + user=MY_USER + password=MY_PASSWORD + host=MY_HOST + port=3306 + database=MY_DATABASE + ``` + +!!! warning + The label `MY_GROUP` is arbitrary but it has to match the following `config.yaml` key: + + ```yaml + DATABASE_GROUP: &database_group + MY_GROUP + ``` + +!!! note + You can ignore this step if you are only processing Fitbit data in CSV files. +--- + +## Timezone of your study + +### Single timezone + +If your study only happened in a single time zone, select the appropriate code form this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) and change the following config key. Double check your timezone code pick, for example US Eastern Time is `America/New_York` not `EST` + +``` yaml +TIMEZONE: &timezone + America/New_York +``` + +### Multiple timezones + +Support coming soon. + +--- + +## Participant files + +Participant files link together multiple devices (smartphones and wearables) to specific participants and identify them throughout RAPIDS. You can create these files manually or [automatically](#automatic-creation-of-participant-files). Participant files are stored in `data/external/participant_files/pxx.yaml` and follow a unified structure: + +```yaml +# This is the content of a participant file (data/external/participant_files/pxx.yaml) +PHONE: + DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524, dsadas-2324-fgsf-sdwr-gdfgs4rfsdf43] + PLATFORMS: [android,ios] + LABEL: test01 + START_DATE: 2020-04-23 + END_DATE: 2020-10-28 +FITBIT: + DEVICE_IDS: [fitbit1] + LABEL: test01 + START_DATE: 2020-04-23 + END_DATE: 2020-10-28 + +``` + +??? hint "Optional: Migrating participants files with the old format" + If you were using the pre-release version of RAPIDS with participant files in plain text (as opposed to yaml), you can run the following command and your old files will be converted into yaml files stored in `data/external/participant_files/` + + ```bash + python tools/update_format_participant_files.py + ``` + +!!! tip + Attributes of the `[PHONE]` and `[FITBIT]` sections are optional which allows you to analyze data from participants that only carried smartphones, only Fitbit devices, or both. + +### Structure of participants files + +**For `[PHONE]`** + +| Key | Description | +|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `[DEVICE_IDS]` | An array of the strings that uniquely identify each smartphone, you can have more than one for when participants changed phones in the middle of the study, in this case, data from all their devices will be joined and relabeled with the last 1 on this list. | +| `[PLATFORMS]` | An array that specifies the OS of each smartphone in `[DEVICE_IDS]` , use a combination of `android` or `ios` (we support participants that changed platforms in the middle of your study!). If you have an `aware_device` table in your database you can set `[PLATFORMS]: [multiple]` and RAPIDS will infer them automatically. | +| `[LABEL]` | A string that is used in reports and visualizations. | +| `[START_DATE]` | A string with format `YYY-MM-DD` . Only data collected *after* this date will be included in the analysis | +| `[END_DATE]` | A string with format `YYY-MM-DD` . Only data collected *before* this date will be included in the analysis | + +**For `[FITBIT]`** + +| Key | Description | +|------------------|-----------------------------------------------------------------------------------------------------------| +| `[DEVICE_IDS]` | An array of the strings that uniquely identify each Fitbit, you can have more than one in case the participant changed devices in the middle of the study, in this case, data from all devices will be joined and relabeled with the last `device_id` on this list. | +| `[LABEL]` | A string that is used in reports and visualizations. | +| `[START_DATE]` | A string with format `YYY-MM-DD` . Only data collected *after* this date will be included in the analysis | +| `[END_DATE]` | A string with format `YYY-MM-DD` . Only data collected *before* this date will be included in the analysis | + +### Automatic creation of participant files + +You have two options a) use the `aware_device` table in your database or b) use a CSV file. In either case, in your `config.yaml`, set `[PHONE_SECTION][ADD]` or `[FITBIT_SECTION][ADD]` to `TRUE` depending on what devices you used in your study. Set `[DEVICE_ID_COLUMN]` to the name of the column that uniquely identifies each device and include any device ids you want to ignore in `[IGNORED_DEVICE_IDS]`. + +=== "aware_device table" + + Set the following keys in your `config.yaml` + + ```yaml + CREATE_PARTICIPANT_FILES: + SOURCE: + TYPE: AWARE_DEVICE_TABLE + DATABASE_GROUP: *database_group + CSV_FILE_PATH: "" + TIMEZONE: *timezone + PHONE_SECTION: + ADD: TRUE # or FALSE + DEVICE_ID_COLUMN: device_id # column name + IGNORED_DEVICE_IDS: [] + FITBIT_SECTION: + ADD: TRUE # or FALSE + DEVICE_ID_COLUMN: fitbit_id # column name + IGNORED_DEVICE_IDS: [] + ``` + + Then run + + ```bash + snakemake -j1 create_participants_files + ``` + +=== "CSV file" + + Set the following keys in your `config.yaml`. + + ```yaml + CREATE_PARTICIPANT_FILES: + SOURCE: + TYPE: CSV_FILE + DATABASE_GROUP: "" + CSV_FILE_PATH: "your_path/to_your.csv" + TIMEZONE: *timezone + PHONE_SECTION: + ADD: TRUE # or FALSE + DEVICE_ID_COLUMN: device_id # column name + IGNORED_DEVICE_IDS: [] + FITBIT_SECTION: + ADD: TRUE # or FALSE + DEVICE_ID_COLUMN: fitbit_id # column name + IGNORED_DEVICE_IDS: [] + ``` + Your CSV file (`[SOURCE][CSV_FILE_PATH]`) should have the following columns but you can omit any values you don't have on each column: + + | Column | Description | + |------------------|-----------------------------------------------------------------------------------------------------------| + | phone device id | The name of this column has to match `[PHONE_SECTION][DEVICE_ID_COLUMN]`. Separate multiple ids with `;` | + | fitbit device id | The name of this column has to match `[FITBIT_SECTION][DEVICE_ID_COLUMN]`. Separate multiple ids with `;` | + | pid | Unique identifiers with the format pXXX (your participant files will be named with this string | + | platform | Use `android`, `ios` or `multiple` as explained above, separate values with `;` | + | label | A human readable string that is used in reports and visualizations. | + | start_date | A string with format `YYY-MM-DD`. | + | end_date | A string with format `YYY-MM-DD`. | + + !!! example + + ```csv + device_id,pid,label,platform,start_date,end_date,fitbit_id + a748ee1a-1d0b-4ae9-9074-279a2b6ba524;dsadas-2324-fgsf-sdwr-gdfgs4rfsdf43,p01,julio,android;ios,2020-01-01,2021-01-01,fitbit1 + 4c4cf7a1-0340-44bc-be0f-d5053bf7390c,p02,meng,ios,2021-01-01,2022-01-01,fitbit2 + ``` + + Then run + + ```bash + snakemake -j1 create_participants_files + ``` + +--- + +## Day Segments + +Day segments (or epochs) are the time windows on which you want to extract behavioral features. For example, you might want to process data on every day, every morning, or only during weekends. RAPIDS offers three categories of day segments that are flexible enough to cover most use cases: **frequency** (short time windows every day), **periodic** (arbitrary time windows on any day), and **event** (arbitrary time windows around events of interest). See also our [examples](#segment-examples). + +=== "Frequency Segments" + + These segments are computed on every day and all have the same duration (for example 30 minutes). Set the following keys in your `config.yaml` + + ```yaml + DAY_SEGMENTS: &day_segments + TYPE: FREQUENCY + FILE: "data/external/your_frequency_segments.csv" + INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE + ``` + + The file pointed by `[DAY_SEGMENTS][FILE]` should have the following format and can only have 1 row. + + | Column | Description | + |--------|----------------------------------------------------------------------| + | label | A string that is used as a prefix in the name of your day segments | + | length | An integer representing the duration of your day segments in minutes | + + !!! example + + ```csv + label,length + thirtyminutes,30 + ``` + + This configuration will compute 48 day segments for every day when any data from any participant was sensed. For example: + + ```csv + start_time,length,label + 00:00,30,thirtyminutes0000 + 00:30,30,thirtyminutes0001 + 01:00,30,thirtyminutes0002 + 01:30,30,thirtyminutes0003 + ... + ``` + +=== "Periodic Segments" + + These segments can be computed every day, or on specific days of the week, month, quarter, and year. Their minimum duration is 1 minute but they can be as long as you want. Set the following keys in your `config.yaml`. + + ```yaml + DAY_SEGMENTS: &day_segments + TYPE: PERIODIC + FILE: "data/external/your_periodic_segments.csv" + INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # or TRUE + ``` + + If `[INCLUDE_PAST_PERIODIC_SEGMENTS]` is set to `TRUE`, RAPIDS will consider instances of your segments back enough in the past as to include the first row of data of each participant. For example, if the first row of data from a participant happened on Saturday March 7th 2020 and the requested segment duration is 7 days starting on every Sunday, the first segment to be considered would start on Sunday March 1st if `[INCLUDE_PAST_PERIODIC_SEGMENTS]` is `TRUE` or on Sunday March 8th if `FALSE`. + + The file pointed by `[DAY_SEGMENTS][FILE]` should have the following format and can have multiple rows. + + | Column | Description | + |---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | label | A string that is used as a prefix in the name of your day segments. It has to be **unique** between rows | + | start_time | A string with format `HH:MM:SS` representing the starting time of this segment on any day | + | length | A string representing the length of this segment.It can have one or more of the following strings **`XXD XXH XXM XXS`** to represent days, hours, minutes and seconds. For example `7D 23H 59M 59S` | + | repeats_on | One of the follow options `every_day`, `wday`, `qday`, `mday`, and `yday`. The last four represent a week, quarter, month and year day | + | repeats_value | An integer complementing `repeats_on`. If you set `repeats_on` to `every_day` set this to `0`, otherwise `1-7` represent a `wday` starting from Mondays, `1-31` represent a `mday`, `1-91` represent a `qday`, and `1-366` represent a `yday` | + + !!! example + + ```csv + label,start_time,length,repeats_on,repeats_value + daily,00:00:00,23H 59M 59S,every_day,0 + morning,06:00:00,5H 59M 59S,every_day,0 + afternoon,12:00:00,5H 59M 59S,every_day,0 + evening,18:00:00,5H 59M 59S,every_day,0 + night,00:00:00,5H 59M 59S,every_day,0 + ``` + + This configuration will create five segments instances (`daily`, `morning`, `afternoon`, `evening`, `night`) on any given day (`every_day` set to 0). The `daily` segment will start at midnight and will last `23:59:59`, the other four segments will start at 6am, 12pm, 6pm, and 12am respectively and last for `05:59:59`. + +=== "Event segments" + + These segments can be computed before or after an event of interest (defined as any UNIX timestamp). Their minimum duration is 1 minute but they can be as long as you want. The start of each segment can be shifted backwards or forwards from the specified timestamp. Set the following keys in your `config.yaml`. + + ```yaml + DAY_SEGMENTS: &day_segments + TYPE: EVENT + FILE: "data/external/your_event_segments.csv" + INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # or TRUE + ``` + + The file pointed by `[DAY_SEGMENTS][FILE]` should have the following format and can have multiple rows. + + | Column | Description | + |---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | label | A string that is used as a prefix in the name of your day segments. If labels are unique is segment is completely independent, if two segments have the same label their data will be considered together when computing features like the `most frequent contact` for calls (the most frequent contact will be computed across these segments) | + | start_time | A string with format HH:MM:SS representing the starting time of this segment | + | length | A string representing the length of this segment.It can have one or more of the following `XXD XXH XXM XXS` to represent days, hours, minutes and seconds. For example `7D 23H 59M 59S | + | repeats_on | One of the follow options `every_day`, `wday`, `qday`, `mday`, and `yday`. The last four represent a week, quarter, month and year day | + | repeats_value | An integer complementing `repeats_on`. If `every_day` set this to 0, otherwise 1-7 represent a `wday` starting from Mondays, 1-31 represent a `mday`, 1-91 represent a `qday`, and `1-366` represent a `yday` | + + !!! example + ```csv + label,event_timestamp,length,shift,shift_direction,device_id + stress1,1587661220000,1H,5M,1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 + stress2,1587747620000,4H,4H,-1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 + stress3,1587906020000,3H,5M,1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 + stress4,1584291600000,7H,4H,-1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 + stress5,1588172420000,9H,5M,-1,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 + mood,1587661220000,1H,0,0,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 + mood,1587747620000,1D,0,0,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 + mood,1587906020000,7D,0,0,a748ee1a-1d0b-4ae9-9074-279a2b6ba524 + ``` + + This example will create eight segments for a single participant (`a748ee1a...`), five independent `stressX` segments with various lengths (1,4,3,7, and 9 hours). Segments `stress1`, `stress3`, and `stress5` are shifted forwards by 5 minutes and `stress2` and `stress4` are shifted backwards by 4 hours (that is, if the `stress4` event happened on March 15th at 1pm EST (`1584291600000`), the day segment will start on that day at 9am and end at 4pm). + + The three `mood` segments are 1 hour, 1 day and 7 days long and have no shift. In addition, these `mood` segments are grouped together, meaning that although RAPIDS will compute features on each one of them, some necessary information to compute a few of such features will be extracted from all three segments, for example the phone contact that called a participant the most or the location clusters visited by a participant. + +### Segment Examples + +--- +## Device Data Configuration + +You might need to modify the following config keys in your `config.yaml` depending on what devices your participants used and where you are storing your data. + +!!! hint + You can ignore `[SENSOR_DATA][PHONE]` or `[SENSOR_DATA][FITBIT]` if you are not working with either devices. + +```yaml +SENSOR_DATA: + PHONE: + SOURCE: + TYPE: DATABASE + DATABASE_GROUP: *database_group + DEVICE_ID_COLUMN: device_id # column name + TIMEZONE: + TYPE: SINGLE + VALUE: *timezone + FITBIT: + SOURCE: + TYPE: DATABASE # DATABASE or FILES (set each FITBIT_SENSOR TABLE attribute accordingly with a table name or a file path) + DATABASE_GROUP: *database_group + DEVICE_ID_COLUMN: fitbit_id # column name + TIMEZONE: + TYPE: SINGLE # Fitbit only supports SINGLE timezones + VALUE: *timezone + +``` + +**For `[SENSOR_DATA][PHONE]`** + +| Key | Description | +|---------------------|----------------------------------------------------------------------------------------------------------------------------| +| `[SOURCE] [TYPE]` | Only `DATABASE` is supported (phone data will be pulled from a database) | +| `[SOURCE] [DATABASE_GROUP]` | `*database_group` points to the value defined before in [Database credentials](#database-credentials) | +| `[SOURCE] [DEVICE_ID_COLUMN]` | The column that has strings that uniquely identify smartphones. For data collected with AWARE this is usually `device_id` | +| `[TIMEZONE] [TYPE]` | Only `SINGLE` is supported | +| `[TIMEZONE] [VALUE]` | `*timezone` points to the value defined before in [Timezone of your study](#timezone-of-your-study) | + +**For `[SENSOR_DATA][FITBIT]`** + +| Key | Description | +|------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `[SOURCE]` `[TYPE]` | `DATABASE` or `FILES` (set each `[FITBIT_SENSOR]` `[TABLE]` attribute accordingly with a table name or a file path) | +| `[SOURCE]` `[DATABASE_GROUP]` | `*database_group` points to the value defined before in [Database credentials](#database-credentials). Only used if `[TYPE]` is `DATABASE` . | +| `[SOURCE]` `[DEVICE_ID_COLUMN]` | The column that has strings that uniquely identify Fitbit devices. | +| `[TIMEZONE]` `[TYPE]` | Only `SINGLE` is supported (Fitbit devices always store data in local time). | +| `[TIMEZONE]` `[VALUE]` | `*timezone` points to the value defined before in [Timezone of your study](#timezone-of-your-study) | + +--- + +## Sensor and Features to Process + +Finally, you need to modify the `config.yaml` of the sensors you want to process. All sensors follow the same naming nomenclature `DEVICE_SENSOR` and have the following basic attributes (we will use `PHONE_MESSAGES` as an example). + +!!! hint + Every time you change any sensor parameter, all the necessary files will be updated as soon as you execute RAPIDS. Some sensors will have specific attributes (like `MESSAGES_TYPES`) so refer to each sensor documentation. + +```yaml +PHONE_MESSAGES: + TABLE: messages + PROVIDERS: + RAPIDS: + COMPUTE: True + MESSAGES_TYPES : [received, sent] + FEATURES: + received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] + sent: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] + SRC_LANGUAGE: "r" + SRC_FOLDER: "rapids" # inside src/features/phone_messages +``` + +| Key | Description | +|-------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `[TABLE]` | The name of the table in your database that stores this sensor data. | +| `[PROVIDERS]` | A collection of `providers` . A provider is an author or group of authors that created specific features for the sensor at hand. The provider for all the features implemented by our team is called `RAPIDS` but we have also included contributions from other researchers (for example `DORYAB` for location features). | +| `[PROVIDER]` `[COMPUTE]` | Set this to `TRUE` if you want to process features for this `provider` . | +| `[PROVIDER]` `[FEATURES]` | A list of all the features available for the `provider` . Delete those that you don't want to compute. | +| `[PROVIDER]` `[SRC_LANGUAGE]` | The programming language ( `r` or `python` ) in which the features of this `provider` are implemented. | +| `[PROVIDER]` `[SRC_FOLDER]` | The folder where the script(s) to compute the features of this `provider` are stored. This folder is always inside `src/features/[DEVICE_SENSOR]/` | + diff --git a/docs/setup/installation.md b/docs/setup/installation.md new file mode 100644 index 00000000..10629a87 --- /dev/null +++ b/docs/setup/installation.md @@ -0,0 +1,188 @@ +# Installation + +You can install RAPIDS using Docker (the fastest), or native instructions for MacOS and Ubuntu + +=== "Docker" + + 1. Install [Docker](https://docs.docker.com/desktop/) + + 2. Pull our RAPIDS container + ``` bash + docker pull agamk/rapids:latest` + ``` + + 3. Run RAPIDS\' container (after this step is done you should see a + prompt in the main RAPIDS folder with its python environment active) + + ``` bash + docker run -it agamk/rapids:latest + ``` + + 4. Pull the latest version of RAPIDS + + ``` bash + git pull + ``` + + 5. Check that RAPIDS is working + ``` bash + ./rapids -j1 + ``` + 6. *Optional*. You can edit RAPIDS files with vim but we recommend using Visual Studio Code and its Remote Containers extension + + ??? info "How to configure Remote Containers extension" + + - Make sure RAPIDS container is running + - Install the [Remote - Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) + - Go to the `Remote Explorer` panel on the left hand sidebar + - On the top right dropdown menu choose `Containers` + - Double click on the `agamk/rapids` container in the`CONTAINERS` tree + - A new VS Code session should open on RAPIDS main folder insidethe container. + +=== "MacOS" + We tested these instructions in Catalina + + 1. Install [brew](https://brew.sh/) + + 2. Install MySQL + + ``` bash + brew install mysql + brew services start mysql + ``` + + 3. Install R 4.0, pandoc and rmarkdown. If you have other instances of R, we recommend uninstalling them + + ``` bash + brew install r + brew install pandoc + Rscript --vanilla -e 'install.packages("rmarkdown", repos="http://cran.us.r-project.org")' + ``` + + 4. Install miniconda (restart your terminal afterwards) + + ``` bash + brew cask install miniconda + conda init zsh # (or conda init bash) + ``` + + 5. Clone our repo + + ``` bash + git clone https://github.com/carissalow/rapids + ``` + + 6. Create a python virtual environment + + ``` bash + cd rapids + conda env create -f environment.yml -n rapids + conda activate rapids + ``` + + 7. Install R packages and virtual environment: + + ``` bash + snakemake -j1 renv_install + snakemake -j1 renv_restore + + ``` + + !!! note + This step could take several minutes to complete, especially if you have less than 3Gb of RAM or packages need to be compiled from source. Please be patient and let it run until completion. + + 8. Check that RAPIDS is working + ``` bash + ./rapids -j1 + ``` + +=== "Ubuntu" + + We tested on Ubuntu 18.04 & 20.04 + + 1. Install dependencies + + ``` bash + sudo apt install libcurl4-openssl-dev + sudo apt install libssl-dev + sudo apt install libxml2-dev + ``` + + 2. Install MySQL + + ``` bash + sudo apt install libmysqlclient-dev + sudo apt install mysql-server + ``` + + 3. Add key for R's repository. + + ``` bash + sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 + ``` + + 4. Add R's repository + + 1. For 18.04 + ``` bash + sudo add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' + ``` + + 1. For 20.04 + ``` bash + sudo add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' + ``` + + 5. Install R 4.0. If you have other instances of R, we recommend uninstalling them + + ``` bash + sudo apt update + sudo apt install r-base + ``` + + 6. Install Pandoc and rmarkdown + + ``` bash + sudo apt install pandoc + Rscript --vanilla -e 'install.packages("rmarkdown", repos="http://cran.us.r-project.org")' + ``` + + 7. Install git + + ``` bash + sudo apt install git + ``` + + 8. Install [miniconda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html) + + 9. Restart your current shell + + 10. Clone our repo: + + ``` bash + git clone https://github.com/carissalow/rapids + ``` + + 11. Create a python virtual environment: + + ``` bash + cd rapids + conda env create -f environment.yml -n MY_ENV_NAME + conda activate MY_ENV_NAME + ``` + + 7. Install R packages and virtual environment: + + ``` bash + snakemake -j1 renv_install + snakemake -j1 renv_restore + + ``` + + !!! note + This step could take several minutes to complete, especially if you have less than 3Gb of RAM or packages need to be compiled from source. Please be patient and let it run until completion. + + 8. Check that RAPIDS is working + ``` bash + ./rapids -j1 + ``` diff --git a/docs/usage/example.rst b/docs/usage/example.rst deleted file mode 100644 index 79be53ac..00000000 --- a/docs/usage/example.rst +++ /dev/null @@ -1,49 +0,0 @@ -.. _analysis-workflow-example: - -Analysis Workflow Example -========================== - -This is a quick guide for creating and running a simple pipeline to analysis an example dataset with 2 participants. - -#. Install RAPIDS. See :ref:`Installation Section `. - -#. Configure your database credentials (see the example below or step 1 of :ref:`Usage Section ` for more information). - - - Create an ``.env`` file at the root of RAPIDS folder - - Your MySQL user must have write permissions because we will restore our example database - - Name your credentials group ``MY_GROUP``. - - If you are trying to connect to a local MySQL server from our docker container set your host according to this link_. - - You can name your database any way you want, for example ``rapids_example`` - - .. code-block:: bash - - [MY_GROUP] - user=rapids - password=rapids - host=127.0.0.1 - port=3306 - database=rapids_example - -#. Make sure your conda environment is active (the environment is already active in our docker container). See step 6 of :ref:`install-page`. - -#. If you installed RAPIDS from GitHub (did not use docker) you need to download the `example db backup `_ and save it to ``data/external/rapids_example.sql``. - -#. Run the following command to restore database from ``rapids_example.sql`` file:: - - snakemake -j1 restore_sql_file - -#. Create example participants files with the following command:: - - snakemake -j1 create_example_participant_files - -#. Run the following command to analysis the example dataset. - - - Execute over a single core:: - - snakemake -j1 --profile example_profile - - - Execute over multiple cores (here, we use 8 cores):: - - snakemake -j8 --profile example_profile - -.. _link: https://stackoverflow.com/questions/24319662/from-inside-of-a-docker-container-how-do-i-connect-to-the-localhost-of-the-mach diff --git a/docs/usage/faq.rst b/docs/usage/faq.rst deleted file mode 100644 index b1d0f9c7..00000000 --- a/docs/usage/faq.rst +++ /dev/null @@ -1,182 +0,0 @@ -Frequently Asked Questions -============================ - -1. Cannot connect to the MySQL server -""""""""""""""""""""""""""""""""""""""" -**Error in .local(drv, ...) :** -**Failed to connect to database: Error: Can't initialize character set unknown (path: compiled_in)** -:: - - Calls: dbConnect -> dbConnect -> .local -> .Call - Execution halted - [Tue Mar 10 19:40:15 2020] - Error in rule download_dataset: - jobid: 531 - output: data/raw/p60/locations_raw.csv - - RuleException: - CalledProcessError in line 20 of /home/ubuntu/rapids/rules/preprocessing.snakefile: - Command 'set -euo pipefail; Rscript --vanilla /home/ubuntu/rapids/.snakemake/scripts/tmp_2jnvqs7.download_dataset.R' returned non-zero exit status 1. - File "/home/ubuntu/rapids/rules/preprocessing.snakefile", line 20, in __rule_download_dataset - File "/home/ubuntu/anaconda3/envs/moshi-env/lib/python3.7/concurrent/futures/thread.py", line 57, in run - Shutting down, this might take some time. - Exiting because a job execution failed. Look above for error message - -**Solution:** - -Please make sure the ``DATABASE_GROUP`` in ``config.yaml`` matches your DB credentials group in ``.env``. - - - -2. Cannot start mysql in linux via ``brew services start mysql`` -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -Use the following command instead: - -``mysql.server start`` - - -3. Every time I run ``snakemake -R download_dataset`` all rules are executed -"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -This is expected behavior. The advantage of using ``snakemake`` under the hood is that every time a file containing data is modified every rule that depends on that file will be re-executed to update their results. In this case, since ``download_dataset`` updates all the raw data, and you are forcing the rule with the flag ``-R`` every single rule that depends on those raw files will be executed. - - -4. Got an error ``Table XXX doesn't exist`` while running the download_dataset rule. -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -:: - - Error in .local(conn, statement, ...) : - could not run statement: Table 'db_name.table_name' doesn't exist - Calls: colnames ... .local -> dbSendQuery -> dbSendQuery -> .local -> .Call - Execution halted - -**Solution:** -Please make sure the sensors listed in ``[PHONE_VALID_SENSED_BINS][TABLES]`` and each sensor section you activated in ``config.yaml`` match your database tables. - - - -5. How do I install on Ubuntu 16.04 -"""""""""""""""""""""""""""""""""""" - -#. Install dependencies (Homebrew - if not installed): - - - ``sudo apt-get install libmariadb-client-lgpl-dev libxml2-dev libssl-dev`` - - Install brew_ for linux and add the following line to ~/.bashrc: ``export PATH=$HOME/.linuxbrew/bin:$PATH`` - - ``source ~/.bashrc`` - -#. Install MySQL - - - ``brew install mysql`` - - ``brew services start mysql`` - -#. Install R, pandoc and rmarkdown: - - - ``brew install r`` - - ``brew install gcc@6`` (needed due to this bug_) - - ``HOMEBREW_CC=gcc-6 brew install pandoc`` - -#. Install miniconda using these instructions_ - -#. Clone our repo: - - - ``git clone https://github.com/carissalow/rapids`` - -#. Create a python virtual environment: - - - ``cd rapids`` - - ``conda env create -f environment.yml -n MY_ENV_NAME`` - - ``conda activate MY_ENV_NAME`` - -#. Install R packages and virtual environment: - - - ``snakemake renv_install`` - - ``snakemake renv_init`` - - ``snakemake renv_restore`` - - This step could take several minutes to complete. Please be patient and let it run until completion. - -#. See :ref:`Usage section `. - - - -6. Configuration failed for package ``RMySQL`` -"""""""""""""""""""""""""""""""""""""""""""""""" -:: - - --------------------------[ ERROR MESSAGE ]---------------------------- - :1:10: fatal error: mysql.h: No such file or directory - compilation terminated. - ----------------------------------------------------------------------- - ERROR: configuration failed for package 'RMySQL' - -Run ``sudo apt install libmariadbclient-dev`` - - - -7. No package ``libcurl`` found -""""""""""""""""""""""""""""""""" - -The ``libcurl`` needs to installed using the following command - -Run ``sudo apt install libcurl4-openssl-dev`` - - - -8. Configuration failed because ``openssl`` was not found. -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" - -Install the ``openssl`` library using the following command - -Run ``sudo apt install libssl-dev`` - - -9. Configuration failed because ``libxml-2.0`` was not found -""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" - -Install the ``xml`` library using the following command - -Run ``sudo apt install libxml2-dev`` - -10. SSL connection error when running RAPIDS -"""""""""""""""""""""""""""""""""""""""""""""" - -You are getting the following error message when running RAPIDS: - -``Error: Failed to connect: SSL connection error: error:1425F102:SSL routines:ssl_choose_client_version:unsupported protocol``. - -This is a bug in Ubuntu 20.04 when trying to connect to an old MySQL server with MySQL client 8.0. You should get the same error message if you try to connect from the command line. There you can add the option ``--ssl-mode=DISABLED`` but we can't do this from the R connector. - -If you can't update your server, the quickest solution would be to import your database to another server or to a local environment. Alternatively, you could replace ``mysql-client`` and ``libmysqlclient-dev`` with ``mariadb-client`` and ``libmariadbclient-dev`` and reinstall renv. More info about this issue here https://bugs.launchpad.net/ubuntu/+source/mysql-8.0/+bug/1872541 - -11. ``DB_TABLES`` key not found -"""""""""""""""""""""""""""""""" - -If you get the following error ``KeyError in line 43 of preprocessing.smk: 'DB_TABLES'``, means that the indentation of the key ``DB_TABLES`` is not matching the other child elements of ``PHONE_VALID_SENSED_BINS`` and you need to add or remove any leading whitespaces as needed. - -:: - - PHONE_VALID_SENSED_BINS: - COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features - BIN_SIZE: &bin_size 5 # (in minutes) - # Add as many sensor tables as you have, they all improve the computation of PHONE_VALID_SENSED_BINS and PHONE_VALID_SENSED_DAYS. - # If you are extracting screen or Barnett's location features, screen and locations tables are mandatory. - DB_TABLES: [] - -12. Error while updating your conda environment in Ubuntu -""""""""""""""""""""""""""""""""""""""""""""""""""""""""" - -If you get the following error try reinstalling conda. - -:: - - CondaMultiError: CondaVerificationError: The package for tk located at /home/ubuntu/miniconda2/pkgs/tk-8.6.9-hed695b0_1003 - appears to be corrupted. The path 'include/mysqlStubs.h' - specified in the package manifest cannot be found. - ClobberError: This transaction has incompatible packages due to a shared path. - packages: conda-forge/linux-64::llvm-openmp-10.0.0-hc9558a2_0, anaconda/linux-64::intel-openmp-2019.4-243 - path: 'lib/libiomp5.so' - - -.. ------------------------ Links --------------------------- .. - -.. _bug: https://github.com/Homebrew/linuxbrew-core/issues/17812 -.. _instructions: https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html -.. _brew: https://docs.brew.sh/Homebrew-on-Linux diff --git a/docs/usage/installation.rst b/docs/usage/installation.rst deleted file mode 100644 index ff51b3c4..00000000 --- a/docs/usage/installation.rst +++ /dev/null @@ -1,209 +0,0 @@ -.. _install-page: - -Installation -=============== - -These instructions have been tested on macOS (Catalina and Mojave) and Ubuntu 16.04. If you find a problem, please create a GitHub issue or contact us. If you want to test RAPIDS quickly try our docker image or follow the Linux instructions on a virtual machine. - -Docker (the fastest and easiest way) ------------------------------------- - -#. Install docker - -#. Pull RAPIDS' container - - ``docker pull agamk/rapids:latest`` - -#. Run RAPIDS' container (after this step is done you should see a prompt in the main RAPIDS folder with its python environment active) - - ``docker run -it agamk/rapids:latest`` - -#. Pull the latest version of RAPIDS - - ``git pull`` - -#. Optional. You can start editing files with vim but we recommend using Visual Studio Code and its Remote extension - - - Make sure RAPIDS container is running - - Install the Remote - Containers extension: https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers - - Go to the ``Remote Explorer`` panel on the left hand sidebar - - On the top right dropdown menu choose ``Containers`` - - Double click on the ``agamk/rapids`` container in the ``CONTAINERS`` tree - - A new VS Code session should open on RAPIDS main folder inside the container. - -#. See Usage section below. - - -macOS (tested on Catalina 10.15) --------------------------------- - -#. Install dependencies (Homebrew if not installed): - - - Install brew_ for Mac: ``/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"`` - -#. Install MySQL - - - ``brew install mysql`` - - ``brew services start mysql`` - -#. Install R 4.0, pandoc and rmarkdown. If you have other instances of R, we recommend uninstalling them. - - - ``brew install r`` - - ``brew install pandoc`` - - ``Rscript --vanilla -e 'install.packages("rmarkdown", repos="http://cran.us.r-project.org")'`` - -#. Install miniconda: - - - ``brew cask install miniconda`` - - ``conda init zsh`` or ``conda init bash`` - - Restart terminal if necessary - -#. Clone our repo: - - - ``git clone https://github.com/carissalow/rapids`` - -#. Create a python virtual environment: - - - ``cd rapids`` - - ``conda env create -f environment.yml -n rapids`` - - ``conda activate rapids`` - -#. Install R packages and virtual environment: - - - ``snakemake -j1 renv_install`` - - ``snakemake -j1 renv_restore`` - - - This step could take several minutes to complete, especially if you have less than 3Gb of RAM or packages need to be compiled from source. Please be patient and let it run until completion. - -#. See Usage section below. - - -Linux (tested on Ubuntu 18.04 & 20.04) ---------------------------------------- - -#. Install dependencies : - - - ``sudo apt install libcurl4-openssl-dev`` - - ``sudo apt install libssl-dev`` - - ``sudo apt install libxml2-dev`` - -#. Install MySQL - - - ``sudo apt install libmysqlclient-dev`` - - ``sudo apt install mysql-server`` - - -#. Install R 4.0 . If you have other instances of R, we recommend uninstalling them. - - - ``sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9`` - - Add R's repository: - - - For 18.04 do: ``sudo add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/'`` - - For 20.04 do: ``sudo add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/'`` - - ``sudo apt update`` - - ``sudo apt install r-base`` - -#. Install Pandoc and rmarkdown - - - ``sudo apt install pandoc`` - - ``Rscript --vanilla -e 'install.packages("rmarkdown", repos="http://cran.us.r-project.org")'`` - -#. Install GIT - - - ``sudo apt install git`` - -#. Install miniconda using these instructions_ - -#. Restart your current shell - -#. Clone our repo: - - - ``git clone https://github.com/carissalow/rapids`` - -#. Create a python virtual environment: - - - ``cd rapids`` - - ``conda env create -f environment.yml -n MY_ENV_NAME`` - - ``conda activate MY_ENV_NAME`` - -#. Install R packages and virtual environment: - - - ``snakemake -j1 renv_install`` - - ``snakemake -j1 renv_restore`` - - - This step could take several minutes to complete, especially if you have less than 3Gb of RAM or packages need to be compiled from source. Please be patient and let it run until completion. - -#. See Usage section below. - - -.. _usage-section: - -Usage -====== -Once RAPIDS is installed, follow these steps to start processing mobile data. - -.. _db-configuration: - -#. Configure the database connection: - - - Create an empty file called `.env` in the root directory (``rapids/``) - - Add the following lines and replace your database-specific credentials (user, password, host, and database): - - .. code-block:: bash - - [MY_GROUP] - user=MY_USER - password=MY_PASSWORD - host=MY_HOST - port=3306 - database=MY_DATABASE - - .. note:: - - ``MY_GROUP`` is a custom label for your credentials. It has to match ``DATABASE_GROUP`` in the ``config.yaml`` file_. It is not related to your database configuration. - -#. Setup the participants' devices whose data you want to analyze, for this you have two options: - - #. **Automatically**. You can automatically include all devices that are stored in the ``aware_device`` table. If you want to control what devices and dates are included, see the Manual configuration:: - - snakemake -j1 download_participants - - #. **Manually**. Create one file per participant in the ``rapids/data/external/`` directory. The file should NOT have an extension (i.e., no .txt). The name of the file will become the label for that participant in the pipeline. - - - The first line of the file should be the Aware ``device_id`` for that participant. If one participant has multiple device_ids (i.e. Aware had to be re-installed), add all device_ids separated by commas. - - The second line should list the device's operating system (``android`` or ``ios``). If a participant used more than one device (i.e., the participant changed phones and/or platforms mid-study) you can a) list each platform matching the order of the first line (``android,ios``), b) use ``android`` or ``ios`` if all phones belong to the same platform, or c) if you have an ``aware_device`` table in your database, set this line to ``multiple`` and RAPIDS will infer the multiple platforms automatically. - - The third line is an optional human-friendly label that will appear in any plots for that participant. - - The fourth line is optional and contains a start and end date separated by a comma ``YYYYMMDD,YYYYMMDD`` (e.g., ``20201301,20202505``). If these dates are specified, only data within this range will be processed, otherwise, all data from the device(s) will be used. - - For example, let's say participant `p01` had two AWARE device_ids and they were running Android between February 1st 2020 and March 3rd 2020. Their participant file would be named ``p01`` and contain: - - .. code-block:: bash - - 3a7b0d0a-a9ce-4059-ab98-93a7b189da8a,44f20139-50cc-4b13-bdde-0d5a3889e8f9 - android - Participant01 - 2020/02/01,2020/03/03 - -#. Choose what features to extract: - - - See :ref:`Minimal Working Example`. - -#. Execute RAPIDS - - - Standard execution over a single core:: - - snakemake -j1 - - - Standard execution over multiple cores:: - - snakemake -j8 - - - Force a rule (useful if you modify your code and want to update its results):: - - snakemake -j1 -R RULE_NAME - -.. _bug: https://github.com/Homebrew/linuxbrew-core/issues/17812 -.. _instructions: https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html -.. _brew: https://docs.brew.sh/Homebrew-on-Linux -.. _AWARE: https://awareframework.com/what-is-aware/ -.. _file: https://github.com/carissalow/rapids/blob/master/config.yaml#L22 diff --git a/docs/usage/introduction.rst b/docs/usage/introduction.rst deleted file mode 100644 index b14d8743..00000000 --- a/docs/usage/introduction.rst +++ /dev/null @@ -1,44 +0,0 @@ -Quick Introduction -================== - -The goal of this pipeline is to standardize the data cleaning, feature extraction, analysis, and evaluation of mobile sensing projects. It leverages Conda_, Cookiecutter_, SciPy_, Snakemake_, Sphinx_, and R_ to create an end-to-end reproducible environment that can be published along with research papers. - -At the moment, mobile data can be collected using different sensing frameworks (AWARE_, Beiwe_) and hardware (Fitbit_). The pipeline is agnostic to these data sources and can unify their analysis. The current implementation only handles data collected with AWARE_ and Fitbit_. However, it can be easily extended to other providers. - -We recommend reading Snakemake_ docs, but the main idea behind the pipeline is that every link in the analysis chain is a rule with an input and an output. Input and output are files, which can be manipulated using any programming language (although Snakemake_ has wrappers for Julia_, Python_, and R_ that can make development slightly more comfortable). Snakemake_ also allows the pipeline rules to be executed in parallel on multiple cores without any code changes. This can drastically reduce the time needed to complete an analysis. - -Do you want to keep up to date with new functionality or have a question? Join the #rapids channel in AWARE Framework's slack_ - -Available features: - -- :ref:`accelerometer-sensor-doc` -- :ref:`applications-foreground-sensor-doc` -- :ref:`battery-sensor-doc` -- :ref:`bluetooth-sensor-doc` -- :ref:`wifi-sensor-doc` -- :ref:`call-sensor-doc` -- :ref:`activity-recognition-sensor-doc` -- :ref:`light-doc` -- :ref:`location-sensor-doc` -- :ref:`screen-sensor-doc` -- :ref:`messages-sensor-doc` -- :ref:`fitbit-sleep-sensor-doc` -- :ref:`fitbit-heart-rate-sensor-doc` -- :ref:`fitbit-steps-sensor-doc` - -We are updating these docs constantly, but if you think something needs clarification, feel free to reach out or submit a pull request on GitHub. - - -.. _Conda: https://docs.conda.io/en/latest/ -.. _Cookiecutter: http://drivendata.github.io/cookiecutter-data-science/ -.. _SciPy: https://www.scipy.org/index.html -.. _Snakemake: https://snakemake.readthedocs.io/en/stable/ -.. _Sphinx: https://www.sphinx-doc.org/en/master/ -.. _R: https://www.r-project.org/ - -.. _AWARE: https://awareframework.com/what-is-aware/ -.. _Beiwe: https://www.beiwe.org/ -.. _Fitbit: https://www.fitbit.com/us/home -.. _Python: https://www.python.org/ -.. _Julia: https://julialang.org/ -.. _slack: http://awareframework.com:3000/ diff --git a/docs/usage/quick_rule.rst b/docs/usage/quick_rule.rst deleted file mode 100644 index 6a2d800e..00000000 --- a/docs/usage/quick_rule.rst +++ /dev/null @@ -1,42 +0,0 @@ -.. _minimal-working-example: - -Minimal Working Example -======================== - -This is a quick guide for creating and running a simple pipeline to extract call features for daily and night epochs of one participant monitored on the US East coast. - -#. Make sure your database connection credentials in ``.env`` are correct. See step 1 of :ref:`Usage Section `. - -#. Create at least one participant file ``p01`` under ``data/external/``. See step 2 of :ref:`Usage Section `. - -#. Make sure your Conda (python) environment is active. See step 6 of :ref:`install-page`. - -#. Modify the following settings in the ``config.yaml`` file with the values shown below (leave all other settings as they are) - -:: - - PIDS: [p01] - - DAY_SEGMENTS: &day_segments - [daily, night] - - TIMEZONE: &timezone - America/New_York - - DATABASE_GROUP: &database_group - MY_GROUP (change this if you added your DB credentials to .env with a different label) - - CALLS: - COMPUTE: True - DB_TABLE: calls (only change DB_TABLE if your database calls table has a different name) - -For more information on the ``calls`` sensor see :ref:`call-sensor-doc` - -#. Run the following command to execute RAPIDS - - :: - - snakemake -j1 - -#. Daily and night call metrics will be found in files under the ``data/processed/p01/`` directory. - diff --git a/docs/usage/snakemake_docs.rst b/docs/usage/snakemake_docs.rst deleted file mode 100644 index c2fed4f1..00000000 --- a/docs/usage/snakemake_docs.rst +++ /dev/null @@ -1,238 +0,0 @@ -.. _rapids-structure: - -RAPIDS Structure -================= - -.. _the-config-file: - -The ``config.yaml`` File ------------------------- - -RAPIDS configuration settings are defined in ``config.yaml`` (See `config.yaml`_). This is the only file that you need to understand in order to compute the features that RAPIDS ships with. - -It has global settings like ``PIDS``, ``DAY_SEGMENTS``, among others (see :ref:`global-sensor-doc` for more information). As well as per sensor settings, for example, for the :ref:`messages-sensor-doc`:: - - | ``MESSAGES:`` - | ``COMPUTE: True`` - | ``DB_TABLE: messages`` - | ``...`` - -.. _the-snakefile-file: - -The ``Snakefile`` File ----------------------- -The ``Snakefile`` file (see the actual `Snakefile`_) pulls the entire system together. The first line in this file identifies the configuration file. Next are a list of included directives that import the rules used to pull, clean, process, analyze and report data. It compiles the list of ``files_to_compute`` by scaning the config file looking for the sensors with a ``COMPUTE`` flag equal to ``True``. Then, the ``all`` rule is called with this list which prompts Snakemake to exectue the pipeline (raw files, intermediate files, feature files, reports, etc). - -.. _includes-section: - -Includes -""""""""" -There are 5 included files in the ``Snakefile`` file. - - - ``renv.smk`` - Rules to create, backup and restore the R renv virtual environment for RAPIDS. (See `renv`_) - - ``preprocessing.smk`` - Rules that are used to pre-preprocess the data such as downloading, cleaning and formatting. (See `preprocessing`_) - - ``features.smk`` - Rules that used for behavioral feature extraction. (See `features`_) - - ``models.smk`` - Rules that are used to build models from features that have been extreacted from the sensor data. (See `models`_) - - ``reports.smk`` - Rules that are used to produce reports and visualizations. (See `reports`_) - -Includes are relative to the root directory. - -.. _rule-all-section: - -``Rule all:`` -""""""""""""" -In RAPIDS the ``all`` rule lists the output files we expect the pipeline to compute. Before the ``all`` rule is called snakemake checks the ``config.yaml`` and adds all the rules for which the sensors ``COMPUTE`` parameter is ``True``. The ``expand`` function allows us to generate a list of file paths that have a common structure except for PIDS or other parameters. Consider the following:: - - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"])) - -If ``pids = ['p01','p02']`` and ``config["MESSAGES"]["DB_TABLE"] = messages`` then the above directive would produce:: - - ["data/raw/p01/messages_raw.csv", "data/raw/p02/messages_raw.csv"] - -Thus, this allows us to define all the desired output files without having to manually list each path for every participant and every sensor. The way Snakemake works is that it looks for the rule that produces the desired output files and then executes that rule. For more information on ``expand`` see `The Expand Function`_ - - -.. _the-env-file: - -The ``.env`` File -------------------- -Your database credentials are stored in the ``.env`` file (See :ref:`install-page`):: - - [MY_GROUP_NAME] - user=MyUSER - password=MyPassword - host=MyIP/DOMAIN - port=3306 - -.. _rules-syntax: - -The ``Rules`` Directory ------------------------- - -The ``rules`` directory contains the ``snakefiles`` that were included in the main ``Snakefile`` file. A short description of these files are given in the :ref:`includes-section` section. - - -Rules -"""""" - -A Snakemake workflow is defined by rules (See the features_ snakefile as an actual example). Rules decompose the workflow into small steps by specifying what output files should be created by running a script on a set of input files. Snakemake automatically determines the dependencies between the rules by matching file names. Thus, a rule can consist of a name, input files, output files, and a command to generate the output from the input. The following is the basic structure of a Snakemake rule:: - - rule NAME: - input: "path/to/inputfile", "path/to/other/inputfile" - output: "path/to/outputfile", "path/to/another/outputfile" - script: "path/to/somescript.R" - - -A sample rule from the RAPIDS source code is shown below:: - - rule messages_features: - input: - expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"]) - params: - messages_type = "{messages_type}", - day_segment = "{day_segment}", - features = lambda wildcards: config["MESSAGES"]["FEATURES"][wildcards.messages_type] - output: - "data/processed/{pid}/messages_{messages_type}_{day_segment}.csv" - script: - "../src/features/messages_features.R" - - -The ``rule`` directive specifies the name of the rule that is being defined. ``params`` defines additional parameters for the rule's script. In the example above, the parameters are passed to the ``messages_features.R`` script as an dictionary. Instead of ``script`` a ``shell`` command call can also be called by replacing the ``script`` directive of the rule and replacing it with:: - - shell: "somecommand {input} {output}" - -It should be noted that rules can be defined without input and output as seen in the ``renv.snakemake``. For more information see `Rules documentation`_ and for an actual example see the `renv`_ snakefile. - -.. _wildcards: - -Wildcards -"""""""""" -There are times when the same rule should be applied to different participants and day segments. For this we use wildcards ``{my_wildcard}``. All wildcards are inferred from the files listed in the ``all` rule of the ``Snakefile`` file and therefore from the output of any rule:: - - rule messages_features: - input: - expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"]) - params: - messages_type = "{messages_type}", - day_segment = "{day_segment}", - features = lambda wildcards: config["MESSAGES"]["FEATURES"][wildcards.messages_type] - output: - "data/processed/{pid}/messages_{messages_type}_{day_segment}.csv" - script: - "../src/features/messages_features.R" - -If the rule’s output matches a requested file, the substrings matched by the wildcards are propagated to the input and params directives. For example, if another rule in the workflow requires the file ``data/processed/p01/messages_sent_daily.csv``, Snakemake recognizes that the above rule is able to produce it by setting ``pid=p01``, ``messages_type=sent`` and ``day_segment=daily``. Thus, it requests the input file ``data/raw/p01/messages_with_datetime.csv`` as input, sets ``messages_type=sent``, ``day_segment=daily`` in the ``params`` directive and executes the script. ``../src/features/messages_features.R``. See the preprocessing_ snakefile for an actual example. - - -.. _the-data-directory: - -The ``data`` Directory ------------------------ - -This directory contains the data files for the project. These directories are as follows: - - - ``external`` - This directory stores the participant `pxxx` files as well as data from third party sources (see :ref:`install-page` page). - - ``raw`` - This directory contains the original, immutable data dump from your database. - - ``interim`` - This directory contains intermediate data that has been transformed but do not represent features. - - ``processed`` - This directory contains all behavioral features. - - -.. _the-src-directory: - -The ``src`` Directory ----------------------- - -The ``src`` directory holds all the scripts used by the pipeline for data manipulation. These scripts can be in any programming language including but not limited to Python_, R_ and Julia_. This directory is organized into the following directories: - - - ``data`` - This directory contains scripts that are used to download and preprocess raw data that will be used in analysis. See `data directory`_ - - ``features`` - This directory contains scripts to extract behavioral features. See `features directory`_ - - ``models`` - This directory contains the scripts for building and training models. See `models directory`_ - - ``visualization`` - This directory contains the scripts to create plots and reports. See `visualization directory`_ - - -.. _RAPIDS_directory_structure: - -:: - - ├── LICENSE - ├── Makefile <- Makefile with commands like `make data` or `make train` - ├── README.md <- The top-level README for developers using this project. - ├── config.yaml <- The configuration settings for the pipeline. - ├── environment.yml <- Environmental settings - channels and dependences that are installed in the env) - ├── data - │ ├── external <- Data from third party sources. - │ ├── interim <- Intermediate data that has been transformed. - │ ├── processed <- The final, canonical data sets for modeling. - │ └── raw <- The original, immutable data dump. - │ - ├── docs <- A default Sphinx project; see sphinx-doc.org for details - │ - ├── models <- Trained and serialized models, model predictions, or model summaries - │ - ├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering), - │ the creator's initials, and a short `-` delimited description, e.g. - │ `1.0-jqp-initial-data-exploration`. - │ - ├── packrat <- Installed R dependences. (Packrat is a dependency management system for R) - │ (Depreciated - replaced by renv) - ├── references <- Data dictionaries, manuals, and all other explanatory materials. - │ - ├── renv.lock <- List of R packages and dependences for that are installed for the pipeline. - │ - ├── reports <- Generated analysis as HTML, PDF, LaTeX, etc. - │ └── figures <- Generated graphics and figures to be used in reporting. - │ - ├── rules - │ ├── features <- Rules to process the feature data pulled in to pipeline. - │ ├── models <- Rules for building models. - │ ├── mystudy <- Rules added by you that are specifically tailored to your project/study. - │ ├── packrat <- Rules for setting up packrat. (Depreciated replaced by renv) - │ ├── preprocessing <- Preprocessing rules to clean data before processing. - │ ├── renv <- Rules for setting up renv and R packages. - │ └── reports <- Snakefile used to produce reports. - │ - ├── setup.py <- makes project pip installable (pip install -e .) so src can be imported - ├── Snakemake <- The root snakemake file (the equivalent of a Makefile) - ├── src <- Source code for use in this project. Can be in any language e.g. Python, - │ │ R, Julia, etc. - │ │ - │ ├── data <- Scripts to download or generate data. Can be in any language e.g. Python, - │ │ R, Julia, etc. - │ │ - │ ├── features <- Scripts to turn raw data into features for modeling. Can be in any language - │ │ e.g. Python, R, Julia, etc. - │ │ - │ ├── models <- Scripts to train models and then use trained models to make prediction. Can - │ │ be in any language e.g. Python, R, Julia, etc. - │ │ - │ └── visualization <- Scripts to create exploratory and results oriented visualizations. Can be - │ in any language e.g. Python, R, Julia, etc. - ├── tests - │ ├── data <- Replication of the project root data directory for testing. - │ ├── scripts <- Scripts for testing. - │ ├── settings <- The config and settings files for running tests. - │ └── Snakefile <- The Snakefile for testing only. - │ - └── tox.ini <- tox file with settings for running tox; see tox.testrun.org - - -.. _Python: https://www.python.org/ -.. _Julia: https://julialang.org/ -.. _R: https://www.r-project.org/ -.. _`List of Timezone`: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones -.. _`The Expand Function`: https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#the-expand-function -.. _`example snakefile`: https://github.com/carissalow/rapids/blob/master/rules/features.snakefile -.. _renv: https://github.com/carissalow/rapids/blob/master/rules/renv.snakefile -.. _preprocessing: https://github.com/carissalow/rapids/blob/master/rules/preprocessing.snakefile -.. _features: https://github.com/carissalow/rapids/blob/master/rules/features.snakefile -.. _models: https://github.com/carissalow/rapids/blob/master/rules/models.snakefile -.. _reports: https://github.com/carissalow/rapids/blob/master/rules/reports.snakefile -.. _mystudy: https://github.com/carissalow/rapids/blob/master/rules/mystudy.snakefile -.. _`Rules documentation`: https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#rules -.. _`data directory`: https://github.com/carissalow/rapids/tree/master/src/data -.. _`features directory`: https://github.com/carissalow/rapids/tree/master/src/features -.. _`models directory`: https://github.com/carissalow/rapids/tree/master/src/models -.. _`visualization directory`: https://github.com/carissalow/rapids/tree/master/src/visualization -.. _`config.yaml`: https://github.com/carissalow/rapids/blob/master/config.yaml -.. _`Snakefile`: https://github.com/carissalow/rapids/blob/master/Snakefile diff --git a/docs/visualization/data_exploration.rst b/docs/visualization/data_exploration.rst deleted file mode 100644 index 89cbbd4b..00000000 --- a/docs/visualization/data_exploration.rst +++ /dev/null @@ -1,216 +0,0 @@ -.. _data_exploration: - -Data Exploration -================ - -These plots are in beta, if you get an error while computing them please let us know. - -.. _histogram-of-valid-sensed-hours: - -Histogram of valid sensed hours -""""""""""""""""""""""""""""""" - -See `Histogram of Valid Sensed Hours Config Code`_ - -**Rule Chain:** - -- Rule: ``rules/preprocessing.smk/download_dataset`` -- Rule: ``rules/preprocessing.smk/readable_datetime`` -- Rule: ``rules/preprocessing.smk/phone_sensed_bins`` -- Rule: ``rules/preprocessing.smk/phone_valid_sensed_days`` -- Rule: ``rules/reports.smk/histogram_valid_sensed_hours`` - -.. _figure1-parameters: - -**Parameters of histogram_valid_sensed_hours Rule:** - -======================= ======================= -Name Description -======================= ======================= -plot Whether the rule is executed or not. The available options are ``True`` and ``False``. -min_valid_bins_per_hour The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS` for more information. -min_valid_hours_per_day The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS` for more information. -======================= ======================= - -**Observations:** - -This histogram shows the valid sensed hours of all participants processed in RAPIDS (See valid sensed :ref:`bins` and :ref:`days` sections). It can be used as a rough indication of the AWARE client monitoring coverage during a study for all participants. See Figure 1. - -.. figure:: figures/Figure1.png - :scale: 90 % - :align: center - - Figure 1 Histogram of valid sensed hours for all participants - - -.. _heatmap-of-phone-sensed-bins: - -Heatmap of phone sensed bins -"""""""""""""""""""""""""""" - -See `Heatmap of Phone Sensed Bins Config Code`_ - -**Rule Chain:** - -- Rule: ``rules/preprocessing.smk/download_dataset`` -- Rule: ``rules/preprocessing.smk/readable_datetime`` -- Rule: ``rules/preprocessing.smk/phone_sensed_bins`` -- Rule: ``rules/reports.smk/heatmap_sensed_bins`` - -.. _figure2-parameters: - -**Parameters of heatmap_sensed_bins Rule:** - -======================= ======================= -Name Description -======================= ======================= -plot Whether the rule is executed or not. The available options are ``True`` and ``False``. -bin_size Every hour is divided into N bins of size ``BIN_SIZE`` (in minutes). It modifies the way we compute ``data/interim/pXX/phone_sensed_bins.csv`` file. -======================= ======================= - -**Observations:** - -In this heatmap rows are dates, columns are sensed bins for a participant, and cells’ color shows the number of mobile sensors that logged at least one row of data during that bin. This plot shows the periods of time without data for a participant and can be used as a rough indication of whether time-based sensors were following their sensing schedule (e.g. if location was being sensed every 2 minutes). See Figure 2. - -.. figure:: figures/Figure2.png - :scale: 90 % - :align: center - - Figure 2 Heatmap of phone sensed bins for a single participant - - -.. _heatmap-of-days-by-sensors - -Heatmap of days by sensors -"""""""""""""""""""""""""" - -See `Heatmap of Days by Sensors Config Code`_ - -**Rule Chain:** - -- Rule: ``rules/preprocessing.smk/download_dataset`` -- Rule: ``rules/preprocessing.smk/readable_datetime`` -- Rule: ``rules/preprocessing.smk/phone_sensed_bins`` -- Rule: ``rules/preprocessing.smk/phone_valid_sensed_days`` -- Rule: ``rules/reports.smk/heatmap_days_by_sensors`` - -.. _figure3-parameters: - -**Parameters of heatmap_days_by_sensors Rule:** - -======================= ======================= -Name Description -======================= ======================= -plot Whether the rule is executed or not. The available options are ``True`` and ``False``. -min_valid_bins_per_hour The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS` for more information. -min_valid_hours_per_day The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS` for more information. -expected_num_of_days The number of days of data to show starting from the first day of each participant. -db_tables List of sensor tables to compute valid bins & hours. -======================= ======================= - -**Observations:** - -In this heatmap rows are sensors, columns are days and cells’ color shows the normalized (0 to 1) number of valid sensed hours (See valid sensed :ref:`bins` and :ref:`days` sections) collected by a sensor during a day for a participant. The user can decide how many days of data to show starting from the first day of each participant. This plot can used to judge missing data on a per participant, per sensor basis as well as the number of valid sensed hours (usable data) for each day. See Figure 3. - -.. figure:: figures/Figure3.png - :scale: 90 % - :align: center - - Figure 3 Heatmap of days by sensors for a participant - - -.. _overall-compliance-heatmap - -Overall compliance heatmap -"""""""""""""""""""""""""" - -See `Overall Compliance Heatmap Config Code`_ - -**Rule Chain:** - -- Rule: ``rules/preprocessing.smk/download_dataset`` -- Rule: ``rules/preprocessing.smk/readable_datetime`` -- Rule: ``rules/preprocessing.smk/phone_sensed_bins`` -- Rule: ``rules/preprocessing.smk/phone_valid_sensed_days`` -- Rule: ``rules/reports.smk/overall_compliance_heatmap`` - -.. _figure4-parameters: - -**Parameters of overall_compliance_heatmap Rule:** - -======================= ======================= -Name Description -======================= ======================= -plot Whether the rule is executed or not. The available options are ``True`` and ``False``. -only_show_valid_days Whether the plot only shows valid days or not. The available options are ``True`` and ``False``. -expected_num_of_days The number of days to show before today. -bin_size Every hour is divided into N bins of size ``BIN_SIZE`` (in minutes). It modifies the way we compute ``data/interim/pXX/phone_sensed_bins.csv`` file. -min_valid_bins_per_hour The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS` for more information. -min_valid_hours_per_day The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS` for more information. -======================= ======================= - -**Observations:** - -In this heatmap rows are participants, columns are days and cells’ color shows the valid sensed hours for a participant during a day (See valid sensed :ref:`bins` and :ref:`days` sections). This plot can be configured to show a certain number of days before today using the ``EXPECTED_NUM_OF_DAYS`` parameter (by default -1 showing all days for every participant). As different participants might join the study on different dates, the x-axis has a day index instead of a date. This plot gives the user a quick overview of the amount of data collected per person and is complementary to the histogram of valid sensed hours as it is broken down per participant and per day. See Figure 4. - -.. figure:: figures/Figure4.png - :scale: 90 % - :align: center - - Figure 4 Overall compliance heatmap for all participants - - -.. _heatmap-of-correlation-matrix-between-features - -Heatmap of correlation matrix between features -"""""""""""""""""""""""""""""""""""""""""""""" - -See `Heatmap of Correlation Matrix Config Code`_ - -**Rule Chain:** - -- Rules to extract features -- Rule: ``rules/preprocessing.smk/download_dataset`` -- Rule: ``rules/preprocessing.smk/readable_datetime`` -- Rule: ``rules/preprocessing.smk/phone_sensed_bins`` -- Rule: ``rules/preprocessing.smk/phone_valid_sensed_days`` -- Rule: ``rules/reports.smk/heatmap_features_correlations`` - -.. _figure5-parameters: - -**Parameters of heatmap_features_correlations Rule:** - -======================= ============== -Name Description -======================= ============== -plot Whether the rule is executed or not. The available options are ``True`` and ``False``. -min_valid_bins_per_hour The minimum valid bins an hour should have to be considered valid. A valid bin has at least 1 row of data. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_BINS` for more information. -min_valid_hours_per_day The minimum valid hours a day should have to be considered valid. It modifies the way we compute phone valid days. Read :ref:`PHONE_VALID_SENSED_DAYS` for more information. -corr_method Method of correlation. The available options are ``pearson``, ``kendall`` and ``spearman``. -min_rows_ratio Minimum number of observations required per pair of columns to have a valid correlation coefient. Currently, only available for ``pearson`` and ``spearman`` correlation. -phone_features The list of phone features. -fitbit_features The list of Fitbit features. -corr_threshold Only correlation coefficients larger than ``corr_threshold`` can be shown in the heatmap. -======================= ============== - -**Observations:** - -Columns and rows are features computed in RAPIDS, cells’ color represents the correlation coefficient between all days of data for every pair of feature of all participants. The user can specify a minimum number of observations required to compute the correlation between two features using the ``MIN_ROWS_RATIO`` parameter (0.5 by default). In addition, this plot can be configured to only display correlation coefficients above a threshold using the ``CORR_THRESHOLD`` parameter (0.1 by default). See Figure 5. - -.. figure:: figures/Figure5.png - :scale: 90 % - :align: center - - Figure 5 Correlation matrix heatmap for all the data of all participants - - - - - - - -.. _`Histogram of Valid Sensed Hours Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L221 -.. _`Heatmap of Phone Sensed Bins Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L233 -.. _`Heatmap of Days by Sensors Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L226 -.. _`Overall Compliance Heatmap Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L237 -.. _`Heatmap of Correlation Matrix Config Code`: https://github.com/carissalow/rapids/blob/master/config.yaml#L211 diff --git a/docs/visualization/figures/Figure1.png b/docs/visualization/figures/Figure1.png deleted file mode 100644 index c4d47637..00000000 Binary files a/docs/visualization/figures/Figure1.png and /dev/null differ diff --git a/docs/visualization/figures/Figure2.png b/docs/visualization/figures/Figure2.png deleted file mode 100644 index af22ccde..00000000 Binary files a/docs/visualization/figures/Figure2.png and /dev/null differ diff --git a/docs/visualization/figures/Figure3.png b/docs/visualization/figures/Figure3.png deleted file mode 100644 index 09fd60b3..00000000 Binary files a/docs/visualization/figures/Figure3.png and /dev/null differ diff --git a/docs/visualization/figures/Figure4.png b/docs/visualization/figures/Figure4.png deleted file mode 100644 index 6bfeb752..00000000 Binary files a/docs/visualization/figures/Figure4.png and /dev/null differ diff --git a/docs/visualization/figures/Figure5.png b/docs/visualization/figures/Figure5.png deleted file mode 100644 index 93eeeee4..00000000 Binary files a/docs/visualization/figures/Figure5.png and /dev/null differ diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..9e2155d1 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,51 @@ +site_name: RAPIDS +# theme: 'material' +markdown_extensions: + - toc: + permalink: True + - admonition + - smarty + - wikilinks + - codehilite: + linenums: True + # - urlize # requires: pip install git+https://github.com/r0wb0t/markdown-urlize.git + - pymdownx.arithmatex + - pymdownx.betterem: + smart_enable: all + - pymdownx.caret + - pymdownx.critic + - pymdownx.details + - pymdownx.emoji: + emoji_index: !!python/name:materialx.emoji.twemoji + emoji_generator: !!python/name:materialx.emoji.to_svg + - pymdownx.highlight + - pymdownx.inlinehilite + - pymdownx.magiclink + - pymdownx.mark + - pymdownx.smartsymbols + - pymdownx.superfences + - pymdownx.tabbed + - pymdownx.tasklist: + custom_checkbox: True + - pymdownx.tilde + - attr_list +site_favicon: material/air-filter +extra: + social: + - icon: fontawesome/brands/twitter + link: 'https://twitter.com/julio_ui' +repo_name: 'carissalow/rapids' +repo_url: 'https://github.com/carissalow/rapids' +copyright: 'Released under AGPL' +theme: + name: material + palette: + primary: blue + icon: + logo: material/air-filter + +pages: + - Home: 'index.md' + - Setup: + - Installation: 'setup/installation.md' + - Initial Configuration: setup/configuration.md \ No newline at end of file