Fix questions that were slightly different in the morning.

Use refactored methods.
Fix trailing whitespace.
2023-07-03 21:29:09 +02:00 · 2023-07-03 21:18:15 +02:00 · 2023-07-03 21:17:40 +02:00 · 2023-07-03 21:13:50 +02:00 · 2023-07-03 21:01:15 +02:00 · 2023-07-03 20:52:08 +02:00
96 changed files with 15252 additions and 309 deletions
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,9 @@
+[flake8]
+max-line-length = 88
+extend-ignore =
+    E203,
+    # E501 line too long for docstrings
+    D501
+per-file-ignores =
+    exploration/*.py:E501
+docstring-convention = numpy
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,22 @@ __pycache__/
 /exploration/*.ipynb
 /config/*.ipynb
 /statistical_analysis/*.ipynb
+/presentation/*.ipynb
+/machine_learning/intermediate_results/
+/data/features/
+/data/baseline/
+/data/*input*.csv
+/data/daily*
+/data/intradaily*
+/data/raw
+/data/stressfulness_event*
+/data/30min*
+/presentation/*scores.csv
+/presentation/Results.ods
+/presentation/results/
+.Rproj.user
+.Rhistory
+/presentation/*.nb.html
+presentation/event_stressful_detection_half_loso.csv
+presentation/event_stressful_detection_loso.csv
+/statistical_analysis/scale_reliability.nb.html
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,4 @@
+[submodule "rapids"]
+	path = rapids
+	url = https://repo.ijs.si/junoslukan/rapids.git
+	branch = master
--- a/.idea/codeStyles/Project.xml
+++ b/.idea/codeStyles/Project.xml
@ -0,0 +1,6 @@
+<component name="ProjectCodeStyleConfiguration">
+  <code_scheme name="Project" version="173">
+    <option name="RIGHT_MARGIN" value="150" />
+    <option name="SOFT_MARGINS" value="88" />
+  </code_scheme>
+</component>
--- a/.idea/codeStyles/codeStyleConfig.xml
+++ b/.idea/codeStyles/codeStyleConfig.xml
@ -0,0 +1,5 @@
+<component name="ProjectCodeStyleConfiguration">
+  <state>
+    <option name="USE_PER_PROJECT_SETTINGS" value="true" />
+  </state>
+</component>
--- a/.idea/dictionaries/junos.xml
+++ b/.idea/dictionaries/junos.xml
@ -0,0 +1,3 @@
+<component name="ProjectDictionaryState">
+  <dictionary name="junos" />
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,7 +1,31 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (straw2analysis)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="straw2analysis" project-jdk-type="Python SDK" />
+  <component name="PyCharmDSProjectLayout">
+    <option name="id" value="JupyterRightHiddenStructureLayout" />
+  </component>
  <component name="PyCharmProfessionalAdvertiser">
    <option name="shown" value="true" />
  </component>
+  <component name="RMarkdownSettings">
+    <option name="renderProfiles">
+      <map>
+        <entry key="file://$PROJECT_DIR$/rapids/src/visualization/merge_heatmap_sensors_per_minute_per_time_segment.Rmd">
+          <value>
+            <RMarkdownRenderProfile>
+              <option name="outputDirectoryUrl" value="file://$PROJECT_DIR$/rapids/src/visualization" />
+            </RMarkdownRenderProfile>
+          </value>
+        </entry>
+        <entry key="file://$PROJECT_DIR$/statistical_analysis/scale_reliability.rmd">
+          <value>
+            <RMarkdownRenderProfile>
+              <option name="lastOutput" value="$PROJECT_DIR$/statistical_analysis/scale_reliability.nb.html" />
+              <option name="outputDirectoryUrl" value="file://$PROJECT_DIR$/statistical_analysis" />
+            </RMarkdownRenderProfile>
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
 </project>
--- a/.idea/rGraphicsSettings.xml
+++ b/.idea/rGraphicsSettings.xml
@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="RGraphicsSettings">
+    <option name="height" value="600" />
+    <option name="resolution" value="75" />
+    <option name="version" value="2" />
+    <option name="width" value="960" />
+  </component>
+</project>
--- a/.idea/rMarkdownGraphicsSettings.xml
+++ b/.idea/rMarkdownGraphicsSettings.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="RMarkdownGraphicsSettings">
+    <option name="globalResolution" value="75" />
+    <option name="version" value="2" />
+  </component>
+</project>
--- a/.idea/rSettings.xml
+++ b/.idea/rSettings.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="RSettings">
+    <option name="interpreterPath" value="C:\Program Files\R\R-4.3.1\bin\R.exe" />
+  </component>
+</project>
--- a/.idea/snakemake-settings.xml
+++ b/.idea/snakemake-settings.xml
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="SmkProjectSettings" sdk="Python 3.10 (snakemake)" enabled="true" />
+</project>
--- a/.idea/straw2analysis.iml
+++ b/.idea/straw2analysis.iml
@ -5,7 +5,7 @@
      <excludeFolder url="file://$MODULE_DIR$/config/.ipynb_checkpoints" />
      <excludeFolder url="file://$MODULE_DIR$/exploration/.ipynb_checkpoints" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.9 (straw2analysis)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="straw2analysis" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyDocumentationSettings">
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -2,5 +2,6 @@
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/rapids" vcs="Git" />
  </component>
 </project>
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,30 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: isort (python)
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+      - id: black
+        language_version: python3
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.0.0
+    hooks:
+      - id: flake8
+#  - repo: https://github.com/mwouts/jupytext
+#    rev: v1.14.7
+#    hooks:
+#      - id: jupytext
+#        args: [ --from, "py:percent", --to, "ipynb" ]
+#        additional_dependencies:
+#          - isort==5.12.0 # Matches hook
+#          - black==23.3.0
+#          - flake8==6.0.0
--- a/README.md
+++ b/README.md
@ -27,9 +27,135 @@ To install:
   ipython kernel install --user --name=straw2analysis
   ```

-2. Provide an .env file to be used by `python-dotenv` which should be placed in the top folder of the application 
+2. Provide a file called `.env` to be used by `python-dotenv` which should be placed in the top folder of the application 
   and should have the form:
   
   ```
   DB_PASSWORD=database-password
   ```
+   
+# RAPIDS
+
+To install RAPIDS, follow the [instructions on their webpage](https://www.rapids.science/1.6/setup/installation/). 
+
+Here, I include additional information related to the installation and specific to the STRAW2analysis project.
+The installation was tested on Windows using Ubuntu 20.04 on Windows Subsystem for Linux ([WSL2](https://docs.microsoft.com/en-us/windows/wsl/install)).
+
+## Custom configuration
+### Credentials
+
+As mentioned under [Database in RAPIDS documentation](https://www.rapids.science/1.6/snippets/database/), a `credentials.yaml` file is needed to connect to a database.
+It should contain:
+
+```yaml
+PSQL_STRAW:
+  database: staw
+  host: 212.235.208.113
+  password: password
+  port: 5432
+  user: staw_db
+```
+
+where`password` needs to be specified as well.
+
+## Possible installation issues
+### Missing dependencies for RPostgres
+
+To install `RPostgres` R package (used to connect to the PostgreSQL database), an error might occur:
+
+```text
+------------------------- ANTICONF ERROR ---------------------------
+Configuration failed because libpq was not found. Try installing:
+   * deb: libpq-dev (Debian, Ubuntu, etc)
+   * rpm: postgresql-devel (Fedora, EPEL)
+   * rpm: postgreql8-devel, psstgresql92-devel, postgresql93-devel, or postgresql94-devel (Amazon Linux)
+   * csw: postgresql_dev (Solaris)
+   * brew: libpq (OSX)
+If libpq is already installed, check that either:
+  (i)  'pkg-config' is in your PATH AND PKG_CONFIG_PATH contains a libpq.pc file; or
+  (ii) 'pg_config' is in your PATH.
+If neither can detect , you can set INCLUDE_DIR
+and LIB_DIR manually via:
+  R CMD INSTALL --configure-vars='INCLUDE_DIR=... LIB_DIR=...'
+--------------------------[ ERROR MESSAGE ]----------------------------
+  <stdin>:1:10: fatal error: libpq-fe.h: No such file or directory
+compilation terminated.
+```
+
+The library requires `libpq` for compiling from source, so install accordingly.
+
+### Timezone environment variable for tidyverse (relevant for WSL2)
+
+One of the R packages, `tidyverse` might need access to the `TZ` environment variable during the installation.
+On Ubuntu 20.04 on WSL2 this triggers the following error:
+
+```text
+> install.packages('tidyverse')
+
+ERROR: configuration failed for package ‘xml2’
+System has not been booted with systemd as init system (PID 1). Can't operate.
+Failed to create bus connection: Host is down
+Warning in system("timedatectl", intern = TRUE) :
+  running command 'timedatectl' had status 1
+Error in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]) :
+  namespace ‘xml2’ 1.3.1 is already loaded, but >= 1.3.2 is required
+Calls: <Anonymous> ... namespaceImportFrom -> asNamespace -> loadNamespace
+Execution halted
+ERROR: lazy loading failed for package ‘tidyverse’
+```
+
+This happens because WSL2 does not use the `timedatectl` service, which provides this variable.
+
+```bash
+~$ timedatectl
+System has not been booted with systemd as init system (PID 1). Can't operate.
+Failed to create bus connection: Host is down
+```
+
+and later 
+
+```bash 
+Warning message:
+In system("timedatectl", intern = TRUE) :
+  running command 'timedatectl' had status 1
+Execution halted
+```
+
+This can be amended by setting the environment variable manually before attempting to install `tidyverse`:
+
+```bash
+export TZ='Europe/Ljubljana'
+```
+
+## Possible runtime issues
+### Unix end of line characters
+
+Upon running rapids, an error might occur:
+
+```bash
+/usr/bin/env: ‘python3\r’: No such file or directory
+```
+
+This is due to Windows style end of line characters. 
+To amend this, I added a `.gitattributes` files to force `git` to checkout `rapids` using Unix EOL characters.
+If this still fails, `dos2unix` can be used to change them.
+
+### System has not been booted with systemd as init system (PID 1)
+
+See [the installation issue above](#Timezone-environment-variable-for-tidyverse-(relevant-for-WSL2)).
+
+## Update RAPIDS
+
+To update RAPIDS, first pull and merge [origin]( https://github.com/carissalow/rapids), such as with:
+
+```commandline
+git fetch --progress "origin" refs/heads/master
+git merge --no-ff origin/master
+```
+
+Next, update the conda and R virtual environment.
+
+```bash
+R -e 'renv::restore(repos = c(CRAN = "https://packagemanager.rstudio.com/all/__linux__/focal/latest"))'
+```
+
--- a/config/environment.yml
+++ b/config/environment.yml
@ -1,22 +1,28 @@
 name: straw2analysis
 channels:
-  - defaults
  - conda-forge
 dependencies:
-  - python=3.9
+  - python=3.11
  - black
  - isort
  - flake8
+  - flake8-docstrings
+  - imbalanced-learn=0.10.0
  - jupyterlab
  - jupytext
+  - lightgbm
  - mypy
  - nodejs
  - pandas
  - psycopg2 >= 2.9.1
+  - pre-commit
  - python-dotenv
  - pytz
+  - pyprojroot
+  - pyyaml
  - seaborn
  - scikit-learn
  - sqlalchemy
  - statsmodels
  - tabulate
+  - xgboost
--- a/data/app_categories.csv
+++ b/data/app_categories.csv
--- a/data/input_PANAS_NA.csv
+++ b/data/input_PANAS_NA.csv
--- a/data/input_PANAS_negative_affect_mean.csv
+++ b/data/input_PANAS_negative_affect_mean.csv
--- a/data/z_input_PANAS_NA.csv
+++ b/data/z_input_PANAS_NA.csv
--- a/exploration/all_sensors_sequential_addition_scores.xlsx
+++ b/exploration/all_sensors_sequential_addition_scores.xlsx
--- a/exploration/debug_heatmap.py
+++ b/exploration/debug_heatmap.py
@ -0,0 +1,337 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %%
+import pandas as pd
+
+from rapids.src.features.utils.utils import chunk_episodes
+
+# %%
+phone_data_yield = pd.read_csv(
+    "../rapids/data/interim/p011/phone_yielded_timestamps_with_datetime.csv",
+    parse_dates=["local_date_time"],
+)
+time_segments_labels = pd.read_csv(
+    "../rapids/data/interim/time_segments/p011_time_segments_labels.csv"
+)
+
+# %%
+phone_data_yield["assigned_segments"] = phone_data_yield[
+    "assigned_segments"
+].str.replace(r"_RR\d+SS#", "#", regex=True)
+time_segments_labels["label"] = time_segments_labels["label"].str.replace(
+    r"_RR\d+SS$", "", regex=True
+)
+
+
+# %% tags=[]
+def filter_data_by_segment(data, time_segment_current):
+    data.dropna(subset=["assigned_segments"], inplace=True)
+    if data.shape[0] == 0:  # data is empty
+        data["local_segment"] = data["timestamps_segment"] = None
+        return data
+
+    datetime_regex = (
+        r"[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
+    )
+    timestamps_regex = r"[0-9]{13}"
+    segment_regex = r"\[({}#{},{};{},{})\]".format(
+        time_segment_current,
+        datetime_regex,
+        datetime_regex,
+        timestamps_regex,
+        timestamps_regex,
+    )
+    data["local_segment"] = data["assigned_segments"].str.extract(
+        segment_regex, expand=True
+    )
+    data = data.drop(columns=["assigned_segments"])
+    data = data.dropna(subset=["local_segment"])
+    if (
+        data.shape[0] == 0
+    ):  # there are no rows belonging to time_segment after droping na
+        data["timestamps_segment"] = None
+    else:
+        data[["local_segment", "timestamps_segment"]] = data["local_segment"].str.split(
+            pat=";", n=1, expand=True
+        )
+
+    # chunk episodes
+    if (
+        (not data.empty)
+        and ("start_timestamp" in data.columns)
+        and ("end_timestamp" in data.columns)
+    ):
+        data = chunk_episodes(data)
+
+    return data
+
+
+# %% tags=[]
+time_segment = "daily"
+phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
+
+# %%
+phone_data_yield.tail()
+
+# %%
+phone_data_yield_per_segment.tail()
+
+
+# %%
+def getDataForPlot(phone_data_yield_per_segment):
+    # calculate the length (in minute) of per segment instance
+    phone_data_yield_per_segment["length"] = (
+        phone_data_yield_per_segment["timestamps_segment"]
+        .str.split(",")
+        .apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
+    )
+    # calculate the number of sensors logged at least one row of data per minute.
+    phone_data_yield_per_segment = (
+        phone_data_yield_per_segment.groupby(
+            ["local_segment", "length", "local_date", "local_hour", "local_minute"]
+        )[["sensor", "local_date_time"]]
+        .max()
+        .reset_index()
+    )
+    # extract local start datetime of the segment from "local_segment" column
+    phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
+        phone_data_yield_per_segment["local_segment"].apply(
+            lambda x: x.split("#")[1].split(",")[0]
+        )
+    )
+    # calculate the number of minutes after local start datetime of the segment
+    phone_data_yield_per_segment["minutes_after_segment_start"] = (
+        (
+            phone_data_yield_per_segment["local_date_time"]
+            - phone_data_yield_per_segment["local_segment_start_datetimes"]
+        )
+        / pd.Timedelta(minutes=1)
+    ).astype("int")
+
+    # impute missing rows with 0
+    columns_for_full_index = phone_data_yield_per_segment[
+        ["local_segment_start_datetimes", "length"]
+    ].drop_duplicates(keep="first")
+    columns_for_full_index = columns_for_full_index.apply(
+        lambda row: [
+            [row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
+        ],
+        axis=1,
+    )
+    full_index = []
+    for columns in columns_for_full_index:
+        full_index = full_index + columns
+    full_index = pd.MultiIndex.from_tuples(
+        full_index,
+        names=("local_segment_start_datetimes", "minutes_after_segment_start"),
+    )
+    phone_data_yield_per_segment = (
+        phone_data_yield_per_segment.set_index(
+            ["local_segment_start_datetimes", "minutes_after_segment_start"]
+        )
+        .reindex(full_index)
+        .reset_index()
+        .fillna(0)
+    )
+
+    # transpose the dataframe per local start datetime of the segment
+    # and discard the useless index layer
+    phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
+        "local_segment_start_datetimes"
+    )[["minutes_after_segment_start", "sensor"]].apply(
+        lambda x: x.set_index("minutes_after_segment_start").transpose()
+    )
+    phone_data_yield_per_segment.index = (
+        phone_data_yield_per_segment.index.get_level_values(
+            "local_segment_start_datetimes"
+        )
+    )
+    return phone_data_yield_per_segment
+
+
+# %%
+data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
+
+# %%
+# calculate the length (in minute) of per segment instance
+phone_data_yield_per_segment["length"] = (
+    phone_data_yield_per_segment["timestamps_segment"]
+    .str.split(",")
+    .apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
+)
+
+# %%
+phone_data_yield_per_segment.tail()
+
+# %%
+# calculate the number of sensors logged at least one row of data per minute.
+phone_data_yield_per_segment = (
+    phone_data_yield_per_segment.groupby(
+        ["local_segment", "length", "local_date", "local_hour", "local_minute"]
+    )[["sensor", "local_date_time"]]
+    .max()
+    .reset_index()
+)
+
+# %%
+# extract local start datetime of the segment from "local_segment" column
+phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
+    phone_data_yield_per_segment["local_segment"].apply(
+        lambda x: x.split("#")[1].split(",")[0]
+    )
+)
+
+# %%
+# calculate the number of minutes after local start datetime of the segment
+phone_data_yield_per_segment["minutes_after_segment_start"] = (
+    (
+        phone_data_yield_per_segment["local_date_time"]
+        - phone_data_yield_per_segment["local_segment_start_datetimes"]
+    )
+    / pd.Timedelta(minutes=1)
+).astype("int")
+
+# %%
+columns_for_full_index = phone_data_yield_per_segment[
+    ["local_segment_start_datetimes", "length"]
+].drop_duplicates(keep="first")
+columns_for_full_index = columns_for_full_index.apply(
+    lambda row: [
+        [row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
+    ],
+    axis=1,
+)
+
+# %%
+full_index = []
+for columns in columns_for_full_index:
+    full_index = full_index + columns
+full_index = pd.MultiIndex.from_tuples(
+    full_index, names=("local_segment_start_datetimes", "minutes_after_segment_start")
+)
+
+# %%
+phone_data_yield_per_segment.tail()
+
+# %% [markdown]
+# # A workaround
+
+# %%
+phone_data_yield_per_segment[
+    "local_segment_start_datetimes", "minutes_after_segment_start"
+] = phone_data_yield_per_segment[
+    ["local_segment_start_datetimes", "minutes_after_segment_start"]
+].drop_duplicates(
+    keep="first"
+)
+
+# %%
+phone_data_yield_per_segment.set_index(
+    ["local_segment_start_datetimes", "minutes_after_segment_start"],
+    verify_integrity=True,
+).reindex(full_index)
+
+# %%
+phone_data_yield_per_segment.head()
+
+
+# %% [markdown]
+# # Retry
+
+
+# %%
+def get_data_for_plot(phone_data_yield_per_segment):
+    # calculate the length (in minute) of per segment instance
+    phone_data_yield_per_segment["length"] = (
+        phone_data_yield_per_segment["timestamps_segment"]
+        .str.split(",")
+        .apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
+    )
+    # calculate the number of sensors logged at least one row of data per minute.
+    phone_data_yield_per_segment = (
+        phone_data_yield_per_segment.groupby(
+            ["local_segment", "length", "local_date", "local_hour", "local_minute"]
+        )[["sensor", "local_date_time"]]
+        .max()
+        .reset_index()
+    )
+    # extract local start datetime of the segment from "local_segment" column
+    phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
+        phone_data_yield_per_segment["local_segment"].apply(
+            lambda x: x.split("#")[1].split(",")[0]
+        )
+    )
+    # calculate the number of minutes after local start datetime of the segment
+    phone_data_yield_per_segment["minutes_after_segment_start"] = (
+        (
+            phone_data_yield_per_segment["local_date_time"]
+            - phone_data_yield_per_segment["local_segment_start_datetimes"]
+        )
+        / pd.Timedelta(minutes=1)
+    ).astype("int")
+
+    # impute missing rows with 0
+    columns_for_full_index = phone_data_yield_per_segment[
+        ["local_segment_start_datetimes", "length"]
+    ].drop_duplicates(keep="first")
+    columns_for_full_index = columns_for_full_index.apply(
+        lambda row: [
+            [row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
+        ],
+        axis=1,
+    )
+    full_index = []
+    for columns in columns_for_full_index:
+        full_index = full_index + columns
+    full_index = pd.MultiIndex.from_tuples(
+        full_index,
+        names=("local_segment_start_datetimes", "minutes_after_segment_start"),
+    )
+    phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(
+        subset=["local_segment_start_datetimes", "minutes_after_segment_start"],
+        keep="first",
+    )
+    phone_data_yield_per_segment = (
+        phone_data_yield_per_segment.set_index(
+            ["local_segment_start_datetimes", "minutes_after_segment_start"]
+        )
+        .reindex(full_index)
+        .reset_index()
+        .fillna(0)
+    )
+
+    # transpose the dataframe per local start datetime of the segment
+    # and discard the useless index layer
+    phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
+        "local_segment_start_datetimes"
+    )[["minutes_after_segment_start", "sensor"]].apply(
+        lambda x: x.set_index("minutes_after_segment_start").transpose()
+    )
+    phone_data_yield_per_segment.index = (
+        phone_data_yield_per_segment.index.get_level_values(
+            "local_segment_start_datetimes"
+        )
+    )
+    return phone_data_yield_per_segment
+
+
+# %%
+phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
+
+# %%
+data_for_plot_per_segment = get_data_for_plot(phone_data_yield_per_segment)
+
+# %%
--- a/exploration/ex_ml_pipeline.py
+++ b/exploration/ex_ml_pipeline.py
--- a/exploration/expl_app_categories.py
+++ b/exploration/expl_app_categories.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.4
+#       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -74,3 +74,29 @@ rows_os_manufacturer = df_category_not_found["package_name"].str.contains(
 # %%
 with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df_category_not_found.loc[~rows_os_manufacturer])
+
+# %% [markdown]
+# # Export categories
+
+# %% [markdown]
+# Rename all of "not_found" to "system" or "other".
+
+# %%
+df_app_categories_to_export = df_app_categories.copy()
+rows_os_manufacturer_full = (df_app_categories_to_export["package_name"].str.contains(
+    "|".join(manufacturers + custom_rom + other), case=False
+)) & (df_app_categories_to_export["play_store_genre"] == "not_found")
+df_app_categories_to_export.loc[rows_os_manufacturer_full, "play_store_genre"] = "System"
+
+# %%
+rows_not_found = (df_app_categories_to_export["play_store_genre"] == "not_found")
+df_app_categories_to_export.loc[rows_not_found, "play_store_genre"] = "Other"
+
+# %%
+df_app_categories_to_export["play_store_genre"].value_counts()
+
+# %%
+df_app_categories_to_export.rename(columns={"play_store_genre": "genre"},inplace=True)
+df_app_categories_to_export.to_csv("../data/app_categories.csv", columns=["package_hash","genre"],index=False)
+
+# %%
--- a/exploration/expl_esm_labels.py
+++ b/exploration/expl_esm_labels.py
@ -7,7 +7,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.2
+#       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -15,17 +15,33 @@
 # ---

 # %%
-import os
-import sys
+import datetime

 import seaborn as sns

-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
 import participants.query_db
-from features.esm import *
-from features.esm_JCQ import *
+from features.esm import (
+    QUESTIONNAIRE_IDS,
+    clean_up_esm,
+    get_esm_data,
+    increment_answers,
+    preprocess_esm,
+    reassign_question_ids,
+)
+from features.esm_COPE import DICT_COPE_QUESTION_IDS
+from features.esm_JCQ import reverse_jcq_demand_control_scoring
+from features.esm_SAM import DICT_SAM_QUESTION_IDS, extract_stressful_events
+
+# import os
+# import sys
+# nb_dir = os.path.split(os.getcwd())[0]
+# if nb_dir not in sys.path:
+#     sys.path.append(nb_dir)
+
+
+# %%
+save_figs = False
+export_data = True

 # %%
 participants_inactive_usernames = participants.query_db.get_usernames(
@ -41,8 +57,14 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)

 # %%
 df_esm_PANAS = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] == 8)
-    | (df_esm_preprocessed["questionnaire_id"] == 9)
+    (
+        df_esm_preprocessed["questionnaire_id"]
+        == QUESTIONNAIRE_IDS["PANAS_positive_affect"]
+    )
+    | (
+        df_esm_preprocessed["questionnaire_id"]
+        == QUESTIONNAIRE_IDS["PANAS_negative_affect"]
+    )
 ]
 df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)

@ -63,35 +85,47 @@ df_esm_PANAS_daily_means = (
 # %%
 df_esm_PANAS_summary_participant = (
    df_esm_PANAS_daily_means.groupby(["participant_id", "questionnaire_id"])
-    .agg(["mean", "median", "std"])
+    .esm_numeric_mean.agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
-df_esm_PANAS_summary_participant.columns = df_esm_PANAS_summary_participant.columns.get_level_values(
-    1
-)
 df_esm_PANAS_summary_participant[
-    "PANAS_subscale"
+    "PANAS subscale"
 ] = df_esm_PANAS_daily_means.questionnaire_id.astype("category").cat.rename_categories(
-    {8.0: "PA", 9.0: "NA"}
+    {8.0: "positive affect", 9.0: "negative affect"}
 )

 # %%
-sns.displot(
-    data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS_subscale", binwidth=0.2
+df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["mean"]
+
+# %%
+df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["std"]
+
+# %%
+df_esm_PANAS_summary_participant.query("std == 0")
+
+# %%
+fig1 = sns.displot(
+    data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS subscale", binwidth=0.2
 )
+fig1.set_axis_labels(x_var="participant mean", y_var="frequency")
+if save_figs:
+    fig1.figure.savefig("PANAS_mean_participant.pdf", dpi=300)

 # %%
 sns.displot(
    data=df_esm_PANAS_summary_participant,
    x="median",
-    hue="PANAS_subscale",
+    hue="PANAS subscale",
    binwidth=0.2,
 )

 # %%
-sns.displot(
-    data=df_esm_PANAS_summary_participant, x="std", hue="PANAS_subscale", binwidth=0.05
+fig2 = sns.displot(
+    data=df_esm_PANAS_summary_participant, x="std", hue="PANAS subscale", binwidth=0.05
 )
+fig2.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
+if save_figs:
+    fig2.figure.savefig("PANAS_std_participant.pdf", dpi=300)

 # %%
 df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
@ -99,10 +133,22 @@ df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
 # %% [markdown]
 # # Stress appraisal measure

+# %%
+df_SAM_all = extract_stressful_events(df_esm_inactive)
+
+# %%
+df_SAM_all.head()
+
 # %%
 df_esm_SAM = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= 87)
-    & (df_esm_preprocessed["questionnaire_id"] <= 93)
+    (
+        df_esm_preprocessed["questionnaire_id"]
+        >= QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
+    )
+    & (
+        df_esm_preprocessed["questionnaire_id"]
+        <= QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
+    )
 ]
 df_esm_SAM_clean = clean_up_esm(df_esm_SAM)

@ -110,9 +156,10 @@ df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
 # ## Stressful events

 # %%
-df_esm_SAM_event = df_esm_SAM_clean[df_esm_SAM_clean["questionnaire_id"] == 87].assign(
-    stressful_event=lambda x: (x.esm_user_answer_numeric > 0)
-)
+df_esm_SAM_event = df_esm_SAM_clean[
+    df_esm_SAM_clean["questionnaire_id"]
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
+].assign(stressful_event=lambda x: (x.esm_user_answer_numeric > 0))

 # %%
 df_esm_SAM_daily_events = (
@ -123,20 +170,22 @@ df_esm_SAM_daily_events = (
 )

 # %% [markdown]
-# Calculate the daily mean of YES (1) or NO (0) answers to the question about a stressful events. This is then the daily ratio of EMA sessions that included a stressful event.
+# Calculate the daily mean of YES (1) or NO (0) answers to the question about stressful events. This is then the daily ratio of EMA sessions that included a stressful event.

 # %%
 df_esm_SAM_event_summary_participant = (
    df_esm_SAM_daily_events.groupby(["participant_id"])
-    .agg(["mean", "median", "std"])
+    .SAM_event_ratio.agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
-df_esm_SAM_event_summary_participant.columns = df_esm_SAM_event_summary_participant.columns.get_level_values(
-    1
-)

 # %%
-sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
+fig6 = sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
+fig6.set_axis_labels(
+    x_var="participant proportion of stressful events", y_var="frequency"
+)
+if save_figs:
+    fig6.figure.savefig("SAM_events_mean_participant.pdf", dpi=300)

 # %%
 sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
@ -147,7 +196,12 @@ sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
 # %% [markdown]
 # * Example of threat: "Did this event make you feel anxious?"
 # * Example of challenge: "How eager are you to tackle this event?"
-# * Possible answers: 0 - Not at all, 1 - Slightly, 2 - Moderately, 3 - Considerably, 4 - Extremely
+# * Possible answers:
+#   0 - Not at all,
+#   1 - Slightly,
+#   2 - Moderately,
+#   3 - Considerably,
+#   4 - Extremely

 # %%
 df_esm_SAM_daily = (
@ -159,27 +213,45 @@ df_esm_SAM_daily = (

 # %%
 df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[
-    (df_esm_SAM_daily["questionnaire_id"] == 88)
-    | (df_esm_SAM_daily["questionnaire_id"] == 89)
+    (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
+    | (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
 ]

 # %%
 df_esm_SAM_summary_participant = (
    df_esm_SAM_daily.groupby(["participant_id", "questionnaire_id"])
-    .agg(["mean", "median", "std"])
+    .esm_numeric_mean.agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
-df_esm_SAM_summary_participant.columns = df_esm_SAM_summary_participant.columns.get_level_values(
-    1
+
+# %%
+df_esm_SAM_event_stressfulness_summary_participant = df_esm_SAM_summary_participant[
+    df_esm_SAM_summary_participant["questionnaire_id"]
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
+]
+df_esm_SAM_event_stressfulness_summary_participant.describe()["mean"]
+
+# %%
+df_esm_SAM_event_stressfulness_summary_participant.describe()["std"]
+
+# %%
+sns.displot(
+    data=df_esm_SAM_event_stressfulness_summary_participant, x="mean", binwidth=0.2
 )

 # %%
 df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[
-    (df_esm_SAM_summary_participant["questionnaire_id"] == 88)
-    | (df_esm_SAM_summary_participant["questionnaire_id"] == 89)
+    (
+        df_esm_SAM_summary_participant["questionnaire_id"]
+        == QUESTIONNAIRE_IDS["appraisal_threat"]
+    )
+    | (
+        df_esm_SAM_summary_participant["questionnaire_id"]
+        == QUESTIONNAIRE_IDS["appraisal_challenge"]
+    )
 ]
 df_esm_SAM_threat_challenge_summary_participant[
-    "event_subscale"
+    "event subscale"
 ] = df_esm_SAM_threat_challenge_summary_participant.questionnaire_id.astype(
    "category"
 ).cat.rename_categories(
@ -190,26 +262,84 @@ df_esm_SAM_threat_challenge_summary_participant[
 sns.displot(
    data=df_esm_SAM_threat_challenge_summary_participant,
    x="mean",
-    hue="event_subscale",
+    hue="event subscale",
    binwidth=0.2,
 )

 # %%
-sns.displot(
+fig3 = sns.displot(
    data=df_esm_SAM_threat_challenge_summary_participant,
    x="std",
-    hue="event_subscale",
+    hue="event subscale",
    binwidth=0.1,
 )
+fig3.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
+if save_figs:
+    fig3.figure.savefig("SAM_std_participant.pdf", dpi=300)
+
+# %%
+df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
+    "mean"
+]
+
+# %%
+df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
+    "std"
+]
+
+# %%
+df_esm_SAM_clean.columns
+
+# %%
+df_esm_SAM_clean.esm_status.value_counts()
+
+# %%
+if export_data:
+    df_esm_SAM_fixed = reassign_question_ids(df_esm_SAM_clean, DICT_SAM_QUESTION_IDS)
+    df_esm_SAM_fixed = increment_answers(df_esm_SAM_fixed)
+    df_esm_SAM_for_export = df_esm_SAM_fixed[
+        [
+            "participant_id",
+            "username",
+            "device_id",
+            "_id",
+            "esm_trigger",
+            "esm_session",
+            "esm_notification_id",
+            "question_id",
+            "questionnaire_id",
+            "esm_instructions",
+            "double_esm_user_answer_timestamp",
+            "datetime_lj",
+            "date_lj",
+            "time",
+            "esm_user_answer",
+            "esm_user_answer_numeric",
+        ]
+    ]
+    df_esm_SAM_for_export.sort_values(
+        by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
+    )
+    print(df_esm_SAM_for_export.head())
+    df_esm_SAM_for_export.to_csv(
+        "../data/raw/df_esm_SAM_threat_challenge.csv", index=False
+    )

 # %% [markdown]
 # ## Stressfulness of period

 # %%
 df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[
-    df_esm_SAM_summary_participant["questionnaire_id"] == 93
+    df_esm_SAM_summary_participant["questionnaire_id"]
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
 ]

+# %%
+df_esm_SAM_period_summary_participant.describe()["mean"]
+
+# %%
+df_esm_SAM_period_summary_participant.describe()["std"]
+
 # %%
 sns.displot(data=df_esm_SAM_period_summary_participant, x="mean", binwidth=0.2)

@ -221,8 +351,8 @@ sns.displot(data=df_esm_SAM_period_summary_participant, x="std", binwidth=0.1)

 # %%
 df_esm_JCQ_demand_control = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= 10)
-    & (df_esm_preprocessed["questionnaire_id"] <= 11)
+    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["JCQ_job_demand"])
+    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["JCQ_job_control"])
 ]
 df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control)

@ -242,14 +372,11 @@ df_esm_JCQ_daily = (
 )
 df_esm_JCQ_summary_participant = (
    df_esm_JCQ_daily.groupby(["participant_id", "questionnaire_id"])
-    .agg(["mean", "median", "std"])
+    .esm_score_mean.agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
-df_esm_JCQ_summary_participant.columns = df_esm_JCQ_summary_participant.columns.get_level_values(
-    1
-)
 df_esm_JCQ_summary_participant[
-    "JCQ_subscale"
+    "JCQ subscale"
 ] = df_esm_JCQ_summary_participant.questionnaire_id.astype(
    "category"
 ).cat.rename_categories(
@ -257,11 +384,71 @@ df_esm_JCQ_summary_participant[
 )

 # %%
-sns.displot(
-    data=df_esm_JCQ_summary_participant, x="mean", hue="JCQ_subscale", binwidth=0.1,
-)
+df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["mean"]

 # %%
-sns.displot(
-    data=df_esm_JCQ_summary_participant, x="std", hue="JCQ_subscale", binwidth=0.05,
+df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["std"]
+
+# %%
+fig4 = sns.displot(
+    data=df_esm_JCQ_summary_participant,
+    x="mean",
+    hue="JCQ subscale",
+    binwidth=0.1,
 )
+fig4.set_axis_labels(x_var="participant mean", y_var="frequency")
+if save_figs:
+    fig4.figure.savefig("JCQ_mean_participant.pdf", dpi=300)
+
+# %%
+fig5 = sns.displot(
+    data=df_esm_JCQ_summary_participant,
+    x="std",
+    hue="JCQ subscale",
+    binwidth=0.05,
+)
+fig6.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
+if save_figs:
+    fig5.figure.savefig("JCQ_std_participant.pdf", dpi=300)
+
+# %% [markdown]
+# # COPE Inventory
+
+# %%
+df_esm_COPE = df_esm_preprocessed[
+    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["COPE_active"])
+    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["COPE_emotions"])
+]
+
+# %%
+df_esm_COPE_clean = clean_up_esm(df_esm_COPE)
+df_esm_COPE_clean = increment_answers(df_esm_COPE_clean)
+df_esm_COPE_fixed = reassign_question_ids(df_esm_COPE_clean, DICT_COPE_QUESTION_IDS)
+
+# %%
+if export_data:
+    df_esm_COPE_for_export = df_esm_COPE_fixed[
+        [
+            "participant_id",
+            "username",
+            "device_id",
+            "_id",
+            "esm_trigger",
+            "esm_session",
+            "esm_notification_id",
+            "question_id",
+            "questionnaire_id",
+            "esm_instructions",
+            "double_esm_user_answer_timestamp",
+            "datetime_lj",
+            "date_lj",
+            "time",
+            "esm_user_answer",
+            "esm_user_answer_numeric",
+        ]
+    ]
+    df_esm_COPE_for_export.sort_values(
+        by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
+    )
+    print(df_esm_COPE_for_export.head())
+    df_esm_COPE_for_export.to_csv("../data/raw/df_esm_COPE.csv", index=False)
--- a/exploration/expl_features_analysis.py
+++ b/exploration/expl_features_analysis.py
@ -0,0 +1,318 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# %matplotlib inline
+
+import os, sys, math
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+from sklearn.tree import DecisionTreeClassifier
+from sklearn import tree
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
+
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+def calc_entropy(column):
+    """
+    Calculate entropy given a pandas series, list, or numpy array.
+    """
+    # Compute the counts of each unique value in the column
+    counts = np.bincount(column)
+    # Divide by the total column length to get a probability
+    probabilities = counts / len(column)
+    
+    # Initialize the entropy to 0
+    entropy = 0
+    # Loop through the probabilities, and add each one to the total entropy
+    for prob in probabilities:
+        if prob > 0:
+            # use log from math and set base to 2
+            entropy += prob * math.log(prob, 2)
+    
+    return -entropy
+
+
+def calc_information_gain(data, split_name, target_name):
+    """
+    Calculate information gain given a data set, column to split on, and target
+    """
+    # Calculate the original entropy
+    original_entropy = calc_entropy(data[target_name])
+    #Find the unique values in the column
+    values = data[split_name].unique()
+    
+    # Make two subsets of the data, based on the unique values
+    left_split = data[data[split_name] == values[0]]
+    right_split = data[data[split_name] == values[1]]
+    
+    # Loop through the splits and calculate the subset entropies
+    to_subtract = 0
+    for subset in [left_split, right_split]:
+        prob = (subset.shape[0] / data.shape[0]) 
+        to_subtract += prob * calc_entropy(subset[target_name])
+    
+    # Return information gain
+    return original_entropy - to_subtract
+
+
+def get_information_gains(data, target_name):
+  #Intialize an empty dictionary for information gains
+  information_gains = {}
+  
+  #Iterate through each column name in our list
+  for col in list(data.columns):
+    #Find the information gain for the column
+    information_gain = calc_information_gain(data, col, target_name)
+    #Add the information gain to our dictionary using the column name as the ekey                                         
+    information_gains[col] = information_gain
+  
+  #Return the key with the highest value                                          
+  #return max(information_gains, key=information_gains.get)
+  
+  return information_gains
+
+def n_features_with_highest_info_gain(info_gain_dict, n=None):
+    """
+    Get n-features that have highest information gain
+    """
+    if n is None:
+        n = len(info_gain_dict)
+    import heapq
+    n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
+    return {feature[0]: feature[1] for feature in n_largest}
+
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
+
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+categorical_features = model_input[categorical_feature_colnames].copy()
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
+model_input = pd.concat([numerical_features, categorical_features], axis=1)
+
+# Binarizacija targeta
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) 
+print(model_input['target'].value_counts(), edges)
+
+# %%
+info_gains = get_information_gains(model_input, 'target')
+
+# %% [markdown]
+# Present the feature importance results
+
+# %%
+print("Total columns:", len(info_gains))
+print(pd.Series(info_gains).value_counts())
+
+n_features_with_highest_info_gain(info_gains, n=189)
+
+# %%
+def compute_impurity(feature, impurity_criterion):
+    """
+    This function calculates impurity of a feature.
+    Supported impurity criteria: 'entropy', 'gini'
+    input: feature (this needs to be a Pandas series)
+    output: feature impurity
+    """
+    probs = feature.value_counts(normalize=True)
+    
+    if impurity_criterion == 'entropy':
+        impurity = -1 * np.sum(np.log2(probs) * probs)
+    elif impurity_criterion == 'gini':
+        impurity = 1 - np.sum(np.square(probs))
+    else:
+        raise ValueError('Unknown impurity criterion')
+        
+    return impurity
+
+
+def comp_feature_information_gain(df, target, descriptive_feature, split_criterion, print_flag=False):
+    """
+    This function calculates information gain for splitting on 
+    a particular descriptive feature for a given dataset
+    and a given impurity criteria.
+    Supported split criterion: 'entropy', 'gini'
+    """
+    if print_flag:
+        print('target feature:', target)
+        print('descriptive_feature:', descriptive_feature)
+        print('split criterion:', split_criterion)
+            
+    target_entropy = compute_impurity(df[target], split_criterion)
+
+    # we define two lists below:
+    # entropy_list to store the entropy of each partition
+    # weight_list to store the relative number of observations in each partition
+    entropy_list = list()
+    weight_list = list()
+    
+    # loop over each level of the descriptive feature
+    # to partition the dataset with respect to that level
+    # and compute the entropy and the weight of the level's partition
+    for level in df[descriptive_feature].unique():
+        df_feature_level = df[df[descriptive_feature] == level]
+        entropy_level = compute_impurity(df_feature_level[target], split_criterion)
+        entropy_list.append(round(entropy_level, 3))
+        weight_level = len(df_feature_level) / len(df)
+        weight_list.append(round(weight_level, 3))
+
+    # print('impurity of partitions:', entropy_list)
+    # print('weights of partitions:', weight_list)
+
+    feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
+    
+    information_gain = target_entropy - feature_remaining_impurity
+    
+    if print_flag:
+        print('impurity of partitions:', entropy_list)
+        print('weights of partitions:', weight_list)
+        print('remaining impurity:', feature_remaining_impurity)
+        print('information gain:', information_gain)
+        print('====================')
+        
+    return information_gain
+
+
+def calc_information_gain_2(data, split_name, target_name, split_criterion):
+    """
+    Calculate information gain given a data set, column to split on, and target
+    """
+    # Calculate the original impurity
+    original_impurity = compute_impurity(data[target_name], split_criterion)
+    #Find the unique values in the column
+    values = data[split_name].unique()
+    
+    # Make two subsets of the data, based on the unique values
+    left_split = data[data[split_name] == values[0]]
+    right_split = data[data[split_name] == values[1]]
+    
+    # Loop through the splits and calculate the subset impurities
+    to_subtract = 0
+    for subset in [left_split, right_split]:
+        prob = (subset.shape[0] / data.shape[0]) 
+        to_subtract += prob * compute_impurity(subset[target_name], split_criterion) 
+    
+    # Return information gain
+    return original_impurity - to_subtract
+
+
+def get_information_gains_2(data, target_name, split_criterion):
+  #Intialize an empty dictionary for information gains
+  information_gains = {}
+  
+  #Iterate through each column name in our list
+  for feature in list(data.columns):
+    #Find the information gain for the column
+    information_gain = calc_information_gain_2(model_input, target_name, feature, split_criterion)
+    #Add the information gain to our dictionary using the column name as the ekey                                         
+    information_gains[feature] = information_gain
+  
+  #Return the key with the highest value                                          
+  #return max(information_gains, key=information_gains.get)
+  
+  return information_gains
+
+# %% [markdown]
+# Present the feature importance results from other methods
+
+# %%
+split_criterion = 'entropy'
+print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
+information_gains = get_information_gains_2(model_input, 'target', split_criterion)
+print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
+n_features_with_highest_info_gain(information_gains)
+
+# %%
+# Present the feature importance using a tree (that uses gini imputity measure)
+split_criterion = 'entropy'
+print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
+
+X, y  = model_input.drop(columns=['target', 'pid']), model_input['target']
+imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+X = imputer.fit_transform(X)
+X, _, y, _ =  train_test_split(X, y, random_state=19, test_size=0.25)
+
+
+clf = DecisionTreeClassifier(criterion=split_criterion)
+clf.fit(X, y)
+
+feat_importance = clf.tree_.compute_feature_importances(normalize=False)
+print("feat importance = ", feat_importance)
+print("shape", feat_importance.shape)
+tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist()))
+info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp))
+info_gains_dict[info_gains_dict > 0]
+
+# %%
+# Binarizacija vrednosti tree Information Gain-a
+bins = [-0.1, 0, 0.1] # bins for target's correlations with features
+cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True) 
+plt.title(f"Tree information gains by value ({split_criterion})")
+cut_info_gains.value_counts().plot(kind='bar', color='purple')
+plt.xticks(rotation=45, ha='right')
+print(cut_info_gains.value_counts())
+
+
+pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20))
+
+# %%
+# Plot feature importance tree graph 
+plt.figure(figsize=(12,12))
+tree.plot_tree(clf,
+               feature_names = list(model_input.drop(columns=['target', 'pid']).columns), 
+               class_names=True,
+               filled = True, fontsize=5, max_depth=3)
+
+plt.savefig('tree_high_dpi', dpi=800)
+
+
+# %% [markdown]
+# Present the feature importance by correlation with target
+
+corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
+# corrs.sort_values(ascending=False)
+
+# Binarizacija vrednosti korelacij
+bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features
+cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True) 
+plt.title("Target's correlations with features")
+cut_corrs.value_counts().plot(kind='bar')
+plt.xticks(rotation=45, ha='right')
+print(cut_corrs.value_counts())
+print(corrs[corrs > 0.1]) # or corrs < -0.1])
+# %%
+
+# %%
--- a/exploration/expl_features_groups_analysis.py
+++ b/exploration/expl_features_groups_analysis.py
@ -0,0 +1,328 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# %matplotlib inline
+
+import os, sys, math
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.impute import SimpleImputer
+from sklearn.naive_bayes import GaussianNB  
+from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
+from sklearn import metrics 
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
+
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+categorical_features = model_input[categorical_feature_colnames].copy()
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
+model_input = pd.concat([numerical_features, categorical_features], axis=1)
+
+# Binarizacija targeta
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) 
+
+print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
+print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)
+
+
+# %%
+# Add prefix to demographical features
+demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile', 
+                'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']
+
+new_names = [(col, "demo_"+col) for col in demo_features]
+model_input.rename(columns=dict(new_names), inplace=True)
+
+demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio', 
+                'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M', 
+                'demo_startlanguage_nl', 'demo_startlanguage_sl']
+
+# %%
+# Get phone and non-phone columns
+import warnings
+
+def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
+    """
+    This function makes predictions with sensor groups. 
+    It takes in a dataframe (df), a list of group substrings (groups_substrings) 
+    and an optional parameter include_group (default is True). 
+    It creates a list of columns in the dataframe that contain the group substrings, 
+    while excluding the 'pid' and 'target' columns. It then splits the data into training 
+    and test sets, using a test size of 0.25 for the first split and 0.2 for the second split. 
+    A SimpleImputer is used to fill in missing values with median values. 
+    A LogisticRegression is then used to fit the training set and make predictions 
+    on the test set. Finally, accuracy, precision, recall and F1 scores are printed 
+    for each substring group depending on whether or not include_group 
+    is set to True or False.
+
+    """
+    
+    best_sensor = None
+    best_recall_score, best_f1_score = None, None
+
+    for fgroup_substr in groups_substrings:
+        if fgroup_substr is None:
+            feature_group_cols = list(df.columns)
+            feature_group_cols.remove("pid")
+            feature_group_cols.remove("target")
+        else: 
+            if include_group:
+                feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
+            else:    
+                feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
+
+
+        X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
+        X, _, y, _ =  train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
+        
+        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+        
+        nb = GaussianNB()
+        model_cv = cross_validate(
+            nb,
+            X=imputer.fit_transform(X),
+            y=y,
+            cv=StratifiedKFold(n_splits=5, shuffle=True),
+            n_jobs=-1,
+            scoring=('accuracy', 'precision', 'recall', 'f1')
+        )
+        X_train, X_test, y_train, y_test =  train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
+        
+
+        if print_flag:
+            if include_group:
+                print("\nPrediction with", fgroup_substr)
+            else:
+                print("\nPrediction without", fgroup_substr)
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
+
+            acc = np.mean(model_cv['test_accuracy'])
+            acc_std = np.std(model_cv['test_accuracy'])
+            
+            prec = np.mean(model_cv['test_precision'])
+            prec_std = np.std(model_cv['test_precision'])
+            
+            rec = np.mean(model_cv['test_recall'])
+            rec_std = np.std(model_cv['test_recall'])
+            
+            f1 = np.mean(model_cv['test_f1'])
+            f1_std = np.std(model_cv['test_f1'])
+
+        if print_flag:
+            print("************************************************")
+            print(f"Accuracy: {acc} (sd={acc_std})")
+            print(f"Precison: {prec} (sd={prec_std})")
+            print(f"Recall: {rec} (sd={rec_std})")
+            print(f"F1: {f1} (sd={f1_std})\n")
+
+        if (not best_recall_score and not best_f1_score) or (rec > best_recall_score):
+            best_sensor = fgroup_substr
+            best_recall_score, best_f1_score = rec, f1
+            best_recall_score_std, best_f1_score_std = rec_std, f1_std
+        
+    return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std 
+
+# %% [markdown]
+# ### sensor big feature groups (phone, empatica, demographical)
+big_groups_substr = ["phone_", "empatica_", "demo_"]
+make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)
+
+# %% [markdown]
+# ### Empatica sezor groups
+# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
+# e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
+# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
+
+# %% [markdown]
+# ### Phone sensor groups
+# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
+# phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", 
+#                 "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
+# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
+
+# %%
+# Write all the sensors  (phone, empatica), seperate other (demographical) cols also
+
+sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
+                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
+                        "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
+# %%
+def find_sensor_group_features_importance(model_input, sensor_groups_strings):
+    """
+    This function finds the importance of sensor groups for a given model input. It takes two parameters: 
+    model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores, 
+    which will be populated with tuples containing the best sensor, its recall score, and its F1 score. 
+    It then makes a copy of the model input and the sensor groups strings. It then loops through each group 
+    in the list of strings, creating a list of important columns from the sensor importance scores list. 
+    It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score, 
+    and its F1 score. These values are added to the sensor importance scores list as a tuple. The function 
+    then removes that best sensor from the list of strings before looping again until all groups have been evaluated. 
+    Finally, it returns the populated list of tuples containing all sensors' scores. 
+    """
+    sensor_importance_scores = []
+    model_input = model_input.copy()
+    sensor_groups_strings = sensor_groups_strings.copy()
+    groups_len = len(sensor_groups_strings)
+    for i in range(groups_len):
+        important_cols = [col[0] for col in sensor_importance_scores]
+        with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
+        
+
+        best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std  = \
+            make_predictions_with_sensor_groups(model_input, 
+            groups_substrings=sensor_groups_strings, include_group=True, 
+            with_cols=with_cols)
+        sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std ))
+        print(f"\nAdded sensor: {best_sensor}\n")
+        sensor_groups_strings.remove(best_sensor)
+    
+    return sensor_importance_scores
+
+
+# %%
+# Method for sorting list of tuples into 3 lists
+def sort_tuples_to_lists(list_of_tuples):
+    """
+    sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument 
+    and sorts them into three separate lists. The first list, xs, contains the first element 
+    of each tuple. The second list, yrecall, contains the second element of each tuple rounded 
+    to 4 decimal places. The third list, y_fscore, contains the third element of each tuple 
+    rounded to 4 decimal places. The method returns all three lists. 
+    """
+    xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], []
+    for a_tuple in list_of_tuples:
+        xs.append(a_tuple[0])
+        y_recall.append(round(a_tuple[1], 4))
+        y_fscore.append(round(a_tuple[2], 4))
+        recall_std.append(round(a_tuple[3], 4))
+        fscore_std.append(round(a_tuple[4], 4))
+    return xs, y_recall, y_fscore, recall_std, fscore_std
+
+def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
+                                                        title="Sequential addition of features and its F1, and recall scores"):
+    """
+    This function plots the sequential progress of feature addition scores using two subplots. 
+    The first subplot is for recall scores and the second subplot is for F1-scores. 
+    The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes. 
+    The title of the plot can be specified by the user using the parameter title. 
+    The maximum recall index and maximum F1-score index are also plotted using a black dot. 
+    The figure size is set to 18.5 inches in width and 10.5 inches in height, 
+    and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed 
+    using plt.show().
+    """
+    
+    fig, ax = plt.subplots(nrows=2, sharex=True)
+    ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD
+    ax[0].plot(xs, y_recall, color='red')
+    ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD
+    mrec_indx = np.argmax(y_recall)
+    ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
+    ax[0].legend(["Upper std", "Mean Recall", "Lower std"])
+
+    ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD
+    ax[1].plot(xs, y_fscore)
+    ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD
+    mfscore_indx = np.argmax(y_fscore)
+    ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
+    ax[1].legend(["Upper std", "Mean F1-score", "Lower std"])
+    
+    fig.set_size_inches(18.5, 10.5)
+
+    ax[0].title.set_text('Recall scores')
+    ax[1].title.set_text('F1-scores')
+    plt.suptitle(title, fontsize=14)
+    plt.xticks(rotation=90)
+    plt.show()
+
+# %%
+sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
+                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
+                        "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
+
+# sensors_features_groups = ["phone_", "empatica_", "demo_"]
+
+# %%
+# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
+sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
+xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores)
+
+# %% [markdown]
+# ### Visualize sensors groups F1 and recall scores
+print(sensor_groups_importance_scores)
+plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
+                                                    title="Sequential addition of sensors and its F1, and recall scores")
+
+# %%
+# Take the most important feature group and investigate it feature-by-feature
+best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
+best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
+
+# best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
+
+# xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(best_sensor_features_scores)
+
+# %% [markdown]
+# ### Visualize best sensor's F1 and recall scores
+# print(best_sensor_features_scores)
+# plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
+#                                                     title="Best sensor addition it's features with F1 and recall scores")
+
+# %%
+# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
+# It also saves the sequence of scores for all sensors' features in excel file
+seq_columns = ["sensor_name", "feature_sequence", "recall", "f1_score"]
+feature_sequence = pd.DataFrame(columns=seq_columns)
+for i, sensor_group in enumerate(sensor_groups_importance_scores):
+
+    current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
+    current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
+    xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(current_sensor_features_scores)
+    feature_sequence = pd.concat([feature_sequence, pd.DataFrame({"sensor_name":sensor_group[0], "feature_sequence": [xs], "recall": [y_recall], 
+                                                             "f1_score": [y_fscore], "recall_std": [recall_std], "f1_std": [fscore_std]})])
+
+    plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std, 
+    title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")
+
+feature_sequence.to_excel("all_sensors_sequential_addition_scores.xlsx", index=False)
+
+# %%
+# TODO: method that reads data from the excel file, specified above, and then the method,
+# that selects only features that are max a thresh[%] below the max value (best for recall
+# possibly for f1). This method should additionally take threshold parameter.
+
+# %%
+
--- a/exploration/expl_proximity.py
+++ b/exploration/expl_proximity.py
@ -16,6 +16,7 @@
 # %%
 # %matplotlib inline
 import datetime
+import importlib
 import os
 import sys

@ -32,13 +33,16 @@ import participants.query_db
 TZ_LJ = timezone("Europe/Ljubljana")

 # %%
-from features.proximity import *
+from features import helper, proximity
+
+# %%
+importlib.reload(proximity)

 # %% [markdown]
 # # Basic characteristics

 # %%
-df_proximity_nokia = get_proximity_data(["nokia_0000003"])
+df_proximity_nokia = proximity.get_proximity_data(["nokia_0000003"])
 print(df_proximity_nokia)

 # %%
@ -53,7 +57,7 @@ df_proximity_nokia.double_proximity.value_counts()

 # %%
 participants_inactive_usernames = participants.query_db.get_usernames()
-df_proximity_inactive = get_proximity_data(participants_inactive_usernames)
+df_proximity_inactive = proximity.get_proximity_data(participants_inactive_usernames)

 # %%
 df_proximity_inactive.double_proximity.describe()
@ -110,3 +114,13 @@ df_proximity_combinations[
    (df_proximity_combinations[5.0] != 0)
    & (df_proximity_combinations[5.00030517578125] != 0)
 ]
+
+# %% [markdown]
+# # Features
+
+# %%
+df_proximity_inactive = helper.get_date_from_timestamp(df_proximity_inactive)
+
+# %%
+df_proximity_features = proximity.count_proximity(df_proximity_inactive, ["date_lj"])
+display(df_proximity_features)
--- a/exploration/expl_stress_event.py
+++ b/exploration/expl_stress_event.py
@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %%
+import os
+import sys
+import datetime
+import math
+
+import seaborn as sns
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+import participants.query_db
+from features.esm import *
+from features.esm_JCQ import *
+from features.esm_SAM import *
+
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+# %%
+participants_inactive_usernames = participants.query_db.get_usernames(
+    collection_start=datetime.date.fromisoformat("2020-08-01")
+)
+df_esm_inactive = get_esm_data(participants_inactive_usernames)
+
+# %%
+df_esm_preprocessed = preprocess_esm(df_esm_inactive)
+
+
+# %% [markdown]
+# Investigate stressfulness events
+# %%
+extracted_ers = df_esm_preprocessed.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire length
+extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire answering is 15 min
+session_start_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].min().to_frame().rename(columns={'timestamp': 'session_start_timestamp'}) # questionnaire start timestamp
+session_end_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
+se_time = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
+se_duration = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
+
+# Make se_durations to the appropriate lengths
+
+# Extracted 3 targets that will be transfered in the csv file to the cleaning script. 
+df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].columns
+se_stressfulness_event_tg = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'appraisal_stressfulness_event'})
+
+# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
+extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \
+                                .join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
+                                .join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
+                                .join(se_time, on=['device_id', 'esm_session'], how='left') \
+                                .join(se_duration, on=['device_id', 'esm_session'], how='left') \
+
+# Filter-out the sessions that are not useful. Because of the ambiguity this excludes: 
+# (1) straw event times that are marked as "0 - I don't remember"
+# (2) straw event durations that are marked as "0 - I don't remember" 
+extracted_ers = extracted_ers[(~extracted_ers.se_time.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("Removed "))]
+extracted_ers.reset_index(drop=True, inplace=True)
+
+# Add default duration in case if participant answered that no stressful event occured
+
+# Prepare data to fit the data structure in the CSV file ...
+# Add the event time as the start of the questionnaire if no stress event occured
+extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
+# Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds 
+extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
+extracted_ers['shift_direction'] = -1
+
+""">>>>> begin section (could be optimized) <<<<<"""
+
+# Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
+# is taken as end time of the segment. Else the user input duration is taken. 
+extracted_ers['temp_duration'] = extracted_ers['se_duration']
+extracted_ers['se_duration'] = \
+    np.where(
+        extracted_ers['se_duration'].astype(str).str.startswith("1 - "),
+        extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'], 
+        extracted_ers['se_duration']
+    )
+
+# This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds.
+
+extracted_ers['se_duration'] = \
+    extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else abs(pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60)
+
+# Check whether min se_duration is at least the same duration as the ioi. Filter-out the rest.
+
+""">>>>> end section <<<<<"""
+
+# %% [markdown]
+# Count negative values of duration
+print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
+print("Count stressed:", extracted_ers[(~extracted_ers['se_duration'].isna())][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
+print("Count negative durations (invalid se_time user input):", extracted_ers[extracted_ers['se_duration'] < 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
+print("Count 0 durations:", extracted_ers[extracted_ers['se_duration'] == 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
+extracted_ers[extracted_ers['se_duration'] <= 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]
+extracted_ers[(~extracted_ers['se_duration'].isna()) & (extracted_ers['se_duration'] <= 0)][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']]
+
+ax = extracted_ers.hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
+hist, bin_edges = np.histogram(extracted_ers['se_duration'].dropna())
+hist
+bin_edges
+
+extracted_ers = extracted_ers[extracted_ers['se_duration'] >= 0]
+
+# %%
+# bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos'  ..... right=False
+bins = [-100000000, -0.0000001, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
+
+extracted_ers['bins'], edges = pd.cut(extracted_ers.se_duration, bins=bins, labels=['neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
+sns.displot(
+    data=extracted_ers.dropna(),
+    x="bins",
+    binwidth=0.1,
+)
+
+# %% [markdown]
+extracted_ers[extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'] >= 0]
+extracted_ers['se_time'].value_counts()
+pd.set_option('display.max_rows', 100)
+# Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika. 
+extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()] # Remove no stress events
+extracted_ers['diff_se_time_session_end'] = (extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'])
+
+print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
+print("Count negative durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] < 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']])
+print("Count 0 durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] == 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
+
+extracted_ers[extracted_ers['diff_se_time_session_end'] < 0]['diff_se_time_session_end']
+# extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)]
+bins2 = [-100000, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
+extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['neg_zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
+extracted_ers['bins2']
+sns.displot(
+    data=extracted_ers.dropna(),
+    x="bins2",
+    binwidth=0.1,
+)
+
+extracted_ers.shape
+extracted_ers.dropna().shape
+
+print()
+
+
+# %%
+extracted_ers['appraisal_stressfulness_event_num'] = extracted_ers['appraisal_stressfulness_event'].str[0].astype(int)
+print("duration-target (corr):", extracted_ers['se_duration'].corr(extracted_ers['appraisal_stressfulness_event_num']))
+
+# %%
+# Explore groupby participants?
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@ -0,0 +1,93 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% 
+import sys, os
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import recall_score, f1_score
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+from machine_learning.cross_validation import CrossValidation
+from machine_learning.preprocessing import Preprocessing
+from machine_learning.feature_selection import FeatureSelection
+
+# %% 
+df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+df.set_index(index_columns, inplace=True)
+
+# Create binary target 
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
+
+
+nan_cols = df.columns[df.isna().any()].tolist()
+df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))
+
+cv = CrossValidation(data=df, cv_method="logo")
+
+categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
+interval_feature_list, other_feature_list = [], []
+
+# %%
+for split in cv.get_splits():
+    train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
+    pre = Preprocessing(train_X, train_y, test_X, test_y)
+    pre.one_hot_encode_train_and_test_sets(categorical_columns)
+    train_X, train_y, test_X, test_y = pre.get_train_test_sets()
+    
+    
+    print(train_X.shape, test_X.shape)
+    # Predict before feature selection
+    rfc = RandomForestClassifier(n_estimators=10)
+    rfc.fit(train_X, train_y)
+    predictions = rfc.predict(test_X)
+    
+    print("Recall:", recall_score(test_y, predictions))
+    print("F1:", f1_score(test_y, predictions))
+    
+    # Feature selection on train set
+    train_groups, test_groups = cv.get_groups_sets(split)
+
+    fs = FeatureSelection(train_X, train_y, train_groups) 
+    selected_features = fs.select_features(n_min=20, n_max=29, k=40,
+                                           ml_type="classification_bin", 
+                                           metric="recall", n_tolerance=20)
+    
+    train_X = train_X[selected_features]
+    test_X = test_X[selected_features]
+    
+    print(selected_features)
+    print(len(selected_features))
+    
+    # Predict after feature selection    
+    rfc = RandomForestClassifier(n_estimators=500)
+    rfc.fit(train_X, train_y)
+    predictions = rfc.predict(test_X)
+    
+    print("Recall:", recall_score(test_y, predictions))
+    print("F1:", f1_score(test_y, predictions))
+    
+    break
+
+# %%
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@ -0,0 +1,134 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.14.5
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# from IPython.core.interactiveshell import InteractiveShell
+from pathlib import Path
+
+# matplotlib inline
+# import os
+# import sys
+import pandas as pd
+
+from machine_learning.helper import (
+    impute_encode_categorical_features,
+    prepare_cross_validator,
+    prepare_sklearn_data_format,
+    run_all_classification_models,
+)
+
+# InteractiveShell.ast_node_interactivity = "all"
+#
+# nb_dir = os.path.split(os.getcwd())[0]
+# if nb_dir not in sys.path:
+#     sys.path.append(nb_dir)
+
+
+# %%
+CV_METHOD = "logo"  # logo, half_logo, 5kfold
+# Cross-validation method (could be regarded as a hyperparameter)
+print("CV_METHOD: " + CV_METHOD)
+N_SL = 3  # Number of largest/smallest accuracies (of particular CV) outputs
+UNDERSAMPLING = False
+# (bool) If True this will train and test data on balanced dataset
+# (using undersampling method)
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+PATH_BASE = Path("E:/STRAWresults/20230415")
+
+SEGMENT_TYPE = "period"
+print("SEGMENT_TYPE: " + SEGMENT_TYPE)
+SEGMENT_LENGTH = "30_minutes_before"
+print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
+TARGET_VARIABLE = "JCQ_job_control"
+print("TARGET_VARIABLE: " + TARGET_VARIABLE)
+
+if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
+    TARGET_VARIABLE += "_"
+    TARGET_VARIABLE += SEGMENT_TYPE
+
+PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
+
+model_input = pd.read_csv(PATH_FULL)
+
+if SEGMENT_LENGTH == "daily":
+    DAY_LENGTH = "daily"  # or "working"
+    print(DAY_LENGTH)
+    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+model_input["target"].value_counts()
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# bins = [-10, 0, 10] # bins for z-scored targets
+BINS = [-1, 0, 4]  # bins for stressfulness (0-4) target
+print("BINS: ", BINS)
+model_input["target"], edges = pd.cut(
+    model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
+)  # ['low', 'medium', 'high']
+print(model_input["target"].value_counts())
+REMOVE_MEDIUM = True
+if ("medium" in model_input["target"]) and REMOVE_MEDIUM:
+    model_input = model_input[model_input["target"] != "medium"]
+    model_input["target"] = (
+        model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
+    )
+else:
+    model_input["target"] = model_input["target"].map(
+        {"low": 0, "medium": 1, "high": 2}
+    )
+    print(model_input["target"].value_counts())
+
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# UnderSampling
+if UNDERSAMPLING:
+    no_stress = model_input[model_input["target"] == 0]
+    stress = model_input[model_input["target"] == 1]
+
+    no_stress = no_stress.sample(n=len(stress))
+    model_input = pd.concat([stress, no_stress], axis=0)
+
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+model_input_encoded = impute_encode_categorical_features(model_input)
+# %%
+data_x, data_y, data_groups = prepare_sklearn_data_format(
+    model_input_encoded, CV_METHOD
+)
+cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
+
+# %%
+data_y.head()
+
+# %%
+data_y.tail()
+# %%
+data_y.shape
+# %%
+scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
+# %%
+PATH_OUTPUT = Path("..") / Path("presentation/results")
+path_output_full = PATH_OUTPUT / (
+    TARGET_VARIABLE
+    + "_"
+    + SEGMENT_LENGTH
+    + "_classification"
+    + str(BINS)
+    + "_"
+    + CV_METHOD
+    + ".csv"
+)
+scores.to_csv(path_output_full, index=False)
--- a/exploration/ml_pipeline_classification_composite.py
+++ b/exploration/ml_pipeline_classification_composite.py
@ -0,0 +1,177 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.14.5
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+from pathlib import Path
+
+import pandas as pd
+import seaborn as sns
+from sklearn.decomposition import PCA
+
+from machine_learning.helper import (
+    impute_encode_categorical_features,
+    prepare_cross_validator,
+    prepare_sklearn_data_format,
+    run_all_classification_models,
+)
+
+# %%
+CV_METHOD = "logo"  # logo, half_logo, 5kfold
+# Cross-validation method (could be regarded as a hyperparameter)
+print("CV_METHOD: " + CV_METHOD)
+N_SL = 3  # Number of largest/smallest accuracies (of particular CV) outputs
+UNDERSAMPLING = False
+# (bool) If True this will train and test data on balanced dataset
+# (using undersampling method)
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+PATH_BASE = Path("E:/STRAWresults/20230415")
+
+SEGMENT_TYPE = "period"
+print("SEGMENT_TYPE: " + SEGMENT_TYPE)
+SEGMENT_LENGTH = "30_minutes_before"
+print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
+
+PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv"
+
+all_features_with_baseline = pd.read_csv(PATH_FULL)
+
+# %%
+TARGETS = [
+    "PANAS_negative_affect_mean",
+    "PANAS_positive_affect_mean",
+    "JCQ_job_demand_mean",
+    "JCQ_job_control_mean",
+    "appraisal_stressfulness_period_mean",
+]
+
+# %%
+all_features_cleaned = pd.DataFrame()
+for target in TARGETS:
+    PATH_FULL = (
+        PATH_BASE
+        / SEGMENT_LENGTH
+        / "features"
+        / ("all_sensor_features_cleaned_straw_py_(" + target + ").csv")
+    )
+    current_features = pd.read_csv(PATH_FULL, index_col="local_segment")
+    if all_features_cleaned.empty:
+        all_features_cleaned = current_features
+    else:
+        all_features_cleaned = all_features_cleaned.join(
+            current_features[("phone_esm_straw_" + target)],
+            how="inner",
+            rsuffix="_" + target,
+        )
+    print(all_features_cleaned.shape)
+
+# %%
+pca = PCA(n_components=1)
+TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS]
+pca.fit(all_features_cleaned[TARGETS_PREFIXED])
+print(pca.explained_variance_ratio_)
+
+# %%
+model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED)
+model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED])
+
+# %%
+sns.histplot(data=model_input, x="target")
+
+# %%
+model_input.target.quantile(0.6)
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# bins = [-10, 0, 10] # bins for z-scored targets
+BINS = [-10, 0, 10]  # bins for stressfulness (0-4) target
+print("BINS: ", BINS)
+model_input["target"], edges = pd.cut(
+    model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
+)  # ['low', 'medium', 'high']
+print(model_input["target"].value_counts())
+REMOVE_MEDIUM = True
+if REMOVE_MEDIUM:
+    if "medium" in model_input["target"]:
+        model_input = model_input[model_input["target"] != "medium"]
+    model_input["target"] = (
+        model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
+    )
+else:
+    model_input["target"] = model_input["target"].map(
+        {"low": 0, "medium": 1, "high": 2}
+    )
+    print(model_input["target"].value_counts())
+
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# UnderSampling
+if UNDERSAMPLING:
+    no_stress = model_input[model_input["target"] == 0]
+    stress = model_input[model_input["target"] == 1]
+
+    no_stress = no_stress.sample(n=len(stress))
+    model_input = pd.concat([stress, no_stress], axis=0)
+
+
+# %%
+TARGET_VARIABLE = "PANAS_negative_affect"
+print("TARGET_VARIABLE: " + TARGET_VARIABLE)
+
+PATH_FULL_HELP = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
+
+model_input_with_baseline = pd.read_csv(PATH_FULL_HELP, index_col="local_segment")
+
+# %%
+baseline_col_names = [
+    col for col in model_input_with_baseline.columns if col not in model_input.columns
+]
+print(baseline_col_names)
+
+# %%
+model_input = model_input.join(
+    model_input_with_baseline[baseline_col_names], how="left"
+)
+model_input.reset_index(inplace=True)
+
+# %%
+model_input_encoded = impute_encode_categorical_features(model_input)
+
+# %%
+data_x, data_y, data_groups = prepare_sklearn_data_format(
+    model_input_encoded, CV_METHOD
+)
+cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
+
+# %%
+data_y.head()
+
+# %%
+data_y.tail()
+# %%
+data_y.shape
+# %%
+scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
+# %%
+PATH_OUTPUT = Path("..") / Path("presentation/results")
+path_output_full = PATH_OUTPUT / (
+    "composite_"
+    + SEGMENT_LENGTH
+    + "_classification"
+    + str(BINS)
+    + "_"
+    + CV_METHOD
+    + ".csv"
+)
+scores.to_csv(path_output_full, index=False)
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@ -0,0 +1,239 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.14.5
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from sklearn.cluster import KMeans
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold, cross_validate
+
+from machine_learning.classification_models import ClassificationModels
+
+# %%
+# ## Set script's parameters
+N_CLUSTERS = 4  # Number of clusters (could be regarded as a hyperparameter)
+CV_METHOD = "logo"  # logo, halflogo, 5kfold
+# Cross-validation method (could be regarded as a hyperparameter)
+N_SL = 1  # Number of largest/smallest accuracies (of particular CV) outputs
+
+# %%
+PATH_BASE = Path("E:/STRAWresults/20230415")
+
+SEGMENT_TYPE = "period"
+print("SEGMENT_TYPE: " + SEGMENT_TYPE)
+SEGMENT_LENGTH = "30_minutes_before"
+print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
+TARGET_VARIABLE = "appraisal_stressfulness"
+print("TARGET_VARIABLE: " + TARGET_VARIABLE)
+
+if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
+    TARGET_VARIABLE += "_"
+    TARGET_VARIABLE += SEGMENT_TYPE
+
+PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
+
+model_input = pd.read_csv(PATH_FULL)
+
+if SEGMENT_LENGTH == "daily":
+    DAY_LENGTH = "daily"  # or "working"
+    print(DAY_LENGTH)
+    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
+
+# %% jupyter={"source_hidden": true}
+index_columns = [
+    "local_segment",
+    "local_segment_label",
+    "local_segment_start_datetime",
+    "local_segment_end_datetime",
+]
+
+CLUST_COL = "limesurvey_demand_control_ratio_quartile"
+print("CLUST_COL: " + CLUST_COL)
+
+BINS = [-1, 0, 4]
+print("BINS: " + str(BINS))
+
+model_input[CLUST_COL].describe()
+
+
+# %%
+model_input["target"].value_counts()
+
+# %% jupyter={"source_hidden": true}
+# Filter-out outlier rows by clust_col
+# model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
+
+uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
+uniq = uniq.dropna()
+plt.bar(uniq["pid"], uniq[CLUST_COL])
+
+# %% jupyter={"source_hidden": true}
+# Get clusters by cluster col & and merge the clusters to main df
+km = KMeans(n_clusters=N_CLUSTERS).fit_predict(uniq.set_index("pid"))
+np.unique(km, return_counts=True)
+uniq["cluster"] = km
+
+model_input = model_input.merge(uniq[["pid", "cluster"]])
+
+# %%
+model_input[["cluster", "target"]].value_counts().sort_index()
+
+# %% jupyter={"source_hidden": true}
+model_input.set_index(index_columns, inplace=True)
+
+# %% jupyter={"source_hidden": true}
+# Create dict with classification ml models
+cm = ClassificationModels()
+cmodels = cm.get_cmodels()
+
+# %% jupyter={"source_hidden": true}
+for k in range(N_CLUSTERS):
+    model_input_subset = model_input[model_input["cluster"] == k].copy()
+    model_input_subset.loc[:, "target"] = pd.cut(
+        model_input_subset.loc[:, "target"],
+        bins=BINS,
+        labels=["low", "high"],
+        right=True,
+    )  # ['low', 'medium', 'high']
+    model_input_subset["target"].value_counts()
+    # model_input_subset = model_input_subset[model_input_subset["target"] != "medium"]
+    model_input_subset["target"] = (
+        model_input_subset["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
+    )
+
+    print(model_input_subset["target"].value_counts())
+
+    if CV_METHOD == "half_logo":
+        model_input_subset["pid_index"] = model_input_subset.groupby("pid").cumcount()
+        model_input_subset["pid_count"] = model_input_subset.groupby("pid")[
+            "pid"
+        ].transform("count")
+
+        model_input_subset["pid_index"] = (
+            model_input_subset["pid_index"] / model_input_subset["pid_count"] + 1
+        ).round()
+        model_input_subset["pid_half"] = (
+            model_input_subset["pid"]
+            + "_"
+            + model_input_subset["pid_index"].astype(int).astype(str)
+        )
+
+        data_x, data_y, data_groups = (
+            model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
+            model_input_subset["target"],
+            model_input_subset["pid_half"],
+        )
+    else:
+        data_x, data_y, data_groups = (
+            model_input_subset.drop(["target", "pid"], axis=1),
+            model_input_subset["target"],
+            model_input_subset["pid"],
+        )
+
+    # Treat categorical features
+    categorical_feature_colnames = ["gender", "startlanguage"]
+    additional_categorical_features = [
+        col
+        for col in data_x.columns
+        if "mostcommonactivity" in col or "homelabel" in col
+    ]
+    categorical_feature_colnames += additional_categorical_features
+
+    categorical_features = data_x[categorical_feature_colnames].copy()
+    mode_categorical_features = categorical_features.mode().iloc[0]
+
+    # fillna with mode
+    categorical_features = categorical_features.fillna(mode_categorical_features)
+
+    # one-hot encoding
+    categorical_features = categorical_features.apply(
+        lambda col: col.astype("category")
+    )
+    if not categorical_features.empty:
+        categorical_features = pd.get_dummies(categorical_features)
+
+    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+    train_x = pd.concat([numerical_features, categorical_features], axis=1)
+
+    # Establish cv method
+    cv_method = StratifiedKFold(
+        n_splits=5, shuffle=True
+    )  # Defaults to 5 k-folds in cross_validate method
+    if CV_METHOD == "logo" or CV_METHOD == "half_logo":
+        cv_method = LeaveOneGroupOut()
+        cv_method.get_n_splits(
+            train_x,
+            data_y,
+            groups=data_groups,
+        )
+
+    imputer = SimpleImputer(missing_values=np.nan, strategy="median")
+
+    for model_title, model in cmodels.items():
+        classifier = cross_validate(
+            model["model"],
+            X=imputer.fit_transform(train_x),
+            y=data_y,
+            groups=data_groups,
+            cv=cv_method,
+            n_jobs=-1,
+            error_score="raise",
+            scoring=("accuracy", "precision", "recall", "f1"),
+        )
+
+        print("\n-------------------------------------\n")
+        print("Current cluster:", k, end="\n")
+        print("Current model:", model_title, end="\n")
+        print("Acc", np.mean(classifier["test_accuracy"]))
+        print("Precision", np.mean(classifier["test_precision"]))
+        print("Recall", np.mean(classifier["test_recall"]))
+        print("F1", np.mean(classifier["test_f1"]))
+        print(
+            f"Largest {N_SL} ACC:",
+            np.sort(-np.partition(-classifier["test_accuracy"], N_SL)[:N_SL])[::-1],
+        )
+        print(
+            f"Smallest {N_SL} ACC:",
+            np.sort(np.partition(classifier["test_accuracy"], N_SL)[:N_SL]),
+        )
+
+        cmodels[model_title]["metrics"][0] += np.mean(classifier["test_accuracy"])
+        cmodels[model_title]["metrics"][1] += np.mean(classifier["test_precision"])
+        cmodels[model_title]["metrics"][2] += np.mean(classifier["test_recall"])
+        cmodels[model_title]["metrics"][3] += np.mean(classifier["test_f1"])
+
+# %% jupyter={"source_hidden": true}
+# Get overall results
+scores = cm.get_total_models_scores(n_clusters=N_CLUSTERS)
+
+# %%
+PATH_OUTPUT = Path("..") / Path("presentation/results")
+path_output_full = PATH_OUTPUT / (
+    TARGET_VARIABLE
+    + "_"
+    + SEGMENT_LENGTH
+    + "_classification_"
+    + CV_METHOD
+    + str(BINS)
+    + "_clust_"
+    + CLUST_COL
+    + str(N_CLUSTERS)
+    + ".csv"
+)
+scores.to_csv(path_output_full, index=False)
--- a/exploration/ml_pipeline_classification_with_clustering_2_class.py
+++ b/exploration/ml_pipeline_classification_with_clustering_2_class.py
@ -0,0 +1,194 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.14.5
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from scipy import stats
+from sklearn.cluster import KMeans
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+from sklearn.model_selection import train_test_split
+
+from machine_learning.classification_models import ClassificationModels
+from machine_learning.helper import impute_encode_categorical_features
+
+# %% [markdown]
+# ## Set script's parameters
+#
+
+# %%
+n_clusters = 3  # Number of clusters (could be regarded as a hyperparameter)
+n_sl = 3  # Number of largest/smallest accuracies (of particular CV) outputs
+
+# %%
+PATH_BASE = Path("E:/STRAWresults/20230415")
+
+SEGMENT_TYPE = "period"
+print("SEGMENT_TYPE: " + SEGMENT_TYPE)
+SEGMENT_LENGTH = "30_minutes_before"
+print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
+TARGET_VARIABLE = "appraisal_stressfulness"
+print("TARGET_VARIABLE: " + TARGET_VARIABLE)
+
+if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
+    TARGET_VARIABLE += "_"
+    TARGET_VARIABLE += SEGMENT_TYPE
+
+PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
+
+model_input = pd.read_csv(PATH_FULL)
+
+if SEGMENT_LENGTH == "daily":
+    DAY_LENGTH = "daily"  # or "working"
+    print(DAY_LENGTH)
+    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
+
+# %% jupyter={"source_hidden": true}
+CLUST_COL = "limesurvey_demand_control_ratio"
+print("CLUST_COL: " + CLUST_COL)
+
+BINS = [-1, 0, 4]
+print("BINS: " + str(BINS))
+
+index_columns = [
+    "local_segment",
+    "local_segment_label",
+    "local_segment_start_datetime",
+    "local_segment_end_datetime",
+]
+
+model_input[CLUST_COL].describe()
+
+
+# %% jupyter={"source_hidden": true}
+# Filter-out outlier rows by clust_col
+model_input = model_input[(np.abs(stats.zscore(model_input[CLUST_COL])) < 3)]
+
+uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
+plt.bar(uniq["pid"], uniq[CLUST_COL])
+
+# %% jupyter={"source_hidden": true}
+# Get clusters by cluster col & and merge the clusters to main df
+km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index("pid"))
+np.unique(km, return_counts=True)
+uniq["cluster"] = km
+print(uniq)
+
+model_input = model_input.merge(uniq[["pid", "cluster"]])
+
+# %% jupyter={"source_hidden": true}
+model_input.set_index(index_columns, inplace=True)
+
+# %% jupyter={"source_hidden": true}
+# Create dict with classification ml models
+cm = ClassificationModels()
+cmodels = cm.get_cmodels()
+
+# %%
+model_input["target"].value_counts()
+
+# %% jupyter={"source_hidden": true}
+for k in range(n_clusters):
+    model_input_subset = model_input[model_input["cluster"] == k].copy()
+
+    # Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
+    # model_input_subset['numerical_target'] = model_input_subset['target']
+
+    model_input_subset.loc[:, "target"] = pd.cut(
+        model_input_subset.loc[:, "target"], bins=BINS, labels=[0, 1], right=True
+    )
+
+    # p15 = np.percentile(model_input_subset['numerical_target'], 15)
+    # p85 = np.percentile(model_input_subset['numerical_target'], 85)
+
+    # Treat categorical features
+    model_input_subset = impute_encode_categorical_features(model_input_subset)
+
+    # Split to train, validate, and test subsets
+    # train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
+    # test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
+    train_set, test_set = train_test_split(
+        model_input_subset,
+        test_size=0.3,
+        stratify=model_input_subset["pid"],
+        random_state=42,
+    )
+
+    print(train_set["target"].value_counts())
+    print(test_set["target"].value_counts())
+
+    train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
+
+    validate_x, test_x, validate_y, test_y = train_test_split(
+        test_set.drop(["target", "pid"], axis=1),
+        test_set["target"],
+        test_size=0.50,
+        random_state=42,
+    )
+
+    # Impute missing values
+    imputer = SimpleImputer(missing_values=np.nan, strategy="median")
+
+    train_x = imputer.fit_transform(train_x)
+    validate_x = imputer.fit_transform(validate_x)
+    test_x = imputer.fit_transform(test_x)
+
+    for model_title, model in cmodels.items():
+        model["model"].fit(train_x, train_y)
+        y_pred = model["model"].predict(validate_x)
+
+        acc = accuracy_score(validate_y, y_pred)
+        prec = precision_score(validate_y, y_pred)
+        rec = recall_score(validate_y, y_pred)
+        f1 = f1_score(validate_y, y_pred)
+
+        print("\n-------------------------------------\n")
+        print("Current cluster:", k, end="\n")
+        print("Current model:", model_title, end="\n")
+        print("Acc", acc)
+        print("Precision", prec)
+        print("Recall", rec)
+        print("F1", f1)
+
+        cmodels[model_title]["metrics"][0] += acc
+        cmodels[model_title]["metrics"][1] += prec
+        cmodels[model_title]["metrics"][2] += rec
+        cmodels[model_title]["metrics"][3] += f1
+
+# %% jupyter={"source_hidden": true}
+# Get overall results
+scores = cm.get_total_models_scores(n_clusters=n_clusters)
+
+# %%
+print(scores)
+
+# %%
+PATH_OUTPUT = Path("..") / Path("presentation/results")
+path_output_full = PATH_OUTPUT / (
+    TARGET_VARIABLE
+    + "_"
+    + SEGMENT_LENGTH
+    + "_classification"
+    + str(BINS)
+    + "_CLUST_"
+    + CLUST_COL
+    + +str(n_clusters)
+    + ".csv"
+)
+scores.to_csv(path_output_full, index=False)
--- a/exploration/ml_pipeline_regression.py
+++ b/exploration/ml_pipeline_regression.py
@ -0,0 +1,66 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.14.5
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %%
+import os
+import sys
+
+import pandas as pd
+
+from machine_learning.helper import (
+    impute_encode_categorical_features,
+    prepare_cross_validator,
+    prepare_sklearn_data_format,
+    run_all_regression_models,
+)
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+# %%
+model_input = pd.read_csv(
+    "../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
+)
+
+# %%
+model_input = model_input[model_input["local_segment"].str.contains("daily")]
+
+# %%
+CV_METHOD = "logo"  # logo, half_logo, 5kfold
+
+model_input_encoded = impute_encode_categorical_features(model_input)
+# %%
+data_x, data_y, data_groups = prepare_sklearn_data_format(
+    model_input_encoded, CV_METHOD
+)
+cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
+# %%
+data_y.head()
+
+# %%
+data_y.tail()
+
+# %%
+data_y.shape
+
+# %%
+scores = run_all_regression_models(data_x, data_y, data_groups, cross_validator)
+
+# %%
+scores.to_csv(
+    "../presentation/JCQ_supervisor_support_regression_" + CV_METHOD + ".csv",
+    index=False,
+)
--- a/exploration/ml_pipeline_stress_event_cleaned.py
+++ b/exploration/ml_pipeline_stress_event_cleaned.py
@ -0,0 +1,359 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+import yaml
+from pyprojroot import here
+from sklearn import linear_model, svm, kernel_ridge, gaussian_process
+from sklearn.model_selection import LeaveOneGroupOut, LeavePGroupsOut, cross_val_score, cross_validate
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.impute import SimpleImputer
+from sklearn.dummy import DummyRegressor
+import xgboost as xg
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.features_sensor
+import machine_learning.labels
+import machine_learning.model
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# ## PANAS negative affect
+
+# %% jupyter={"source_hidden": true}
+model_input = pd.read_csv("../data/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
+
+# %% jupyter={"source_hidden": true}
+
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+
+model_input.set_index(index_columns, inplace=True)
+
+cv_method = 'half_logo'
+if cv_method == 'logo':
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+else:
+
+    model_input[(model_input['pid'] == "p037") | (model_input['pid'] == "p064") | (model_input['pid'] == "p092")]
+
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
+
+# %% jupyter={"source_hidden": true}
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+# %% jupyter={"source_hidden": true}
+categorical_features = data_x[categorical_feature_colnames].copy()
+
+# %% jupyter={"source_hidden": true}
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# %% jupyter={"source_hidden": true}
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# %% jupyter={"source_hidden": true}
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+# %% jupyter={"source_hidden": true}
+numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+
+# %% jupyter={"source_hidden": true}
+train_x = pd.concat([numerical_features, categorical_features], axis=1)
+
+# %% jupyter={"source_hidden": true}
+train_x.dtypes
+
+# %% jupyter={"source_hidden": true}
+logo = LeaveOneGroupOut()
+logo.get_n_splits(
+    train_x,
+    data_y,
+    groups=data_groups,
+)
+
+# Defaults to 5 k folds in cross_validate method
+if cv_method != 'logo' and cv_method != 'half_logo':
+    logo = None
+
+# %% jupyter={"source_hidden": true}
+sum(data_y.isna())
+
+# %% [markdown]
+# ### Baseline: Dummy Regression (mean)
+# %%
+dummy_regr = DummyRegressor(strategy="mean")
+
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+lin_reg_scores = cross_validate(
+    dummy_regr,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(lin_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(lin_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Linear Regression
+
+# %% jupyter={"source_hidden": true}
+lin_reg_rapids = linear_model.LinearRegression()
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+lin_reg_scores = cross_validate(
+    lin_reg_rapids,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(lin_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(lin_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### XGBRegressor Linear Regression
+# %% jupyter={"source_hidden": true}
+xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+xgb_reg_scores = cross_validate(
+    xgb_r,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(xgb_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(xgb_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(xgb_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(xgb_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### XGBRegressor Pseudo Huber Error Regression
+# %% jupyter={"source_hidden": true}
+xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+xgb_psuedo_huber_reg_scores = cross_validate(
+    xgb_psuedo_huber_r,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(xgb_psuedo_huber_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Ridge regression
+
+# %% jupyter={"source_hidden": true}
+ridge_reg = linear_model.Ridge(alpha=.5)
+
+# %% tags=[] jupyter={"source_hidden": true}
+ridge_reg_scores = cross_validate(
+    ridge_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(ridge_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(ridge_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(ridge_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(ridge_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Lasso
+
+# %% jupyter={"source_hidden": true}
+lasso_reg = linear_model.Lasso(alpha=0.1)
+
+# %% jupyter={"source_hidden": true}
+lasso_reg_score = cross_validate(
+    lasso_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(lasso_reg_score['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(lasso_reg_score['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(lasso_reg_score['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(lasso_reg_score['test_r2']))
+
+# %% [markdown]
+# ### Bayesian Ridge
+
+# %% jupyter={"source_hidden": true}
+bayesian_ridge_reg = linear_model.BayesianRidge()
+
+# %% jupyter={"source_hidden": true}
+bayesian_ridge_reg_score = cross_validate(
+    bayesian_ridge_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(bayesian_ridge_reg_score['test_r2']))
+
+# %% [markdown]
+# ### RANSAC (outlier robust regression)
+
+# %% jupyter={"source_hidden": true}
+ransac_reg = linear_model.RANSACRegressor()
+
+# %% jupyter={"source_hidden": true}
+ransac_reg_scores = cross_validate(
+    ransac_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(ransac_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(ransac_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(ransac_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(ransac_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Support vector regression
+
+# %% jupyter={"source_hidden": true}
+svr = svm.SVR()
+
+# %% jupyter={"source_hidden": true}
+svr_scores = cross_validate(
+    svr,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(svr_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(svr_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(svr_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(svr_scores['test_r2']))
+
+# %% [markdown]
+# ### Kernel Ridge regression
+
+# %% jupyter={"source_hidden": true}
+kridge = kernel_ridge.KernelRidge()
+
+# %% jupyter={"source_hidden": true}
+kridge_scores = cross_validate(
+    kridge,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(kridge_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(kridge_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(kridge_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(kridge_scores['test_r2']))
+
+# %% [markdown]
+# ### Gaussian Process Regression
+
+# %% jupyter={"source_hidden": true}
+gpr = gaussian_process.GaussianProcessRegressor()
+
+# %% jupyter={"source_hidden": true}
+
+gpr_scores = cross_validate(
+    gpr,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(gpr_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(gpr_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(gpr_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(gpr_scores['test_r2']))
+
+# %%
--- a/exploration/test_JCQ_reversal.py
+++ b/exploration/test_JCQ_reversal.py
@ -0,0 +1,217 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.14.5
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %%
+import pandas as pd
+
+from features.esm_JCQ import DICT_JCQ_DEMAND_CONTROL_REVERSE
+
+# %%
+limesurvey_questions = pd.read_csv(
+    "E:/STRAWbaseline/survey637813+question_text.csv", header=None
+).T
+
+# %%
+limesurvey_questions
+
+# %%
+limesurvey_questions[["code", "text"]] = limesurvey_questions[0].str.split(
+    r"\.\s", expand=True, n=1
+)
+
+# %%
+limesurvey_questions
+
+# %%
+demand_reverse_lime_rows = (
+    limesurvey_questions["text"].str.startswith(" [Od mene se ne zahteva,")
+    | limesurvey_questions["text"].str.startswith(" [Imam dovolj časa, da končam")
+    | limesurvey_questions["text"].str.startswith(
+        " [Pri svojem delu se ne srečujem s konfliktnimi"
+    )
+)
+control_reverse_lime_rows = limesurvey_questions["text"].str.startswith(
+    " [Moje delo vključuje veliko ponavljajočega"
+) | limesurvey_questions["text"].str.startswith(
+    " [Pri svojem delu imam zelo malo svobode"
+)
+
+# %%
+demand_reverse_lime = limesurvey_questions[demand_reverse_lime_rows]
+demand_reverse_lime.loc[:, "qid"] = demand_reverse_lime["code"].str.extract(
+    r"\[(\d+)\]"
+)
+control_reverse_lime = limesurvey_questions[control_reverse_lime_rows]
+control_reverse_lime.loc[:, "qid"] = control_reverse_lime["code"].str.extract(
+    r"\[(\d+)\]"
+)
+
+# %%
+limesurvey_questions.loc[89, "text"]
+
+# %%
+limesurvey_questions[limesurvey_questions["code"].str.startswith("JobEisen")]
+
+# %%
+demand_reverse_lime
+
+# %%
+control_reverse_lime
+
+# %%
+participant_info = pd.read_csv(
+    "C:/Users/junos/Documents/FWO-ARRS/Analysis/straw2analysis/rapids/data/raw/p031/participant_baseline_raw.csv",
+    parse_dates=["date_of_birth"],
+)
+
+# %%
+participant_info_t = participant_info.T
+
+# %%
+rows_baseline = participant_info_t.index
+
+# %%
+rows_demand = rows_baseline.str.startswith("JobEisen") & ~rows_baseline.str.endswith(
+    "Time"
+)
+
+# %%
+rows_baseline[rows_demand]
+
+# %%
+limesurvey_control = (
+    participant_info_t[rows_demand]
+    .reset_index()
+    .rename(columns={"index": "question", 0: "score_original"})
+)
+
+# %%
+limesurvey_control
+
+# %%
+limesurvey_control["qid"] = (
+    limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
+)
+
+# %%
+limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
+
+# %%
+limesurvey_control["score"] = limesurvey_control["score_original"]
+
+# %%
+limesurvey_control["qid"][0]
+
+# %%
+rows_demand_reverse = limesurvey_control["qid"].isin(
+    DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
+)
+limesurvey_control.loc[rows_demand_reverse, "score"] = (
+    4 + 1 - limesurvey_control.loc[rows_demand_reverse, "score_original"]
+)
+
+# %%
+JCQ_DEMAND = "JobEisen"
+JCQ_CONTROL = "JobControle"
+dict_JCQ_demand_control_reverse = {
+    JCQ_DEMAND: {
+        3: " [Od mene se ne zahteva,",
+        4: " [Imam dovolj časa, da končam",
+        5: " [Pri svojem delu se ne srečujem s konfliktnimi",
+    },
+    JCQ_CONTROL: {
+        2: " |Moje delo vključuje veliko ponavljajočega",
+        6: " [Pri svojem delu imam zelo malo svobode",
+    },
+}
+
+# %%
+limesurvey_control
+
+# %%
+test = pd.DataFrame(
+    data={"question": "one", "score_original": 3, "score": 3, "qid": 10}, index=[0]
+)
+
+# %%
+pd.concat([test, limesurvey_control]).reset_index()
+
+# %%
+limesurvey_control["score"].sum()
+
+# %%
+rows_demand_reverse
+
+# %%
+dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
+
+# %%
+limesurvey_control
+
+# %%
+DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
+DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
+
+JCQ_NORMS = {
+    "F": {
+        0: DEMAND_CONTROL_RATIO_MIN,
+        1: 0.45,
+        2: 0.52,
+        3: 0.62,
+        4: DEMAND_CONTROL_RATIO_MAX,
+    },
+    "M": {
+        0: DEMAND_CONTROL_RATIO_MIN,
+        1: 0.41,
+        2: 0.48,
+        3: 0.56,
+        4: DEMAND_CONTROL_RATIO_MAX,
+    },
+}
+
+# %%
+JCQ_NORMS[participant_info.loc[0, "gender"]][0]
+
+# %%
+participant_info_t.index.str.startswith("JobControle")
+
+# %%
+columns_baseline = participant_info.columns
+
+# %%
+columns_demand = columns_baseline.str.startswith(
+    "JobControle"
+) & ~columns_baseline.str.endswith("Time")
+
+# %%
+columns_baseline[columns_demand]
+
+# %%
+participant_control = participant_info.loc[:, columns_demand]
+
+# %%
+participant_control["id"] = participant_control.index
+
+# %%
+participant_control
+
+# %%
+pd.wide_to_long(
+    participant_control,
+    stubnames="JobControle",
+    i="id",
+    j="qid",
+    sep="[",
+    suffix="(\\d+)]",
+)
--- a/exploration/tree_high_dpi.png
+++ b/exploration/tree_high_dpi.png
--- a/features/communication.py
+++ b/features/communication.py
@ -8,14 +8,21 @@ from setup import db_engine, session
 call_types = {1: "incoming", 2: "outgoing", 3: "missed"}
 sms_types = {1: "received", 2: "sent"}

-FEATURES_CALLS = (
-    ["no_calls_all"]
-    + ["no_" + call_type for call_type in call_types.values()]
-    + ["duration_total_" + call_types.get(1), "duration_total_" + call_types.get(2)]
-    + ["duration_max_" + call_types.get(1), "duration_max_" + call_types.get(2)]
-    + ["no_" + call_types.get(1) + "_ratio", "no_" + call_types.get(2) + "_ratio"]
-    + ["no_contacts_calls"]
-)
+FILL_NA_CALLS = {
+    "no_calls_all": 0,
+    "no_" + call_types.get(1): 0,
+    "no_" + call_types.get(2): 0,
+    "no_" + call_types.get(3): 0,
+    "duration_total_" + call_types.get(1): 0,
+    "duration_total_" + call_types.get(2): 0,
+    "duration_max_" + call_types.get(1): 0,
+    "duration_max_" + call_types.get(2): 0,
+    "no_" + call_types.get(1) + "_ratio": 1 / 3,  # Three different types
+    "no_" + call_types.get(2) + "_ratio": 1 / 3,
+    "no_contacts_calls": 0,
+}
+
+FEATURES_CALLS = list(FILL_NA_CALLS.keys())

 # FEATURES_CALLS =
 # ["no_calls_all",
@ -23,21 +30,26 @@ FEATURES_CALLS = (
 # "duration_total_incoming", "duration_total_outgoing",
 # "duration_max_incoming", "duration_max_outgoing",
 # "no_incoming_ratio", "no_outgoing_ratio",
-# "no_contacts"]
+# "no_contacts_calls"]
+
+FILL_NA_SMS = {
+    "no_sms_all": 0,
+    "no_" + sms_types.get(1): 0,
+    "no_" + sms_types.get(2): 0,
+    "no_" + sms_types.get(1) + "_ratio": 1 / 2,  # Two different types
+    "no_" + sms_types.get(2) + "_ratio": 1 / 2,
+    "no_contacts_sms": 0,
+}
+
+FEATURES_SMS = list(FILL_NA_SMS.keys())

-FEATURES_SMS = (
-    ["no_sms_all"]
-    + ["no_" + sms_type for sms_type in sms_types.values()]
-    + ["no_" + sms_types.get(1) + "_ratio", "no_" + sms_types.get(2) + "_ratio"]
-    + ["no_contacts_sms"]
-)
 # FEATURES_SMS =
 # ["no_sms_all",
 #  "no_received", "no_sent",
 #  "no_received_ratio", "no_sent_ratio",
-#  "no_contacts"]
+#  "no_contacts_sms"]

-FEATURES_CONTACT = [
+FEATURES_CALLS_SMS_PROP = [
    "proportion_calls_all",
    "proportion_calls_incoming",
    "proportion_calls_outgoing",
@ -45,6 +57,15 @@ FEATURES_CONTACT = [
    "proportion_calls_missed_sms_received",
 ]

+FILL_NA_CALLS_SMS_PROP = {
+    key: 1 / 2 for key in FEATURES_CALLS_SMS_PROP
+}  # All of the form of a / (a + b).
+
+FEATURES_CALLS_SMS_ALL = FEATURES_CALLS + FEATURES_SMS + FEATURES_CALLS_SMS_PROP
+
+FILL_NA_CALLS_SMS_ALL = FILL_NA_CALLS | FILL_NA_SMS | FILL_NA_CALLS_SMS_PROP
+# As per PEP-584 a union for dicts was implemented in Python 3.9.0.
+

 def get_call_data(usernames: Collection) -> pd.DataFrame:
    """
--- a/features/esm.py
+++ b/features/esm.py
@ -20,11 +20,47 @@ ANSWER_DAY_OFF = "DayOff3421"
 ANSWER_SET_EVENING = "DayFinishedSetEvening"

 MAX_MORNING_LENGTH = 3
-# When the participants was not yet at work at the time of the first (morning) EMA,
+# When the participant was not yet at work at the time of the first (morning) EMA,
 # only three items were answered.
 # Two sleep related items and one indicating NOT starting work yet.
 # Daytime EMAs are all longer, in fact they always consist of at least 6 items.

+QUESTIONNAIRE_IDS = {
+    "sleep_quality": 1,
+    "PANAS_positive_affect": 8,
+    "PANAS_negative_affect": 9,
+    "JCQ_job_demand": 10,
+    "JCQ_job_control": 11,
+    "JCQ_supervisor_support": 12,
+    "JCQ_coworker_support": 13,
+    "PFITS_supervisor": 14,
+    "PFITS_coworkers": 15,
+    "UWES_vigor": 16,
+    "UWES_dedication": 17,
+    "UWES_absorption": 18,
+    "COPE_active": 19,
+    "COPE_support": 20,
+    "COPE_emotions": 21,
+    "balance_life_work": 22,
+    "balance_work_life": 23,
+    "recovery_experience_detachment": 24,
+    "recovery_experience_relaxation": 25,
+    "symptoms": 26,
+    "appraisal_stressfulness_event": 87,
+    "appraisal_threat": 88,
+    "appraisal_challenge": 89,
+    "appraisal_event_time": 90,
+    "appraisal_event_duration": 91,
+    "appraisal_event_work_related": 92,
+    "appraisal_stressfulness_period": 93,
+    "late_work": 94,
+    "work_hours": 95,
+    "left_work": 96,
+    "activities": 97,
+    "coffee_breaks": 98,
+    "at_work_yet": 99,
+}
+

 def get_esm_data(usernames: Collection) -> pd.DataFrame:
    """
@ -52,6 +88,8 @@ def get_esm_data(usernames: Collection) -> pd.DataFrame:

 def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    """
+    Convert timestamps and expand JSON column.
+
    Convert timestamps into human-readable datetimes and dates
        and expand the JSON column into several Pandas DF columns.

@ -63,7 +101,8 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    Returns
    -------
    df_esm_preprocessed: pd.DataFrame
-        A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
+        A dataframe with added columns: datetime in Ljubljana timezone
+            and all fields from ESM_JSON column.
    """
    df_esm = helper.get_date_from_timestamp(df_esm)

@ -76,31 +115,39 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
 def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
    For each distinct EMA session, determine how the participant responded to it.
-    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
+
+    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED,
+        and SESSION_STATUS_COMPLETE

    This is done in three steps.

    First, the esm_status is considered.
-    If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
+    If any of the ESMs in a session has a status *other than* "answered",
+        then this session is taken as unfinished.

    Second, the sessions which do not represent full questionnaires are identified.
-    These are sessions where participants only marked they are finished with the day or have not yet started working.
+    These are sessions where participants only marked they are finished with the day
+        or have not yet started working.

    Third, the sessions with only one item are marked with their trigger.
-    We never offered questionnaires with single items, so we can be sure these are unfinished.
+    We never offered questionnaires with single items,
+        so we can be sure these are unfinished.

    Finally, all sessions that remain are marked as completed.
-    By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option.
+    By going through different possibilities in expl_esm_adherence.ipynb,
+        this turned out to be a reasonable option.

    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
+            which must include the session ID (esm_session).

    Returns
    -------
    df_session_counts: pd.Dataframe
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
+            with their statuses and the number of items.
    """
    sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)

@ -155,17 +202,22 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat

 def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
-    For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)
+    Classify EMA sessions into morning, workday, or evening.
+
+    For each EMA session, determine the time of the first user answer
+        and its time type (morning, workday, or evening).

    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
+            which must include the session ID (esm_session).

    Returns
    -------
    df_session_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
+            with their time type and timestamp of first answer.
    """
    df_session_time = (
        df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
@ -179,13 +231,17 @@ def classify_sessions_by_completion_time(
    df_esm_preprocessed: pd.DataFrame,
 ) -> pd.DataFrame:
    """
-    The point of this function is to not only classify sessions by using the previously defined functions.
+    Classify sessions and correct the time type.
+
+    The point of this function is to not only classify sessions
+        by using the previously defined functions.
    It also serves to "correct" the time type of some EMA sessions.

    A morning questionnaire could seamlessly transition into a daytime questionnaire,
        if the participant was already at work.
    In this case, the "time" label changed mid-session.
-    Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
+    Because of the way classify_sessions_by_time works,
+        this questionnaire was classified as "morning".
    But for all intents and purposes, it can be treated as a "daytime" EMA.

    The way this scenario is differentiated from a true "morning" questionnaire,
@ -194,13 +250,16 @@ def classify_sessions_by_completion_time(
    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
+            which must include the session ID (esm_session).

    Returns
    -------
    df_session_counts_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items,
-            their time type (with some morning EMAs reclassified) and timestamp of first answer.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses,
+            the number of items,
+            their time type (with some morning EMAs reclassified)
+            and timestamp of first answer.

    """
    df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
@ -219,7 +278,8 @@ def classify_sessions_by_completion_time(

 def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
-    This function eliminates invalid ESM responses.
+    Eliminate invalid ESM responses.
+
    It removes unanswered ESMs and those that indicate end of work and similar.
    It also extracts a numeric answer from strings such as "4 - I strongly agree".

@ -256,3 +316,100 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
        )
    )
    return df_esm_clean
+
+
+def increment_answers(df_esm_clean: pd.DataFrame, increment_by=1):
+    """
+    Increment answers to keep in line with original scoring.
+
+    We always used 0 for the lowest value of user answer.
+    Some scales originally used other scoring, such as starting from 1.
+    This restores original scoring so that the values are comparable to references.
+
+    Parameters
+    ----------
+    df_esm_clean: pd.DataFrame
+        A cleaned ESM dataframe, which must also include esm_user_answer_numeric.
+    increment_by:
+        A number to add to the user answer.
+
+    Returns
+    -------
+    df_esm_clean: pd.DataFrame
+        The same df with addition of a column 'esm_user_answer_numeric'.
+
+    """
+    try:
+        df_esm_clean = df_esm_clean.assign(
+            esm_user_score=lambda x: x.esm_user_answer_numeric + increment_by
+        )
+    except AttributeError as e:
+        print("Please, clean the dataframe first using features.esm.clean_up_esm.")
+        print(e)
+    return df_esm_clean
+
+
+def reassign_question_ids(
+    df_esm_cleaned: pd.DataFrame, question_ids_content: dict
+) -> pd.DataFrame:
+    """
+    Fix question IDs to match their actual content.
+
+    Unfortunately, when altering the protocol to adapt to COVID pandemic,
+    we did not retain original question IDs.
+    This means that for participants before 2021, they are different
+    from for the rest of them.
+    This function searches for question IDs by matching their strings.
+
+    Parameters
+    ----------
+    df_esm_cleaned: pd.DataFrame
+        A cleaned up dataframe, which must also include esm_user_answer_numeric.
+    question_ids_content: dict
+        A dictionary, linking question IDs with their content ("instructions").
+
+    Returns
+    -------
+    df_esm_fixed: pd.DataFrame
+        The same dataframe but with fixed question IDs.
+    """
+    df_esm_unique_questions = (
+        df_esm_cleaned.groupby("question_id")
+        .esm_instructions.value_counts()
+        .rename()
+        .reset_index()
+    )
+    # Tabulate all possible answers to each question (group by question ID).
+
+    # First, check that we anticipated all esm instructions.
+    for q_id in question_ids_content.keys():
+        # Look for all questions ("instructions") occurring in the dataframe.
+        actual_questions = df_esm_unique_questions.loc[
+            df_esm_unique_questions["question_id"] == q_id,
+            "esm_instructions",
+        ]
+        # These are all answers to a given question (by q_id).
+        questions_matches = actual_questions.str.startswith(
+            question_ids_content.get(q_id)
+        )
+        # See if they are expected, i.e. included in the dictionary.
+        if ~actual_questions.all():
+            print("One of the questions that occur in the data was undefined.")
+            print("This were the questions found in the data: ")
+            raise KeyError(actual_questions[~questions_matches])
+            # In case there is an unexpected answer, raise an exception.
+
+    # Next, replace question IDs.
+    df_esm_fixed = df_esm_cleaned.copy()
+    df_esm_fixed["question_id"] = df_esm_cleaned["esm_instructions"].apply(
+        lambda x: next(
+            (
+                key
+                for key, values in question_ids_content.items()
+                if x.startswith(values)
+            ),
+            None,
+        )
+    )
+
+    return df_esm_fixed
--- a/features/esm_COPE.py
+++ b/features/esm_COPE.py
@ -0,0 +1,125 @@
+COPE_ORIGINAL_MAX = 4
+COPE_ORIGINAL_MIN = 1
+
+DICT_COPE_QUESTION_IDS = {
+    164: (
+        "I took additional action to try to get rid of the problem",
+        "Ik deed extra mijn best om er iets aan te doen",
+        "Vložila sem dodaten napor, da bi rešila problem",
+        "Vložil sem dodaten napor, da bi rešil problem",
+    ),
+    165: (
+        "I concentrated my efforts on doing something about it",
+        "Ik probeerde de situatie te verbeteren",
+        "Svoje sile sem usmerila v reševanje nastale situacije",
+        "Svoje sile sem usmeril v reševanje nastale situacije",
+    ),
+    166: (
+        "I did what had to be done, one step at a time",
+        "Ik deed stap voor stap wat nodig was",
+        "Naredila sem, kar je bilo potrebno – korak za korakom",
+        "Naredil sem, kar je bilo potrebno – korak za korakom",
+    ),
+    167: (
+        "I took direct action to get around the problem",
+        "Ik handelde vlug om het probleem te verhelpen",
+        "Nekaj sem naredila, da sem zaobšla problem",
+        "Nekaj sem naredil, da sem zaobšel problem",
+    ),
+    168: (
+        "I tried to come up with a strategy about what to do",
+        "Ik probeerde te verzinnen wat ik er aan kon doen",
+        "Skušala sem najti ustrezen način za rešitev situacije",
+        "Skušal sem najti ustrezen način za rešitev situacije",
+    ),
+    169: (
+        "I made a plan of action",
+        "Ik maakte een plan",
+        "Naredila sem načrt za delovanje",
+        "Naredil sem načrt za delovanje",
+    ),
+    170: (
+        "I thought hard about what steps to take",
+        "Ik dacht hard na over wat ik moest doen",
+        "Dobro sem premislila, katere korake moram narediti, da rešim problem",
+        "Dobro sem premislil, katere korake moram narediti, da rešim problem",
+    ),
+    171: (
+        "I thought about how I might best handle the problem",
+        "lk dacht na over hoe ik het probleem het best kon aanpakken",
+        "Razmišljala sem, kaj bi bilo najbolje narediti s problemom",
+        "Razmišljal sem, kaj bi bilo najbolje narediti s problemom",
+    ),
+    172: (
+        "I asked people who have had similar experiences what they did",
+        "Ik vroeg aan mensen met dergelijke ervaringen hoe zij reageerden",
+        "Vprašala sem posameznike s podobnimi izkušnjami, kaj so storili",
+        "Vprašal sem posameznike s podobnimi izkušnjami, kaj so storili",
+    ),
+    173: (
+        "I tried to get advice from someone about what to do",
+        "lk vroeg advies aan iemand",
+        "Pri drugih sem poskušala dobiti nasvet, kaj naj storim",
+        "Pri drugih sem poskušal dobiti nasvet, kaj naj storim",
+    ),
+    174: (
+        "I talked to someone to find out more about the situation",
+        "Ik sprak met iemand om meer te weten te komen over de situatie",
+        "Z nekom sem se pogovorila, da bi izvedela še kaj o svojem problemu",
+        "Z nekom sem se pogovoril, da bi izvedel še kaj o svojem problemu",
+    ),
+    175: (
+        "I talked to someone who could do something concrete about the problem",
+        "Ik sprak met iemand die iets aan het probleem kon doen",
+        "Pogovorila sem se s kom, ki bi lahko naredil kaj konkretnega",
+        "Pogovoril sem se s kom, ki bi lahko naredil kaj konkretnega",
+    ),
+    176: (
+        "I talked to someone about how I felt",
+        "Ik sprak met iemand over hoe ik mij voelde",
+        "Z nekom sem se pogovorila o tem, kako sem se počutila",
+        "Z nekom sem se pogovoril o tem, kako sem se počutil",
+    ),
+    177: (
+        "I tried to get emotional support from friends or relatives",
+        "Ik zocht steun bij vrienden of familie",
+        "Skušala sem dobiti čustveno podporo prijateljev ali sorodnikov",
+        "Skušal sem dobiti čustveno podporo prijateljev ali sorodnikov",
+    ),
+    178: (
+        "I discussed my feelings with someone",
+        "lk besprak mijn gevoelens met iemand",
+        "O svojih občutkih sem se z nekom pogovorila",
+        "O svojih občutkih sem se z nekom pogovoril",
+    ),
+    179: (
+        "I got sympathy and understanding from someone",
+        "Ik vroeg medeleven en begrip van iemand",
+        "Poiskala sem naklonjenost in razumevanje drugih",
+        "Poiskal sem naklonjenost in razumevanje drugih",
+    ),
+    180: (
+        "I got upset and let my emotions out",
+        "Ik raakte van streek",
+        "Razburila sem se in to tudi pokazala",
+        "Razburil sem se in to tudi pokazal",
+    ),
+    181: (
+        "I let my feelings out",
+        "Ik toonde mijn gevoelens",
+        "Svojim čustvom sem dala prosto pot",
+        "Svojim čustvom sem dal prosto pot",
+    ),
+    182: (
+        "I felt a lot of emotional distress and I found myself expressing",
+        "lk liet duidelijk blijken hoe ellendig ik mij voelde",
+        "Doživljala sem veliko stresa in opažala, da sem čustva",
+        "Doživljal sem veliko stresa in opažal, da sem čustva",
+    ),
+    183: (
+        "I got upset, and I was really aware of it",
+        "Ik merkte dat ik erg van streek was",
+        "Razburila sem se in razmišljala samo o tem",
+        "Razburil sem se in razmišljal samo o tem",
+    ),
+}
--- a/features/esm_JCQ.py
+++ b/features/esm_JCQ.py
@ -1,9 +1,11 @@
 import pandas as pd

+from features.esm import increment_answers
+
 JCQ_ORIGINAL_MAX = 4
 JCQ_ORIGINAL_MIN = 1

-dict_JCQ_demand_control_reverse = {
+DICT_JCQ_DEMAND_CONTROL_REVERSE = {
    75: (
        "I was NOT asked",
        "Men legde mij geen overdreven",
@ -40,10 +42,14 @@ def reverse_jcq_demand_control_scoring(
    df_esm_jcq_demand_control: pd.DataFrame,
 ) -> pd.DataFrame:
    """
-    This function recodes answers in Job content questionnaire by first incrementing them by 1,
-    to be in line with original (1-4) scoring.
-    Then, some answers are reversed (i.e. 1 becomes 4 etc.), because the questions are negatively phrased.
-    These answers are listed in dict_JCQ_demand_control_reverse and identified by their question ID.
+    Reverse JCQ demand and control answers.
+
+    This function recodes answers in Job content questionnaire
+        by first incrementing them by 1, to be in line with original (1-4) scoring.
+    Then, some answers are reversed (i.e. 1 becomes 4 etc.),
+        because the questions are negatively phrased.
+    These answers are listed in DICT_JCQ_DEMAND_CONTROL_REVERSE
+        and identified by their question ID.
    However, the existing data is checked against literal phrasing of these questions
        to protect against wrong numbering of questions (differing question IDs).

@ -55,7 +61,8 @@ def reverse_jcq_demand_control_scoring(
    Returns
    -------
    df_esm_jcq_demand_control: pd.DataFrame
-        The same dataframe with a column esm_user_score containing answers recoded and reversed.
+        The same dataframe with a column esm_user_score
+            containing answers recoded and reversed.
    """
    df_esm_jcq_demand_control_unique_answers = (
        df_esm_jcq_demand_control.groupby("question_id")
@ -64,7 +71,7 @@ def reverse_jcq_demand_control_scoring(
        .reset_index()
    )
    # Tabulate all possible answers to each question (group by question ID).
-    for q_id in dict_JCQ_demand_control_reverse.keys():
+    for q_id in DICT_JCQ_DEMAND_CONTROL_REVERSE.keys():
        # Look through all answers that need to be reversed.
        possible_answers = df_esm_jcq_demand_control_unique_answers.loc[
            df_esm_jcq_demand_control_unique_answers["question_id"] == q_id,
@ -72,7 +79,7 @@ def reverse_jcq_demand_control_scoring(
        ]
        # These are all answers to a given question (by q_id).
        answers_matches = possible_answers.str.startswith(
-            dict_JCQ_demand_control_reverse.get(q_id)
+            DICT_JCQ_DEMAND_CONTROL_REVERSE.get(q_id)
        )
        # See if they are expected, i.e. included in the dictionary.
        if ~answers_matches.all():
@ -82,18 +89,16 @@ def reverse_jcq_demand_control_scoring(
            # In case there is an unexpected answer, raise an exception.

    try:
-        df_esm_jcq_demand_control = df_esm_jcq_demand_control.assign(
-            esm_user_score=lambda x: x.esm_user_answer_numeric + 1
-        )
-        # Increment the original answer by 1
-        # to keep in line with traditional scoring (JCQ_ORIGINAL_MIN - JCQ_ORIGINAL_MAX).
+        df_esm_jcq_demand_control = increment_answers(df_esm_jcq_demand_control)
+        # Increment the original answer by 1 to keep in line
+        # with traditional scoring (from JCQ_ORIGINAL_MIN to JCQ_ORIGINAL_MAX).
        df_esm_jcq_demand_control[
            df_esm_jcq_demand_control["question_id"].isin(
-                dict_JCQ_demand_control_reverse.keys()
+                DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
            )
        ] = df_esm_jcq_demand_control[
            df_esm_jcq_demand_control["question_id"].isin(
-                dict_JCQ_demand_control_reverse.keys()
+                DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
            )
        ].assign(
            esm_user_score=lambda x: JCQ_ORIGINAL_MAX
--- a/features/esm_SAM.py
+++ b/features/esm_SAM.py
@ -3,6 +3,9 @@ import pandas as pd

 import features.esm

+SAM_ORIGINAL_MAX = 5
+SAM_ORIGINAL_MIN = 1
+
 QUESTIONNAIRE_ID_SAM = {
    "event_stress": 87,
    "event_threat": 88,
@ -20,10 +23,107 @@ GROUP_QUESTIONNAIRES_BY = [
    "device_id",
    "esm_session",
 ]
-# Each questionnaire occurs only once within each esm_session on the same device within the same participant.
+# Each questionnaire occurs only once within each esm_session on the same device
+# within the same participant.
+
+
+DICT_SAM_QUESTION_IDS = {
+    87: (
+        "Was there a particular event that created tension in you?",
+        "Was er een bepaalde gebeurtenis die spanning veroorzaakte?",
+        "Je prišlo do kakega dogodka, ki je v vas ustvaril napetost?",
+    ),
+    88: (
+        "Did this event make you feel anxious?",
+        "Voelde je je angstig door deze gebeurtenis?",
+        "Ste se zaradi tega dogodka počutili tesnob",
+    ),
+    89: (
+        "Will the outcome of this event be negative?",
+        "Zal de uitkomst van deze gebeurtenis negatief zijn?",
+        "Bo izid tega dogodka negativen?",
+    ),
+    90: (
+        "How threatening was this event?",
+        "Hoe bedreigend was deze gebeurtenis?",
+        "Kako grozeč je bil ta dogodek?",
+    ),
+    91: (
+        "Is this going to have a negative impact on you?",
+        "Zal dit een negatieve impact op je hebben?",
+        "Ali bo to negativno vplivalo na vas?",
+    ),
+    92: (
+        "Is this going to have a positive impact on you?",
+        "Zal dit een positief effect op je hebben?",
+        "Ali bo to pozitivno vplivalo na vas?",
+    ),
+    93: (
+        "How eager are you to tackle this event?",
+        "Hoe graag wil je deze gebeurtenis aanpakken?",
+        "Kako zagnani ste bili",
+    ),
+    94: (
+        "To what extent can you become a stronger person because of this event?",
+        "In welke mate kan je een sterkere persoon worden door deze gebeurtenis?",
+        "V kolikšni meri lahko zaradi tega dogodka postanete močnejša oseba?",
+    ),
+    95: (
+        "To what extent are you excited thinking about the outcome of this event?",
+        "In welke mate ben je enthousiast bij de gedachte",
+        "V kolikšni meri vas misel na izid tega dogodka navdušuje?",
+    ),
+    96: (
+        "At what time did this event occur?",
+        "Hoe laat vond deze gebeurtenis plaats?",
+        "Kdaj se je ta dogodek zgodil?",
+    ),
+    97: (
+        "How long did this event last?",
+        "Hoe lang duurde deze gebeurtenis?",
+        "Kako dolgo je trajal ta dogodek?",
+    ),
+    98: (
+        "Was/is this event work-related?",
+        "Was/is deze gebeurtenis werkgerelateerd?",
+        "Je (bil) ta dogodek povezan s službo?",
+        "Je bil ali je ta dogodek povezan s službo?",
+    ),
+    99: (
+        "Did this overall period create tension in you?",
+        "Heeft deze globale periode spanning veroorzaakt?",
+        "Je to obdobje kot celota v vas ustvarilo napetost?",
+        "Je to celo obdobje v vas ustvarilo napetost?",
+    ),
+    100: (
+        "To what extent do you perceive this overall period as stressful?",
+        "In welke mate ervaar je deze globale periode als stressvol?",
+        "V kolikšni meri ste to obdobje dojemali kot stresno?",
+        "V kolikšni meri ste celo to obdobje dojemali kot stresno?",
+    ),
+}


 def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
+    """
+    Extract information about stressful events.
+
+    Participants were asked: "Was there a particular event that created tension in you?"
+    Then a subset of questions related to this event followed.
+    This function goes through the follow-up questions one by one
+        and preprocesses them, so that it adds new columns to the dataframe.
+
+    Parameters
+    ----------
+    df_esm: pd.DataFrame
+        A raw dataframe of all ESM data.
+
+    Returns
+    -------
+    df_esm_events: pd.DataFrame
+        A cleaned up df of Stress Appraisal Measure items with additional columns.
+
+    """
    # 0. Select only questions from Stress Appraisal Measure.
    df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
    df_esm_sam = df_esm_preprocessed[
@ -78,7 +178,8 @@ def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:

 def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
-    This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
+    This function calculates challenge and threat
+        (two Stress Appraisal Measure subscales) means,
        for each ESM session (within participants and devices).
    It creates a grouped dataframe with means in two columns.

@ -90,7 +191,8 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
    Returns
    -------
    df_esm_event_threat_challenge_mean_wide: pd.DataFrame
-        A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
+        A dataframe of unique ESM sessions (by participants and devices)
+        with threat and challenge means.
    """
    # Select only threat and challenge assessments for events
    df_esm_event_threat_challenge = df_esm_sam_clean[
@ -112,8 +214,8 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
        aggfunc="mean",
    )
    # Drop unnecessary column values.
-    df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
-        1
+    df_esm_event_threat_challenge_mean_wide.columns = (
+        df_esm_event_threat_challenge_mean_wide.columns.get_level_values(1)
    )
    df_esm_event_threat_challenge_mean_wide.columns.name = None
    df_esm_event_threat_challenge_mean_wide.rename(
@ -189,10 +291,12 @@ def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:

 def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
-    This function only serves to convert the string datetime answer into a real datetime type.
-    Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
-    NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
-        the NaTs can be interpreted to mean this.
+    This function only serves to convert the string datetime answer
+        into a real datetime type.
+    Errors during this conversion are coerced, meaning that non-datetime answers
+        are assigned Not a Time (NaT).
+    NOTE: Since the only available non-datetime answer to this question was
+        "0 - I do not remember", the NaTs can be interpreted to mean this.

    Parameters
    ----------
@ -208,9 +312,13 @@ def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
        df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
    ].assign(
        event_time=lambda x: pd.to_datetime(
-            x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
+            x.esm_user_answer,
+            errors="coerce",
+            format="%Y-%m-%d %H:%M:%S %z",
+            exact=True,
        )
    )
+    # Example answer: 2020-09-29 00:05:00 +0200
    return df_esm_event_time


@ -241,9 +349,12 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
        == QUESTIONNAIRE_ID_SAM.get("event_duration")
    ].assign(
        event_duration=lambda x: pd.to_datetime(
-            x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
+            x.esm_user_answer.str.slice(start=0, stop=-6),
+            errors="coerce",
+            format="%Y-%m-%d %H:%M:%S",
        ).dt.time
    )
+    # Example answer: 2020-09-29 00:05:00 +0200
    # TODO Explore the values recorded in event_duration and possibly fix mistakes.
    # For example, participants reported setting 23:50:00 instead of 00:50:00.

@ -251,7 +362,7 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    # we can determine whether:
    #   - this event is still going on ("1 - It is still going on")
    #   - the participant couldn't remember it's duration ("0 - I do not remember")
-    # Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
+    # Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm
    # but only the numeric types of questions and answers.
    # Since this was of "datetime" type, convert these specific answers here again.
    df_esm_event_duration["event_duration_info"] = np.nan
@ -264,4 +375,5 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    return df_esm_event_duration


-# TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?
+# TODO: How many questions about the stressfulness of the period were asked
+#  and how does this relate to events?
--- a/features/proximity.py
+++ b/features/proximity.py
@ -5,7 +5,12 @@ import pandas as pd
 from config.models import Participant, Proximity
 from setup import db_engine, session

-FEATURES_PROXIMITY = ["freq_prox_near", "prop_prox_near"]
+FILL_NA_PROXIMITY = {
+    "freq_prox_near": 0,
+    "prop_prox_near": 1 / 2,  # Of the form of a / (a + b).
+}
+
+FEATURES_PROXIMITY = list(FILL_NA_PROXIMITY.keys())


 def get_proximity_data(usernames: Collection) -> pd.DataFrame:
@ -78,11 +83,11 @@ def count_proximity(
        A dataframe with the count of "near" proximity values and their relative count.
    """
    if group_by is None:
-        group_by = ["participant_id"]
+        group_by = []
    if "bool_prox_near" not in df_proximity:
        df_proximity = recode_proximity(df_proximity)
    df_proximity["bool_prox_far"] = ~df_proximity["bool_prox_near"]
-    df_proximity_features = df_proximity.groupby(group_by).sum()[
+    df_proximity_features = df_proximity.groupby(["participant_id"] + group_by).sum()[
        ["bool_prox_near", "bool_prox_far"]
    ]
    df_proximity_features = df_proximity_features.assign(
--- a/features/timezone.py
+++ b/features/timezone.py
@ -0,0 +1,30 @@
+from collections.abc import Collection
+
+import pandas as pd
+
+from config.models import Participant, Timezone
+from setup import db_engine, session
+
+
+def get_timezone_data(usernames: Collection) -> pd.DataFrame:
+    """
+    Read the data from the proximity sensor table and return it in a dataframe.
+
+    Parameters
+    ----------
+    usernames: Collection
+        A list of usernames to put into the WHERE condition.
+
+    Returns
+    -------
+    df_proximity: pd.DataFrame
+        A dataframe of proximity data.
+    """
+    query_timezone = (
+        session.query(Timezone, Participant.username)
+        .filter(Participant.id == Timezone.participant_id)
+        .filter(Participant.username.in_(usernames))
+    )
+    with db_engine.connect() as connection:
+        df_timezone = pd.read_sql(query_timezone.statement, connection)
+    return df_timezone
--- a/images/dag_calls_nokia_example.svg
+++ b/images/dag_calls_nokia_example.svg
@ -0,0 +1,205 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: snakemake_dag Pages: 1 -->
+<svg width="548pt" height="625pt"
+ viewBox="0.00 0.00 548.00 625.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 621)">
+<title>snakemake_dag</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-621 544,-621 544,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<path fill="none" stroke="#565bd8" stroke-width="2" d="M202,-36C202,-36 172,-36 172,-36 166,-36 160,-30 160,-24 160,-24 160,-12 160,-12 160,-6 166,0 172,0 172,0 202,0 202,0 208,0 214,-6 214,-12 214,-12 214,-24 214,-24 214,-30 208,-36 202,-36"/>
+<text text-anchor="middle" x="187" y="-15.5" font-family="sans" font-size="10.00">all</text>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<path fill="none" stroke="#56d8a9" stroke-width="2" d="M100,-617C100,-617 12,-617 12,-617 6,-617 0,-611 0,-605 0,-605 0,-588 0,-588 0,-582 6,-576 12,-576 12,-576 100,-576 100,-576 106,-576 112,-582 112,-588 112,-588 112,-605 112,-605 112,-611 106,-617 100,-617"/>
+<text text-anchor="middle" x="56" y="-605" font-family="sans" font-size="10.00">pull_phone_data</text>
+<text text-anchor="middle" x="56" y="-594" font-family="sans" font-size="10.00">pid: nokia_0000003</text>
+<text text-anchor="middle" x="56" y="-583" font-family="sans" font-size="10.00">sensor: calls</text>
+</g>
+<!-- 1&#45;&gt;0 -->
+<g id="edge1" class="edge">
+<title>1&#45;&gt;0</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M47.83,-575.78C37.21,-548.32 20,-496.76 20,-451 20,-451 20,-451 20,-161 20,-114.96 38.83,-102.85 73,-72 95.21,-51.94 126.33,-38.17 150.45,-29.7"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="151.61,-33 159.97,-26.5 149.38,-26.37 151.61,-33"/>
+</g>
+<!-- 2 -->
+<g id="node3" class="node">
+<title>2</title>
+<path fill="none" stroke="#56d863" stroke-width="2" d="M124,-540C124,-540 60,-540 60,-540 54,-540 48,-534 48,-528 48,-528 48,-516 48,-516 48,-510 54,-504 60,-504 60,-504 124,-504 124,-504 130,-504 136,-510 136,-516 136,-516 136,-528 136,-528 136,-534 130,-540 124,-540"/>
+<text text-anchor="middle" x="92" y="-519.5" font-family="sans" font-size="10.00">calls_episodes</text>
+</g>
+<!-- 1&#45;&gt;2 -->
+<g id="edge9" class="edge">
+<title>1&#45;&gt;2</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M65.84,-575.69C69.87,-567.56 74.6,-558.03 78.92,-549.33"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="82.09,-550.83 83.4,-540.32 75.82,-547.72 82.09,-550.83"/>
+</g>
+<!-- 2&#45;&gt;0 -->
+<g id="edge2" class="edge">
+<title>2&#45;&gt;0</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M85.12,-503.83C75.18,-477.44 58,-425.14 58,-379 58,-379 58,-379 58,-161 58,-105.34 112.96,-61.84 151.14,-38.34"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="153.16,-41.21 159.96,-33.08 149.58,-35.2 153.16,-41.21"/>
+</g>
+<!-- 3 -->
+<g id="node4" class="node">
+<title>3</title>
+<path fill="none" stroke="#d8a456" stroke-width="2" d="M187.5,-468C187.5,-468 98.5,-468 98.5,-468 92.5,-468 86.5,-462 86.5,-456 86.5,-456 86.5,-444 86.5,-444 86.5,-438 92.5,-432 98.5,-432 98.5,-432 187.5,-432 187.5,-432 193.5,-432 199.5,-438 199.5,-444 199.5,-444 199.5,-456 199.5,-456 199.5,-462 193.5,-468 187.5,-468"/>
+<text text-anchor="middle" x="143" y="-453" font-family="sans" font-size="10.00">resample_episodes</text>
+<text text-anchor="middle" x="143" y="-442" font-family="sans" font-size="10.00">sensor: phone_calls</text>
+</g>
+<!-- 2&#45;&gt;3 -->
+<g id="edge10" class="edge">
+<title>2&#45;&gt;3</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M104.61,-503.7C110.6,-495.47 117.88,-485.48 124.48,-476.42"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="127.48,-478.25 130.54,-468.1 121.82,-474.13 127.48,-478.25"/>
+</g>
+<!-- 3&#45;&gt;0 -->
+<g id="edge3" class="edge">
+<title>3&#45;&gt;0</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M140.43,-432C136.64,-405.4 130,-352.3 130,-307 130,-307 130,-307 130,-161 130,-117.8 153,-72.19 169.78,-44.66"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="172.83,-46.37 175.19,-36.04 166.91,-42.65 172.83,-46.37"/>
+</g>
+<!-- 4 -->
+<g id="node5" class="node">
+<title>4</title>
+<path fill="none" stroke="#56d8d8" stroke-width="2" d="M357.5,-396C357.5,-396 194.5,-396 194.5,-396 188.5,-396 182.5,-390 182.5,-384 182.5,-384 182.5,-372 182.5,-372 182.5,-366 188.5,-360 194.5,-360 194.5,-360 357.5,-360 357.5,-360 363.5,-360 369.5,-366 369.5,-372 369.5,-372 369.5,-384 369.5,-384 369.5,-390 363.5,-396 357.5,-396"/>
+<text text-anchor="middle" x="276" y="-375.5" font-family="sans" font-size="10.00">resample_episodes_with_datetime</text>
+</g>
+<!-- 3&#45;&gt;4 -->
+<g id="edge11" class="edge">
+<title>3&#45;&gt;4</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M175.54,-431.88C193.25,-422.55 215.35,-410.92 234.32,-400.94"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="236.12,-403.94 243.34,-396.19 232.86,-397.75 236.12,-403.94"/>
+</g>
+<!-- 4&#45;&gt;0 -->
+<g id="edge4" class="edge">
+<title>4&#45;&gt;0</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M250.68,-359.83C218.76,-335.92 168,-289.36 168,-235 168,-235 168,-235 168,-161 168,-120.86 175.55,-74.9 181.13,-46.4"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="184.61,-46.88 183.16,-36.39 177.74,-45.5 184.61,-46.88"/>
+</g>
+<!-- 8 -->
+<g id="node9" class="node">
+<title>8</title>
+<path fill="none" stroke="#68d856" stroke-width="2" d="M353.5,-324C353.5,-324 248.5,-324 248.5,-324 242.5,-324 236.5,-318 236.5,-312 236.5,-312 236.5,-300 236.5,-300 236.5,-294 242.5,-288 248.5,-288 248.5,-288 353.5,-288 353.5,-288 359.5,-288 365.5,-294 365.5,-300 365.5,-300 365.5,-312 365.5,-312 365.5,-318 359.5,-324 353.5,-324"/>
+<text text-anchor="middle" x="301" y="-309" font-family="sans" font-size="10.00">phone_calls_r_features</text>
+<text text-anchor="middle" x="301" y="-298" font-family="sans" font-size="10.00">provider_key: rapids</text>
+</g>
+<!-- 4&#45;&gt;8 -->
+<g id="edge15" class="edge">
+<title>4&#45;&gt;8</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M282.18,-359.7C285,-351.81 288.39,-342.3 291.52,-333.55"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="294.82,-334.7 294.89,-324.1 288.23,-332.34 294.82,-334.7"/>
+</g>
+<!-- 5 -->
+<g id="node6" class="node">
+<title>5</title>
+<path fill="none" stroke="#afd856" stroke-width="2" d="M475.5,-468C475.5,-468 364.5,-468 364.5,-468 358.5,-468 352.5,-462 352.5,-456 352.5,-456 352.5,-444 352.5,-444 352.5,-438 358.5,-432 364.5,-432 364.5,-432 475.5,-432 475.5,-432 481.5,-432 487.5,-438 487.5,-444 487.5,-444 487.5,-456 487.5,-456 487.5,-462 481.5,-468 475.5,-468"/>
+<text text-anchor="middle" x="420" y="-453" font-family="sans" font-size="10.00">process_time_segments</text>
+<text text-anchor="middle" x="420" y="-442" font-family="sans" font-size="10.00">pid: nokia_0000003</text>
+</g>
+<!-- 5&#45;&gt;4 -->
+<g id="edge12" class="edge">
+<title>5&#45;&gt;4</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M384.77,-431.88C365.42,-422.47 341.23,-410.71 320.57,-400.67"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="321.89,-397.41 311.36,-396.19 318.83,-403.71 321.89,-397.41"/>
+</g>
+<!-- 5&#45;&gt;8 -->
+<g id="edge16" class="edge">
+<title>5&#45;&gt;8</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M415.13,-431.72C409.07,-412.57 397.25,-381.55 379,-360 369.03,-348.23 355.82,-337.94 343.12,-329.64"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="344.79,-326.55 334.45,-324.21 341.08,-332.49 344.79,-326.55"/>
+</g>
+<!-- 6 -->
+<g id="node7" class="node">
+<title>6</title>
+<path fill="none" stroke="#d86656" stroke-width="2" stroke-dasharray="5,2" d="M322.5,-468C322.5,-468 229.5,-468 229.5,-468 223.5,-468 217.5,-462 217.5,-456 217.5,-456 217.5,-444 217.5,-444 217.5,-438 223.5,-432 229.5,-432 229.5,-432 322.5,-432 322.5,-432 328.5,-432 334.5,-438 334.5,-444 334.5,-444 334.5,-456 334.5,-456 334.5,-462 328.5,-468 322.5,-468"/>
+<text text-anchor="middle" x="276" y="-447.5" font-family="sans" font-size="10.00">prepare_tzcodes_file</text>
+</g>
+<!-- 6&#45;&gt;4 -->
+<g id="edge13" class="edge">
+<title>6&#45;&gt;4</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M276,-431.7C276,-423.98 276,-414.71 276,-406.11"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="279.5,-406.1 276,-396.1 272.5,-406.1 279.5,-406.1"/>
+</g>
+<!-- 7 -->
+<g id="node8" class="node">
+<title>7</title>
+<path fill="none" stroke="#56d86b" stroke-width="2" stroke-dasharray="5,2" d="M370,-540C370,-540 182,-540 182,-540 176,-540 170,-534 170,-528 170,-528 170,-516 170,-516 170,-510 176,-504 182,-504 182,-504 370,-504 370,-504 376,-504 382,-510 382,-516 382,-516 382,-528 382,-528 382,-534 376,-540 370,-540"/>
+<text text-anchor="middle" x="276" y="-519.5" font-family="sans" font-size="10.00">query_usernames_device_empatica_ids</text>
+</g>
+<!-- 7&#45;&gt;6 -->
+<g id="edge14" class="edge">
+<title>7&#45;&gt;6</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M276,-503.7C276,-495.98 276,-486.71 276,-478.11"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="279.5,-478.1 276,-468.1 272.5,-478.1 279.5,-478.1"/>
+</g>
+<!-- 8&#45;&gt;0 -->
+<g id="edge5" class="edge">
+<title>8&#45;&gt;0</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M264.63,-287.8C250.06,-279.08 234.51,-267.11 225,-252 184.07,-186.97 182.71,-92.23 184.91,-46.17"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="188.41,-46.26 185.49,-36.07 181.42,-45.85 188.41,-46.26"/>
+</g>
+<!-- 9 -->
+<g id="node10" class="node">
+<title>9</title>
+<path fill="none" stroke="#d87556" stroke-width="2" d="M382,-252C382,-252 246,-252 246,-252 240,-252 234,-246 234,-240 234,-240 234,-228 234,-228 234,-222 240,-216 246,-216 246,-216 382,-216 382,-216 388,-216 394,-222 394,-228 394,-228 394,-240 394,-240 394,-246 388,-252 382,-252"/>
+<text text-anchor="middle" x="314" y="-237" font-family="sans" font-size="10.00">join_features_from_providers</text>
+<text text-anchor="middle" x="314" y="-226" font-family="sans" font-size="10.00">sensor_key: phone_calls</text>
+</g>
+<!-- 8&#45;&gt;9 -->
+<g id="edge17" class="edge">
+<title>8&#45;&gt;9</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M304.21,-287.7C305.65,-279.98 307.37,-270.71 308.96,-262.11"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="312.44,-262.58 310.82,-252.1 305.56,-261.3 312.44,-262.58"/>
+</g>
+<!-- 9&#45;&gt;0 -->
+<g id="edge6" class="edge">
+<title>9&#45;&gt;0</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M294.15,-215.87C283.81,-206.16 271.58,-193.31 263,-180 235.01,-136.57 243.3,-118.11 220,-72 215.36,-62.81 209.61,-53.14 204.23,-44.62"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="207.17,-42.72 198.81,-36.21 201.29,-46.51 207.17,-42.72"/>
+</g>
+<!-- 10 -->
+<g id="node11" class="node">
+<title>10</title>
+<path fill="none" stroke="#56d8d0" stroke-width="2" d="M526,-180C526,-180 284,-180 284,-180 278,-180 272,-174 272,-168 272,-168 272,-156 272,-156 272,-150 278,-144 284,-144 284,-144 526,-144 526,-144 532,-144 538,-150 538,-156 538,-156 538,-168 538,-168 538,-174 532,-180 526,-180"/>
+<text text-anchor="middle" x="405" y="-159.5" font-family="sans" font-size="10.00">merge_sensor_features_for_individual_participants</text>
+</g>
+<!-- 9&#45;&gt;10 -->
+<g id="edge18" class="edge">
+<title>9&#45;&gt;10</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M336.49,-215.7C347.96,-206.88 362.06,-196.03 374.48,-186.47"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="376.97,-188.98 382.76,-180.1 372.7,-183.43 376.97,-188.98"/>
+</g>
+<!-- 10&#45;&gt;0 -->
+<g id="edge7" class="edge">
+<title>10&#45;&gt;0</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M366.3,-143.85C346.21,-134.31 321.62,-121.63 301,-108 280.21,-94.25 277.55,-87.46 258,-72 245.35,-62 231.16,-51.3 218.81,-42.16"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="220.72,-39.22 210.59,-36.1 216.57,-44.85 220.72,-39.22"/>
+</g>
+<!-- 11 -->
+<g id="node12" class="node">
+<title>11</title>
+<path fill="none" stroke="#56d892" stroke-width="2" d="M528,-108C528,-108 322,-108 322,-108 316,-108 310,-102 310,-96 310,-96 310,-84 310,-84 310,-78 316,-72 322,-72 322,-72 528,-72 528,-72 534,-72 540,-78 540,-84 540,-84 540,-96 540,-96 540,-102 534,-108 528,-108"/>
+<text text-anchor="middle" x="425" y="-87.5" font-family="sans" font-size="10.00">merge_sensor_features_for_all_participants</text>
+</g>
+<!-- 10&#45;&gt;11 -->
+<g id="edge19" class="edge">
+<title>10&#45;&gt;11</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M409.94,-143.7C412.17,-135.9 414.85,-126.51 417.33,-117.83"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="420.73,-118.68 420.11,-108.1 414,-116.76 420.73,-118.68"/>
+</g>
+<!-- 11&#45;&gt;0 -->
+<g id="edge8" class="edge">
+<title>11&#45;&gt;0</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M367.08,-71.97C322.5,-58.85 262.21,-41.12 223.96,-29.87"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="224.84,-26.48 214.26,-27.02 222.87,-33.2 224.84,-26.48"/>
+</g>
+</g>
+</svg>
--- a/images/dag_full_nokia_example.svg
+++ b/images/dag_full_nokia_example.svg
--- a/images/dag_participants_files.svg
+++ b/images/dag_participants_files.svg
@ -0,0 +1,68 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: snakemake_dag Pages: 1 -->
+<svg width="414pt" height="396pt"
+ viewBox="0.00 0.00 414.00 396.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 392)">
+<title>snakemake_dag</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-392 410,-392 410,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<text text-anchor="start" x="81" y="-71.6" font-family="sans" font-weight="bold" font-size="18.00">create_participants_files</text>
+<text text-anchor="start" x="81" y="-47.8" font-family="sans" font-size="10.00"> </text>
+<text text-anchor="start" x="85" y="-47.8" font-family="sans" font-weight="bold" font-size="14.00">↪ input</text>
+<text text-anchor="start" x="143" y="-47.8" font-family="sans" font-size="10.00"> </text>
+<text text-anchor="start" x="81" y="-28" font-family="monospace" font-size="10.00">data/external/example_participants.csv</text>
+<text text-anchor="start" x="319" y="-10" font-family="sans" font-size="10.00"> &#160;</text>
+<polygon fill="#acd957" stroke="#acd957" points="75,-62 75,-62 333,-62 333,-62 75,-62"/>
+<polygon fill="#acd957" stroke="#acd957" points="75,-22 75,-22 333,-22 333,-22 75,-22"/>
+<polygon fill="none" stroke="#acd957" stroke-width="2" points="74.5,-1 74.5,-91 331.5,-91 331.5,-1 74.5,-1"/>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<text text-anchor="start" x="77" y="-221.6" font-family="sans" font-weight="bold" font-size="18.00">prepare_participants_csv</text>
+<text text-anchor="start" x="77" y="-197.8" font-family="sans" font-size="10.00"> </text>
+<text text-anchor="start" x="81" y="-197.8" font-family="sans" font-weight="bold" font-size="14.00">↪ input</text>
+<text text-anchor="start" x="139" y="-197.8" font-family="sans" font-size="10.00"> </text>
+<text text-anchor="start" x="77" y="-178" font-family="monospace" font-size="10.00">data/external/example_usernames.csv</text>
+<text text-anchor="start" x="251" y="-157.8" font-family="sans" font-size="10.00"> </text>
+<text text-anchor="start" x="255" y="-157.8" font-family="sans" font-weight="bold" font-size="14.00">output →</text>
+<text text-anchor="start" x="325" y="-157.8" font-family="sans" font-size="10.00"> </text>
+<text text-anchor="start" x="77" y="-138" font-family="monospace" font-size="10.00">data/external/example_participants.csv</text>
+<polygon fill="#57d99e" stroke="#57d99e" points="71,-212 71,-212 336,-212 336,-212 71,-212"/>
+<polygon fill="#57d99e" stroke="#57d99e" points="71,-172 71,-172 336,-172 336,-172 71,-172"/>
+<polygon fill="none" stroke="#57d99e" stroke-width="2" points="71,-129 71,-241 335,-241 335,-129 71,-129"/>
+</g>
+<!-- 1&#45;&gt;0 -->
+<g id="edge1" class="edge">
+<title>1&#45;&gt;0</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M203,-127.88C203,-119.48 203,-110.81 203,-102.42"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="206.5,-102.36 203,-92.36 199.5,-102.36 206.5,-102.36"/>
+</g>
+<!-- 2 -->
+<g id="node3" class="node">
+<title>2</title>
+<text text-anchor="start" x="7" y="-367.6" font-family="sans" font-weight="bold" font-size="18.00">query_usernames_device_empatica_ids</text>
+<text text-anchor="start" x="7" y="-346" font-family="sans" font-size="10.00"> &#160;</text>
+<text text-anchor="start" x="321" y="-325.8" font-family="sans" font-size="10.00"> </text>
+<text text-anchor="start" x="325" y="-325.8" font-family="sans" font-weight="bold" font-size="14.00">output →</text>
+<text text-anchor="start" x="395" y="-325.8" font-family="sans" font-size="10.00"> </text>
+<text text-anchor="start" x="7" y="-306" font-family="monospace" font-size="10.00">data/external/example_usernames.csv</text>
+<text text-anchor="start" x="7" y="-288" font-family="monospace" font-size="10.00">data/external/timezone.csv</text>
+<polygon fill="#86d957" stroke="#86d957" points="1,-358 1,-358 406,-358 406,-358 1,-358"/>
+<polygon fill="#86d957" stroke="#86d957" points="1,-340 1,-340 406,-340 406,-340 1,-340"/>
+<polygon fill="none" stroke="#86d957" stroke-width="2" points="1,-279 1,-387 405,-387 405,-279 1,-279"/>
+</g>
+<!-- 2&#45;&gt;1 -->
+<g id="edge2" class="edge">
+<title>2&#45;&gt;1</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M203,-277.63C203,-269.45 203,-260.93 203,-252.53"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="206.5,-252.36 203,-242.36 199.5,-252.36 206.5,-252.36"/>
+</g>
+</g>
+</svg>
--- a/machine_learning/classification_models.py
+++ b/machine_learning/classification_models.py
@ -0,0 +1,123 @@
+import pandas as pd
+import xgboost as xg
+from lightgbm import LGBMClassifier
+from sklearn import ensemble, linear_model, naive_bayes, neighbors, svm, tree
+from sklearn.dummy import DummyClassifier
+
+
+class ClassificationModels:
+    def __init__(self):
+        self.cmodels = self.init_classification_models()
+
+    def get_cmodels(self):
+        return self.cmodels
+
+    def init_classification_models(self):
+        cmodels = {
+            "dummy_classifier": {
+                "model": DummyClassifier(strategy="most_frequent"),
+                "metrics": [0, 0, 0, 0],
+            },
+            "logistic_regression": {
+                "model": linear_model.LogisticRegression(max_iter=1000),
+                "metrics": [0, 0, 0, 0],
+            },
+            "support_vector_machine": {"model": svm.SVC(), "metrics": [0, 0, 0, 0]},
+            "gaussian_naive_bayes": {
+                "model": naive_bayes.GaussianNB(),
+                "metrics": [0, 0, 0, 0],
+            },
+            "stochastic_gradient_descent_classifier": {
+                "model": linear_model.SGDClassifier(),
+                "metrics": [0, 0, 0, 0],
+            },
+            "knn": {"model": neighbors.KNeighborsClassifier(), "metrics": [0, 0, 0, 0]},
+            "decision_tree": {
+                "model": tree.DecisionTreeClassifier(),
+                "metrics": [0, 0, 0, 0],
+            },
+            "random_forest_classifier": {
+                "model": ensemble.RandomForestClassifier(),
+                "metrics": [0, 0, 0, 0],
+            },
+            "gradient_boosting_classifier": {
+                "model": ensemble.GradientBoostingClassifier(),
+                "metrics": [0, 0, 0, 0],
+            },
+            "lgbm_classifier": {"model": LGBMClassifier(), "metrics": [0, 0, 0, 0]},
+            "XGBoost_classifier": {
+                "model": xg.sklearn.XGBClassifier(),
+                "metrics": [0, 0, 0, 0],
+            },
+        }
+
+        return cmodels
+
+    def get_total_models_scores(self, n_clusters=1):
+        scores = pd.DataFrame(columns=["method", "metric", "mean"])
+        for model_title, model in self.cmodels.items():
+            scores_df = pd.DataFrame(columns=["method", "metric", "mean"])
+            print("\n************************************\n")
+            print("Current model:", model_title, end="\n")
+            print("Acc:", model["metrics"][0] / n_clusters)
+            scores_df = pd.concat(
+                [
+                    scores_df,
+                    pd.DataFrame(
+                        {
+                            "method": model_title,
+                            "metric": "test_accuracy",
+                            "mean": model["metrics"][0] / n_clusters,
+                        },
+                        index=[0],
+                    ),
+                ],
+                ignore_index=True,
+            )
+            print("Precision:", model["metrics"][1] / n_clusters)
+            scores_df = pd.concat(
+                [
+                    scores_df,
+                    pd.DataFrame(
+                        {
+                            "method": model_title,
+                            "metric": "test_precision",
+                            "mean": model["metrics"][1] / n_clusters,
+                        },
+                        index=[0],
+                    ),
+                ],
+                ignore_index=True,
+            )
+            print("Recall:", model["metrics"][2] / n_clusters)
+            scores_df = pd.concat(
+                [
+                    scores_df,
+                    pd.DataFrame(
+                        {
+                            "method": model_title,
+                            "metric": "test_recall",
+                            "mean": model["metrics"][2] / n_clusters,
+                        },
+                        index=[0],
+                    ),
+                ],
+                ignore_index=True,
+            )
+            print("F1:", model["metrics"][3] / n_clusters)
+            scores_df = pd.concat(
+                [
+                    scores_df,
+                    pd.DataFrame(
+                        {
+                            "method": model_title,
+                            "metric": "test_f1",
+                            "mean": model["metrics"][3] / n_clusters,
+                        },
+                        index=[0],
+                    ),
+                ],
+                ignore_index=True,
+            )
+            scores = pd.concat([scores, scores_df])
+        return scores
--- a/machine_learning/config/minimal_features.yaml
+++ b/machine_learning/config/minimal_features.yaml
@ -0,0 +1,5 @@
+grouping_variable: date_lj
+features:
+  proximity:
+    all
+participants_usernames: [nokia_0000003]
--- a/machine_learning/config/minimal_labels.yaml
+++ b/machine_learning/config/minimal_labels.yaml
@ -0,0 +1,6 @@
+grouping_variable: date_lj
+labels:
+  PANAS:
+    - PA
+    - NA
+participants_usernames: [nokia_0000003]
--- a/machine_learning/config/prox_comm_PANAS_features.yaml
+++ b/machine_learning/config/prox_comm_PANAS_features.yaml
@ -0,0 +1,6 @@
+grouping_variable: date_lj
+features:
+  proximity:
+    all
+  communication:
+    all
--- a/machine_learning/config/prox_comm_PANAS_labels.yaml
+++ b/machine_learning/config/prox_comm_PANAS_labels.yaml
@ -0,0 +1,5 @@
+grouping_variable: date_lj
+labels:
+  PANAS:
+    - PA
+    - NA
--- a/machine_learning/cross_validation.py
+++ b/machine_learning/cross_validation.py
@ -0,0 +1,128 @@
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold
+
+class CrossValidation():
+    """This code implements a CrossValidation class for creating cross validation splits.
+    """
+    
+    
+    def __init__(self, data=None, cv_method='logo'):
+        """This method initializes the cv_method argument and optionally prepares the data if supplied.
+
+        Args:
+            cv_method (str, optional): String of cross validation method; options are 'logo', 'half_logo' and '5kfold'. 
+                Defaults to 'logo'.
+            data (DataFrame, optional): Pandas DataFrame with target, pid columns and other features as columns. 
+                Defaults to None.
+        """
+        
+        self.initialize_cv_method(cv_method)
+        
+        if data is not None:
+            self.prepare_data(data)
+        
+        
+    def prepare_data(self, data):
+        """Prepares the data ready to be passed to the cross-validation algorithm, depending on the cv_method type. 
+            For example, if cv_method is set to 'half_logo' new columns 'pid_index', 'pid_count', 'pid_half' 
+            are added and used in the process.
+
+        Args:
+            data (_type_): Pandas DataFrame with target, pid columns and other features as columns.
+        """
+        self.data = data
+        if self.cv_method == "logo":
+            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
+            
+        elif self.cv_method == "half_logo":
+            data['pid_index'] = data.groupby('pid').cumcount()
+            data['pid_count'] = data.groupby('pid')['pid'].transform('count')
+
+            data["pid_index"] = (data['pid_index'] / data['pid_count'] + 1).round()
+            data["pid_half"] = data["pid"] + "_" +  data["pid_index"].astype(int).astype(str)
+
+            data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
+           
+        elif self.cv_method == "Stratified5kfold":
+            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], None
+
+        self.X, self.y, self.groups = data_X, data_y, data_groups
+
+
+    def initialize_cv_method(self, cv_method):
+        """Initializes the given cv_method type. Depending on the type, the appropriate splitting technique is used.
+        
+        Args:
+            cv_method (str): The type of cross-validation method to use; options are 'logo', 'half_logo' and '5kfold'.
+
+        Raises:
+            ValueError: If cv_method is not in the list of available methods, it raises an ValueError.
+        """
+        
+        self.cv_method = cv_method
+        if self.cv_method not in ["logo", "half_logo", "5kfold"]:
+            raise ValueError("Invalid cv_method input. Correct values are: 'logo', 'half_logo', '5kfold'")
+        
+        if self.cv_method in ["logo", "half_logo"]:
+            self.cv = LeaveOneGroupOut()
+        elif self.cv_method == "Stratified5kfold":
+            self.cv = StratifiedKFold(n_splits=5, shuffle=True)
+
+
+    def get_splits(self):
+        """Returns a generator object containing the cross-validation splits. 
+
+        Raises:
+            ValueError: Raises ValueError if no data has been set.
+
+        """
+        if not self.data.empty:
+            return self.cv.split(self.X, self.y, self.groups)
+        else: 
+            raise ValueError("No data has been set. Use 'prepare_data(data)' method to set the data.")
+        
+        
+    def get_data(self):
+        """data getter
+
+        Returns:
+            Pandas DataFrame: Returns the data from the class instance.
+        """
+        return self.data
+    
+    
+    def get_x_y_groups(self):
+        """X, y, and groups data getter
+
+        Returns:
+            Pandas DataFrame: Returns the data from the class instance.
+        """
+        return self.X, self.y, self.groups
+    
+    
+    def get_train_test_sets(self, split):
+        """Gets train and test sets, dependent on the split parameter. This method can be used in a specific splitting context,
+        where by index we can get train and test sets.
+
+        Args:
+            split (tuple of indices): It represents one iteration of the split generator (see get_splits method). 
+
+        Returns:
+            tuple of Pandas DataFrames: This method returns train_X, train_y, test_X, test_y, with correctly indexed rows by split param.
+        """
+        return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
+    
+    def get_groups_sets(self, split):
+        
+        if self.groups is None:
+            return None, None
+        else:
+            return self.groups.iloc[split[0]], self.groups.iloc[split[1]]
+    
+    
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@ -0,0 +1,245 @@
+import os
+import sys
+import warnings
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
+from sklearn.model_selection import cross_validate, StratifiedKFold, GroupKFold
+from sklearn.naive_bayes import GaussianNB
+from sklearn.linear_model import Lasso 
+
+
+""" Feature selection pipeline: a methods that can be used in the wrapper metod alongside other wrapper contents (hyperparameter tuning etc.).
+
+(1) Establish methods for each of the steps in feature selection protocol.
+(2) Ensure that above methods are given only a part of data and use appropriate random seeds - to later simulate use case in production. 
+(3) Implement a method which gives graphical exploration of (1) (a) and (b) steps of the feature selection.
+(4) Prepare a core method that can be fit into a wrapper (see sklearn wrapper methods) and integrates methods from (1)
+
+"""
+
+class FeatureSelection:
+
+    def __init__(self, X, y, groups):
+        self.X = X
+        self.y = y
+        self.groups = groups
+
+    
+    def select_best_feature(self, features, method="remove", ml_category="classification", ml_subcategory="bin", metric="recall", stored_features=[]):
+        """The method selects the best feature by testing the prediction on the feature set with or without the current feature.
+        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particular 
+        feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
+        specified as a parameter.
+
+        Args:
+            df (DataFrame): Input data on which the predictions will be made.
+            features (list): List of features to select the best/worst from
+            method (str, optional): remove or add features.  Defaults to "remove".
+            ml_category (str, optional): Either classification or regression ml problem controls the ML algorithm and  metric. 
+                Defaults to "classification".
+            ml_subcategory (str, optional): In case of classification '_bin' for binary classification 
+                and 'multi' for multiclass classification. For regression an empty string '' is sufficient. 
+                Defaults to "bin".
+            metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
+            stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
+
+        Raises:
+            ValueError: Raises if classification or regression metrics are not recognised if a specific ml_type is selected.
+            ValueError: If unknown ml_type is chosen. 
+            
+        Returns:
+            tuple: name of the best feature, best feature score, best feature score standard deviation.
+        """
+        
+        best_feature = None
+        
+        # Validacije tipov ML in specificiranimi metrikami
+        if ml_category == "classification":
+            if ml_subcategory == "bin" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
+                raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
+            elif ml_subcategory == "multi":
+                ml_subcategory_error = False
+                if metric != "accuracy" and "_" in metric:          
+                    metric_s, metric_t = metric.split("_")
+                    if metric_s not in ['accuracy', 'precision', 'recall', 'f1'] or metric_t not in ['micro', 'macro', 'weighted']:
+                        ml_subcategory_error = True
+                else:
+                    ml_subcategory_error = True
+                    
+                if ml_subcategory_error:
+                    raise ValueError(""""Classification metric for multi-class classification must be specified precisely.
+                                     Available metric are: 'accuracy', 'precision', 'recall' and 'f1'.
+                                     Only accuracy must be specified as 'accuracy'.
+                                     For others please add appropriate suffixes: '_macro', '_micro', or '_weighted', e.g., 'f1_macro'""")
+        elif ml_category == "regression" and metric not in ['r2']:
+            raise ValueError("Regression metric not recognized. Please choose 'r2'")
+
+        for feat in features:
+            if method == "remove":
+                pred_features = [col for col in self.X.columns if feat != col] # All but feat
+            elif method == "add":
+                pred_features = [feat] + stored_features # Feat with stored features
+            
+            X  = self.X[pred_features].copy()
+            
+            if self.groups is not None:
+                cv = GroupKFold(n_splits=5)
+            else:
+                cv = StratifiedKFold(n_splits=5, shuffle=True)
+                
+            # See link about scoring for multiclassfication
+            # http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
+            if ml_category == "classification":
+                nb = GaussianNB()
+                model_cv = cross_validate(
+                    nb,
+                    X=X,
+                    y=self.y,
+                    cv=cv,
+                    groups=self.groups,
+                    n_jobs=-1,
+                    scoring=(metric)
+                )
+                
+                                       
+            elif ml_category == "regression":
+                lass = Lasso()
+                model_cv = cross_validate(
+                    lass,
+                    X=X,
+                    y=y,
+                    cv=cv,
+                    groups=self.groups,
+                    n_jobs=-1,
+                    scoring=('r2')
+                )
+
+            else:
+                raise ValueError("ML type not yet implemented!")
+            
+            # Section of metrics' scores comparison. 
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
+
+                metric_score = np.nanmean(model_cv["test_score"])
+                metric_score_std = np.nanstd(model_cv["test_score"])
+                
+                if not best_feature or (metric_score > best_metric_score):
+                    best_feature = feat
+                    best_metric_score = metric_score
+                    best_metric_score_std = metric_score_std
+                    
+        return best_feature, best_metric_score, best_metric_score_std
+    
+    
+    def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10):
+        """This method selects a set of features and returns them as a list. It returns number of features 
+        determined in the interval of [n_min, n_max]. 
+        
+        The method consists of two steps: 
+        (1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter.
+        (2) The sequential features removal procedure is executed. Using the remaing features from (1).
+            The best score is detected using a removal procedure. The procedure sequentially removes the features 
+            that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is 
+            improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance) 
+            with which the next n removed features are inspected whether currently best score is improved.     
+
+        Args:
+            n_min (int, optional): Minimal amount of features returned.
+            n_max (int, optional): Maximal amount of features returned.
+            k (int, optional): Determines the k in the k-best features method. 
+                If None, SelectKBest feature selection does not execute.
+            ml_type(str, optional): Type of ML problem. Currently implemented options: 
+                'classification_bin', 'classification_multi', and 'regression_'
+            method (str, optional): "remove" or "add" features.  Defaults to "remove".
+            n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
+                the method returns index of feature with current best score as a tipping point feature.
+            
+        Returns:
+            list: list of selected features
+        """        
+
+        if k is not None and k <= n_max:
+            raise ValueError("The k parameter needs to be greater than the n_max parameter.")
+        
+        # Select k-best feature dependent on the type of ML task
+        ml_category, ml_subcategory = ml_type.split("_")
+
+        if k is not None:
+            if ml_category == "classification":
+                if ml_subcategory== "bin":
+                    selector = SelectKBest(mutual_info_classif, k=k)
+                elif ml_subcategory== "multi":
+                    selector = SelectKBest(f_classif, k=k)
+                else:
+                    raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
+            elif ml_category == "regression":
+                selector = SelectKBest(f_regression, k=k)
+            else:
+                raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
+            
+            selector.fit(self.X, self.y)
+            cols_idxs = selector.get_support(indices=True)
+            self.X = self.X.iloc[:,cols_idxs]
+        
+        print("All columns (after SelectKBest method):")
+        print(self.X.columns)
+        
+        # Sequential feature addition / removal
+        n_features = self.X.shape[1]
+        if n_max >= n_features:
+            n_max = n_features-1 # The algorithm removes at least one feature
+            
+        if n_min > n_features:
+            raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
+        
+        if n_max < n_min:
+            raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
+        
+        features = self.X.columns.tolist()
+        feature_importance = []
+        if method == "remove":
+            best_score = 0
+            best_feature_indx = None
+            i_worse = 0
+            for i in reversed(range(n_features)):
+                
+                if i+1 == n_min:
+                    break
+                
+                best_feature, best_metric_score, best_metric_score_std = \
+                    self.select_best_feature(features, method=method, ml_category=ml_category, ml_subcategory=ml_subcategory, metric=metric)
+                    
+                feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
+                
+                features.remove(best_feature)
+                print("Features left:", i) 
+                
+                if i <= n_max:
+                    if best_metric_score >= best_score:
+                        best_score = best_metric_score
+                        best_feature_indx = i+1
+                        i_worse = 0
+                    else:
+                        i_worse += 1
+                    
+                    if i_worse == n_tolerance: 
+                        break  
+                
+            feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
+
+            print(feature_importance_df)
+            print("best_feature_indx", best_feature_indx)
+            print("best_score", best_score)
+
+            features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist()
+            selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]    
+            
+            return selected_features
+        
+        else:
+            raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
--- a/machine_learning/features_sensor.py
+++ b/machine_learning/features_sensor.py
@ -0,0 +1,231 @@
+import datetime
+import warnings
+from pathlib import Path
+from typing import Collection
+
+import pandas as pd
+from pyprojroot import here
+
+import participants.query_db
+from features import communication, helper, proximity
+from machine_learning.helper import (
+    read_csv_with_settings,
+    safe_outer_merge_on_index,
+    to_csv_with_settings,
+)
+
+WARNING_PARTICIPANTS_LABEL = (
+    "Before calculating features, please set participants label using self.set_participants_label() "
+    "to be used as a filename prefix when exporting data. "
+    "The filename will be of the form: %participants_label_%grouping_variable_%data_type.csv"
+)
+
+
+class SensorFeatures:
+    """
+    A class to represent all sensor (AWARE) features.
+
+    Attributes
+    ----------
+    grouping_variable: str
+        The name of the variable by which to group (segment) data, e.g. date_lj.
+    features: dict
+        A dictionary of sensors (data types) and features to calculate.
+        See config/minimal_features.yaml for an example.
+    participants_usernames: Collection
+        A list of usernames for which to calculate features.
+        If None, use all participants.
+
+    Methods
+    -------
+    set_sensor_data():
+        Query the database for data types defined by self.features.
+    get_sensor_data(data_type): pd.DataFrame
+        Returns the dataframe of sensor data for specified data_type.
+    calculate_features():
+        Calls appropriate functions from features/ and joins them in a single dataframe, df_features_all.
+    get_features(data_type, feature_names): pd.DataFrame
+        Returns the dataframe of specified features for selected sensor.
+
+    construct_export_path():
+        Construct a path for exporting the features as csv files.
+    set_participants_label(label):
+        Sets a label for the usernames subset. This is used to distinguish feature exports.
+    """
+
+    def __init__(
+        self,
+        grouping_variable: str,
+        features: dict,
+        participants_usernames: Collection = None,
+    ) -> None:
+        """
+        Specifies the grouping variable and usernames for which to calculate features.
+        Sets other (implicit) attributes used in other methods.
+        If participants_usernames=None, this queries the usernames which belong to the main part of the study,
+            i.e. from 2020-08-01 on.
+
+        Parameters
+        ----------
+        grouping_variable: str
+            The name of the variable by which to group (segment) data, e.g. date_lj.
+        features: dict
+            A dictionary of sensors (data types) and features to calculate.
+            See config/minimal_features.yaml for an example.
+        participants_usernames: Collection
+            A list of usernames for which to calculate features.
+            If None, use all participants.
+
+        Returns
+        -------
+        None
+        """
+        self.grouping_variable_name = grouping_variable
+        self.grouping_variable = [grouping_variable]
+
+        self.data_types = features.keys()
+
+        self.participants_label: str = ""
+        if participants_usernames is None:
+            participants_usernames = participants.query_db.get_usernames(
+                collection_start=datetime.date.fromisoformat("2020-08-01")
+            )
+            self.participants_label = "all"
+        self.participants_usernames = participants_usernames
+
+        self.df_features_all = pd.DataFrame()
+
+        self.df_proximity = pd.DataFrame()
+        self.df_proximity_counts = pd.DataFrame()
+
+        self.df_calls = pd.DataFrame()
+        self.df_sms = pd.DataFrame()
+        self.df_calls_sms = pd.DataFrame()
+
+        self.folder: Path = Path()
+        self.filename_prefix = ""
+        self.construct_export_path()
+        print("SensorFeatures initialized.")
+
+    def set_sensor_data(self) -> None:
+        print("Querying database ...")
+        if "proximity" in self.data_types:
+            self.df_proximity = proximity.get_proximity_data(
+                self.participants_usernames
+            )
+            print("Got proximity data from the DB.")
+            self.df_proximity = helper.get_date_from_timestamp(self.df_proximity)
+            self.df_proximity = proximity.recode_proximity(self.df_proximity)
+        if "communication" in self.data_types:
+            self.df_calls = communication.get_call_data(self.participants_usernames)
+            self.df_calls = helper.get_date_from_timestamp(self.df_calls)
+            print("Got calls data from the DB.")
+
+            self.df_sms = communication.get_sms_data(self.participants_usernames)
+            self.df_sms = helper.get_date_from_timestamp(self.df_sms)
+            print("Got sms data from the DB.")
+
+    def get_sensor_data(self, data_type: str) -> pd.DataFrame:
+        if data_type == "proximity":
+            return self.df_proximity
+        elif data_type == "communication":
+            return self.df_calls_sms
+        else:
+            raise KeyError("This data type has not been implemented.")
+
+    def calculate_features(self, cached=True) -> None:
+        print("Calculating features ...")
+        if not self.participants_label:
+            raise ValueError(WARNING_PARTICIPANTS_LABEL)
+        self.df_features_all = pd.DataFrame()
+
+        if "proximity" in self.data_types:
+            try:
+                if not cached:  # Do not use the file, even if it exists.
+                    raise FileNotFoundError
+                self.df_proximity_counts = read_csv_with_settings(
+                    self.folder,
+                    self.filename_prefix,
+                    data_type="prox",
+                    grouping_variable=self.grouping_variable,
+                )
+                print("Read proximity features from the file.")
+            except FileNotFoundError:
+                # We need to recalculate the features in this case.
+                self.df_proximity_counts = proximity.count_proximity(
+                    self.df_proximity, self.grouping_variable
+                )
+                print("Calculated proximity features.")
+                to_csv_with_settings(
+                    self.df_proximity_counts,
+                    self.folder,
+                    self.filename_prefix,
+                    data_type="prox",
+                )
+            finally:
+                self.df_features_all = safe_outer_merge_on_index(
+                    self.df_features_all, self.df_proximity_counts
+                )
+
+        if "communication" in self.data_types:
+            try:
+                if not cached:  # Do not use the file, even if it exists.
+                    raise FileNotFoundError
+                self.df_calls_sms = read_csv_with_settings(
+                    self.folder,
+                    self.filename_prefix,
+                    data_type="comm",
+                    grouping_variable=self.grouping_variable,
+                )
+                print("Read communication features from the file.")
+            except FileNotFoundError:
+                # We need to recalculate the features in this case.
+                self.df_calls_sms = communication.calls_sms_features(
+                    df_calls=self.df_calls,
+                    df_sms=self.df_sms,
+                    group_by=self.grouping_variable,
+                )
+                print("Calculated communication features.")
+                to_csv_with_settings(
+                    self.df_calls_sms,
+                    self.folder,
+                    self.filename_prefix,
+                    data_type="comm",
+                )
+            finally:
+                self.df_features_all = safe_outer_merge_on_index(
+                    self.df_features_all, self.df_calls_sms
+                )
+
+        self.df_features_all.fillna(
+            value=proximity.FILL_NA_PROXIMITY, inplace=True, downcast="infer",
+        )
+        self.df_features_all.fillna(
+            value=communication.FILL_NA_CALLS_SMS_ALL, inplace=True, downcast="infer",
+        )
+
+    def get_features(self, data_type, feature_names) -> pd.DataFrame:
+        if data_type == "proximity":
+            if feature_names == "all":
+                feature_names = proximity.FEATURES_PROXIMITY
+            return self.df_proximity_counts[feature_names]
+        elif data_type == "communication":
+            if feature_names == "all":
+                feature_names = communication.FEATURES_CALLS_SMS_ALL
+            return self.df_calls_sms[feature_names]
+        elif data_type == "all":
+            return self.df_features_all
+        else:
+            raise KeyError("This data type has not been implemented.")
+
+    def construct_export_path(self) -> None:
+        if not self.participants_label:
+            warnings.warn(WARNING_PARTICIPANTS_LABEL, UserWarning)
+        self.folder = here("machine_learning/intermediate_results/features", warn=True)
+        self.filename_prefix = (
+            self.participants_label + "_" + self.grouping_variable_name
+        )
+
+    def set_participants_label(self, label: str) -> None:
+        self.participants_label = label
+        self.construct_export_path()
--- a/machine_learning/helper.py
+++ b/machine_learning/helper.py
@ -0,0 +1,730 @@
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from sklearn import (
+    ensemble,
+    gaussian_process,
+    kernel_ridge,
+    linear_model,
+    naive_bayes,
+    svm,
+)
+from sklearn.dummy import DummyClassifier, DummyRegressor
+from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import (
+    BaseCrossValidator,
+    LeaveOneGroupOut,
+    StratifiedKFold,
+    cross_validate,
+)
+from xgboost import XGBClassifier, XGBRegressor
+
+
+def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
+    if left.empty:
+        return right
+    elif right.empty:
+        return left
+    else:
+        return pd.merge(
+            left,
+            right,
+            how="outer",
+            left_index=True,
+            right_index=True,
+            validate="one_to_one",
+        )
+
+
+def to_csv_with_settings(
+    df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str
+) -> None:
+    full_path = construct_full_path(folder, filename_prefix, data_type)
+    df.to_csv(
+        path_or_buf=full_path,
+        sep=",",
+        na_rep="NA",
+        header=True,
+        index=True,
+        encoding="utf-8",
+    )
+    print("Exported the dataframe to " + str(full_path))
+
+
+def read_csv_with_settings(
+    folder: Path, filename_prefix: str, data_type: str, grouping_variable: list
+) -> pd.DataFrame:
+    full_path = construct_full_path(folder, filename_prefix, data_type)
+    return pd.read_csv(
+        filepath_or_buffer=full_path,
+        sep=",",
+        header=0,
+        na_values="NA",
+        encoding="utf-8",
+        index_col=(["participant_id"] + grouping_variable),
+        parse_dates=True,
+        infer_datetime_format=True,
+        cache_dates=True,
+    )
+
+
+def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> Path:
+    export_filename = filename_prefix + "_" + data_type + ".csv"
+    full_path = folder / export_filename
+    return full_path
+
+
+def insert_row(df, row):
+    return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
+
+
+def impute_encode_categorical_features(model_input: pd.DataFrame) -> pd.DataFrame:
+    categorical_feature_col_names = [
+        "gender",
+        "startlanguage",
+        "limesurvey_demand_control_ratio_quartile",
+    ]
+    additional_categorical_features = [
+        col
+        for col in model_input.columns
+        if "mostcommonactivity" in col or "homelabel" in col
+    ]
+    categorical_feature_col_names += additional_categorical_features
+
+    categorical_features = model_input[categorical_feature_col_names].copy()
+
+    mode_categorical_features = categorical_features.mode().iloc[0]
+    # fillna with mode
+    categorical_features = categorical_features.fillna(mode_categorical_features)
+    # one-hot encoding
+    categorical_features = categorical_features.apply(
+        lambda col: col.astype("category")
+    )
+    if not categorical_features.empty:
+        categorical_features = pd.get_dummies(categorical_features)
+
+    numerical_features = model_input.drop(categorical_feature_col_names, axis=1)
+
+    model_input = pd.concat([numerical_features, categorical_features], axis=1)
+    return model_input
+
+
+def prepare_sklearn_data_format(
+    model_input: pd.DataFrame, cv_method: str = "logo"
+) -> tuple:
+    index_columns = [
+        "local_segment",
+        "local_segment_label",
+        "local_segment_start_datetime",
+        "local_segment_end_datetime",
+    ]
+    model_input.set_index(index_columns, inplace=True)
+
+    if cv_method == "half_logo":
+        model_input["pid_index"] = model_input.groupby("pid").cumcount()
+        model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")
+
+        model_input["pid_index"] = (
+            model_input["pid_index"] / model_input["pid_count"] + 1
+        ).round()
+        model_input["pid_half"] = (
+            model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
+        )
+
+        data_x, data_y, data_groups = (
+            model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
+            model_input["target"],
+            model_input["pid_half"],
+        )
+    else:
+        data_x, data_y, data_groups = (
+            model_input.drop(["target", "pid"], axis=1),
+            model_input["target"],
+            model_input["pid"],
+        )
+    return data_x, data_y, data_groups
+
+
+def prepare_cross_validator(
+    data_x: pd.DataFrame,
+    data_y: pd.DataFrame,
+    data_groups: pd.DataFrame,
+    cv_method: str = "logo",
+) -> BaseCrossValidator:
+    if cv_method == "logo" or cv_method == "half_logo":
+        cv = LeaveOneGroupOut()
+        cv.get_n_splits(
+            data_x,
+            data_y,
+            groups=data_groups,
+        )
+    else:
+        cv = StratifiedKFold(n_splits=5, shuffle=True)
+    return cv
+
+
+def aggregate_and_transpose(df: pd.DataFrame, statistics=None) -> pd.DataFrame:
+    if statistics is None:
+        statistics = ["max", "mean"]
+    return (
+        df.agg(statistics)
+        .transpose()
+        .reset_index()
+        .rename(columns={"index": "test_metric"})
+    )
+
+
+def run_all_regression_models(
+    data_x: pd.DataFrame,
+    data_y: pd.DataFrame,
+    data_groups: pd.DataFrame,
+    cross_validator: BaseCrossValidator,
+) -> pd.DataFrame:
+    metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
+    test_metrics = ["test_" + metric for metric in metrics]
+    scores = pd.DataFrame(columns=["method", "test_metric", "max", "nanmedian"])
+
+    # Validate models
+    dummy_regr = DummyRegressor(strategy="mean")
+    dummy_regr_scores = cross_validate(
+        dummy_regr,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    print("Dummy model:")
+    print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))
+
+    scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df["method"] = "dummy"
+    scores = pd.concat([scores, scores_df])
+    del dummy_regr
+    del dummy_regr_scores
+
+    lin_reg = linear_model.LinearRegression()
+    lin_reg_scores = cross_validate(
+        lin_reg,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    print("Linear regression:")
+    print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))
+
+    scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df["method"] = "linear_reg"
+    scores = pd.concat([scores, scores_df])
+    del lin_reg
+    del lin_reg_scores
+
+    ridge_reg = linear_model.Ridge(alpha=0.5)
+    ridge_reg_scores = cross_validate(
+        ridge_reg,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    print("Ridge regression")
+
+    scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df["method"] = "ridge_reg"
+    scores = pd.concat([scores, scores_df])
+    del ridge_reg
+    del ridge_reg_scores
+
+    lasso_reg = linear_model.Lasso(alpha=0.1)
+    lasso_reg_score = cross_validate(
+        lasso_reg,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    print("Lasso regression")
+
+    scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df["method"] = "lasso_reg"
+    scores = pd.concat([scores, scores_df])
+    del lasso_reg
+    del lasso_reg_score
+
+    bayesian_ridge_reg = linear_model.BayesianRidge()
+    bayesian_ridge_reg_score = cross_validate(
+        bayesian_ridge_reg,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    print("Bayesian Ridge")
+
+    scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df["method"] = "bayesian_ridge"
+    scores = pd.concat([scores, scores_df])
+    del bayesian_ridge_reg
+    del bayesian_ridge_reg_score
+
+    ransac_reg = linear_model.RANSACRegressor()
+    ransac_reg_score = cross_validate(
+        ransac_reg,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    print("RANSAC (outlier robust regression)")
+
+    scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df["method"] = "RANSAC"
+    scores = pd.concat([scores, scores_df])
+    del ransac_reg
+    del ransac_reg_score
+
+    svr = svm.SVR()
+    svr_score = cross_validate(
+        svr,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    print("Support vector regression")
+
+    scores_df = pd.DataFrame(svr_score)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df["method"] = "SVR"
+    scores = pd.concat([scores, scores_df])
+    del svr
+    del svr_score
+
+    kridge = kernel_ridge.KernelRidge()
+    kridge_score = cross_validate(
+        kridge,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    print("Kernel Ridge regression")
+
+    scores_df = pd.DataFrame(kridge_score)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df["method"] = "kernel_ridge"
+    scores = pd.concat([scores, scores_df])
+    del kridge
+    del kridge_score
+
+    gpr = gaussian_process.GaussianProcessRegressor()
+    gpr_score = cross_validate(
+        gpr,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    print("Gaussian Process Regression")
+
+    scores_df = pd.DataFrame(gpr_score)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df["method"] = "gaussian_proc"
+    scores = pd.concat([scores, scores_df])
+    del gpr
+    del gpr_score
+
+    rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
+    rfr_score = cross_validate(
+        rfr,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    print("Random Forest Regression")
+
+    scores_df = pd.DataFrame(rfr_score)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df["method"] = "random_forest"
+    scores = pd.concat([scores, scores_df])
+    del rfr
+    del rfr_score
+
+    xgb = XGBRegressor()
+    xgb_score = cross_validate(
+        xgb,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    print("XGBoost Regressor")
+
+    scores_df = pd.DataFrame(xgb_score)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df["method"] = "XGBoost"
+    scores = pd.concat([scores, scores_df])
+    del xgb
+    del xgb_score
+
+    ada = ensemble.AdaBoostRegressor()
+    ada_score = cross_validate(
+        ada,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    print("ADA Boost Regressor")
+
+    scores_df = pd.DataFrame(ada_score)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df["method"] = "ADA_boost"
+    scores = pd.concat([scores, scores_df])
+    del ada
+    del ada_score
+
+    return scores
+
+
+def confusion_matrix_scorer(clf, X, y):
+    y_pred = clf.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]}
+
+
+def aggregate_confusion_matrix(scores_dict: dict) -> pd.DataFrame:
+    scores_aggregated = aggregate_and_transpose(
+        pd.DataFrame(scores_dict), statistics=["sum"]
+    )
+    return scores_aggregated[
+        ~scores_aggregated.test_metric.isin(["fit_time", "score_time"])
+    ]
+
+
+def run_all_classification_models(
+    data_x: pd.DataFrame,
+    data_y: pd.DataFrame,
+    data_groups: pd.DataFrame,
+    cross_validator: BaseCrossValidator,
+):
+    data_y_value_counts = data_y.value_counts()
+    if len(data_y_value_counts) == 1:
+        raise (ValueError("There is only one unique value in data_y."))
+    if len(data_y_value_counts) == 2:
+        metrics = ["accuracy", "average_precision", "recall", "f1"]
+    else:
+        metrics = ["accuracy", "precision_micro", "recall_micro", "f1_micro"]
+
+    test_metrics = ["test_" + metric for metric in metrics]
+
+    scores = pd.DataFrame(columns=["method", "test_metric", "max", "mean"])
+
+    dummy_class = DummyClassifier(strategy="most_frequent")
+
+    dummy_score = cross_validate(
+        dummy_class,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        error_score="raise",
+        scoring=metrics,
+    )
+    dummy_confusion_matrix = cross_validate(
+        dummy_class,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        error_score="raise",
+        scoring=confusion_matrix_scorer,
+    )
+    print("Dummy")
+
+    scores_df = pd.DataFrame(dummy_score)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(dummy_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
+    scores_df["method"] = "dummy_classifier"
+    scores = pd.concat([scores, scores_df])
+    del dummy_class
+    del dummy_score
+    del dummy_confusion_matrix
+
+    logistic_regression = linear_model.LogisticRegression()
+
+    log_reg_scores = cross_validate(
+        logistic_regression,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    log_reg_confusion_matrix = cross_validate(
+        logistic_regression,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=confusion_matrix_scorer,
+    )
+    print("Logistic regression")
+
+    scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(log_reg_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
+    scores_df["method"] = "logistic_regression"
+    scores = pd.concat([scores, scores_df])
+    del logistic_regression
+    del log_reg_scores
+    del log_reg_confusion_matrix
+
+    svc = svm.SVC()
+
+    svc_scores = cross_validate(
+        svc,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    svc_confusion_matrix = cross_validate(
+        svc,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=confusion_matrix_scorer,
+    )
+    print("Support Vector Machine")
+
+    scores_df = pd.DataFrame(svc_scores)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(svc_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
+    scores_df["method"] = "SVC"
+    scores = pd.concat([scores, scores_df])
+    del svc
+    del svc_scores
+    del svc_confusion_matrix
+
+    gaussian_nb = naive_bayes.GaussianNB()
+
+    gaussian_nb_scores = cross_validate(
+        gaussian_nb,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    gaussian_nb_confusion_matrix = cross_validate(
+        gaussian_nb,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=confusion_matrix_scorer,
+    )
+    print("Gaussian Naive Bayes")
+
+    scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(gaussian_nb_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
+    scores_df["method"] = "gaussian_naive_bayes"
+    scores = pd.concat([scores, scores_df])
+    del gaussian_nb
+    del gaussian_nb_scores
+    del gaussian_nb_confusion_matrix
+
+    sgdc = linear_model.SGDClassifier()
+
+    sgdc_scores = cross_validate(
+        sgdc,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    sgdc_confusion_matrix = cross_validate(
+        sgdc,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=confusion_matrix_scorer,
+    )
+    print("Stochastic Gradient Descent")
+
+    scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(sgdc_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
+    scores_df["method"] = "stochastic_gradient_descent_classifier"
+    scores = pd.concat([scores, scores_df])
+    del sgdc
+    del sgdc_scores
+    del sgdc_confusion_matrix
+
+    rfc = ensemble.RandomForestClassifier()
+
+    rfc_scores = cross_validate(
+        rfc,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    rfc_confusion_matrix = cross_validate(
+        rfc,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=confusion_matrix_scorer,
+    )
+    print("Random Forest")
+
+    scores_df = pd.DataFrame(rfc_scores)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(rfc_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
+    scores_df["method"] = "random_forest_classifier"
+    scores = pd.concat([scores, scores_df])
+    del rfc
+    del rfc_scores
+    del rfc_confusion_matrix
+
+    xgb_classifier = XGBClassifier()
+
+    xgb_scores = cross_validate(
+        xgb_classifier,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
+    )
+    xgb_confusion_matrix = cross_validate(
+        xgb_classifier,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=confusion_matrix_scorer,
+    )
+    print("XGBoost")
+
+    scores_df = pd.DataFrame(xgb_scores)[test_metrics]
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(xgb_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
+    scores_df["method"] = "XGBoost_classifier"
+    scores = pd.concat([scores, scores_df])
+    del xgb_classifier
+    del xgb_scores
+    del xgb_confusion_matrix
+
+    return scores
--- a/machine_learning/labels.py
+++ b/machine_learning/labels.py
@ -0,0 +1,135 @@
+import datetime
+import warnings
+from pathlib import Path
+from typing import Collection
+
+import pandas as pd
+from pyprojroot import here
+
+import participants.query_db
+from features import esm
+from machine_learning import QUESTIONNAIRE_IDS, QUESTIONNAIRE_IDS_RENAME
+from machine_learning.helper import read_csv_with_settings, to_csv_with_settings
+
+WARNING_PARTICIPANTS_LABEL = (
+    "Before aggregating labels, please set participants label using self.set_participants_label() "
+    "to be used as a filename prefix when exporting data. "
+    "The filename will be of the form: %participants_label_%grouping_variable_%data_type.csv"
+)
+
+
+class Labels:
+    def __init__(
+        self,
+        grouping_variable: str,
+        labels: dict,
+        participants_usernames: Collection = None,
+    ) -> None:
+        self.grouping_variable_name = grouping_variable
+        self.grouping_variable = [grouping_variable]
+
+        self.questionnaires = labels.keys()
+
+        self.participants_label: str = ""
+        if participants_usernames is None:
+            participants_usernames = participants.query_db.get_usernames(
+                collection_start=datetime.date.fromisoformat("2020-08-01")
+            )
+            self.participants_label = "all"
+        self.participants_usernames = participants_usernames
+
+        self.df_esm = pd.DataFrame()
+        self.df_esm_preprocessed = pd.DataFrame()
+        self.df_esm_interest = pd.DataFrame()
+        self.df_esm_clean = pd.DataFrame()
+
+        self.df_esm_means = pd.DataFrame()
+
+        self.folder: Path = Path()
+        self.filename_prefix = ""
+        self.construct_export_path()
+        print("Labels initialized.")
+
+    def set_labels(self) -> None:
+        print("Querying database ...")
+        self.df_esm = esm.get_esm_data(self.participants_usernames)
+        print("Got ESM data from the DB.")
+        self.df_esm_preprocessed = esm.preprocess_esm(self.df_esm)
+        print("ESM data preprocessed.")
+        if "PANAS" in self.questionnaires:
+            self.df_esm_interest = self.df_esm_preprocessed[
+                (
+                    self.df_esm_preprocessed["questionnaire_id"]
+                    == QUESTIONNAIRE_IDS.get("PANAS").get("PA")
+                )
+                | (
+                    self.df_esm_preprocessed["questionnaire_id"]
+                    == QUESTIONNAIRE_IDS.get("PANAS").get("NA")
+                )
+            ]
+        self.df_esm_clean = esm.clean_up_esm(self.df_esm_interest)
+        print("ESM data cleaned.")
+
+    def get_labels(self, questionnaire: str) -> pd.DataFrame:
+        if questionnaire == "PANAS":
+            return self.df_esm_clean
+        else:
+            raise KeyError("This questionnaire has not been implemented as a label.")
+
+    def aggregate_labels(self, cached=True) -> None:
+        print("Aggregating labels ...")
+        if not self.participants_label:
+            raise ValueError(WARNING_PARTICIPANTS_LABEL)
+
+        try:
+            if not cached:  # Do not use the file, even if it exists.
+                raise FileNotFoundError
+            self.df_esm_means = read_csv_with_settings(
+                self.folder,
+                self.filename_prefix,
+                data_type="_".join(self.questionnaires),
+                grouping_variable=self.grouping_variable,
+            )
+            print("Read labels from the file.")
+        except FileNotFoundError:
+            # We need to recalculate the features in this case.
+            self.df_esm_means = (
+                self.df_esm_clean.groupby(
+                    ["participant_id", "questionnaire_id"] + self.grouping_variable
+                )
+                .esm_user_answer_numeric.agg("mean")
+                .reset_index()
+                .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
+            )
+            self.df_esm_means = (
+                self.df_esm_means.pivot(
+                    index=["participant_id"] + self.grouping_variable,
+                    columns="questionnaire_id",
+                    values="esm_numeric_mean",
+                )
+                .reset_index(col_level=1)
+                .rename(columns=QUESTIONNAIRE_IDS_RENAME)
+                .set_index(["participant_id"] + self.grouping_variable)
+            )
+            print("Labels aggregated.")
+            to_csv_with_settings(
+                self.df_esm_means,
+                self.folder,
+                self.filename_prefix,
+                data_type="_".join(self.questionnaires),
+            )
+
+    def get_aggregated_labels(self) -> pd.DataFrame:
+        return self.df_esm_means
+
+    def construct_export_path(self) -> None:
+        if not self.participants_label:
+            warnings.warn(WARNING_PARTICIPANTS_LABEL, UserWarning)
+        self.folder = here("machine_learning/intermediate_results/labels", warn=True)
+        self.filename_prefix = (
+            self.participants_label + "_" + self.grouping_variable_name
+        )
+
+    def set_participants_label(self, label: str) -> None:
+        self.participants_label = label
+        self.construct_export_path()
--- a/machine_learning/model.py
+++ b/machine_learning/model.py
@ -0,0 +1,47 @@
+from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
+
+
+class ModelValidation:
+    def __init__(self, X, y, group_variable=None, cv_name="loso"):
+        self.model = None
+        self.cv = None
+
+        idx_common = X.index.intersection(y.index)
+        self.y = y.loc[idx_common, "NA"]
+        # TODO Handle the case of multiple labels.
+        self.X = X.loc[idx_common]
+        self.groups = self.y.index.get_level_values(group_variable)
+
+        self.cv_name = cv_name
+        print("ModelValidation initialized.")
+
+    def set_cv_method(self):
+        if self.cv_name == "loso":
+            self.cv = LeaveOneGroupOut()
+            self.cv.get_n_splits(X=self.X, y=self.y, groups=self.groups)
+        print("Validation method set.")
+
+    def cross_validate(self):
+        print("Running cross validation ...")
+        if self.model is None:
+            raise TypeError(
+                "Please, specify a machine learning model first, by setting the .model attribute. "
+                "E.g. self.model = sklearn.linear_model.LinearRegression()"
+            )
+        if self.cv is None:
+            raise TypeError(
+                "Please, specify a cross validation method first, by using set_cv_method() first."
+            )
+        if self.X.isna().any().any() or self.y.isna().any().any():
+            raise ValueError(
+                "NaNs were found in either X or y. Please, check your data before continuing."
+            )
+        return cross_val_score(
+            estimator=self.model,
+            X=self.X,
+            y=self.y,
+            groups=self.groups,
+            cv=self.cv,
+            n_jobs=-1,
+            scoring="r2",
+        )
--- a/machine_learning/pipeline.py
+++ b/machine_learning/pipeline.py
@ -1,125 +1,32 @@
-import datetime
+import numpy as np
+import yaml
+from sklearn import linear_model

-import pandas as pd
-from sklearn.model_selection import cross_val_score
+from machine_learning.features_sensor import SensorFeatures
+from machine_learning.labels import Labels
+from machine_learning.model import ModelValidation

-import participants.query_db
-from features import esm, helper, proximity
-from machine_learning import QUESTIONNAIRE_IDS, QUESTIONNAIRE_IDS_RENAME
+if __name__ == "__main__":
+    with open("./config/prox_comm_PANAS_features.yaml", "r") as file:
+        sensor_features_params = yaml.safe_load(file)
+    sensor_features = SensorFeatures(**sensor_features_params)
+    sensor_features.set_sensor_data()
+    sensor_features.calculate_features()

+    with open("./config/prox_comm_PANAS_labels.yaml", "r") as file:
+        labels_params = yaml.safe_load(file)
+    labels = Labels(**labels_params)
+    labels.set_labels()
+    labels.aggregate_labels()

-class MachineLearningPipeline:
-    def __init__(
-        self,
-        labels_questionnaire,
-        labels_scale,
-        data_types,
-        participants_usernames=None,
-        feature_names=None,
-        grouping_variable=None,
-    ):
-        if participants_usernames is None:
-            participants_usernames = participants.query_db.get_usernames(
-                collection_start=datetime.date.fromisoformat("2020-08-01")
-            )
-        self.participants_usernames = participants_usernames
-        self.labels_questionnaire = labels_questionnaire
-        self.data_types = data_types
-
-        if feature_names is None:
-            self.feature_names = []
-        self.df_features = pd.DataFrame()
-        self.labels_scale = labels_scale
-        self.df_labels = pd.DataFrame()
-        self.grouping_variable = grouping_variable
-        self.df_groups = pd.DataFrame()
-
-        self.model = None
-        self.validation_method = None
-
-        self.df_esm = pd.DataFrame()
-        self.df_esm_preprocessed = pd.DataFrame()
-        self.df_esm_interest = pd.DataFrame()
-        self.df_esm_clean = pd.DataFrame()
-
-        self.df_proximity = pd.DataFrame()
-
-        self.df_full_data_daily_means = pd.DataFrame()
-        self.df_esm_daily_means = pd.DataFrame()
-        self.df_proximity_daily_counts = pd.DataFrame()
-
-    def get_labels(self):
-        self.df_esm = esm.get_esm_data(self.participants_usernames)
-        self.df_esm_preprocessed = esm.preprocess_esm(self.df_esm)
-        if self.labels_questionnaire == "PANAS":
-            self.df_esm_interest = self.df_esm_preprocessed[
-                (
-                    self.df_esm_preprocessed["questionnaire_id"]
-                    == QUESTIONNAIRE_IDS.get("PANAS").get("PA")
-                )
-                | (
-                    self.df_esm_preprocessed["questionnaire_id"]
-                    == QUESTIONNAIRE_IDS.get("PANAS").get("NA")
-                )
-            ]
-        self.df_esm_clean = esm.clean_up_esm(self.df_esm_interest)
-
-    def get_sensor_data(self):
-        if "proximity" in self.data_types:
-            self.df_proximity = proximity.get_proximity_data(
-                self.participants_usernames
-            )
-            self.df_proximity = helper.get_date_from_timestamp(self.df_proximity)
-            self.df_proximity = proximity.recode_proximity(self.df_proximity)
-
-    def aggregate_daily(self):
-        self.df_esm_daily_means = (
-            self.df_esm_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])
-            .esm_user_answer_numeric.agg("mean")
-            .reset_index()
-            .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
-        )
-        self.df_esm_daily_means = (
-            self.df_esm_daily_means.pivot(
-                index=["participant_id", "date_lj"],
-                columns="questionnaire_id",
-                values="esm_numeric_mean",
-            )
-            .reset_index(col_level=1)
-            .rename(columns=QUESTIONNAIRE_IDS_RENAME)
-            .set_index(["participant_id", "date_lj"])
-        )
-        self.df_full_data_daily_means = self.df_esm_daily_means.copy()
-        if "proximity" in self.data_types:
-            self.df_proximity_daily_counts = proximity.count_proximity(
-                self.df_proximity, ["participant_id", "date_lj"]
-            )
-            self.df_full_data_daily_means = self.df_full_data_daily_means.join(
-                self.df_proximity_daily_counts
-            )
-
-    def assign_columns(self):
-        self.df_features = self.df_full_data_daily_means[self.feature_names]
-        self.df_labels = self.df_full_data_daily_means[self.labels_scale]
-        if self.grouping_variable:
-            self.df_groups = self.df_full_data_daily_means[self.grouping_variable]
-        else:
-            self.df_groups = None
-
-    def validate_model(self):
-        if self.model is None:
-            raise AttributeError(
-                "Please, specify a machine learning model first, by setting the .model attribute."
-            )
-        if self.validation_method is None:
-            raise AttributeError(
-                "Please, specify a cross validation method first, by setting the .validation_method attribute."
-            )
-        cross_val_score(
-            estimator=self.model,
-            X=self.df_features,
-            y=self.df_labels,
-            groups=self.df_groups,
-            cv=self.validation_method,
-            n_jobs=-1,
+    model_validation = ModelValidation(
+        sensor_features.get_features("all", "all"),
+        labels.get_aggregated_labels(),
+        group_variable="participant_id",
+        cv_name="loso",
    )
+    model_validation.model = linear_model.LinearRegression()
+    model_validation.set_cv_method()
+    model_loso_r2 = model_validation.cross_validate()
+    print(model_loso_r2)
+    print(np.mean(model_loso_r2))
--- a/machine_learning/preprocessing.py
+++ b/machine_learning/preprocessing.py
@ -0,0 +1,133 @@
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+class Preprocessing:
+    """This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data. 
+       It's blind to the test data - e.g, it imputes the test data with train data mean. 
+       This means, it somehow needs an access to the information about data split. In context 
+    """
+    
+
+    def __init__(self, train_X, train_y, test_X, test_y):
+        self.train_X = train_X
+        self.train_y = train_y
+        self.test_X = test_X
+        self.test_y = test_y
+
+
+    def one_hot_encoder(self, categorical_features, numerical_features, mode):
+        """
+        This code is an implementation of one-hot encoding. It takes in two data sets, 
+        one with categorical features and one with numerical features and a mode parameter. 
+        First it uses the fillna() function to fill in any missing values present in the 
+        categorical data set with the mode value. Then it uses the apply () method to 
+        convert each column of the data set into a category data type which is then 
+        transformed using the pd.get_dummies() function. Finally it concatenates the 
+        numerical data set and the transformed categorical data set using pd.concat() and 
+        returns it.
+
+        Args:
+            categorical_features (DataFrame): DataFrame including only categorical columns.
+            numerical_features (_type_): DataFrame including only numerical columns.
+            mode (int): Mode of the column with which DataFrame is filled.
+
+        Returns:
+            DataFrame: Hot-One Encoded DataFrame.
+        """
+        # Fill train set with mode
+        categorical_features = categorical_features.fillna(mode)
+
+        # one-hot encoding
+        categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+        if not categorical_features.empty:
+            categorical_features = pd.get_dummies(categorical_features)
+
+        return pd.concat([numerical_features, categorical_features], axis=1), categorical_features.columns.tolist()
+
+
+    def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
+        """
+        This code is used to transform categorical data into numerical representations. 
+        It first identifies the categorical columns, then copies them and saves them as 
+        a new dataset. The missing data is filled with the mode (most frequent value in 
+        the respective column). This new dataset is then subjected to one-hot encoding, 
+        which is a process of transforming categorical data into machine interpretable 
+        numerical form by converting categories into multiple binary outcome variables. 
+        These encoded values are then concatenated to the numerical features prior to 
+        being returned as the final dataset.
+
+        Args:
+            categorical_columns (list, optional): List of categorical columns in the dataset. 
+                Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"].
+        
+        """
+        categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
+
+        # For train set
+        train_X_categorical_features = self.train_X[categorical_columns].copy()
+        train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
+        mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
+        
+        self.train_X, train_cat_col_names = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
+        encoded_categorical_features = [col for col in self.train_X.columns if col.startswith(tuple(categorical_columns))]
+        
+        # For test set
+        test_X_categorical_features = self.test_X[categorical_columns].copy()
+        test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
+        
+        self.test_X, test_cat_col_names = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
+
+        # Create categorical columns that were not found in test set and fill them with 0        
+        missing_cols = [col for col in train_cat_col_names if col not in test_cat_col_names]
+        self.test_X[missing_cols] = 0
+        
+        # Sort column names alphabetically        
+        self.train_X = self.train_X.reindex(sorted(self.train_X.columns), axis=1)
+        self.test_X = self.test_X.reindex(sorted(self.test_X.columns), axis=1)
+        
+
+    def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
+        
+        # TODO: TESTING
+        
+        if groupby:
+            # Interval numerical features # TODO: How can we get and assign appropriate groupby means and assign them to correct columns?
+            
+            # VVVVV ......  IN PROGRES ...... VVVVV
+            means = self.train_X[interval_feature_list].groupby(groupby_feature).mean() 
+            self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + other_feature_list)]] = \
+                self.train_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
+                
+            self.test_X[self.test_X.loc[:, ~self.test_X.columns.isin([groupby_feature] + other_feature_list)]] = \
+                self.test_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
+                
+            # Other features
+            self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + interval_feature_list)]] = \
+                self.train_X[other_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.median()))
+            
+        else:
+            # Interval numerical features
+            means = self.train_X[interval_feature_list].mean()
+            self.train_X[interval_feature_list].fillna(means, inplace=True)
+            self.test_X[interval_feature_list].fillna(means, inplace=True)
+                    
+            # Other features
+            medians = self.train_X[other_feature_list].median()
+            self.train_X[other_feature_list].fillna(medians, inplace=True)
+            self.test_X[other_feature_list].fillna(medians, inplace=True)
+            
+            
+    def get_train_test_sets(self):
+        """Train and test sets getter
+
+        Returns:
+            tuple of Pandas DataFrames: Gets train test sets in traditional sklearn format.
+        """
+        return self.train_X, self.train_y, self.test_X, self.test_y
+        
+        
+
--- a/participants/prepare_usernames_file.py
+++ b/participants/prepare_usernames_file.py
@ -0,0 +1,69 @@
+import datetime
+import os
+import sys
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import pandas as pd
+from features.timezone import get_timezone_data
+from pyprojroot import here
+
+import participants.query_db
+
+participants_inactive_usernames = participants.query_db.get_usernames(
+    tester=False,  # True participants are wanted.
+    active=False,  # They have all finished their participation.
+    collection_start=datetime.date.fromisoformat(
+        "2020-08-01"
+    ),  # This is the timeframe of the main study.
+    last_upload=datetime.date.fromisoformat("2021-09-01"),
+)
+
+participants_overview_si = pd.read_csv(
+    snakemake.params["baseline_folder"] + "Participants_overview_Slovenia.csv", sep=";"
+)
+participants_overview_be = pd.read_csv(
+    snakemake.params["baseline_folder"]+ "Participants_overview_Belgium.csv", sep=";"
+)
+
+participants_true_si = participants_overview_si[
+    participants_overview_si["Wristband_SerialNo"] != "DECLINED"
+]
+participants_true_be = participants_overview_be[
+    participants_overview_be["SmartphoneBrand+Generation"].str.slice(0, 3) != "Not"
+]
+
+# Concatenate participants from both countries.
+participants_usernames_empatica = pd.concat(
+    [participants_true_be, participants_true_si]
+)
+# Filter only the participants from the main study (queried from the database).
+participants_usernames_empatica = participants_usernames_empatica[
+    participants_usernames_empatica["Username"].isin(participants_inactive_usernames)
+]
+# Rename and select columns.
+participants_usernames_empatica = participants_usernames_empatica.rename(
+    columns={"Username": "label", "Wristband_SerialNo": "empatica_id"}
+)[["label", "empatica_id"]]
+# Adapt for csv export.
+participants_usernames_empatica["empatica_id"] = participants_usernames_empatica[
+    "empatica_id"
+].str.replace(",", ";")
+
+participants_usernames_empatica.to_csv(
+    snakemake.output["usernames_file"],
+    header=True,
+    index=False,
+    line_terminator="\n",
+)
+
+timezone_df = get_timezone_data(participants_inactive_usernames)
+
+timezone_df.to_csv(
+    snakemake.output["timezone_file"],
+    header=True,
+    index=False,
+    line_terminator="\n",
+)
--- a/presentation/ApplicationCategories.R
+++ b/presentation/ApplicationCategories.R
@ -0,0 +1,147 @@
+library(conflicted)
+library(yaml)
+library(RPostgreSQL)
+library(tidyverse)
+conflicts_prefer(
+  dplyr::filter,
+  dplyr::lag
+)
+library(magrittr)
+
+# read the password from file
+credentials <- yaml.load_file("../rapids/credentials.yaml")
+pw <- credentials$PSQL_STRAW$password
+
+# load the PostgreSQL driver
+drv <- RPostgres::Postgres()
+
+# creates a connection to the postgres database
+# note that "con" will be used later in each connection to the database
+con <- RPostgres::dbConnect(drv,
+  dbname = "staw",
+  host = "eol.ijs.si", port = 5432,
+  user = "staw_db", password = pw
+)
+
+rm(pw, credentials) # removes the password
+
+# check for the bluetooth table, an example
+dbExistsTable(con, "app_categories")
+
+df_app_categories <- tbl(con, "app_categories") %>%
+  collect()
+
+head(df_app_categories)
+table(df_app_categories$play_store_genre)
+
+df_app_categories %>%
+  filter(play_store_genre == "not_found") %>%
+  group_by(play_store_response) %>%
+  count()
+# All "not_found" have an HTTP status of 404.
+
+df_app_categories %>%
+  filter(play_store_genre == "not_found") %>%
+  group_by(package_name) %>%
+  count() %>%
+  arrange(desc(n))
+# All "not_found" apps are unique.
+
+# Exclude phone manufacturers, custom ROM names and similar.
+manufacturers <- c(
+  "samsung",
+  "oneplus",
+  "huawei",
+  "xiaomi",
+  "lge",
+  "motorola",
+  "miui",
+  "lenovo",
+  "oppo",
+  "mediatek"
+)
+custom_rom <- c("coloros", "lineageos", "myos", "cyanogenmod", "foundation.e")
+other <- c("android", "wssyncmldm")
+
+grep_pattern <- paste(c(manufacturers, custom_rom, other), collapse = "|")
+
+rows_os_manufacturer <- grepl(grep_pattern, df_app_categories$package_name)
+
+# Explore what remains after excluding above.
+df_app_categories[!rows_os_manufacturer, ] %>%
+  filter(play_store_genre == "not_found")
+
+# Also check the relationship between is_system_app and System category.
+tbl(con, "applications") %>% 
+  filter(is_system_app, play_store_genre != "System") %>% 
+  count()
+# They are perfectly correlated.
+
+# Manually classify apps
+df_app_categories[df_app_categories$play_store_genre == "not_found",] <- 
+  df_app_categories %>% 
+  filter(play_store_genre == "not_found") %>% 
+  mutate(
+    play_store_genre =
+      case_when(
+        str_detect(str_to_lower(package_name), grep_pattern) ~ "System",
+        str_detect(str_to_lower(package_name), "straw") ~ "STRAW",
+        str_detect(str_to_lower(package_name), "chromium") ~ "Communication", # Same as chrome.
+        str_detect(str_to_lower(package_name), "skype") ~ "Communication", # Skype Lite not classified.
+        str_detect(str_to_lower(package_name), "imsservice") ~ "Communication", # IP Multimedia Subsystem
+        str_detect(str_to_lower(package_name), paste(c("covid", "empatica"), collapse = "|")) ~ "Medical",
+        str_detect(str_to_lower(package_name), paste(c("libri", "tachiyomi"), collapse = "|")) ~ "Books & Reference",
+        str_detect(str_to_lower(package_name), paste(c("bricks", "chess"), collapse = "|")) ~ "Casual",
+        str_detect(str_to_lower(package_name), "weather") ~ "Weather",
+        str_detect(str_to_lower(package_name), "excel") ~ "Productivity",
+        str_detect(str_to_lower(package_name), paste(c("qr", "barcode", "archimedes", "mixplorer", "winrar", "filemanager", "shot", "faceunlock", "signin", "milink"), collapse = "|")) ~ "Tools",
+        str_detect(str_to_lower(package_name), "stupeflix") ~ "Photography",
+        str_detect(str_to_lower(package_name), "anyme") ~ "Entertainment",
+        str_detect(str_to_lower(package_name), "vanced") ~ "Video Players & Editors",
+        str_detect(str_to_lower(package_name), paste(c("music", "radio", "dolby"), collapse = "|")) ~ "Music & Audio",
+        str_detect(str_to_lower(package_name), paste(c("tensorflow", "object_detection"), collapse = "|")) ~ "Education",
+        .default = play_store_genre
+      )
+  )
+
+# Explore what remains after classifying above.
+df_app_categories %>%
+  filter(play_store_genre == "not_found")
+
+# After this, 13 applications remain, which I will classify as "Other".
+
+# Correct some mistakes
+# And classify 'not_found'
+df_app_categories %<>%
+  mutate(
+    play_store_genre = {
+      function(x) {
+        case_when(
+          x == "Education,Education" ~ "Education",
+          x == "EducationEducation" ~ "Education",
+          x == "not_found" ~ "Other",
+          .default = x
+        )
+      }
+    }(play_store_genre)
+  ) %>%
+  select(-package_name) %>%
+  rename(
+    genre = play_store_genre,
+    package_name = package_hash
+  )
+
+table(df_app_categories$genre)
+
+df_app_categories %>%
+  group_by(genre) %>%
+  count() %>%
+  arrange(desc(n)) %>%
+  write_csv("play_store_categories_count.csv")
+
+write_csv(
+  x = select(df_app_categories, c(package_name, genre)),
+  file = "play_store_application_genre_catalogue.csv"
+)
+
+dbDisconnect(con)
--- a/presentation/StressfulEvents.Rmd
+++ b/presentation/StressfulEvents.Rmd
@ -0,0 +1,103 @@
+---
+title: "Stressful event detection"
+output: html_notebook
+---
+
+```{r chunk_options, include=FALSE}
+knitr::opts_chunk$set(
+  comment = "#>", echo = FALSE, fig.width = 6
+)
+```
+
+```{r libraries, include=FALSE}
+library(knitr)
+library(kableExtra)
+library(stringr)
+library(RColorBrewer)
+library(magrittr)
+library(tidyverse)
+```
+
+```{r fig_setup, include=FALSE}
+accent <- RColorBrewer::brewer.pal(7, "Accent")
+```
+
+
+```{r read_data, include=FALSE}
+podatki <- read_csv("E:/STRAWresults/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
+podatki %<>% mutate(pid = as_factor(pid))
+```
+
+# Event descriptions
+
+Participants were asked "Was there a particular event that created tension in you?" with the following options:
+
+- 0 - No	
+- 1 - Yes, slightly	
+- 2 - Yes, moderately	
+- 3 - Yes, considerably	
+- 4 - Yes, extremely
+
+If they answered anything but "No", they were also asked about the event's perceived threat (e.g. "Did this event make you feel anxious?") and challenge (e.g. "How eager are you to tackle this event?"). 
+We only consider general "stressfulness" in this presentation.
+
+Most of the time, nothing stressful happened:
+ 
+```{r target_table}
+kable(table(podatki$target), col.names = c("stressfulness", "frequency")) %>% 
+  kable_styling(full_width = FALSE)
+```
+
+Most participants had somewhere between 0 and 10 stressful events.
+
+```{r target_distribution}
+podatki %>% 
+  group_by(pid) %>% 
+  summarise(no_of_events = sum(target > 0)) %>% 
+  ggplot(aes(no_of_events)) +
+  geom_histogram(binwidth = 1, fill = accent[1]) +
+  coord_cartesian(expand = FALSE) +
+  labs(x = "Number of events per participant") +
+  theme_classic()
+```
+
+When a stressful event occurred, participants mostly perceived it as slightly to moderately stressful on average.
+
+```{r mean_stressfulness_distribution}
+podatki %>% 
+  filter(target > 0) %>% 
+  group_by(pid) %>% 
+  summarise(mean_stressfulness = mean(target)) %>% 
+  ggplot(aes(mean_stressfulness)) +
+  geom_histogram(binwidth = 0.1, fill = accent[1]) +
+  coord_cartesian(expand = FALSE) +
+  labs(x = "Mean stressfulness per participant") +
+  theme_classic()
+```
+
+# Problem description
+
+We are trying to predict whether a stressful event occurred, i.e. stressfulness > 0, or not (stressfulness == 0).
+First, set up a leave-one-subject-out validation and use original distribution of the class variable.
+
+For this, the majority classifier has a mean accuracy of 0.85 (and median 0.90), while the F1-score, precision and recall are all 0.
+
+We also have an option to validate the results differently, such as with "half-loso", i.e. leaving half of the subject's data in the training set and only use half for testing, or k-fold cross-validation.
+Additionally, we can undersample the majority class to balance the dataset.
+
+# Results
+## Leave one subject out, original distribution
+
+```{r event_detection}
+scores <- read_csv("event_stressful_detection_loso.csv", col_types = "ffdd")
+scores_wide <- scores %>% 
+  select(!max) %>% 
+  pivot_wider(names_from = metric, 
+              names_sep = "_",
+              values_from = mean) %>% 
+  rename_all(~str_replace(.,"^test_",""))
+kable(scores_wide, digits = 2) %>% 
+  column_spec(4, color = 'white', background = 'black') %>% 
+  kable_styling(full_width = TRUE)
+```
+
--- a/presentation/classification.py
+++ b/presentation/classification.py
@ -0,0 +1,127 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate
+from sklearn.dummy import DummyClassifier
+from sklearn.impute import SimpleImputer
+
+import xgboost as xg
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+from pathlib import Path
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.labels
+import machine_learning.model
+from machine_learning.helper import run_all_classification_models
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# ## Set script's parameters
+#
+
+# %%
+cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
+
+# %% jupyter={"source_hidden": true}
+filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
+model_input = pd.read_csv(filename)
+
+# %% jupyter={"source_hidden": true}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+model_input.set_index(index_columns, inplace=True)
+model_input['target'].value_counts()
+
+# %% jupyter={"source_hidden": true}
+bins = [-10, -1, 1, 10] # bins for z-scored targets
+# bins = [0, 1, 4] # bins for stressfulness (1-4) target
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'medium', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
+model_input['target'].value_counts(), edges
+model_input = model_input[model_input['target'] != "medium"]
+model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
+
+model_input['target'].value_counts()
+
+if cv_method_str == 'halflogo':
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
+else:
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+
+
+# %% jupyter={"source_hidden": true}
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+categorical_features = data_x[categorical_feature_colnames].copy()
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+train_x = pd.concat([numerical_features, categorical_features], axis=1)
+
+# %% jupyter={"source_hidden": true}
+cv_method = None # Defaults to 5 k-folds in cross_validate method
+if cv_method_str == 'logo' or cv_method_str == 'half_logo':
+    cv_method = LeaveOneGroupOut()
+    cv_method.get_n_splits(
+        train_x,
+        data_y,
+        groups=data_groups,
+    )
+
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+
+# %%
+final_scores = run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
+
+# %%
+final_scores.index.name = "metric"
+final_scores = final_scores.set_index(["method", final_scores.index])
+final_scores.to_csv("event_stressfulness_lmh_lh_scores.csv")
--- a/presentation/event_stressfulness.py
+++ b/presentation/event_stressfulness.py
@ -0,0 +1,60 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: Python 3.10.8 ('straw2analysis')
+#     language: python
+#     name: python3
+# ---
+
+# %%
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+import yaml
+from pyprojroot import here
+from sklearn import linear_model, svm, kernel_ridge, gaussian_process
+from sklearn.model_selection import LeaveOneGroupOut, LeavePGroupsOut, cross_val_score, cross_validate
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.impute import SimpleImputer
+from sklearn.dummy import DummyRegressor
+import xgboost as xg
+
+from pathlib import Path
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.features_sensor
+import machine_learning.labels
+import machine_learning.model
+import machine_learning.helper
+
+
+
+# %% tags=["active-ipynb"]
+# filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
+# filename = Path('C:/Users/Primoz/VSCodeProjects/straw2analysis/data/stressfulness_event/input_appraisal_stressfulness_event_mean.csv')
+
+# %%
+final_scores = machine_learning.helper.run_all_regression_models(filename)
+
+# %%
+final_scores.index.name = "metric"
+final_scores = final_scores.set_index(["method", final_scores.index])
+
+# %%
+final_scores.to_csv("event_stressfulness_scores.csv")
--- a/presentation/play_store_application_genre_catalogue.csv
+++ b/presentation/play_store_application_genre_catalogue.csv
--- a/presentation/play_store_categories_count.csv
+++ b/presentation/play_store_categories_count.csv
@ -0,0 +1,45 @@
+genre,n
+System,261
+Tools,96
+Productivity,71
+Health & Fitness,60
+Finance,54
+Communication,39
+Music & Audio,39
+Shopping,38
+Lifestyle,33
+Education,28
+News & Magazines,24
+Maps & Navigation,23
+Entertainment,21
+Business,18
+Travel & Local,18
+Books & Reference,16
+Social,16
+Weather,16
+Food & Drink,14
+Sports,14
+Other,13
+Photography,13
+Puzzle,13
+Video Players & Editors,12
+Card,9
+Casual,9
+Personalization,8
+Medical,7
+Board,5
+Strategy,4
+House & Home,3
+Trivia,3
+Word,3
+Adventure,2
+Art & Design,2
+Auto & Vehicles,2
+Dating,2
+Role Playing,2
+STRAW,2
+Simulation,2
+"Board,Brain Games",1
+"Entertainment,Music & Video",1
+Parenting,1
+Racing,1
--- a/presentation/plots/d18NArfr_PCA.pdf
+++ b/presentation/plots/d18NArfr_PCA.pdf
--- a/presentation/plots/d18NArfr_hist.pdf
+++ b/presentation/plots/d18NArfr_hist.pdf
--- a/presentation/plots/d18NArfr_relplot.pdf
+++ b/presentation/plots/d18NArfr_relplot.pdf
--- a/presentation/plots/d18demandBayRidge_PCA.pdf
+++ b/presentation/plots/d18demandBayRidge_PCA.pdf
--- a/presentation/plots/d18demandBayRidge_relplot.pdf
+++ b/presentation/plots/d18demandBayRidge_relplot.pdf
--- a/presentation/plots/d18demandBayridge_hist.pdf
+++ b/presentation/plots/d18demandBayridge_hist.pdf
--- a/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
+++ b/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
--- a/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
+++ b/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
--- a/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
+++ b/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
--- a/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
+++ b/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
--- a/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
+++ b/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
--- a/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
+++ b/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
--- a/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
+++ b/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
--- a/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
+++ b/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
--- a/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
+++ b/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
--- a/presentation/presentation.Rproj
+++ b/presentation/presentation.Rproj
@ -0,0 +1,17 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+
+SpellingDictionary: en_GB
--- a/presentation/prox_comm_PANAS_nb.py
+++ b/presentation/prox_comm_PANAS_nb.py
@ -0,0 +1,131 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %%
+# %matplotlib inline
+import yaml
+from sklearn import linear_model
+from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
+import os
+import importlib
+import matplotlib.pyplot as plt
+import sys
+import numpy as np
+import seaborn as sns
+import pandas as pd
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+# %%
+from machine_learning import pipeline, features_sensor, labels, model
+
+# %%
+importlib.reload(labels)
+
+# %%
+with open("./config/prox_comm_PANAS_features.yaml", "r") as file:
+    sensor_features_params = yaml.safe_load(file)
+sensor_features = features_sensor.SensorFeatures(**sensor_features_params)
+#sensor_features.set_sensor_data()
+sensor_features.calculate_features(cached=True)
+
+# %%
+all_features = sensor_features.get_features("all","all")
+
+# %%
+with open("./config/prox_comm_PANAS_labels.yaml", "r") as file:
+    labels_params = yaml.safe_load(file)
+labels_current = labels.Labels(**labels_params)
+#labels_current.set_labels()
+labels_current.aggregate_labels(cached=True)
+
+# %%
+model_validation = model.ModelValidation(
+    sensor_features.get_features("all", "all"),
+    labels_current.get_aggregated_labels(),
+    group_variable="participant_id",
+    cv_name="loso",
+)
+model_validation.model = linear_model.LinearRegression()
+model_validation.set_cv_method()
+
+# %%
+model_loso_r2 = model_validation.cross_validate()
+
+# %%
+print(model_loso_r2)
+print(np.mean(model_loso_r2))
+
+# %%
+model_loso_r2[model_loso_r2 > 0]
+
+# %%
+logo = LeaveOneGroupOut()
+
+# %%
+try_X = model_validation.X.reset_index().drop(["participant_id","date_lj"], axis=1)
+try_y = model_validation.y.reset_index().drop(["participant_id","date_lj"], axis=1)
+
+# %%
+model_loso_mean_absolute_error = -1 * cross_val_score(
+estimator=model_validation.model,
+X=try_X,
+y=try_y,
+groups=model_validation.groups,
+cv=logo.split(X=try_X, y=try_y, groups=model_validation.groups), 
+scoring='neg_mean_absolute_error'
+)
+
+# %%
+model_loso_mean_absolute_error
+
+# %%
+np.median(model_loso_mean_absolute_error)
+
+# %%
+model_validation.model.fit(try_X, try_y)
+
+# %%
+Y_predicted = model_validation.model.predict(try_X)
+
+# %%
+try_y.rename(columns={"NA": "NA_true"}, inplace=True)
+try_y["NA_predicted"] = Y_predicted
+NA_long = pd.wide_to_long(
+    try_y.reset_index(),
+    i="index",
+    j="value",
+    stubnames="NA",
+    sep="_",
+    suffix=".+",
+)
+
+# %%
+g1 = sns.displot(NA_long, x="NA", hue="value", binwidth=0.1, height=5, aspect=1.5)
+sns.move_legend(g1, "upper left", bbox_to_anchor=(.55, .45))
+g1.set_axis_labels("Daily mean", "Day count")
+
+display(g1)
+g1.savefig("prox_comm_PANAS_predictions.pdf")
+
+# %%
+from sklearn.metrics import mean_absolute_error
+
+# %%
+mean_absolute_error(try_y["NA_true"], try_y["NA_predicted"])
+
+# %%
+model_loso_mean_absolute_error
--- a/presentation/results_presentation.py
+++ b/presentation/results_presentation.py
@ -0,0 +1,163 @@
+# %%
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+import yaml
+from pyprojroot import here
+from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble
+from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate, cross_val_predict
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.impute import SimpleImputer
+from sklearn.dummy import DummyRegressor
+from sklearn.decomposition import PCA
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.helper
+
+# %%
+segment = "intradaily_30_min"
+target = "JCQ_job_demand"
+csv_name = "./data/" + segment + "_all_targets/input_" + target + "_mean.csv"
+#csv_name = "./data/daily_18_hours_all_targets/input_JCQ_job_demand_mean.csv"
+
+# %%
+data_x, data_y, data_groups = machine_learning.helper.prepare_model_input(csv_name)
+
+# %%
+data_y.head()
+
+# %%
+scores = machine_learning.helper.run_all_models(csv_name)
+
+
+# %% jupyter={"source_hidden": true}
+logo = LeaveOneGroupOut()
+logo.get_n_splits(
+    data_x,
+    data_y,
+    groups=data_groups,
+)
+
+# %% [markdown]
+# ### Baseline: Dummy Regression (mean)
+dummy_regr = DummyRegressor(strategy="mean")
+
+# %% jupyter={"source_hidden": true}
+lin_reg_scores = cross_validate(
+    dummy_regr,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(lin_reg_scores['test_r2']))
+
+##################
+# %%
+chosen_model = "Random Forest"
+rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
+rfr_score = cross_validate(
+    rfr,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(rfr_score['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(rfr_score['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(rfr_score['test_neg_root_mean_squared_error']))
+print("R2", np.median(rfr_score['test_r2']))
+
+# %%
+y_predicted = cross_val_predict(rfr, data_x, data_y, groups=data_groups, cv=logo)
+#########################
+# %%
+chosen_model = "Bayesian Ridge"
+bayesian_ridge_reg = linear_model.BayesianRidge()
+bayesian_ridge_reg_score = cross_validate(
+    bayesian_ridge_reg,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
+print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
+
+# %%
+y_predicted = cross_val_predict(bayesian_ridge_reg, data_x, data_y, groups=data_groups, cv=logo)
+
+# %%
+data_y = pd.DataFrame(pd.concat([data_y, data_groups], axis=1))
+data_y.rename(columns={"target": "y_true"}, inplace=True)
+data_y["y_predicted"] = y_predicted
+
+# %%
+data_y.head()
+
+# %%
+g1 = sns.relplot(data=data_y, x="y_true", y="y_predicted")
+#g1.set_axis_labels("true", "predicted")
+#g1.map(plt.axhline, y=0, color=".7", dashes=(2, 1), zorder=0)
+#g1.map(plt.axline, xy1=(0,0), slope=1)
+g1.set(title=",".join([segment, target, chosen_model]))
+display(g1)
+g1.savefig("_".join([segment, target, chosen_model, "_relplot.pdf"]))
+
+# %%
+data_y_long = pd.wide_to_long(
+    data_y.reset_index(),
+    i=["local_segment", "pid"],
+    j="value",
+    stubnames="y",
+    sep="_",
+    suffix=".+",
+)
+
+# %%
+data_y_long.head()
+# %%
+g2 = sns.displot(data_y_long, x="y", hue="value", binwidth=0.1, height=5, aspect=1.5)
+sns.move_legend(g2, "upper left", bbox_to_anchor=(.55, .45))
+g2.set(title=",".join([segment, target, chosen_model]))
+g2.savefig("_".join([segment, target, chosen_model, "hist.pdf"]))
+
+# %%
+pca = PCA(n_components=2)
+pca.fit(data_x)
+print(pca.explained_variance_ratio_)
+
+# %%
+data_x_pca = pca.fit_transform(data_x)
+data_pca = pd.DataFrame(pd.concat([data_y.reset_index()["y_true"], pd.DataFrame(data_x_pca, columns = {"pca_0", "pca_1"})], axis=1))
+
+# %%
+data_pca
+# %%
+
+g3 = sns.relplot(data = data_pca, x = "pca_0", y = "pca_1", hue = "y_true", palette = sns.color_palette("Spectral", as_cmap=True))
+g3.set(title=",".join([segment, target, chosen_model]) + "\n variance explained = " + str(round(sum(pca.explained_variance_ratio_), 2)))
+g3.savefig("_".join([segment, target, chosen_model, "_PCA.pdf"]))
+
+# %%
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,7 @@
+[tool.isort]
+profile = "black"
+py_version = 311
+skip_gitignore = "true"
+
+[tool.black]
+target-version = ["py311"]
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 059774bda10545a83ab282f59eb7a329fef9ee4c
--- a/setup.py
+++ b/setup.py
@ -1,8 +1,7 @@
 import os

-import sqlalchemy.engine.url
 from dotenv import load_dotenv
-from sqlalchemy import create_engine
+from sqlalchemy import URL, create_engine
 from sqlalchemy.orm import sessionmaker

 load_dotenv()
@ -11,7 +10,7 @@ testing: bool = False

 db_password = os.getenv("DB_PASSWORD")

-db_uri = sqlalchemy.engine.url.URL(
+db_uri = URL.create(
    drivername="postgresql+psycopg2",
    username="staw_db",
    password=db_password,
--- a/statistical_analysis/adherence.py
+++ b/statistical_analysis/adherence.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.11.4
+#       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,25 +14,7 @@
 # ---

 # %%
-# %matplotlib inline
-import datetime
-import os
-import sys
-
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-import statsmodels.api as sm
-import statsmodels.formula.api as smf
-
-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
-import participants.query_db
-from features.esm import *
-
-# %%
-SAVE_FIGS = True
+SAVE_FIGS = False
 FIG_HEIGHT = 5
 FIG_ASPECT = 1.7
 FIG_COLOUR = "#28827C"
@ -96,13 +78,41 @@ df_session_counts_time = classify_sessions_by_completion_time(df_esm_preprocesse
 # Sessions are now classified according to the type of a session (a true questionnaire or simple single questions) and users response.

 # %%
-df_session_counts_time
+df_session_counts_time["session_response_cat"] = df_session_counts_time[
+    "session_response"
+].astype("category")
+df_session_counts_time["session_response_cat"] = df_session_counts_time[
+    "session_response_cat"
+].cat.remove_categories(
+    ["during_work_first", "ema_unanswered", "evening_first", "morning", "morning_first"]
+)
+df_session_counts_time["session_response_cat"] = df_session_counts_time[
+    "session_response_cat"
+].cat.add_categories("interrupted")
+df_session_counts_time.loc[
+    df_session_counts_time["session_response_cat"].isna(), "session_response_cat"
+] = "interrupted"
+# df_session_counts_time["session_response_cat"] = df_session_counts_time["session_response_cat"].cat.rename_categories({
+#    "ema_unanswered": "interrupted",
+#    "morning_first": "interrupted",
+#    "evening_first": "interrupted",
+#    "morning": "interrupted",
+#    "during_work_first": "interrupted"})
+
+# %%
+df_session_counts_time.session_response_cat

 # %%
 tbl_session_outcomes = df_session_counts_time.reset_index()[
-    "session_response"
+    "session_response_cat"
 ].value_counts()

+# %%
+tbl_session_outcomes_relative = tbl_session_outcomes / len(df_session_counts_time)
+
+# %%
+print(tbl_session_outcomes_relative.to_latex(escape=True))
+
 # %%
 print("All sessions:", len(df_session_counts_time))
 print("-------------------------------------")
--- a/statistical_analysis/scale_reliability.rmd
+++ b/statistical_analysis/scale_reliability.rmd
@ -0,0 +1,60 @@
+---
+title: "Reliability of SAM threat and challenge and COPE"
+output: html_notebook
+---
+
+
+```{r libraries, message=FALSE, warning=FALSE, include=FALSE, cache=FALSE}
+library(conflicted)
+library(here)
+library(tidyverse)
+library(magrittr)
+library(lavaan)
+library(kableExtra)
+
+conflicts_prefer(
+    readr::col_factor,
+    purrr::discard,
+    dplyr::filter,
+    dplyr::lag,
+    purrr::set_names,
+    tidyr::extract,
+    kableExtra::group_rows
+)
+```
+
+```{r style, include=FALSE, cache=FALSE}
+styler::style_file(
+    here("statistical_analysis", "scale_reliability.Rmd"),
+    scope = "tokens",
+    indent_by = 4L
+)
+```
+
+The data were preprocessed and cleaned using [expl_esm_labels.py](../exploration/expl_esm_labels.py) script and read as csv here.
+
+```{r read_data}
+COL_TYPES <- cols(
+    .default = col_double(),
+    participant_id = col_factor(),
+    username = col_factor(),
+    device_id = col_factor(),
+    esm_trigger = col_factor(),
+    esm_instructions = col_factor(),
+    double_esm_user_answer_timestamp = col_double(),
+    datetime_lj = col_datetime(format = ""),
+    date_lj = col_date(format = ""),
+    time = col_factor(),
+    esm_user_answer = col_factor()
+)
+df_SAM <- read_csv(here("data", "raw", "df_esm_SAM_threat_challenge.csv"), col_types = COL_TYPES)
+df_COPE <- read_csv(here("data", "raw", "df_esm_COPE.csv"), col_types = COL_TYPES)
+```
+
+Demonstrate factor analysis for a single participant.
+
+```{r}
+df_COPE %>%
+	group_by(question_id, questionnaire_id) %>%
+	count()
+```
--- a/straw2analysis.Rproj
+++ b/straw2analysis.Rproj
@ -0,0 +1,20 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: No
+NumSpacesForTab: 4
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: XeLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+
+PythonType: conda
+PythonVersion: 3.11.3
+PythonPath: E:/ProgramData/mambaforge/envs/straw2analysis/python.exe
--- a/test/test_communication.py
+++ b/test/test_communication.py
@ -88,6 +88,5 @@ class CallsFeatures(unittest.TestCase):
        self.features_call_sms = calls_sms_features(self.calls, self.sms)
        self.assertIsInstance(self.features_call_sms, pd.DataFrame)
        self.assertCountEqual(
-            self.features_call_sms.columns.to_list(),
-            FEATURES_CALLS + FEATURES_SMS + FEATURES_CONTACT,
+            self.features_call_sms.columns.to_list(), FEATURES_CALLS_SMS_ALL
        )
--- a/test/test_esm.py
+++ b/test/test_esm.py
@ -1,6 +1,7 @@
 import unittest

 from pandas.testing import assert_series_equal
+from pyprojroot import here

 from features.esm import *
 from features.esm_JCQ import *
@ -9,7 +10,7 @@ from features.esm_JCQ import *
 class EsmFeatures(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
-        cls.esm = pd.read_csv("../data/example_esm.csv", sep=";")
+        cls.esm = pd.read_csv(here("data/example_esm.csv"), sep=";")
        cls.esm["esm_json"] = cls.esm["esm_json"].apply(eval)
        cls.esm_processed = preprocess_esm(cls.esm)
        cls.esm_clean = clean_up_esm(cls.esm_processed)
--- a/test/test_features_sensor.py
+++ b/test/test_features_sensor.py
@ -0,0 +1,27 @@
+import unittest
+
+import yaml
+from pyprojroot import here
+
+from machine_learning.features_sensor import *
+
+
+class SensorFeaturesTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        with open(here("machine_learning/config/minimal_features.yaml"), "r") as file:
+            cls.sensor_features_params = yaml.safe_load(file)
+
+    def test_yaml(self):
+        with open(here("machine_learning/config/minimal_features.yaml"), "r") as file:
+            sensor_features_params = yaml.safe_load(file)
+        self.assertIsInstance(sensor_features_params, dict)
+        self.assertIsInstance(sensor_features_params.get("grouping_variable"), str)
+        self.assertIsInstance(sensor_features_params.get("features"), dict)
+        self.assertIsInstance(
+            sensor_features_params.get("participants_usernames"), list
+        )
+
+    def test_participants_label(self):
+        sensor_features = SensorFeatures(**self.sensor_features_params)
+        self.assertRaises(ValueError, sensor_features.calculate_features)
--- a/test/test_proximity.py
+++ b/test/test_proximity.py
@ -1,5 +1,7 @@
 import unittest

+from pyprojroot import here
+
 from features.proximity import *


@ -10,7 +12,7 @@ class ProximityFeatures(unittest.TestCase):

    @classmethod
    def setUpClass(cls) -> None:
-        cls.df_proximity = pd.read_csv("../data/example_proximity.csv")
+        cls.df_proximity = pd.read_csv(here("data/example_proximity.csv"))
        cls.df_proximity["participant_id"] = 99

    def test_recode_proximity(self):
				`@ -0,0 +1 @@`
				`Subproject commit 059774bda10545a83ab282f59eb7a329fef9ee4c`