43 changed files with 235 additions and 9421 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,7 +5,3 @@ __pycache__/
 /exploration/*.ipynb
 /config/*.ipynb
 /statistical_analysis/*.ipynb
-/machine_learning/intermediate_results/
-/data/features/
-/data/baseline/
-/data/*input*.csv
--- a/.gitmodules
+++ b/.gitmodules
@ -1,4 +0,0 @@
-[submodule "rapids"]
-	path = rapids
-	url = https://repo.ijs.si/junoslukan/rapids.git
-	branch = master
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -4,17 +4,4 @@
  <component name="PyCharmProfessionalAdvertiser">
    <option name="shown" value="true" />
  </component>
-  <component name="RMarkdownSettings">
-    <option name="renderProfiles">
-      <map>
-        <entry key="file://$PROJECT_DIR$/rapids/src/visualization/merge_heatmap_sensors_per_minute_per_time_segment.Rmd">
-          <value>
-            <RMarkdownRenderProfile>
-              <option name="outputDirectoryUrl" value="file://$PROJECT_DIR$/rapids/src/visualization" />
-            </RMarkdownRenderProfile>
-          </value>
-        </entry>
-      </map>
-    </option>
-  </component>
 </project>
--- a/.idea/snakemake-settings.xml
+++ b/.idea/snakemake-settings.xml
@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="SmkProjectSettings" sdk="Python 3.10 (snakemake)" enabled="true" />
-</project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -2,6 +2,5 @@
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/rapids" vcs="Git" />
  </component>
 </project>
--- a/README.md
+++ b/README.md
@ -27,135 +27,9 @@ To install:
   ipython kernel install --user --name=straw2analysis
   ```

-2. Provide a file called `.env` to be used by `python-dotenv` which should be placed in the top folder of the application 
+2. Provide an .env file to be used by `python-dotenv` which should be placed in the top folder of the application 
   and should have the form:
   
   ```
   DB_PASSWORD=database-password
   ```
-   
-# RAPIDS
-
-To install RAPIDS, follow the [instructions on their webpage](https://www.rapids.science/1.6/setup/installation/). 
-
-Here, I include additional information related to the installation and specific to the STRAW2analysis project.
-The installation was tested on Windows using Ubuntu 20.04 on Windows Subsystem for Linux ([WSL2](https://docs.microsoft.com/en-us/windows/wsl/install)).
-
-## Custom configuration
-### Credentials
-
-As mentioned under [Database in RAPIDS documentation](https://www.rapids.science/1.6/snippets/database/), a `credentials.yaml` file is needed to connect to a database.
-It should contain:
-
-```yaml
-PSQL_STRAW:
-  database: staw
-  host: 212.235.208.113
-  password: password
-  port: 5432
-  user: staw_db
-```
-
-where`password` needs to be specified as well.
-
-## Possible installation issues
-### Missing dependencies for RPostgres
-
-To install `RPostgres` R package (used to connect to the PostgreSQL database), an error might occur:
-
-```text
------------------------- ANTICONF ERROR ---------------------------
-Configuration failed because libpq was not found. Try installing:
-   * deb: libpq-dev (Debian, Ubuntu, etc)
-   * rpm: postgresql-devel (Fedora, EPEL)
-   * rpm: postgreql8-devel, psstgresql92-devel, postgresql93-devel, or postgresql94-devel (Amazon Linux)
-   * csw: postgresql_dev (Solaris)
-   * brew: libpq (OSX)
-If libpq is already installed, check that either:
-  (i)  'pkg-config' is in your PATH AND PKG_CONFIG_PATH contains a libpq.pc file; or
-  (ii) 'pg_config' is in your PATH.
-If neither can detect , you can set INCLUDE_DIR
-and LIB_DIR manually via:
-  R CMD INSTALL --configure-vars='INCLUDE_DIR=... LIB_DIR=...'
--------------------------[ ERROR MESSAGE ]----------------------------
-  <stdin>:1:10: fatal error: libpq-fe.h: No such file or directory
-compilation terminated.
-```
-
-The library requires `libpq` for compiling from source, so install accordingly.
-
-### Timezone environment variable for tidyverse (relevant for WSL2)
-
-One of the R packages, `tidyverse` might need access to the `TZ` environment variable during the installation.
-On Ubuntu 20.04 on WSL2 this triggers the following error:
-
-```text
-> install.packages('tidyverse')
-
-ERROR: configuration failed for package ‘xml2’
-System has not been booted with systemd as init system (PID 1). Can't operate.
-Failed to create bus connection: Host is down
-Warning in system("timedatectl", intern = TRUE) :
-  running command 'timedatectl' had status 1
-Error in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]) :
-  namespace ‘xml2’ 1.3.1 is already loaded, but >= 1.3.2 is required
-Calls: <Anonymous> ... namespaceImportFrom -> asNamespace -> loadNamespace
-Execution halted
-ERROR: lazy loading failed for package ‘tidyverse’
-```
-
-This happens because WSL2 does not use the `timedatectl` service, which provides this variable.
-
-```bash
-~$ timedatectl
-System has not been booted with systemd as init system (PID 1). Can't operate.
-Failed to create bus connection: Host is down
-```
-
-and later 
-
-```bash 
-Warning message:
-In system("timedatectl", intern = TRUE) :
-  running command 'timedatectl' had status 1
-Execution halted
-```
-
-This can be amended by setting the environment variable manually before attempting to install `tidyverse`:
-
-```bash
-export TZ='Europe/Ljubljana'
-```
-
-## Possible runtime issues
-### Unix end of line characters
-
-Upon running rapids, an error might occur:
-
-```bash
-/usr/bin/env: ‘python3\r’: No such file or directory
-```
-
-This is due to Windows style end of line characters. 
-To amend this, I added a `.gitattributes` files to force `git` to checkout `rapids` using Unix EOL characters.
-If this still fails, `dos2unix` can be used to change them.
-
-### System has not been booted with systemd as init system (PID 1)
-
-See [the installation issue above](#Timezone-environment-variable-for-tidyverse-(relevant-for-WSL2)).
-
-## Update RAPIDS
-
-To update RAPIDS, first pull and merge [origin]( https://github.com/carissalow/rapids), such as with:
-
-```commandline
-git fetch --progress "origin" refs/heads/master
-git merge --no-ff origin/master
-```
-
-Next, update the conda and R virtual environment.
-
-```bash
-R -e 'renv::restore(repos = c(CRAN = "https://packagemanager.rstudio.com/all/__linux__/focal/latest"))'
-```
-
--- a/config/environment.yml
+++ b/config/environment.yml
@ -12,11 +12,9 @@ dependencies:
  - mypy
  - nodejs
  - pandas
-  - psycopg2 >= 2.9.1
+  - psycopg2
  - python-dotenv
  - pytz
-  - pyprojroot
-  - pyyaml
  - seaborn
  - scikit-learn
  - sqlalchemy
--- a/config/models.py
+++ b/config/models.py
@ -166,43 +166,12 @@ class Application(Base, AWAREsensor):


 class Barometer(Base, AWAREsensor):
-    """
-    Contains the barometer sensor data.
-
-    Attributes
-    ----------
-    double_values_0: float
-        The ambient air pressure in mbar (hPa)
-    accuracy: int
-        Sensor’s accuracy level, either 1, 2, or 3 (see [SensorManager](https://developer.android.com/reference/android/hardware/SensorManager.html#SENSOR_STATUS_ACCURACY_HIGH))
-    """
-
    double_values_0 = Column(Float, nullable=False)
    accuracy = Column(SmallInteger, nullable=True)
    label = Column(String, nullable=True)


 class BarometerSensor(Base, AWAREsensor):
-    """
-    Contains the barometer sensor capabilities.
-
-    Attributes
-    ----------
-    double_sensor_maximum_range: float
-        Maximum sensor value possible
-    double_sensor_minimum_delay: float
-        Minimum sampling delay in microseconds
-    sensor_name: str
-    double_sensor_power_ma: float
-        Sensor’s power drain in mA
-    double_sensor_resolution: float
-        Sensor’s resolution in sensor’s units
-    sensor_type: str
-    sensor_vendor: str
-        Sensor’s manufacturer
-    sensor_version: str
-    """
-
    __tablename__ = "barometer_sensor"
    # Since this table is not really important,
    # I will leave all columns as nullable. (nullable=True by default.)
@ -288,19 +257,6 @@ class Imperfection(Base):


 class LightSensor(Base, AWAREsensor):
-    """
-    Contains the light sensor data.
-    Note: Even though this table is named light_sensor, it actually contains what AWARE calls light data
-        (rather than the data about the sensor's capabilities). Cf. Barometer(Sensor) and Temperature(Sensor).
-
-    Attributes
-    ----------
-    double_light_lux: float
-        The ambient luminance in lux units
-    accuracy: int
-        Sensor’s accuracy level, either 1, 2, or 3 (see [SensorManager](https://developer.android.com/reference/android/hardware/SensorManager.html#SENSOR_STATUS_ACCURACY_HIGH))
-    """
-
    __tablename__ = "light_sensor"
    double_light_lux = Column(Float, nullable=False)
    accuracy = Column(Integer, nullable=True)
@ -420,43 +376,12 @@ class SMS(Base, AWAREsensor):


 class Temperature(Base, AWAREsensor):
-    """
-    Contains the temperature sensor data.
-
-    Attributes
-    ----------
-    temperature_celsius: float
-        Measured temperature in °C
-    accuracy: int
-        Sensor’s accuracy level, either 1, 2, or 3 (see [SensorManager](https://developer.android.com/reference/android/hardware/SensorManager.html#SENSOR_STATUS_ACCURACY_HIGH))
-    """
-
    temperature_celsius = Column(Float, nullable=False)
    accuracy = Column(SmallInteger, nullable=True)
    label = Column(String, nullable=True)


 class TemperatureSensor(Base, AWAREsensor):
-    """
-    Contains the temperature sensor capabilities.
-
-    Attributes
-    ----------
-    double_sensor_maximum_range: float
-        Maximum sensor value possible
-    double_sensor_minimum_delay: float
-        Minimum sampling delay in microseconds
-    sensor_name: str
-    double_sensor_power_ma: float
-        Sensor’s power drain in mA
-    double_sensor_resolution: float
-        Sensor’s resolution in sensor’s units
-    sensor_type: str
-    sensor_vendor: str
-        Sensor’s manufacturer
-    sensor_version: str
-    """
-
    # I left all of these nullable,
    # as we haven't seen any data from this sensor anyway.
    __tablename__ = "temperature_sensor"
--- a/data/app_categories.csv
+++ b/data/app_categories.csv
--- a/data/input_PANAS_NA.csv
+++ b/data/input_PANAS_NA.csv
--- a/data/z_input_PANAS_NA.csv
+++ b/data/z_input_PANAS_NA.csv
--- a/exploration/debug_heatmap.py
+++ b/exploration/debug_heatmap.py
@ -1,323 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.13.0
-#   kernelspec:
-#     display_name: straw2analysis
-#     language: python
-#     name: straw2analysis
-# ---
-
-# %%
-import os, sys
-import importlib
-import pandas as pd
-import numpy as np
-
-# import plotly.graph_objects as go
-from importlib import util
-from pathlib import Path
-import yaml
-
-# %%
-phone_data_yield = pd.read_csv(
-    "../rapids/data/interim/p011/phone_yielded_timestamps_with_datetime.csv",
-    parse_dates=["local_date_time"],
-)
-time_segments_labels = pd.read_csv(
-    "../rapids/data/interim/time_segments/p011_time_segments_labels.csv"
-)
-
-# %%
-phone_data_yield["assigned_segments"] = phone_data_yield[
-    "assigned_segments"
-].str.replace(r"_RR\d+SS#", "#")
-time_segments_labels["label"] = time_segments_labels["label"].str.replace(
-    r"_RR\d+SS$", ""
-)
-
-
-# %% tags=[]
-def filter_data_by_segment(data, time_segment):
-    data.dropna(subset=["assigned_segments"], inplace=True)
-    if data.shape[0] == 0:  # data is empty
-        data["local_segment"] = data["timestamps_segment"] = None
-        return data
-
-    datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
-    timestamps_regex = "[0-9]{13}"
-    segment_regex = "\[({}#{},{};{},{})\]".format(
-        time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex
-    )
-    data["local_segment"] = data["assigned_segments"].str.extract(
-        segment_regex, expand=True
-    )
-    data = data.drop(columns=["assigned_segments"])
-    data = data.dropna(subset=["local_segment"])
-    if (
-        data.shape[0] == 0
-    ):  # there are no rows belonging to time_segment after droping na
-        data["timestamps_segment"] = None
-    else:
-        data[["local_segment", "timestamps_segment"]] = data["local_segment"].str.split(
-            pat=";", n=1, expand=True
-        )
-
-    # chunk episodes
-    if (
-        (not data.empty)
-        and ("start_timestamp" in data.columns)
-        and ("end_timestamp" in data.columns)
-    ):
-        data = chunk_episodes(data)
-
-    return data
-
-
-# %% tags=[]
-time_segment = "daily"
-phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
-
-# %%
-phone_data_yield.tail()
-
-# %%
-phone_data_yield_per_segment.tail()
-
-
-# %%
-def getDataForPlot(phone_data_yield_per_segment):
-    # calculate the length (in minute) of per segment instance
-    phone_data_yield_per_segment["length"] = (
-        phone_data_yield_per_segment["timestamps_segment"]
-        .str.split(",")
-        .apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
-    )
-    # calculate the number of sensors logged at least one row of data per minute.
-    phone_data_yield_per_segment = (
-        phone_data_yield_per_segment.groupby(
-            ["local_segment", "length", "local_date", "local_hour", "local_minute"]
-        )[["sensor", "local_date_time"]]
-        .max()
-        .reset_index()
-    )
-    # extract local start datetime of the segment from "local_segment" column
-    phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
-        phone_data_yield_per_segment["local_segment"].apply(
-            lambda x: x.split("#")[1].split(",")[0]
-        )
-    )
-    # calculate the number of minutes after local start datetime of the segment
-    phone_data_yield_per_segment["minutes_after_segment_start"] = (
-        (
-            phone_data_yield_per_segment["local_date_time"]
-            - phone_data_yield_per_segment["local_segment_start_datetimes"]
-        )
-        / pd.Timedelta(minutes=1)
-    ).astype("int")
-
-    # impute missing rows with 0
-    columns_for_full_index = phone_data_yield_per_segment[
-        ["local_segment_start_datetimes", "length"]
-    ].drop_duplicates(keep="first")
-    columns_for_full_index = columns_for_full_index.apply(
-        lambda row: [
-            [row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
-        ],
-        axis=1,
-    )
-    full_index = []
-    for columns in columns_for_full_index:
-        full_index = full_index + columns
-    full_index = pd.MultiIndex.from_tuples(
-        full_index,
-        names=("local_segment_start_datetimes", "minutes_after_segment_start"),
-    )
-    phone_data_yield_per_segment = (
-        phone_data_yield_per_segment.set_index(
-            ["local_segment_start_datetimes", "minutes_after_segment_start"]
-        )
-        .reindex(full_index)
-        .reset_index()
-        .fillna(0)
-    )
-
-    # transpose the dataframe per local start datetime of the segment and discard the useless index layer
-    phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
-        "local_segment_start_datetimes"
-    )[["minutes_after_segment_start", "sensor"]].apply(
-        lambda x: x.set_index("minutes_after_segment_start").transpose()
-    )
-    phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
-        "local_segment_start_datetimes"
-    )
-    return phone_data_yield_per_segment
-
-
-# %%
-data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
-
-# %%
-# calculate the length (in minute) of per segment instance
-phone_data_yield_per_segment["length"] = (
-    phone_data_yield_per_segment["timestamps_segment"]
-    .str.split(",")
-    .apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
-)
-
-# %%
-phone_data_yield_per_segment.tail()
-
-# %%
-# calculate the number of sensors logged at least one row of data per minute.
-phone_data_yield_per_segment = (
-    phone_data_yield_per_segment.groupby(
-        ["local_segment", "length", "local_date", "local_hour", "local_minute"]
-    )[["sensor", "local_date_time"]]
-    .max()
-    .reset_index()
-)
-
-# %%
-# extract local start datetime of the segment from "local_segment" column
-phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
-    phone_data_yield_per_segment["local_segment"].apply(
-        lambda x: x.split("#")[1].split(",")[0]
-    )
-)
-
-# %%
-# calculate the number of minutes after local start datetime of the segment
-phone_data_yield_per_segment["minutes_after_segment_start"] = (
-    (
-        phone_data_yield_per_segment["local_date_time"]
-        - phone_data_yield_per_segment["local_segment_start_datetimes"]
-    )
-    / pd.Timedelta(minutes=1)
-).astype("int")
-
-# %%
-columns_for_full_index = phone_data_yield_per_segment[
-    ["local_segment_start_datetimes", "length"]
-].drop_duplicates(keep="first")
-columns_for_full_index = columns_for_full_index.apply(
-    lambda row: [
-        [row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
-    ],
-    axis=1,
-)
-
-# %%
-full_index = []
-for columns in columns_for_full_index:
-    full_index = full_index + columns
-full_index = pd.MultiIndex.from_tuples(
-    full_index, names=("local_segment_start_datetimes", "minutes_after_segment_start")
-)
-
-# %%
-phone_data_yield_per_segment.tail()
-
-# %% [markdown]
-# # A workaround
-
-# %%
-phone_data_yield_per_segment["local_segment_start_datetimes", "minutes_after_segment_start"] = phone_data_yield_per_segment[
-    ["local_segment_start_datetimes", "minutes_after_segment_start"]
-].drop_duplicates(keep="first")
-
-# %%
-phone_data_yield_per_segment.set_index(
-    ["local_segment_start_datetimes", "minutes_after_segment_start"],
-    verify_integrity=True,
-).reindex(full_index)
-
-# %%
-phone_data_yield_per_segment.head()
-
-
-# %% [markdown]
-# # Retry
-
-# %%
-def getDataForPlot(phone_data_yield_per_segment):
-    # calculate the length (in minute) of per segment instance
-    phone_data_yield_per_segment["length"] = (
-        phone_data_yield_per_segment["timestamps_segment"]
-        .str.split(",")
-        .apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
-    )
-    # calculate the number of sensors logged at least one row of data per minute.
-    phone_data_yield_per_segment = (
-        phone_data_yield_per_segment.groupby(
-            ["local_segment", "length", "local_date", "local_hour", "local_minute"]
-        )[["sensor", "local_date_time"]]
-        .max()
-        .reset_index()
-    )
-    # extract local start datetime of the segment from "local_segment" column
-    phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
-        phone_data_yield_per_segment["local_segment"].apply(
-            lambda x: x.split("#")[1].split(",")[0]
-        )
-    )
-    # calculate the number of minutes after local start datetime of the segment
-    phone_data_yield_per_segment["minutes_after_segment_start"] = (
-        (
-            phone_data_yield_per_segment["local_date_time"]
-            - phone_data_yield_per_segment["local_segment_start_datetimes"]
-        )
-        / pd.Timedelta(minutes=1)
-    ).astype("int")
-
-    # impute missing rows with 0
-    columns_for_full_index = phone_data_yield_per_segment[
-        ["local_segment_start_datetimes", "length"]
-    ].drop_duplicates(keep="first")
-    columns_for_full_index = columns_for_full_index.apply(
-        lambda row: [
-            [row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
-        ],
-        axis=1,
-    )
-    full_index = []
-    for columns in columns_for_full_index:
-        full_index = full_index + columns
-    full_index = pd.MultiIndex.from_tuples(
-        full_index,
-        names=("local_segment_start_datetimes", "minutes_after_segment_start"),
-    )
-    phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(subset=["local_segment_start_datetimes", "minutes_after_segment_start"],keep="first")
-    phone_data_yield_per_segment = (
-        phone_data_yield_per_segment.set_index(
-            ["local_segment_start_datetimes", "minutes_after_segment_start"]
-        )
-        .reindex(full_index)
-        .reset_index()
-        .fillna(0)
-    )
-
-    # transpose the dataframe per local start datetime of the segment and discard the useless index layer
-    phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
-        "local_segment_start_datetimes"
-    )[["minutes_after_segment_start", "sensor"]].apply(
-        lambda x: x.set_index("minutes_after_segment_start").transpose()
-    )
-    phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
-        "local_segment_start_datetimes"
-    )
-    return phone_data_yield_per_segment
-
-
-# %%
-phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
-
-# %%
-data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
-
-# %%
--- a/exploration/ex_all_feat_ml_pipeline.py
+++ b/exploration/ex_all_feat_ml_pipeline.py
@ -1,473 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.13.0
-#   kernelspec:
-#     display_name: straw2analysis
-#     language: python
-#     name: straw2analysis
-# ---
-
-# %% jupyter={"source_hidden": true}
-# %matplotlib inline
-import datetime
-import importlib
-import os
-import sys
-
-import numpy as np
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-import yaml
-from pyprojroot import here
-from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble
-from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
-from sklearn.metrics import mean_squared_error, r2_score
-from sklearn.impute import SimpleImputer
-from xgboost import XGBRegressor
-
-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
-
-import machine_learning.features_sensor
-import machine_learning.labels
-import machine_learning.model
-
-# %% [markdown]
-# # RAPIDS models
-
-# %% [markdown]
-# ## PANAS negative affect
-
-# %% jupyter={"source_hidden": true}
-# model_input = pd.read_csv("../data/input_PANAS_NA.csv") # Nestandardizirani podatki
-model_input = pd.read_csv("../data/z_input_PANAS_NA.csv") # Standardizirani podatki
-
-# %% [markdown]
-# ### NaNs before dropping cols and rows
-
-# %% jupyter={"source_hidden": true}
-sns.set(rc={"figure.figsize":(16, 8)})
-sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
-
-# %% jupyter={"source_hidden": true}
-nan_cols = list(model_input.loc[:, model_input.isna().all()].columns)
-nan_cols
-
-# %% jupyter={"source_hidden": true}
-model_input.dropna(axis=1, how="all", inplace=True)
-model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
-
-# %% [markdown]
-# ### NaNs after dropping NaN cols and rows where target is NaN
-
-# %% jupyter={"source_hidden": true}
-sns.set(rc={"figure.figsize":(16, 8)})
-sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
-
-# %% jupyter={"source_hidden": true}
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-#if "pid" in model_input.columns:
-#    index_columns.append("pid")
-model_input.set_index(index_columns, inplace=True)
-
-data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
-
-# %% jupyter={"source_hidden": true}
-categorical_feature_colnames = ["gender", "startlanguage"]
-
-# %% jupyter={"source_hidden": true}
-categorical_features = data_x[categorical_feature_colnames].copy()
-
-# %% jupyter={"source_hidden": true}
-mode_categorical_features = categorical_features.mode().iloc[0]
-
-# %% jupyter={"source_hidden": true}
-# fillna with mode
-categorical_features = categorical_features.fillna(mode_categorical_features)
-
-# %% jupyter={"source_hidden": true}
-# one-hot encoding
-categorical_features = categorical_features.apply(lambda col: col.astype("category"))
-if not categorical_features.empty:
-    categorical_features = pd.get_dummies(categorical_features)
-
-# %% jupyter={"source_hidden": true}
-numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
-
-# %% jupyter={"source_hidden": true}
-train_x = pd.concat([numerical_features, categorical_features], axis=1)
-
-# %% jupyter={"source_hidden": true}
-train_x.dtypes
-
-# %% jupyter={"source_hidden": true}
-logo = LeaveOneGroupOut()
-logo.get_n_splits(
-    train_x,
-    data_y,
-    groups=data_groups,
-)
-
-# %% jupyter={"source_hidden": true}
-sum(data_y.isna())
-
-# %% [markdown]
-# ### Linear Regression
-
-# %% jupyter={"source_hidden": true}
-lin_reg_rapids = linear_model.LinearRegression()
-
-# %% jupyter={"source_hidden": true}
-imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
-
-# %% jupyter={"source_hidden": true}
-lin_reg_scores = cross_val_score(
-    lin_reg_rapids,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring='r2'
-)
-lin_reg_scores
-np.median(lin_reg_scores)
-
-# %% [markdown]
-# ### Ridge regression
-
-# %% jupyter={"source_hidden": true}
-ridge_reg = linear_model.Ridge(alpha=.5)
-
-# %% tags=[] jupyter={"source_hidden": true}
-ridge_reg_scores = cross_val_score(
-    ridge_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-)
-np.median(ridge_reg_scores)
-
-# %% [markdown]
-# ### Lasso
-
-# %% jupyter={"source_hidden": true}
-lasso_reg = linear_model.Lasso(alpha=0.1)
-
-# %% jupyter={"source_hidden": true}
-lasso_reg_score = cross_val_score(
-    lasso_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-)
-np.median(lasso_reg_score)
-
-# %% [markdown]
-# ### Bayesian Ridge
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg = linear_model.BayesianRidge()
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg_score = cross_val_score(
-    bayesian_ridge_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-)
-np.median(bayesian_ridge_reg_score)
-
-# %% [markdown]
-# ### RANSAC (outlier robust regression)
-
-# %% jupyter={"source_hidden": true}
-ransac_reg = linear_model.RANSACRegressor()
-
-# %% jupyter={"source_hidden": true}
-np.median(
-    cross_val_score(
-    ransac_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-    )
-)
-
-# %% [markdown]
-# ### Support vector regression
-
-# %% jupyter={"source_hidden": true}
-svr = svm.SVR()
-
-# %% jupyter={"source_hidden": true}
-np.median(
-    cross_val_score(
-    svr,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-    )
-)
-
-# %% [markdown]
-# ### Kernel Ridge regression
-
-# %% jupyter={"source_hidden": true}
-kridge = kernel_ridge.KernelRidge()
-
-# %% jupyter={"source_hidden": true}
-np.median(
-    cross_val_score(
-        kridge,
-        X=imputer.fit_transform(train_x),
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-)
-# %% [markdown]
-# ### Gaussian Process Regression
-
-# %% jupyter={"source_hidden": true}
-gpr = gaussian_process.GaussianProcessRegressor()
-
-# %% jupyter={"source_hidden": true}
-
-np.median(
-    cross_val_score(
-        gpr,
-        X=imputer.fit_transform(train_x),
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-)
-# %%
-def insert_row(df, row):
-    return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
-
-# %%
-def run_all_models(input_csv):
-    # Prepare data
-    model_input = pd.read_csv(input_csv)
-    model_input.dropna(axis=1, how="all", inplace=True)
-    model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
-
-    index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-    model_input.set_index(index_columns, inplace=True)
-
-    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
-
-    categorical_feature_colnames = ["gender", "startlanguage"]
-    categorical_features = data_x[categorical_feature_colnames].copy()
-    mode_categorical_features = categorical_features.mode().iloc[0]
-    # fillna with mode
-    categorical_features = categorical_features.fillna(mode_categorical_features)
-    # one-hot encoding
-    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
-    if not categorical_features.empty:
-        categorical_features = pd.get_dummies(categorical_features)
-
-    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
-
-    train_x = pd.concat([numerical_features, categorical_features], axis=1)
-    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
-    train_x_imputed = imputer.fit_transform(train_x)
-
-    # Prepare cross validation
-    logo = LeaveOneGroupOut()
-    logo.get_n_splits(
-        train_x,
-        data_y,
-        groups=data_groups,
-    )
-    scores = pd.DataFrame(columns=["method", "median", "max"])
-
-    # Validate models
-    lin_reg_rapids = linear_model.LinearRegression()
-    lin_reg_scores = cross_val_score(
-        lin_reg_rapids,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring='r2'
-    )
-    print("Linear regression:")
-    print(np.median(lin_reg_scores))
-    scores = insert_row(scores, ["Linear regression",np.median(lin_reg_scores),np.max(lin_reg_scores)])
-
-    ridge_reg = linear_model.Ridge(alpha=.5)
-    ridge_reg_scores = cross_val_score(
-        ridge_reg,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Ridge regression")
-    print(np.median(ridge_reg_scores))
-    scores = insert_row(scores, ["Ridge regression",np.median(ridge_reg_scores),np.max(ridge_reg_scores)])
-
-    lasso_reg = linear_model.Lasso(alpha=0.1)
-    lasso_reg_score = cross_val_score(
-        lasso_reg,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Lasso regression")
-    print(np.median(lasso_reg_score))
-    scores = insert_row(scores, ["Lasso regression",np.median(lasso_reg_score),np.max(lasso_reg_score)])
-
-    bayesian_ridge_reg = linear_model.BayesianRidge()
-    bayesian_ridge_reg_score = cross_val_score(
-        bayesian_ridge_reg,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Bayesian Ridge")
-    print(np.median(bayesian_ridge_reg_score))
-    scores = insert_row(scores, ["Bayesian Ridge",np.median(bayesian_ridge_reg_score),np.max(bayesian_ridge_reg_score)])
-
-    ransac_reg = linear_model.RANSACRegressor()
-    ransac_reg_score = cross_val_score(
-        ransac_reg,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("RANSAC (outlier robust regression)")
-    print(np.median(ransac_reg_score))
-    scores = insert_row(scores, ["RANSAC",np.median(ransac_reg_score),np.max(ransac_reg_score)])
-
-    svr = svm.SVR()
-    svr_score = cross_val_score(
-        svr,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Support vector regression")
-    print(np.median(svr_score))
-    scores = insert_row(scores, ["Support vector regression",np.median(svr_score),np.max(svr_score)])
-
-    kridge = kernel_ridge.KernelRidge()
-    kridge_score = cross_val_score(
-        kridge,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Kernel Ridge regression")
-    print(np.median(kridge_score))
-    scores = insert_row(scores, ["Kernel Ridge regression",np.median(kridge_score),np.max(kridge_score)])
-
-    gpr = gaussian_process.GaussianProcessRegressor()
-    gpr_score = cross_val_score(
-        gpr,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Gaussian Process Regression")
-    print(np.median(gpr_score))
-    scores = insert_row(scores, ["Gaussian Process Regression",np.median(gpr_score),np.max(gpr_score)])
-
-    rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
-    rfr_score = cross_val_score(
-        rfr,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Random Forest Regression")
-    print(np.median(rfr_score))
-    scores = insert_row(scores, ["Random Forest Regression",np.median(rfr_score),np.max(rfr_score)])
-
-    xgb = XGBRegressor()
-    xgb_score = cross_val_score(
-        xgb,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("XGBoost Regressor")
-    print(np.median(xgb_score))
-    scores = insert_row(scores, ["XGBoost Regressor",np.median(xgb_score),np.max(xgb_score)])
-
-    ada = ensemble.AdaBoostRegressor()
-    ada_score = cross_val_score(
-        ada,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("ADA Boost Regressor")
-    print(np.median(ada_score))
-    scores = insert_row(scores, ["ADA Boost Regressor",np.median(ada_score),np.max(ada_score)])
-
-    return scores
-
-
-
-
--- a/exploration/ex_ml_pipeline.py
+++ b/exploration/ex_ml_pipeline.py
--- a/exploration/expl_app_categories.py
+++ b/exploration/expl_app_categories.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.11.4
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -74,29 +74,3 @@ rows_os_manufacturer = df_category_not_found["package_name"].str.contains(
 # %%
 with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df_category_not_found.loc[~rows_os_manufacturer])
-
-# %% [markdown]
-# # Export categories
-
-# %% [markdown]
-# Rename all of "not_found" to "system" or "other".
-
-# %%
-df_app_categories_to_export = df_app_categories.copy()
-rows_os_manufacturer_full = (df_app_categories_to_export["package_name"].str.contains(
-    "|".join(manufacturers + custom_rom + other), case=False
-)) & (df_app_categories_to_export["play_store_genre"] == "not_found")
-df_app_categories_to_export.loc[rows_os_manufacturer_full, "play_store_genre"] = "System"
-
-# %%
-rows_not_found = (df_app_categories_to_export["play_store_genre"] == "not_found")
-df_app_categories_to_export.loc[rows_not_found, "play_store_genre"] = "Other"
-
-# %%
-df_app_categories_to_export["play_store_genre"].value_counts()
-
-# %%
-df_app_categories_to_export.rename(columns={"play_store_genre": "genre"},inplace=True)
-df_app_categories_to_export.to_csv("../data/app_categories.csv", columns=["package_hash","genre"],index=False)
-
-# %%
--- a/exploration/expl_esm_labels.py
+++ b/exploration/expl_esm_labels.py
@ -7,7 +7,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.11.2
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -17,7 +17,6 @@
 # %%
 import os
 import sys
-import datetime

 import seaborn as sns

@ -27,7 +26,6 @@ if nb_dir not in sys.path:
 import participants.query_db
 from features.esm import *
 from features.esm_JCQ import *
-from features.esm_SAM import *

 # %%
 participants_inactive_usernames = participants.query_db.get_usernames(
@ -101,12 +99,6 @@ df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
 # %% [markdown]
 # # Stress appraisal measure

-# %%
-df_SAM_all = extract_stressful_events(df_esm_inactive)
-
-# %%
-df_SAM_all.head()
-
 # %%
 df_esm_SAM = df_esm_preprocessed[
    (df_esm_preprocessed["questionnaire_id"] >= 87)
--- a/exploration/expl_ambient.py
+++ b/exploration/expl_ambient.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.11.4
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -21,6 +21,7 @@ import sys

 import seaborn as sns
 from pytz import timezone
+from tabulate import tabulate

 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
@ -31,18 +32,18 @@ import participants.query_db
 TZ_LJ = timezone("Europe/Ljubljana")

 # %%
-from features.ambient import *
+from features.light import *

 # %% [markdown]
-# # Light
+# # Basic characteristics

 # %%
-df_light_nokia = get_ambient_data(["nokia_0000003"], "light")
+df_light_nokia = get_light_data(["nokia_0000003"])
 print(df_light_nokia)

 # %%
 participants_inactive_usernames = participants.query_db.get_usernames()
-df_light_inactive = get_ambient_data(participants_inactive_usernames, "light")
+df_light_inactive = get_light_data(participants_inactive_usernames)

 # %%
 df_light_inactive.accuracy.value_counts()
@ -102,7 +103,7 @@ df_light_nokia.loc[df_light_nokia["double_light_lux"] == 0, ["datetime_lj"]]
 # Zeroes are present during the day. It does happens when the sensor is physically blocked.

 # %% [markdown]
-# ## Differences between participants
+# # Differences between participants

 # %%
 df_light_participants = (
@ -165,74 +166,3 @@ sns.displot(data=df_light_participants, x="std_rel", binwidth=0.005)
 # Relative variability is homogeneous.
 #
 # This means that light data needs to be standardized. Min/max standardization would probably fit best.
-
-# %% [markdown]
-# # Barometer
-
-# %% [markdown]
-# ## Barometer sensor
-
-# %%
-df_barometer_sensor_samsung = get_ambient_data(["samsung_0000002"], "barometer_sensor")
-df_barometer_sensor_samsung.shape
-
-# %% [markdown]
-# Even though we have many values for this sensor, they are all repeated as seen below.
-
-# %%
-barometer_sensor_cols = df_barometer_sensor_samsung.columns.to_list()
-barometer_sensor_cols.remove("id")
-barometer_sensor_cols.remove("_id")
-barometer_sensor_cols.remove("timestamp")
-barometer_sensor_cols.remove("device_id")
-print(df_barometer_sensor_samsung.drop_duplicates(subset=barometer_sensor_cols))
-
-# %% [markdown]
-# ## Barometer data
-
-# %%
-df_barometer_samsung = get_ambient_data(["samsung_0000002"], "barometer")
-print(df_barometer_samsung)
-
-# %%
-df_barometer_inactive = get_ambient_data(participants_inactive_usernames, "barometer")
-
-# %%
-df_barometer_inactive.accuracy.value_counts()
-
-# %%
-df_barometer_inactive.participant_id.nunique()
-
-# %%
-df_barometer_inactive.double_values_0.describe()
-
-# %% [markdown]
-# From [Wikipedia](https://en.wikipedia.org/wiki/Atmospheric_pressure#Mean_sea-level_pressure):
-#
-# > The lowest measurable sea-level pressure is found at the centers of tropical cyclones and tornadoes, with a record low of 870 mbar (87 kPa; 26 inHg).
-
-# %%
-df_barometer_inactive[df_barometer_inactive["double_values_0"] < 870]
-
-# %%
-sns.displot(
-    data=df_barometer_inactive[df_barometer_inactive["double_values_0"] > 870],
-    x="double_values_0",
-    binwidth=10,
-    height=8,
-)
-
-# %% [markdown]
-# # Temperature data
-
-# %% [markdown]
-# ## Temperature sensor
-
-# %% [markdown]
-# This table is empty.
-
-# %% [markdown]
-# ## Temperature data
-
-# %% [markdown]
-# This table is empty.
--- a/exploration/expl_proximity.py
+++ b/exploration/expl_proximity.py
@ -16,7 +16,6 @@
 # %%
 # %matplotlib inline
 import datetime
-import importlib
 import os
 import sys

@ -33,16 +32,13 @@ import participants.query_db
 TZ_LJ = timezone("Europe/Ljubljana")

 # %%
-from features import helper, proximity
-
-# %%
-importlib.reload(proximity)
+from features.proximity import *

 # %% [markdown]
 # # Basic characteristics

 # %%
-df_proximity_nokia = proximity.get_proximity_data(["nokia_0000003"])
+df_proximity_nokia = get_proximity_data(["nokia_0000003"])
 print(df_proximity_nokia)

 # %%
@ -57,7 +53,7 @@ df_proximity_nokia.double_proximity.value_counts()

 # %%
 participants_inactive_usernames = participants.query_db.get_usernames()
-df_proximity_inactive = proximity.get_proximity_data(participants_inactive_usernames)
+df_proximity_inactive = get_proximity_data(participants_inactive_usernames)

 # %%
 df_proximity_inactive.double_proximity.describe()
@ -114,13 +110,3 @@ df_proximity_combinations[
    (df_proximity_combinations[5.0] != 0)
    & (df_proximity_combinations[5.00030517578125] != 0)
 ]
-
-# %% [markdown]
-# # Features
-
-# %%
-df_proximity_inactive = helper.get_date_from_timestamp(df_proximity_inactive)
-
-# %%
-df_proximity_features = proximity.count_proximity(df_proximity_inactive, ["date_lj"])
-display(df_proximity_features)
--- a/features/ambient.py
+++ b/features/ambient.py
@ -1,91 +0,0 @@
-from collections.abc import Collection
-
-import pandas as pd
-
-from config.models import (
-    Barometer,
-    BarometerSensor,
-    LightSensor,
-    Participant,
-    Temperature,
-    TemperatureSensor,
-)
-from setup import db_engine, session
-
-MINIMUM_PRESSURE_MB = 870
-# The lowest measurable sea-level pressure is found at the centers of tropical cyclones and tornadoes,
-# with a record low of 870 mbar (87 kPa; 26 inHg).
-
-
-def get_ambient_data(usernames: Collection, sensor=None) -> pd.DataFrame:
-    """
-    Read the data from any of the ambient sensor tables and return it in a dataframe.
-
-    Parameters
-    ----------
-    usernames: Collection
-        A list of usernames to put into the WHERE condition.
-    sensor: str
-        One of: barometer, barometer_sensor, light, temperature, temperature_sensor.
-        Here, the _sensor tables describe the phone sensors, such as their range, dela, resolution, vendor etc.,
-        whereas barometer, light, and temperature describe the measured characteristics of the environment.
-
-    Returns
-    -------
-    df_ambient: pd.DataFrame
-        A dataframe of ambient sensor data.
-    """
-    if sensor == "barometer":
-        query_ambient = session.query(Barometer, Participant.username).filter(
-            Participant.id == Barometer.participant_id
-        )
-    elif sensor == "barometer_sensor":
-        query_ambient = session.query(BarometerSensor, Participant.username).filter(
-            Participant.id == BarometerSensor.participant_id
-        )
-    elif sensor == "light":
-        query_ambient = session.query(LightSensor, Participant.username).filter(
-            Participant.id == LightSensor.participant_id
-        )
-    # Note that LightSensor and its light_sensor table are incorrectly named.
-    # In this table, we actually find light data, i.e. double_light_lux, the ambient luminance in lux,
-    # and NOT light sensor data (its range, dela, resolution, vendor etc.) as the name suggests.
-    # We do not have light sensor data saved in the database.
-    elif sensor == "temperature":
-        query_ambient = session.query(Temperature, Participant.username).filter(
-            Participant.id == Temperature.participant_id
-        )
-    elif sensor == "temperature_sensor":
-        query_ambient = session.query(TemperatureSensor, Participant.username).filter(
-            Participant.id == TemperatureSensor.participant_id
-        )
-    else:
-        raise KeyError(
-            "Specify one of the ambient sensors: "
-            "barometer, barometer_sensor, light, temperature, or temperature_sensor."
-        )
-
-    query_ambient = query_ambient.filter(Participant.username.in_(usernames))
-    with db_engine.connect() as connection:
-        df_ambient = pd.read_sql(query_ambient.statement, connection)
-    return df_ambient
-
-
-def clean_pressure(df_ambient: pd.DataFrame) -> pd.DataFrame:
-    """
-    Simply removes values lower than MINIMUM_PRESSURE_MB (lowest measured pressure).
-
-    Parameters
-    ----------
-    df_ambient: pd.DataFrame
-        A dataframe of barometer data, which includes measured pressure in double_values_0.
-
-    Returns
-    -------
-    df_ambient: pd.DataFrame
-        The same dataframe with rows with low values of pressure removed.
-    """
-    if "double_values_0" not in df_ambient:
-        raise KeyError("The DF does not seem to hold barometer data.")
-    df_ambient = df_ambient[df_ambient["double_values_0"] > MINIMUM_PRESSURE_MB]
-    return df_ambient
--- a/features/communication.py
+++ b/features/communication.py
@ -8,21 +8,14 @@ from setup import db_engine, session
 call_types = {1: "incoming", 2: "outgoing", 3: "missed"}
 sms_types = {1: "received", 2: "sent"}

-FILL_NA_CALLS = {
-    "no_calls_all": 0,
-    "no_" + call_types.get(1): 0,
-    "no_" + call_types.get(2): 0,
-    "no_" + call_types.get(3): 0,
-    "duration_total_" + call_types.get(1): 0,
-    "duration_total_" + call_types.get(2): 0,
-    "duration_max_" + call_types.get(1): 0,
-    "duration_max_" + call_types.get(2): 0,
-    "no_" + call_types.get(1) + "_ratio": 1 / 3,  # Three different types
-    "no_" + call_types.get(2) + "_ratio": 1 / 3,
-    "no_contacts_calls": 0,
-}
-
-FEATURES_CALLS = list(FILL_NA_CALLS.keys())
+FEATURES_CALLS = (
+    ["no_calls_all"]
+    + ["no_" + call_type for call_type in call_types.values()]
+    + ["duration_total_" + call_types.get(1), "duration_total_" + call_types.get(2)]
+    + ["duration_max_" + call_types.get(1), "duration_max_" + call_types.get(2)]
+    + ["no_" + call_types.get(1) + "_ratio", "no_" + call_types.get(2) + "_ratio"]
+    + ["no_contacts_calls"]
+)

 # FEATURES_CALLS =
 # ["no_calls_all",
@ -30,26 +23,21 @@ FEATURES_CALLS = list(FILL_NA_CALLS.keys())
 # "duration_total_incoming", "duration_total_outgoing",
 # "duration_max_incoming", "duration_max_outgoing",
 # "no_incoming_ratio", "no_outgoing_ratio",
-# "no_contacts_calls"]
-
-FILL_NA_SMS = {
-    "no_sms_all": 0,
-    "no_" + sms_types.get(1): 0,
-    "no_" + sms_types.get(2): 0,
-    "no_" + sms_types.get(1) + "_ratio": 1 / 2,  # Two different types
-    "no_" + sms_types.get(2) + "_ratio": 1 / 2,
-    "no_contacts_sms": 0,
-}
-
-FEATURES_SMS = list(FILL_NA_SMS.keys())
+# "no_contacts"]

+FEATURES_SMS = (
+    ["no_sms_all"]
+    + ["no_" + sms_type for sms_type in sms_types.values()]
+    + ["no_" + sms_types.get(1) + "_ratio", "no_" + sms_types.get(2) + "_ratio"]
+    + ["no_contacts_sms"]
+)
 # FEATURES_SMS =
 # ["no_sms_all",
 #  "no_received", "no_sent",
 #  "no_received_ratio", "no_sent_ratio",
-#  "no_contacts_sms"]
+#  "no_contacts"]

-FEATURES_CALLS_SMS_PROP = [
+FEATURES_CONTACT = [
    "proportion_calls_all",
    "proportion_calls_incoming",
    "proportion_calls_outgoing",
@ -57,15 +45,6 @@ FEATURES_CALLS_SMS_PROP = [
    "proportion_calls_missed_sms_received",
 ]

-FILL_NA_CALLS_SMS_PROP = {
-    key: 1 / 2 for key in FEATURES_CALLS_SMS_PROP
-}  # All of the form of a / (a + b).
-
-FEATURES_CALLS_SMS_ALL = FEATURES_CALLS + FEATURES_SMS + FEATURES_CALLS_SMS_PROP
-
-FILL_NA_CALLS_SMS_ALL = FILL_NA_CALLS | FILL_NA_SMS | FILL_NA_CALLS_SMS_PROP
-# As per PEP-584 a union for dicts was implemented in Python 3.9.0.
-

 def get_call_data(usernames: Collection) -> pd.DataFrame:
    """
--- a/features/light.py
+++ b/features/light.py
@ -0,0 +1,30 @@
+from collections.abc import Collection
+
+import pandas as pd
+
+from config.models import LightSensor, Participant
+from setup import db_engine, session
+
+
+def get_light_data(usernames: Collection) -> pd.DataFrame:
+    """
+    Read the data from the light sensor table and return it in a dataframe.
+
+    Parameters
+    ----------
+    usernames: Collection
+        A list of usernames to put into the WHERE condition.
+
+    Returns
+    -------
+    df_light: pd.DataFrame
+        A dataframe of light data.
+    """
+    query_light = (
+        session.query(LightSensor, Participant.username)
+        .filter(Participant.id == LightSensor.participant_id)
+        .filter(Participant.username.in_(usernames))
+    )
+    with db_engine.connect() as connection:
+        df_light = pd.read_sql(query_light.statement, connection)
+    return df_light
--- a/features/proximity.py
+++ b/features/proximity.py
@ -5,12 +5,7 @@ import pandas as pd
 from config.models import Participant, Proximity
 from setup import db_engine, session

-FILL_NA_PROXIMITY = {
-    "freq_prox_near": 0,
-    "prop_prox_near": 1 / 2,  # Of the form of a / (a + b).
-}
-
-FEATURES_PROXIMITY = list(FILL_NA_PROXIMITY.keys())
+FEATURES_PROXIMITY = ["freq_prox_near", "prop_prox_near"]


 def get_proximity_data(usernames: Collection) -> pd.DataFrame:
@ -83,11 +78,11 @@ def count_proximity(
        A dataframe with the count of "near" proximity values and their relative count.
    """
    if group_by is None:
-        group_by = []
+        group_by = ["participant_id"]
    if "bool_prox_near" not in df_proximity:
        df_proximity = recode_proximity(df_proximity)
    df_proximity["bool_prox_far"] = ~df_proximity["bool_prox_near"]
-    df_proximity_features = df_proximity.groupby(["participant_id"] + group_by).sum()[
+    df_proximity_features = df_proximity.groupby(group_by).sum()[
        ["bool_prox_near", "bool_prox_far"]
    ]
    df_proximity_features = df_proximity_features.assign(
--- a/features/timezone.py
+++ b/features/timezone.py
@ -1,30 +0,0 @@
-from collections.abc import Collection
-
-import pandas as pd
-
-from config.models import Participant, Timezone
-from setup import db_engine, session
-
-
-def get_timezone_data(usernames: Collection) -> pd.DataFrame:
-    """
-    Read the data from the proximity sensor table and return it in a dataframe.
-
-    Parameters
-    ----------
-    usernames: Collection
-        A list of usernames to put into the WHERE condition.
-
-    Returns
-    -------
-    df_proximity: pd.DataFrame
-        A dataframe of proximity data.
-    """
-    query_timezone = (
-        session.query(Timezone, Participant.username)
-        .filter(Participant.id == Timezone.participant_id)
-        .filter(Participant.username.in_(usernames))
-    )
-    with db_engine.connect() as connection:
-        df_timezone = pd.read_sql(query_timezone.statement, connection)
-    return df_timezone
--- a/images/dag_calls_nokia_example.svg
+++ b/images/dag_calls_nokia_example.svg
@ -1,205 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
- "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<!-- Generated by graphviz version 2.43.0 (0)
- -->
-<!-- Title: snakemake_dag Pages: 1 -->
-<svg width="548pt" height="625pt"
- viewBox="0.00 0.00 548.00 625.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 621)">
-<title>snakemake_dag</title>
-<polygon fill="white" stroke="transparent" points="-4,4 -4,-621 544,-621 544,4 -4,4"/>
-<!-- 0 -->
-<g id="node1" class="node">
-<title>0</title>
-<path fill="none" stroke="#565bd8" stroke-width="2" d="M202,-36C202,-36 172,-36 172,-36 166,-36 160,-30 160,-24 160,-24 160,-12 160,-12 160,-6 166,0 172,0 172,0 202,0 202,0 208,0 214,-6 214,-12 214,-12 214,-24 214,-24 214,-30 208,-36 202,-36"/>
-<text text-anchor="middle" x="187" y="-15.5" font-family="sans" font-size="10.00">all</text>
-</g>
-<!-- 1 -->
-<g id="node2" class="node">
-<title>1</title>
-<path fill="none" stroke="#56d8a9" stroke-width="2" d="M100,-617C100,-617 12,-617 12,-617 6,-617 0,-611 0,-605 0,-605 0,-588 0,-588 0,-582 6,-576 12,-576 12,-576 100,-576 100,-576 106,-576 112,-582 112,-588 112,-588 112,-605 112,-605 112,-611 106,-617 100,-617"/>
-<text text-anchor="middle" x="56" y="-605" font-family="sans" font-size="10.00">pull_phone_data</text>
-<text text-anchor="middle" x="56" y="-594" font-family="sans" font-size="10.00">pid: nokia_0000003</text>
-<text text-anchor="middle" x="56" y="-583" font-family="sans" font-size="10.00">sensor: calls</text>
-</g>
-<!-- 1&#45;&gt;0 -->
-<g id="edge1" class="edge">
-<title>1&#45;&gt;0</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M47.83,-575.78C37.21,-548.32 20,-496.76 20,-451 20,-451 20,-451 20,-161 20,-114.96 38.83,-102.85 73,-72 95.21,-51.94 126.33,-38.17 150.45,-29.7"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="151.61,-33 159.97,-26.5 149.38,-26.37 151.61,-33"/>
-</g>
-<!-- 2 -->
-<g id="node3" class="node">
-<title>2</title>
-<path fill="none" stroke="#56d863" stroke-width="2" d="M124,-540C124,-540 60,-540 60,-540 54,-540 48,-534 48,-528 48,-528 48,-516 48,-516 48,-510 54,-504 60,-504 60,-504 124,-504 124,-504 130,-504 136,-510 136,-516 136,-516 136,-528 136,-528 136,-534 130,-540 124,-540"/>
-<text text-anchor="middle" x="92" y="-519.5" font-family="sans" font-size="10.00">calls_episodes</text>
-</g>
-<!-- 1&#45;&gt;2 -->
-<g id="edge9" class="edge">
-<title>1&#45;&gt;2</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M65.84,-575.69C69.87,-567.56 74.6,-558.03 78.92,-549.33"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="82.09,-550.83 83.4,-540.32 75.82,-547.72 82.09,-550.83"/>
-</g>
-<!-- 2&#45;&gt;0 -->
-<g id="edge2" class="edge">
-<title>2&#45;&gt;0</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M85.12,-503.83C75.18,-477.44 58,-425.14 58,-379 58,-379 58,-379 58,-161 58,-105.34 112.96,-61.84 151.14,-38.34"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="153.16,-41.21 159.96,-33.08 149.58,-35.2 153.16,-41.21"/>
-</g>
-<!-- 3 -->
-<g id="node4" class="node">
-<title>3</title>
-<path fill="none" stroke="#d8a456" stroke-width="2" d="M187.5,-468C187.5,-468 98.5,-468 98.5,-468 92.5,-468 86.5,-462 86.5,-456 86.5,-456 86.5,-444 86.5,-444 86.5,-438 92.5,-432 98.5,-432 98.5,-432 187.5,-432 187.5,-432 193.5,-432 199.5,-438 199.5,-444 199.5,-444 199.5,-456 199.5,-456 199.5,-462 193.5,-468 187.5,-468"/>
-<text text-anchor="middle" x="143" y="-453" font-family="sans" font-size="10.00">resample_episodes</text>
-<text text-anchor="middle" x="143" y="-442" font-family="sans" font-size="10.00">sensor: phone_calls</text>
-</g>
-<!-- 2&#45;&gt;3 -->
-<g id="edge10" class="edge">
-<title>2&#45;&gt;3</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M104.61,-503.7C110.6,-495.47 117.88,-485.48 124.48,-476.42"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="127.48,-478.25 130.54,-468.1 121.82,-474.13 127.48,-478.25"/>
-</g>
-<!-- 3&#45;&gt;0 -->
-<g id="edge3" class="edge">
-<title>3&#45;&gt;0</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M140.43,-432C136.64,-405.4 130,-352.3 130,-307 130,-307 130,-307 130,-161 130,-117.8 153,-72.19 169.78,-44.66"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="172.83,-46.37 175.19,-36.04 166.91,-42.65 172.83,-46.37"/>
-</g>
-<!-- 4 -->
-<g id="node5" class="node">
-<title>4</title>
-<path fill="none" stroke="#56d8d8" stroke-width="2" d="M357.5,-396C357.5,-396 194.5,-396 194.5,-396 188.5,-396 182.5,-390 182.5,-384 182.5,-384 182.5,-372 182.5,-372 182.5,-366 188.5,-360 194.5,-360 194.5,-360 357.5,-360 357.5,-360 363.5,-360 369.5,-366 369.5,-372 369.5,-372 369.5,-384 369.5,-384 369.5,-390 363.5,-396 357.5,-396"/>
-<text text-anchor="middle" x="276" y="-375.5" font-family="sans" font-size="10.00">resample_episodes_with_datetime</text>
-</g>
-<!-- 3&#45;&gt;4 -->
-<g id="edge11" class="edge">
-<title>3&#45;&gt;4</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M175.54,-431.88C193.25,-422.55 215.35,-410.92 234.32,-400.94"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="236.12,-403.94 243.34,-396.19 232.86,-397.75 236.12,-403.94"/>
-</g>
-<!-- 4&#45;&gt;0 -->
-<g id="edge4" class="edge">
-<title>4&#45;&gt;0</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M250.68,-359.83C218.76,-335.92 168,-289.36 168,-235 168,-235 168,-235 168,-161 168,-120.86 175.55,-74.9 181.13,-46.4"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="184.61,-46.88 183.16,-36.39 177.74,-45.5 184.61,-46.88"/>
-</g>
-<!-- 8 -->
-<g id="node9" class="node">
-<title>8</title>
-<path fill="none" stroke="#68d856" stroke-width="2" d="M353.5,-324C353.5,-324 248.5,-324 248.5,-324 242.5,-324 236.5,-318 236.5,-312 236.5,-312 236.5,-300 236.5,-300 236.5,-294 242.5,-288 248.5,-288 248.5,-288 353.5,-288 353.5,-288 359.5,-288 365.5,-294 365.5,-300 365.5,-300 365.5,-312 365.5,-312 365.5,-318 359.5,-324 353.5,-324"/>
-<text text-anchor="middle" x="301" y="-309" font-family="sans" font-size="10.00">phone_calls_r_features</text>
-<text text-anchor="middle" x="301" y="-298" font-family="sans" font-size="10.00">provider_key: rapids</text>
-</g>
-<!-- 4&#45;&gt;8 -->
-<g id="edge15" class="edge">
-<title>4&#45;&gt;8</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M282.18,-359.7C285,-351.81 288.39,-342.3 291.52,-333.55"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="294.82,-334.7 294.89,-324.1 288.23,-332.34 294.82,-334.7"/>
-</g>
-<!-- 5 -->
-<g id="node6" class="node">
-<title>5</title>
-<path fill="none" stroke="#afd856" stroke-width="2" d="M475.5,-468C475.5,-468 364.5,-468 364.5,-468 358.5,-468 352.5,-462 352.5,-456 352.5,-456 352.5,-444 352.5,-444 352.5,-438 358.5,-432 364.5,-432 364.5,-432 475.5,-432 475.5,-432 481.5,-432 487.5,-438 487.5,-444 487.5,-444 487.5,-456 487.5,-456 487.5,-462 481.5,-468 475.5,-468"/>
-<text text-anchor="middle" x="420" y="-453" font-family="sans" font-size="10.00">process_time_segments</text>
-<text text-anchor="middle" x="420" y="-442" font-family="sans" font-size="10.00">pid: nokia_0000003</text>
-</g>
-<!-- 5&#45;&gt;4 -->
-<g id="edge12" class="edge">
-<title>5&#45;&gt;4</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M384.77,-431.88C365.42,-422.47 341.23,-410.71 320.57,-400.67"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="321.89,-397.41 311.36,-396.19 318.83,-403.71 321.89,-397.41"/>
-</g>
-<!-- 5&#45;&gt;8 -->
-<g id="edge16" class="edge">
-<title>5&#45;&gt;8</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M415.13,-431.72C409.07,-412.57 397.25,-381.55 379,-360 369.03,-348.23 355.82,-337.94 343.12,-329.64"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="344.79,-326.55 334.45,-324.21 341.08,-332.49 344.79,-326.55"/>
-</g>
-<!-- 6 -->
-<g id="node7" class="node">
-<title>6</title>
-<path fill="none" stroke="#d86656" stroke-width="2" stroke-dasharray="5,2" d="M322.5,-468C322.5,-468 229.5,-468 229.5,-468 223.5,-468 217.5,-462 217.5,-456 217.5,-456 217.5,-444 217.5,-444 217.5,-438 223.5,-432 229.5,-432 229.5,-432 322.5,-432 322.5,-432 328.5,-432 334.5,-438 334.5,-444 334.5,-444 334.5,-456 334.5,-456 334.5,-462 328.5,-468 322.5,-468"/>
-<text text-anchor="middle" x="276" y="-447.5" font-family="sans" font-size="10.00">prepare_tzcodes_file</text>
-</g>
-<!-- 6&#45;&gt;4 -->
-<g id="edge13" class="edge">
-<title>6&#45;&gt;4</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M276,-431.7C276,-423.98 276,-414.71 276,-406.11"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="279.5,-406.1 276,-396.1 272.5,-406.1 279.5,-406.1"/>
-</g>
-<!-- 7 -->
-<g id="node8" class="node">
-<title>7</title>
-<path fill="none" stroke="#56d86b" stroke-width="2" stroke-dasharray="5,2" d="M370,-540C370,-540 182,-540 182,-540 176,-540 170,-534 170,-528 170,-528 170,-516 170,-516 170,-510 176,-504 182,-504 182,-504 370,-504 370,-504 376,-504 382,-510 382,-516 382,-516 382,-528 382,-528 382,-534 376,-540 370,-540"/>
-<text text-anchor="middle" x="276" y="-519.5" font-family="sans" font-size="10.00">query_usernames_device_empatica_ids</text>
-</g>
-<!-- 7&#45;&gt;6 -->
-<g id="edge14" class="edge">
-<title>7&#45;&gt;6</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M276,-503.7C276,-495.98 276,-486.71 276,-478.11"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="279.5,-478.1 276,-468.1 272.5,-478.1 279.5,-478.1"/>
-</g>
-<!-- 8&#45;&gt;0 -->
-<g id="edge5" class="edge">
-<title>8&#45;&gt;0</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M264.63,-287.8C250.06,-279.08 234.51,-267.11 225,-252 184.07,-186.97 182.71,-92.23 184.91,-46.17"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="188.41,-46.26 185.49,-36.07 181.42,-45.85 188.41,-46.26"/>
-</g>
-<!-- 9 -->
-<g id="node10" class="node">
-<title>9</title>
-<path fill="none" stroke="#d87556" stroke-width="2" d="M382,-252C382,-252 246,-252 246,-252 240,-252 234,-246 234,-240 234,-240 234,-228 234,-228 234,-222 240,-216 246,-216 246,-216 382,-216 382,-216 388,-216 394,-222 394,-228 394,-228 394,-240 394,-240 394,-246 388,-252 382,-252"/>
-<text text-anchor="middle" x="314" y="-237" font-family="sans" font-size="10.00">join_features_from_providers</text>
-<text text-anchor="middle" x="314" y="-226" font-family="sans" font-size="10.00">sensor_key: phone_calls</text>
-</g>
-<!-- 8&#45;&gt;9 -->
-<g id="edge17" class="edge">
-<title>8&#45;&gt;9</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M304.21,-287.7C305.65,-279.98 307.37,-270.71 308.96,-262.11"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="312.44,-262.58 310.82,-252.1 305.56,-261.3 312.44,-262.58"/>
-</g>
-<!-- 9&#45;&gt;0 -->
-<g id="edge6" class="edge">
-<title>9&#45;&gt;0</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M294.15,-215.87C283.81,-206.16 271.58,-193.31 263,-180 235.01,-136.57 243.3,-118.11 220,-72 215.36,-62.81 209.61,-53.14 204.23,-44.62"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="207.17,-42.72 198.81,-36.21 201.29,-46.51 207.17,-42.72"/>
-</g>
-<!-- 10 -->
-<g id="node11" class="node">
-<title>10</title>
-<path fill="none" stroke="#56d8d0" stroke-width="2" d="M526,-180C526,-180 284,-180 284,-180 278,-180 272,-174 272,-168 272,-168 272,-156 272,-156 272,-150 278,-144 284,-144 284,-144 526,-144 526,-144 532,-144 538,-150 538,-156 538,-156 538,-168 538,-168 538,-174 532,-180 526,-180"/>
-<text text-anchor="middle" x="405" y="-159.5" font-family="sans" font-size="10.00">merge_sensor_features_for_individual_participants</text>
-</g>
-<!-- 9&#45;&gt;10 -->
-<g id="edge18" class="edge">
-<title>9&#45;&gt;10</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M336.49,-215.7C347.96,-206.88 362.06,-196.03 374.48,-186.47"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="376.97,-188.98 382.76,-180.1 372.7,-183.43 376.97,-188.98"/>
-</g>
-<!-- 10&#45;&gt;0 -->
-<g id="edge7" class="edge">
-<title>10&#45;&gt;0</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M366.3,-143.85C346.21,-134.31 321.62,-121.63 301,-108 280.21,-94.25 277.55,-87.46 258,-72 245.35,-62 231.16,-51.3 218.81,-42.16"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="220.72,-39.22 210.59,-36.1 216.57,-44.85 220.72,-39.22"/>
-</g>
-<!-- 11 -->
-<g id="node12" class="node">
-<title>11</title>
-<path fill="none" stroke="#56d892" stroke-width="2" d="M528,-108C528,-108 322,-108 322,-108 316,-108 310,-102 310,-96 310,-96 310,-84 310,-84 310,-78 316,-72 322,-72 322,-72 528,-72 528,-72 534,-72 540,-78 540,-84 540,-84 540,-96 540,-96 540,-102 534,-108 528,-108"/>
-<text text-anchor="middle" x="425" y="-87.5" font-family="sans" font-size="10.00">merge_sensor_features_for_all_participants</text>
-</g>
-<!-- 10&#45;&gt;11 -->
-<g id="edge19" class="edge">
-<title>10&#45;&gt;11</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M409.94,-143.7C412.17,-135.9 414.85,-126.51 417.33,-117.83"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="420.73,-118.68 420.11,-108.1 414,-116.76 420.73,-118.68"/>
-</g>
-<!-- 11&#45;&gt;0 -->
-<g id="edge8" class="edge">
-<title>11&#45;&gt;0</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M367.08,-71.97C322.5,-58.85 262.21,-41.12 223.96,-29.87"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="224.84,-26.48 214.26,-27.02 222.87,-33.2 224.84,-26.48"/>
-</g>
-</g>
-</svg>
--- a/images/dag_full_nokia_example.svg
+++ b/images/dag_full_nokia_example.svg
--- a/images/dag_participants_files.svg
+++ b/images/dag_participants_files.svg
@ -1,68 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
- "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<!-- Generated by graphviz version 2.43.0 (0)
- -->
-<!-- Title: snakemake_dag Pages: 1 -->
-<svg width="414pt" height="396pt"
- viewBox="0.00 0.00 414.00 396.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 392)">
-<title>snakemake_dag</title>
-<polygon fill="white" stroke="transparent" points="-4,4 -4,-392 410,-392 410,4 -4,4"/>
-<!-- 0 -->
-<g id="node1" class="node">
-<title>0</title>
-<text text-anchor="start" x="81" y="-71.6" font-family="sans" font-weight="bold" font-size="18.00">create_participants_files</text>
-<text text-anchor="start" x="81" y="-47.8" font-family="sans" font-size="10.00"> </text>
-<text text-anchor="start" x="85" y="-47.8" font-family="sans" font-weight="bold" font-size="14.00">↪ input</text>
-<text text-anchor="start" x="143" y="-47.8" font-family="sans" font-size="10.00"> </text>
-<text text-anchor="start" x="81" y="-28" font-family="monospace" font-size="10.00">data/external/example_participants.csv</text>
-<text text-anchor="start" x="319" y="-10" font-family="sans" font-size="10.00"> &#160;</text>
-<polygon fill="#acd957" stroke="#acd957" points="75,-62 75,-62 333,-62 333,-62 75,-62"/>
-<polygon fill="#acd957" stroke="#acd957" points="75,-22 75,-22 333,-22 333,-22 75,-22"/>
-<polygon fill="none" stroke="#acd957" stroke-width="2" points="74.5,-1 74.5,-91 331.5,-91 331.5,-1 74.5,-1"/>
-</g>
-<!-- 1 -->
-<g id="node2" class="node">
-<title>1</title>
-<text text-anchor="start" x="77" y="-221.6" font-family="sans" font-weight="bold" font-size="18.00">prepare_participants_csv</text>
-<text text-anchor="start" x="77" y="-197.8" font-family="sans" font-size="10.00"> </text>
-<text text-anchor="start" x="81" y="-197.8" font-family="sans" font-weight="bold" font-size="14.00">↪ input</text>
-<text text-anchor="start" x="139" y="-197.8" font-family="sans" font-size="10.00"> </text>
-<text text-anchor="start" x="77" y="-178" font-family="monospace" font-size="10.00">data/external/example_usernames.csv</text>
-<text text-anchor="start" x="251" y="-157.8" font-family="sans" font-size="10.00"> </text>
-<text text-anchor="start" x="255" y="-157.8" font-family="sans" font-weight="bold" font-size="14.00">output →</text>
-<text text-anchor="start" x="325" y="-157.8" font-family="sans" font-size="10.00"> </text>
-<text text-anchor="start" x="77" y="-138" font-family="monospace" font-size="10.00">data/external/example_participants.csv</text>
-<polygon fill="#57d99e" stroke="#57d99e" points="71,-212 71,-212 336,-212 336,-212 71,-212"/>
-<polygon fill="#57d99e" stroke="#57d99e" points="71,-172 71,-172 336,-172 336,-172 71,-172"/>
-<polygon fill="none" stroke="#57d99e" stroke-width="2" points="71,-129 71,-241 335,-241 335,-129 71,-129"/>
-</g>
-<!-- 1&#45;&gt;0 -->
-<g id="edge1" class="edge">
-<title>1&#45;&gt;0</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M203,-127.88C203,-119.48 203,-110.81 203,-102.42"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="206.5,-102.36 203,-92.36 199.5,-102.36 206.5,-102.36"/>
-</g>
-<!-- 2 -->
-<g id="node3" class="node">
-<title>2</title>
-<text text-anchor="start" x="7" y="-367.6" font-family="sans" font-weight="bold" font-size="18.00">query_usernames_device_empatica_ids</text>
-<text text-anchor="start" x="7" y="-346" font-family="sans" font-size="10.00"> &#160;</text>
-<text text-anchor="start" x="321" y="-325.8" font-family="sans" font-size="10.00"> </text>
-<text text-anchor="start" x="325" y="-325.8" font-family="sans" font-weight="bold" font-size="14.00">output →</text>
-<text text-anchor="start" x="395" y="-325.8" font-family="sans" font-size="10.00"> </text>
-<text text-anchor="start" x="7" y="-306" font-family="monospace" font-size="10.00">data/external/example_usernames.csv</text>
-<text text-anchor="start" x="7" y="-288" font-family="monospace" font-size="10.00">data/external/timezone.csv</text>
-<polygon fill="#86d957" stroke="#86d957" points="1,-358 1,-358 406,-358 406,-358 1,-358"/>
-<polygon fill="#86d957" stroke="#86d957" points="1,-340 1,-340 406,-340 406,-340 1,-340"/>
-<polygon fill="none" stroke="#86d957" stroke-width="2" points="1,-279 1,-387 405,-387 405,-279 1,-279"/>
-</g>
-<!-- 2&#45;&gt;1 -->
-<g id="edge2" class="edge">
-<title>2&#45;&gt;1</title>
-<path fill="none" stroke="grey" stroke-width="2" d="M203,-277.63C203,-269.45 203,-260.93 203,-252.53"/>
-<polygon fill="grey" stroke="grey" stroke-width="2" points="206.5,-252.36 203,-242.36 199.5,-252.36 206.5,-252.36"/>
-</g>
-</g>
-</svg>
--- a/machine_learning/config/minimal_features.yaml
+++ b/machine_learning/config/minimal_features.yaml
@ -1,5 +0,0 @@
-grouping_variable: date_lj
-features:
-  proximity:
-    all
-participants_usernames: [nokia_0000003]
--- a/machine_learning/config/minimal_labels.yaml
+++ b/machine_learning/config/minimal_labels.yaml
@ -1,6 +0,0 @@
-grouping_variable: date_lj
-labels:
-  PANAS:
-    - PA
-    - NA
-participants_usernames: [nokia_0000003]
--- a/machine_learning/config/prox_comm_PANAS_features.yaml
+++ b/machine_learning/config/prox_comm_PANAS_features.yaml
@ -1,6 +0,0 @@
-grouping_variable: date_lj
-features:
-  proximity:
-    all
-  communication:
-    all
--- a/machine_learning/config/prox_comm_PANAS_labels.yaml
+++ b/machine_learning/config/prox_comm_PANAS_labels.yaml
@ -1,5 +0,0 @@
-grouping_variable: date_lj
-labels:
-  PANAS:
-    - PA
-    - NA
--- a/machine_learning/features_sensor.py
+++ b/machine_learning/features_sensor.py
@ -1,231 +0,0 @@
-import datetime
-import warnings
-from pathlib import Path
-from typing import Collection
-
-import pandas as pd
-from pyprojroot import here
-
-import participants.query_db
-from features import communication, helper, proximity
-from machine_learning.helper import (
-    read_csv_with_settings,
-    safe_outer_merge_on_index,
-    to_csv_with_settings,
-)
-
-WARNING_PARTICIPANTS_LABEL = (
-    "Before calculating features, please set participants label using self.set_participants_label() "
-    "to be used as a filename prefix when exporting data. "
-    "The filename will be of the form: %participants_label_%grouping_variable_%data_type.csv"
-)
-
-
-class SensorFeatures:
-    """
-    A class to represent all sensor (AWARE) features.
-
-    Attributes
-    ----------
-    grouping_variable: str
-        The name of the variable by which to group (segment) data, e.g. date_lj.
-    features: dict
-        A dictionary of sensors (data types) and features to calculate.
-        See config/minimal_features.yaml for an example.
-    participants_usernames: Collection
-        A list of usernames for which to calculate features.
-        If None, use all participants.
-
-    Methods
-    -------
-    set_sensor_data():
-        Query the database for data types defined by self.features.
-    get_sensor_data(data_type): pd.DataFrame
-        Returns the dataframe of sensor data for specified data_type.
-    calculate_features():
-        Calls appropriate functions from features/ and joins them in a single dataframe, df_features_all.
-    get_features(data_type, feature_names): pd.DataFrame
-        Returns the dataframe of specified features for selected sensor.
-
-    construct_export_path():
-        Construct a path for exporting the features as csv files.
-    set_participants_label(label):
-        Sets a label for the usernames subset. This is used to distinguish feature exports.
-    """
-
-    def __init__(
-        self,
-        grouping_variable: str,
-        features: dict,
-        participants_usernames: Collection = None,
-    ) -> None:
-        """
-        Specifies the grouping variable and usernames for which to calculate features.
-        Sets other (implicit) attributes used in other methods.
-        If participants_usernames=None, this queries the usernames which belong to the main part of the study,
-            i.e. from 2020-08-01 on.
-
-        Parameters
-        ----------
-        grouping_variable: str
-            The name of the variable by which to group (segment) data, e.g. date_lj.
-        features: dict
-            A dictionary of sensors (data types) and features to calculate.
-            See config/minimal_features.yaml for an example.
-        participants_usernames: Collection
-            A list of usernames for which to calculate features.
-            If None, use all participants.
-
-        Returns
-        -------
-        None
-        """
-        self.grouping_variable_name = grouping_variable
-        self.grouping_variable = [grouping_variable]
-
-        self.data_types = features.keys()
-
-        self.participants_label: str = ""
-        if participants_usernames is None:
-            participants_usernames = participants.query_db.get_usernames(
-                collection_start=datetime.date.fromisoformat("2020-08-01")
-            )
-            self.participants_label = "all"
-        self.participants_usernames = participants_usernames
-
-        self.df_features_all = pd.DataFrame()
-
-        self.df_proximity = pd.DataFrame()
-        self.df_proximity_counts = pd.DataFrame()
-
-        self.df_calls = pd.DataFrame()
-        self.df_sms = pd.DataFrame()
-        self.df_calls_sms = pd.DataFrame()
-
-        self.folder: Path = Path()
-        self.filename_prefix = ""
-        self.construct_export_path()
-        print("SensorFeatures initialized.")
-
-    def set_sensor_data(self) -> None:
-        print("Querying database ...")
-        if "proximity" in self.data_types:
-            self.df_proximity = proximity.get_proximity_data(
-                self.participants_usernames
-            )
-            print("Got proximity data from the DB.")
-            self.df_proximity = helper.get_date_from_timestamp(self.df_proximity)
-            self.df_proximity = proximity.recode_proximity(self.df_proximity)
-        if "communication" in self.data_types:
-            self.df_calls = communication.get_call_data(self.participants_usernames)
-            self.df_calls = helper.get_date_from_timestamp(self.df_calls)
-            print("Got calls data from the DB.")
-
-            self.df_sms = communication.get_sms_data(self.participants_usernames)
-            self.df_sms = helper.get_date_from_timestamp(self.df_sms)
-            print("Got sms data from the DB.")
-
-    def get_sensor_data(self, data_type: str) -> pd.DataFrame:
-        if data_type == "proximity":
-            return self.df_proximity
-        elif data_type == "communication":
-            return self.df_calls_sms
-        else:
-            raise KeyError("This data type has not been implemented.")
-
-    def calculate_features(self, cached=True) -> None:
-        print("Calculating features ...")
-        if not self.participants_label:
-            raise ValueError(WARNING_PARTICIPANTS_LABEL)
-        self.df_features_all = pd.DataFrame()
-
-        if "proximity" in self.data_types:
-            try:
-                if not cached:  # Do not use the file, even if it exists.
-                    raise FileNotFoundError
-                self.df_proximity_counts = read_csv_with_settings(
-                    self.folder,
-                    self.filename_prefix,
-                    data_type="prox",
-                    grouping_variable=self.grouping_variable,
-                )
-                print("Read proximity features from the file.")
-            except FileNotFoundError:
-                # We need to recalculate the features in this case.
-                self.df_proximity_counts = proximity.count_proximity(
-                    self.df_proximity, self.grouping_variable
-                )
-                print("Calculated proximity features.")
-                to_csv_with_settings(
-                    self.df_proximity_counts,
-                    self.folder,
-                    self.filename_prefix,
-                    data_type="prox",
-                )
-            finally:
-                self.df_features_all = safe_outer_merge_on_index(
-                    self.df_features_all, self.df_proximity_counts
-                )
-
-        if "communication" in self.data_types:
-            try:
-                if not cached:  # Do not use the file, even if it exists.
-                    raise FileNotFoundError
-                self.df_calls_sms = read_csv_with_settings(
-                    self.folder,
-                    self.filename_prefix,
-                    data_type="comm",
-                    grouping_variable=self.grouping_variable,
-                )
-                print("Read communication features from the file.")
-            except FileNotFoundError:
-                # We need to recalculate the features in this case.
-                self.df_calls_sms = communication.calls_sms_features(
-                    df_calls=self.df_calls,
-                    df_sms=self.df_sms,
-                    group_by=self.grouping_variable,
-                )
-                print("Calculated communication features.")
-                to_csv_with_settings(
-                    self.df_calls_sms,
-                    self.folder,
-                    self.filename_prefix,
-                    data_type="comm",
-                )
-            finally:
-                self.df_features_all = safe_outer_merge_on_index(
-                    self.df_features_all, self.df_calls_sms
-                )
-
-        self.df_features_all.fillna(
-            value=proximity.FILL_NA_PROXIMITY, inplace=True, downcast="infer",
-        )
-        self.df_features_all.fillna(
-            value=communication.FILL_NA_CALLS_SMS_ALL, inplace=True, downcast="infer",
-        )
-
-    def get_features(self, data_type, feature_names) -> pd.DataFrame:
-        if data_type == "proximity":
-            if feature_names == "all":
-                feature_names = proximity.FEATURES_PROXIMITY
-            return self.df_proximity_counts[feature_names]
-        elif data_type == "communication":
-            if feature_names == "all":
-                feature_names = communication.FEATURES_CALLS_SMS_ALL
-            return self.df_calls_sms[feature_names]
-        elif data_type == "all":
-            return self.df_features_all
-        else:
-            raise KeyError("This data type has not been implemented.")
-
-    def construct_export_path(self) -> None:
-        if not self.participants_label:
-            warnings.warn(WARNING_PARTICIPANTS_LABEL, UserWarning)
-        self.folder = here("machine_learning/intermediate_results/features", warn=True)
-        self.filename_prefix = (
-            self.participants_label + "_" + self.grouping_variable_name
-        )
-
-    def set_participants_label(self, label: str) -> None:
-        self.participants_label = label
-        self.construct_export_path()
--- a/machine_learning/helper.py
+++ b/machine_learning/helper.py
@ -1,57 +0,0 @@
-from pathlib import Path
-
-import pandas as pd
-
-
-def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
-    if left.empty:
-        return right
-    elif right.empty:
-        return left
-    else:
-        return pd.merge(
-            left,
-            right,
-            how="outer",
-            left_index=True,
-            right_index=True,
-            validate="one_to_one",
-        )
-
-
-def to_csv_with_settings(
-    df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str
-) -> None:
-    full_path = construct_full_path(folder, filename_prefix, data_type)
-    df.to_csv(
-        path_or_buf=full_path,
-        sep=",",
-        na_rep="NA",
-        header=True,
-        index=True,
-        encoding="utf-8",
-    )
-    print("Exported the dataframe to " + str(full_path))
-
-
-def read_csv_with_settings(
-    folder: Path, filename_prefix: str, data_type: str, grouping_variable: list
-) -> pd.DataFrame:
-    full_path = construct_full_path(folder, filename_prefix, data_type)
-    return pd.read_csv(
-        filepath_or_buffer=full_path,
-        sep=",",
-        header=0,
-        na_values="NA",
-        encoding="utf-8",
-        index_col=(["participant_id"] + grouping_variable),
-        parse_dates=True,
-        infer_datetime_format=True,
-        cache_dates=True,
-    )
-
-
-def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> Path:
-    export_filename = filename_prefix + "_" + data_type + ".csv"
-    full_path = folder / export_filename
-    return full_path
--- a/machine_learning/labels.py
+++ b/machine_learning/labels.py
@ -1,135 +0,0 @@
-import datetime
-import warnings
-from pathlib import Path
-from typing import Collection
-
-import pandas as pd
-from pyprojroot import here
-
-import participants.query_db
-from features import esm
-from machine_learning import QUESTIONNAIRE_IDS, QUESTIONNAIRE_IDS_RENAME
-from machine_learning.helper import read_csv_with_settings, to_csv_with_settings
-
-WARNING_PARTICIPANTS_LABEL = (
-    "Before aggregating labels, please set participants label using self.set_participants_label() "
-    "to be used as a filename prefix when exporting data. "
-    "The filename will be of the form: %participants_label_%grouping_variable_%data_type.csv"
-)
-
-
-class Labels:
-    def __init__(
-        self,
-        grouping_variable: str,
-        labels: dict,
-        participants_usernames: Collection = None,
-    ) -> None:
-        self.grouping_variable_name = grouping_variable
-        self.grouping_variable = [grouping_variable]
-
-        self.questionnaires = labels.keys()
-
-        self.participants_label: str = ""
-        if participants_usernames is None:
-            participants_usernames = participants.query_db.get_usernames(
-                collection_start=datetime.date.fromisoformat("2020-08-01")
-            )
-            self.participants_label = "all"
-        self.participants_usernames = participants_usernames
-
-        self.df_esm = pd.DataFrame()
-        self.df_esm_preprocessed = pd.DataFrame()
-        self.df_esm_interest = pd.DataFrame()
-        self.df_esm_clean = pd.DataFrame()
-
-        self.df_esm_means = pd.DataFrame()
-
-        self.folder: Path = Path()
-        self.filename_prefix = ""
-        self.construct_export_path()
-        print("Labels initialized.")
-
-    def set_labels(self) -> None:
-        print("Querying database ...")
-        self.df_esm = esm.get_esm_data(self.participants_usernames)
-        print("Got ESM data from the DB.")
-        self.df_esm_preprocessed = esm.preprocess_esm(self.df_esm)
-        print("ESM data preprocessed.")
-        if "PANAS" in self.questionnaires:
-            self.df_esm_interest = self.df_esm_preprocessed[
-                (
-                    self.df_esm_preprocessed["questionnaire_id"]
-                    == QUESTIONNAIRE_IDS.get("PANAS").get("PA")
-                )
-                | (
-                    self.df_esm_preprocessed["questionnaire_id"]
-                    == QUESTIONNAIRE_IDS.get("PANAS").get("NA")
-                )
-            ]
-        self.df_esm_clean = esm.clean_up_esm(self.df_esm_interest)
-        print("ESM data cleaned.")
-
-    def get_labels(self, questionnaire: str) -> pd.DataFrame:
-        if questionnaire == "PANAS":
-            return self.df_esm_clean
-        else:
-            raise KeyError("This questionnaire has not been implemented as a label.")
-
-    def aggregate_labels(self, cached=True) -> None:
-        print("Aggregating labels ...")
-        if not self.participants_label:
-            raise ValueError(WARNING_PARTICIPANTS_LABEL)
-
-        try:
-            if not cached:  # Do not use the file, even if it exists.
-                raise FileNotFoundError
-            self.df_esm_means = read_csv_with_settings(
-                self.folder,
-                self.filename_prefix,
-                data_type="_".join(self.questionnaires),
-                grouping_variable=self.grouping_variable,
-            )
-            print("Read labels from the file.")
-        except FileNotFoundError:
-            # We need to recalculate the features in this case.
-            self.df_esm_means = (
-                self.df_esm_clean.groupby(
-                    ["participant_id", "questionnaire_id"] + self.grouping_variable
-                )
-                .esm_user_answer_numeric.agg("mean")
-                .reset_index()
-                .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
-            )
-            self.df_esm_means = (
-                self.df_esm_means.pivot(
-                    index=["participant_id"] + self.grouping_variable,
-                    columns="questionnaire_id",
-                    values="esm_numeric_mean",
-                )
-                .reset_index(col_level=1)
-                .rename(columns=QUESTIONNAIRE_IDS_RENAME)
-                .set_index(["participant_id"] + self.grouping_variable)
-            )
-            print("Labels aggregated.")
-            to_csv_with_settings(
-                self.df_esm_means,
-                self.folder,
-                self.filename_prefix,
-                data_type="_".join(self.questionnaires),
-            )
-
-    def get_aggregated_labels(self) -> pd.DataFrame:
-        return self.df_esm_means
-
-    def construct_export_path(self) -> None:
-        if not self.participants_label:
-            warnings.warn(WARNING_PARTICIPANTS_LABEL, UserWarning)
-        self.folder = here("machine_learning/intermediate_results/labels", warn=True)
-        self.filename_prefix = (
-            self.participants_label + "_" + self.grouping_variable_name
-        )
-
-    def set_participants_label(self, label: str) -> None:
-        self.participants_label = label
-        self.construct_export_path()
--- a/machine_learning/model.py
+++ b/machine_learning/model.py
@ -1,47 +0,0 @@
-from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
-
-
-class ModelValidation:
-    def __init__(self, X, y, group_variable=None, cv_name="loso"):
-        self.model = None
-        self.cv = None
-
-        idx_common = X.index.intersection(y.index)
-        self.y = y.loc[idx_common, "NA"]
-        # TODO Handle the case of multiple labels.
-        self.X = X.loc[idx_common]
-        self.groups = self.y.index.get_level_values(group_variable)
-
-        self.cv_name = cv_name
-        print("ModelValidation initialized.")
-
-    def set_cv_method(self):
-        if self.cv_name == "loso":
-            self.cv = LeaveOneGroupOut()
-            self.cv.get_n_splits(X=self.X, y=self.y, groups=self.groups)
-        print("Validation method set.")
-
-    def cross_validate(self):
-        print("Running cross validation ...")
-        if self.model is None:
-            raise TypeError(
-                "Please, specify a machine learning model first, by setting the .model attribute. "
-                "E.g. self.model = sklearn.linear_model.LinearRegression()"
-            )
-        if self.cv is None:
-            raise TypeError(
-                "Please, specify a cross validation method first, by using set_cv_method() first."
-            )
-        if self.X.isna().any().any() or self.y.isna().any().any():
-            raise ValueError(
-                "NaNs were found in either X or y. Please, check your data before continuing."
-            )
-        return cross_val_score(
-            estimator=self.model,
-            X=self.X,
-            y=self.y,
-            groups=self.groups,
-            cv=self.cv,
-            n_jobs=-1,
-            scoring="r2",
-        )
--- a/machine_learning/pipeline.py
+++ b/machine_learning/pipeline.py
@ -1,32 +1,125 @@
-import numpy as np
-import yaml
-from sklearn import linear_model
+import datetime

-from machine_learning.features_sensor import SensorFeatures
-from machine_learning.labels import Labels
-from machine_learning.model import ModelValidation
+import pandas as pd
+from sklearn.model_selection import cross_val_score

-if __name__ == "__main__":
-    with open("./config/prox_comm_PANAS_features.yaml", "r") as file:
-        sensor_features_params = yaml.safe_load(file)
-    sensor_features = SensorFeatures(**sensor_features_params)
-    sensor_features.set_sensor_data()
-    sensor_features.calculate_features()
+import participants.query_db
+from features import esm, helper, proximity
+from machine_learning import QUESTIONNAIRE_IDS, QUESTIONNAIRE_IDS_RENAME

-    with open("./config/prox_comm_PANAS_labels.yaml", "r") as file:
-        labels_params = yaml.safe_load(file)
-    labels = Labels(**labels_params)
-    labels.set_labels()
-    labels.aggregate_labels()

-    model_validation = ModelValidation(
-        sensor_features.get_features("all", "all"),
-        labels.get_aggregated_labels(),
-        group_variable="participant_id",
-        cv_name="loso",
+class MachineLearningPipeline:
+    def __init__(
+        self,
+        labels_questionnaire,
+        labels_scale,
+        data_types,
+        participants_usernames=None,
+        feature_names=None,
+        grouping_variable=None,
+    ):
+        if participants_usernames is None:
+            participants_usernames = participants.query_db.get_usernames(
+                collection_start=datetime.date.fromisoformat("2020-08-01")
+            )
+        self.participants_usernames = participants_usernames
+        self.labels_questionnaire = labels_questionnaire
+        self.data_types = data_types
+
+        if feature_names is None:
+            self.feature_names = []
+        self.df_features = pd.DataFrame()
+        self.labels_scale = labels_scale
+        self.df_labels = pd.DataFrame()
+        self.grouping_variable = grouping_variable
+        self.df_groups = pd.DataFrame()
+
+        self.model = None
+        self.validation_method = None
+
+        self.df_esm = pd.DataFrame()
+        self.df_esm_preprocessed = pd.DataFrame()
+        self.df_esm_interest = pd.DataFrame()
+        self.df_esm_clean = pd.DataFrame()
+
+        self.df_proximity = pd.DataFrame()
+
+        self.df_full_data_daily_means = pd.DataFrame()
+        self.df_esm_daily_means = pd.DataFrame()
+        self.df_proximity_daily_counts = pd.DataFrame()
+
+    def get_labels(self):
+        self.df_esm = esm.get_esm_data(self.participants_usernames)
+        self.df_esm_preprocessed = esm.preprocess_esm(self.df_esm)
+        if self.labels_questionnaire == "PANAS":
+            self.df_esm_interest = self.df_esm_preprocessed[
+                (
+                    self.df_esm_preprocessed["questionnaire_id"]
+                    == QUESTIONNAIRE_IDS.get("PANAS").get("PA")
+                )
+                | (
+                    self.df_esm_preprocessed["questionnaire_id"]
+                    == QUESTIONNAIRE_IDS.get("PANAS").get("NA")
+                )
+            ]
+        self.df_esm_clean = esm.clean_up_esm(self.df_esm_interest)
+
+    def get_sensor_data(self):
+        if "proximity" in self.data_types:
+            self.df_proximity = proximity.get_proximity_data(
+                self.participants_usernames
+            )
+            self.df_proximity = helper.get_date_from_timestamp(self.df_proximity)
+            self.df_proximity = proximity.recode_proximity(self.df_proximity)
+
+    def aggregate_daily(self):
+        self.df_esm_daily_means = (
+            self.df_esm_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])
+            .esm_user_answer_numeric.agg("mean")
+            .reset_index()
+            .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
+        )
+        self.df_esm_daily_means = (
+            self.df_esm_daily_means.pivot(
+                index=["participant_id", "date_lj"],
+                columns="questionnaire_id",
+                values="esm_numeric_mean",
+            )
+            .reset_index(col_level=1)
+            .rename(columns=QUESTIONNAIRE_IDS_RENAME)
+            .set_index(["participant_id", "date_lj"])
+        )
+        self.df_full_data_daily_means = self.df_esm_daily_means.copy()
+        if "proximity" in self.data_types:
+            self.df_proximity_daily_counts = proximity.count_proximity(
+                self.df_proximity, ["participant_id", "date_lj"]
+            )
+            self.df_full_data_daily_means = self.df_full_data_daily_means.join(
+                self.df_proximity_daily_counts
+            )
+
+    def assign_columns(self):
+        self.df_features = self.df_full_data_daily_means[self.feature_names]
+        self.df_labels = self.df_full_data_daily_means[self.labels_scale]
+        if self.grouping_variable:
+            self.df_groups = self.df_full_data_daily_means[self.grouping_variable]
+        else:
+            self.df_groups = None
+
+    def validate_model(self):
+        if self.model is None:
+            raise AttributeError(
+                "Please, specify a machine learning model first, by setting the .model attribute."
+            )
+        if self.validation_method is None:
+            raise AttributeError(
+                "Please, specify a cross validation method first, by setting the .validation_method attribute."
+            )
+        cross_val_score(
+            estimator=self.model,
+            X=self.df_features,
+            y=self.df_labels,
+            groups=self.df_groups,
+            cv=self.validation_method,
+            n_jobs=-1,
        )
-    model_validation.model = linear_model.LinearRegression()
-    model_validation.set_cv_method()
-    model_loso_r2 = model_validation.cross_validate()
-    print(model_loso_r2)
-    print(np.mean(model_loso_r2))
--- a/machine_learning/prox_comm_PANAS_nb.ipynb
+++ b/machine_learning/prox_comm_PANAS_nb.ipynb
--- a/participants/prepare_usernames_file.py
+++ b/participants/prepare_usernames_file.py
@ -1,69 +0,0 @@
-import datetime
-import os
-import sys
-
-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
-
-import pandas as pd
-from features.timezone import get_timezone_data
-from pyprojroot import here
-
-import participants.query_db
-
-participants_inactive_usernames = participants.query_db.get_usernames(
-    tester=False,  # True participants are wanted.
-    active=False,  # They have all finished their participation.
-    collection_start=datetime.date.fromisoformat(
-        "2020-08-01"
-    ),  # This is the timeframe of the main study.
-    last_upload=datetime.date.fromisoformat("2021-09-01"),
-)
-
-participants_overview_si = pd.read_csv(
-    snakemake.params["baseline_folder"] + "Participants_overview_Slovenia.csv", sep=";"
-)
-participants_overview_be = pd.read_csv(
-    snakemake.params["baseline_folder"]+ "Participants_overview_Belgium.csv", sep=";"
-)
-
-participants_true_si = participants_overview_si[
-    participants_overview_si["Wristband_SerialNo"] != "DECLINED"
-]
-participants_true_be = participants_overview_be[
-    participants_overview_be["SmartphoneBrand+Generation"].str.slice(0, 3) != "Not"
-]
-
-# Concatenate participants from both countries.
-participants_usernames_empatica = pd.concat(
-    [participants_true_be, participants_true_si]
-)
-# Filter only the participants from the main study (queried from the database).
-participants_usernames_empatica = participants_usernames_empatica[
-    participants_usernames_empatica["Username"].isin(participants_inactive_usernames)
-]
-# Rename and select columns.
-participants_usernames_empatica = participants_usernames_empatica.rename(
-    columns={"Username": "label", "Wristband_SerialNo": "empatica_id"}
-)[["label", "empatica_id"]]
-# Adapt for csv export.
-participants_usernames_empatica["empatica_id"] = participants_usernames_empatica[
-    "empatica_id"
-].str.replace(",", ";")
-
-participants_usernames_empatica.to_csv(
-    snakemake.output["usernames_file"],
-    header=True,
-    index=False,
-    line_terminator="\n",
-)
-
-timezone_df = get_timezone_data(participants_inactive_usernames)
-
-timezone_df.to_csv(
-    snakemake.output["timezone_file"],
-    header=True,
-    index=False,
-    line_terminator="\n",
-)
--- a/1
+++ b/1
@ -1 +0,0 @@
-Subproject commit f78aa3e7b3567423b44045766b230cd60d557cb0
--- a/statistical_analysis/adherence.py
+++ b/statistical_analysis/adherence.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.11.4
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,7 +14,25 @@
 # ---

 # %%
-SAVE_FIGS = False
+# %matplotlib inline
+import datetime
+import os
+import sys
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+import statsmodels.api as sm
+import statsmodels.formula.api as smf
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+import participants.query_db
+from features.esm import *
+
+# %%
+SAVE_FIGS = True
 FIG_HEIGHT = 5
 FIG_ASPECT = 1.7
 FIG_COLOUR = "#28827C"
@ -78,41 +96,13 @@ df_session_counts_time = classify_sessions_by_completion_time(df_esm_preprocesse
 # Sessions are now classified according to the type of a session (a true questionnaire or simple single questions) and users response.

 # %%
-df_session_counts_time["session_response_cat"] = df_session_counts_time[
-    "session_response"
-].astype("category")
-df_session_counts_time["session_response_cat"] = df_session_counts_time[
-    "session_response_cat"
-].cat.remove_categories(
-    ["during_work_first", "ema_unanswered", "evening_first", "morning", "morning_first"]
-)
-df_session_counts_time["session_response_cat"] = df_session_counts_time[
-    "session_response_cat"
-].cat.add_categories("interrupted")
-df_session_counts_time.loc[
-    df_session_counts_time["session_response_cat"].isna(), "session_response_cat"
-] = "interrupted"
-# df_session_counts_time["session_response_cat"] = df_session_counts_time["session_response_cat"].cat.rename_categories({
-#    "ema_unanswered": "interrupted",
-#    "morning_first": "interrupted",
-#    "evening_first": "interrupted",
-#    "morning": "interrupted",
-#    "during_work_first": "interrupted"})
-
-# %%
-df_session_counts_time.session_response_cat
+df_session_counts_time

 # %%
 tbl_session_outcomes = df_session_counts_time.reset_index()[
-    "session_response_cat"
+    "session_response"
 ].value_counts()

-# %%
-tbl_session_outcomes_relative = tbl_session_outcomes / len(df_session_counts_time)
-
-# %%
-print(tbl_session_outcomes_relative.to_latex(escape=True))
-
 # %%
 print("All sessions:", len(df_session_counts_time))
 print("-------------------------------------")
--- a/test/test_communication.py
+++ b/test/test_communication.py
@ -88,5 +88,6 @@ class CallsFeatures(unittest.TestCase):
        self.features_call_sms = calls_sms_features(self.calls, self.sms)
        self.assertIsInstance(self.features_call_sms, pd.DataFrame)
        self.assertCountEqual(
-            self.features_call_sms.columns.to_list(), FEATURES_CALLS_SMS_ALL
+            self.features_call_sms.columns.to_list(),
+            FEATURES_CALLS + FEATURES_SMS + FEATURES_CONTACT,
        )
--- a/test/test_esm.py
+++ b/test/test_esm.py
@ -1,7 +1,6 @@
 import unittest

 from pandas.testing import assert_series_equal
-from pyprojroot import here

 from features.esm import *
 from features.esm_JCQ import *
@ -10,7 +9,7 @@ from features.esm_JCQ import *
 class EsmFeatures(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
-        cls.esm = pd.read_csv(here("data/example_esm.csv"), sep=";")
+        cls.esm = pd.read_csv("../data/example_esm.csv", sep=";")
        cls.esm["esm_json"] = cls.esm["esm_json"].apply(eval)
        cls.esm_processed = preprocess_esm(cls.esm)
        cls.esm_clean = clean_up_esm(cls.esm_processed)
--- a/test/test_features_sensor.py
+++ b/test/test_features_sensor.py
@ -1,27 +0,0 @@
-import unittest
-
-import yaml
-from pyprojroot import here
-
-from machine_learning.features_sensor import *
-
-
-class SensorFeaturesTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls) -> None:
-        with open(here("machine_learning/config/minimal_features.yaml"), "r") as file:
-            cls.sensor_features_params = yaml.safe_load(file)
-
-    def test_yaml(self):
-        with open(here("machine_learning/config/minimal_features.yaml"), "r") as file:
-            sensor_features_params = yaml.safe_load(file)
-        self.assertIsInstance(sensor_features_params, dict)
-        self.assertIsInstance(sensor_features_params.get("grouping_variable"), str)
-        self.assertIsInstance(sensor_features_params.get("features"), dict)
-        self.assertIsInstance(
-            sensor_features_params.get("participants_usernames"), list
-        )
-
-    def test_participants_label(self):
-        sensor_features = SensorFeatures(**self.sensor_features_params)
-        self.assertRaises(ValueError, sensor_features.calculate_features)
--- a/test/test_proximity.py
+++ b/test/test_proximity.py
@ -1,7 +1,5 @@
 import unittest

-from pyprojroot import here
-
 from features.proximity import *


@ -12,7 +10,7 @@ class ProximityFeatures(unittest.TestCase):

    @classmethod
    def setUpClass(cls) -> None:
-        cls.df_proximity = pd.read_csv(here("data/example_proximity.csv"))
+        cls.df_proximity = pd.read_csv("../data/example_proximity.csv")
        cls.df_proximity["participant_id"] = 99

    def test_recode_proximity(self):
				`@ -1 +0,0 @@`
				`Subproject commit f78aa3e7b3567423b44045766b230cd60d557cb0`