37 changed files with 1339 additions and 3388 deletions
--- a/.flake8
+++ b/.flake8
@ -1,9 +0,0 @@
-[flake8]
-max-line-length = 88
-extend-ignore =
-    E203,
-    # E501 line too long for docstrings
-    D501
-per-file-ignores =
-    exploration/*.py:E501
-docstring-convention = numpy
--- a/.gitignore
+++ b/.gitignore
@ -12,15 +12,12 @@ __pycache__/
 /data/*input*.csv
 /data/daily*
 /data/intradaily*
-/data/raw
 /data/stressfulness_event*
 /data/30min*
 /presentation/*scores.csv
 /presentation/Results.ods
-/presentation/results/
 .Rproj.user
 .Rhistory
 /presentation/*.nb.html
 presentation/event_stressful_detection_half_loso.csv
 presentation/event_stressful_detection_loso.csv
-/statistical_analysis/scale_reliability.nb.html
--- a/.idea/codeStyles/Project.xml
+++ b/.idea/codeStyles/Project.xml
@ -1,6 +0,0 @@
-<component name="ProjectCodeStyleConfiguration">
-  <code_scheme name="Project" version="173">
-    <option name="RIGHT_MARGIN" value="150" />
-    <option name="SOFT_MARGINS" value="88" />
-  </code_scheme>
-</component>
--- a/.idea/codeStyles/codeStyleConfig.xml
+++ b/.idea/codeStyles/codeStyleConfig.xml
@ -1,5 +0,0 @@
-<component name="ProjectCodeStyleConfiguration">
-  <state>
-    <option name="USE_PER_PROJECT_SETTINGS" value="true" />
-  </state>
-</component>
--- a/.idea/dictionaries/junos.xml
+++ b/.idea/dictionaries/junos.xml
@ -1,3 +0,0 @@
-<component name="ProjectDictionaryState">
-  <dictionary name="junos" />
-</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,9 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="straw2analysis" project-jdk-type="Python SDK" />
-  <component name="PyCharmDSProjectLayout">
-    <option name="id" value="JupyterRightHiddenStructureLayout" />
-  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (straw2analysis)" project-jdk-type="Python SDK" />
  <component name="PyCharmProfessionalAdvertiser">
    <option name="shown" value="true" />
  </component>
@ -17,15 +14,7 @@
            </RMarkdownRenderProfile>
          </value>
        </entry>
-        <entry key="file://$PROJECT_DIR$/statistical_analysis/scale_reliability.rmd">
-          <value>
-            <RMarkdownRenderProfile>
-              <option name="lastOutput" value="$PROJECT_DIR$/statistical_analysis/scale_reliability.nb.html" />
-              <option name="outputDirectoryUrl" value="file://$PROJECT_DIR$/statistical_analysis" />
-            </RMarkdownRenderProfile>
-          </value>
-        </entry>
      </map>
    </option>
  </component>
-</project>
+</project>
--- a/.idea/rGraphicsSettings.xml
+++ b/.idea/rGraphicsSettings.xml
@ -1,9 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="RGraphicsSettings">
-    <option name="height" value="600" />
-    <option name="resolution" value="75" />
-    <option name="version" value="2" />
-    <option name="width" value="960" />
-  </component>
-</project>
--- a/.idea/rMarkdownGraphicsSettings.xml
+++ b/.idea/rMarkdownGraphicsSettings.xml
@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="RMarkdownGraphicsSettings">
-    <option name="globalResolution" value="75" />
-    <option name="version" value="2" />
-  </component>
-</project>
--- a/.idea/rSettings.xml
+++ b/.idea/rSettings.xml
@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="RSettings">
-    <option name="interpreterPath" value="C:\Program Files\R\R-4.3.1\bin\R.exe" />
-  </component>
-</project>
--- a/.idea/straw2analysis.iml
+++ b/.idea/straw2analysis.iml
@ -5,7 +5,7 @@
      <excludeFolder url="file://$MODULE_DIR$/config/.ipynb_checkpoints" />
      <excludeFolder url="file://$MODULE_DIR$/exploration/.ipynb_checkpoints" />
    </content>
-    <orderEntry type="jdk" jdkName="straw2analysis" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.9 (straw2analysis)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyDocumentationSettings">
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -3,5 +3,6 @@
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
    <mapping directory="$PROJECT_DIR$/rapids" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/rapids/calculatingfeatures" vcs="Git" />
  </component>
 </project>
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,30 +0,0 @@
-repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
-    hooks:
-      - id: check-yaml
-      - id: end-of-file-fixer
-      - id: trailing-whitespace
-  - repo: https://github.com/pycqa/isort
-    rev: 5.12.0
-    hooks:
-      - id: isort
-        name: isort (python)
-  - repo: https://github.com/psf/black
-    rev: 23.3.0
-    hooks:
-      - id: black
-        language_version: python3
-  - repo: https://github.com/pycqa/flake8
-    rev: 6.0.0
-    hooks:
-      - id: flake8
-#  - repo: https://github.com/mwouts/jupytext
-#    rev: v1.14.7
-#    hooks:
-#      - id: jupytext
-#        args: [ --from, "py:percent", --to, "ipynb" ]
-#        additional_dependencies:
-#          - isort==5.12.0 # Matches hook
-#          - black==23.3.0
-#          - flake8==6.0.0
--- a/config/environment.yml
+++ b/config/environment.yml
@ -1,12 +1,12 @@
 name: straw2analysis
 channels:
+  - defaults
  - conda-forge
 dependencies:
-  - python=3.11
+  - python=3.9
  - black
  - isort
  - flake8
-  - flake8-docstrings
  - imbalanced-learn=0.10.0
  - jupyterlab
  - jupytext
@ -15,7 +15,6 @@ dependencies:
  - nodejs
  - pandas
  - psycopg2 >= 2.9.1
-  - pre-commit
  - python-dotenv
  - pytz
  - pyprojroot
@ -24,5 +23,4 @@ dependencies:
  - scikit-learn
  - sqlalchemy
  - statsmodels
-  - tabulate
-  - xgboost
+  - tabulate
--- a/exploration/debug_heatmap.py
+++ b/exploration/debug_heatmap.py
@ -14,9 +14,15 @@
 # ---

 # %%
+import os, sys
+import importlib
 import pandas as pd
+import numpy as np

-from rapids.src.features.utils.utils import chunk_episodes
+# import plotly.graph_objects as go
+from importlib import util
+from pathlib import Path
+import yaml

 # %%
 phone_data_yield = pd.read_csv(
@ -30,29 +36,23 @@ time_segments_labels = pd.read_csv(
 # %%
 phone_data_yield["assigned_segments"] = phone_data_yield[
    "assigned_segments"
-].str.replace(r"_RR\d+SS#", "#", regex=True)
+].str.replace(r"_RR\d+SS#", "#")
 time_segments_labels["label"] = time_segments_labels["label"].str.replace(
-    r"_RR\d+SS$", "", regex=True
+    r"_RR\d+SS$", ""
 )


 # %% tags=[]
-def filter_data_by_segment(data, time_segment_current):
+def filter_data_by_segment(data, time_segment):
    data.dropna(subset=["assigned_segments"], inplace=True)
    if data.shape[0] == 0:  # data is empty
        data["local_segment"] = data["timestamps_segment"] = None
        return data

-    datetime_regex = (
-        r"[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
-    )
-    timestamps_regex = r"[0-9]{13}"
-    segment_regex = r"\[({}#{},{};{},{})\]".format(
-        time_segment_current,
-        datetime_regex,
-        datetime_regex,
-        timestamps_regex,
-        timestamps_regex,
+    datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
+    timestamps_regex = "[0-9]{13}"
+    segment_regex = "\[({}#{},{};{},{})\]".format(
+        time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex
    )
    data["local_segment"] = data["assigned_segments"].str.extract(
        segment_regex, expand=True
@ -147,17 +147,14 @@ def getDataForPlot(phone_data_yield_per_segment):
        .fillna(0)
    )

-    # transpose the dataframe per local start datetime of the segment
-    # and discard the useless index layer
+    # transpose the dataframe per local start datetime of the segment and discard the useless index layer
    phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
        "local_segment_start_datetimes"
    )[["minutes_after_segment_start", "sensor"]].apply(
        lambda x: x.set_index("minutes_after_segment_start").transpose()
    )
-    phone_data_yield_per_segment.index = (
-        phone_data_yield_per_segment.index.get_level_values(
-            "local_segment_start_datetimes"
-        )
+    phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
+        "local_segment_start_datetimes"
    )
    return phone_data_yield_per_segment

@ -230,13 +227,9 @@ phone_data_yield_per_segment.tail()
 # # A workaround

 # %%
-phone_data_yield_per_segment[
-    "local_segment_start_datetimes", "minutes_after_segment_start"
-] = phone_data_yield_per_segment[
+phone_data_yield_per_segment["local_segment_start_datetimes", "minutes_after_segment_start"] = phone_data_yield_per_segment[
    ["local_segment_start_datetimes", "minutes_after_segment_start"]
-].drop_duplicates(
-    keep="first"
-)
+].drop_duplicates(keep="first")

 # %%
 phone_data_yield_per_segment.set_index(
@ -251,9 +244,8 @@ phone_data_yield_per_segment.head()
 # %% [markdown]
 # # Retry

-
 # %%
-def get_data_for_plot(phone_data_yield_per_segment):
+def getDataForPlot(phone_data_yield_per_segment):
    # calculate the length (in minute) of per segment instance
    phone_data_yield_per_segment["length"] = (
        phone_data_yield_per_segment["timestamps_segment"]
@ -300,10 +292,7 @@ def get_data_for_plot(phone_data_yield_per_segment):
        full_index,
        names=("local_segment_start_datetimes", "minutes_after_segment_start"),
    )
-    phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(
-        subset=["local_segment_start_datetimes", "minutes_after_segment_start"],
-        keep="first",
-    )
+    phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(subset=["local_segment_start_datetimes", "minutes_after_segment_start"],keep="first")
    phone_data_yield_per_segment = (
        phone_data_yield_per_segment.set_index(
            ["local_segment_start_datetimes", "minutes_after_segment_start"]
@ -313,17 +302,14 @@ def get_data_for_plot(phone_data_yield_per_segment):
        .fillna(0)
    )

-    # transpose the dataframe per local start datetime of the segment
-    # and discard the useless index layer
+    # transpose the dataframe per local start datetime of the segment and discard the useless index layer
    phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
        "local_segment_start_datetimes"
    )[["minutes_after_segment_start", "sensor"]].apply(
        lambda x: x.set_index("minutes_after_segment_start").transpose()
    )
-    phone_data_yield_per_segment.index = (
-        phone_data_yield_per_segment.index.get_level_values(
-            "local_segment_start_datetimes"
-        )
+    phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
+        "local_segment_start_datetimes"
    )
    return phone_data_yield_per_segment

@ -332,6 +318,6 @@ def get_data_for_plot(phone_data_yield_per_segment):
 phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)

 # %%
-data_for_plot_per_segment = get_data_for_plot(phone_data_yield_per_segment)
+data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)

 # %%
--- a/exploration/expl_esm_labels.py
+++ b/exploration/expl_esm_labels.py
@ -7,7 +7,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.14.5
+#       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -15,33 +15,19 @@
 # ---

 # %%
+import os
+import sys
 import datetime

 import seaborn as sns

+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
 import participants.query_db
-from features.esm import (
-    QUESTIONNAIRE_IDS,
-    clean_up_esm,
-    get_esm_data,
-    increment_answers,
-    preprocess_esm,
-    reassign_question_ids,
-)
-from features.esm_COPE import DICT_COPE_QUESTION_IDS
-from features.esm_JCQ import reverse_jcq_demand_control_scoring
-from features.esm_SAM import DICT_SAM_QUESTION_IDS, extract_stressful_events
-
-# import os
-# import sys
-# nb_dir = os.path.split(os.getcwd())[0]
-# if nb_dir not in sys.path:
-#     sys.path.append(nb_dir)
-
-
-# %%
-save_figs = False
-export_data = True
+from features.esm import *
+from features.esm_JCQ import *
+from features.esm_SAM import *

 # %%
 participants_inactive_usernames = participants.query_db.get_usernames(
@ -57,14 +43,8 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)

 # %%
 df_esm_PANAS = df_esm_preprocessed[
-    (
-        df_esm_preprocessed["questionnaire_id"]
-        == QUESTIONNAIRE_IDS["PANAS_positive_affect"]
-    )
-    | (
-        df_esm_preprocessed["questionnaire_id"]
-        == QUESTIONNAIRE_IDS["PANAS_negative_affect"]
-    )
+    (df_esm_preprocessed["questionnaire_id"] == 8)
+    | (df_esm_preprocessed["questionnaire_id"] == 9)
 ]
 df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)

@ -85,47 +65,35 @@ df_esm_PANAS_daily_means = (
 # %%
 df_esm_PANAS_summary_participant = (
    df_esm_PANAS_daily_means.groupby(["participant_id", "questionnaire_id"])
-    .esm_numeric_mean.agg(["mean", "median", "std"])
+    .agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
+df_esm_PANAS_summary_participant.columns = df_esm_PANAS_summary_participant.columns.get_level_values(
+    1
+)
 df_esm_PANAS_summary_participant[
-    "PANAS subscale"
+    "PANAS_subscale"
 ] = df_esm_PANAS_daily_means.questionnaire_id.astype("category").cat.rename_categories(
-    {8.0: "positive affect", 9.0: "negative affect"}
+    {8.0: "PA", 9.0: "NA"}
 )

 # %%
-df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["mean"]
-
-# %%
-df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["std"]
-
-# %%
-df_esm_PANAS_summary_participant.query("std == 0")
-
-# %%
-fig1 = sns.displot(
-    data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS subscale", binwidth=0.2
+sns.displot(
+    data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS_subscale", binwidth=0.2
 )
-fig1.set_axis_labels(x_var="participant mean", y_var="frequency")
-if save_figs:
-    fig1.figure.savefig("PANAS_mean_participant.pdf", dpi=300)

 # %%
 sns.displot(
    data=df_esm_PANAS_summary_participant,
    x="median",
-    hue="PANAS subscale",
+    hue="PANAS_subscale",
    binwidth=0.2,
 )

 # %%
-fig2 = sns.displot(
-    data=df_esm_PANAS_summary_participant, x="std", hue="PANAS subscale", binwidth=0.05
+sns.displot(
+    data=df_esm_PANAS_summary_participant, x="std", hue="PANAS_subscale", binwidth=0.05
 )
-fig2.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
-if save_figs:
-    fig2.figure.savefig("PANAS_std_participant.pdf", dpi=300)

 # %%
 df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
@ -141,14 +109,8 @@ df_SAM_all.head()

 # %%
 df_esm_SAM = df_esm_preprocessed[
-    (
-        df_esm_preprocessed["questionnaire_id"]
-        >= QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
-    )
-    & (
-        df_esm_preprocessed["questionnaire_id"]
-        <= QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
-    )
+    (df_esm_preprocessed["questionnaire_id"] >= 87)
+    & (df_esm_preprocessed["questionnaire_id"] <= 93)
 ]
 df_esm_SAM_clean = clean_up_esm(df_esm_SAM)

@ -156,10 +118,9 @@ df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
 # ## Stressful events

 # %%
-df_esm_SAM_event = df_esm_SAM_clean[
-    df_esm_SAM_clean["questionnaire_id"]
-    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
-].assign(stressful_event=lambda x: (x.esm_user_answer_numeric > 0))
+df_esm_SAM_event = df_esm_SAM_clean[df_esm_SAM_clean["questionnaire_id"] == 87].assign(
+    stressful_event=lambda x: (x.esm_user_answer_numeric > 0)
+)

 # %%
 df_esm_SAM_daily_events = (
@ -170,22 +131,20 @@ df_esm_SAM_daily_events = (
 )

 # %% [markdown]
-# Calculate the daily mean of YES (1) or NO (0) answers to the question about stressful events. This is then the daily ratio of EMA sessions that included a stressful event.
+# Calculate the daily mean of YES (1) or NO (0) answers to the question about a stressful events. This is then the daily ratio of EMA sessions that included a stressful event.

 # %%
 df_esm_SAM_event_summary_participant = (
    df_esm_SAM_daily_events.groupby(["participant_id"])
-    .SAM_event_ratio.agg(["mean", "median", "std"])
+    .agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
+df_esm_SAM_event_summary_participant.columns = df_esm_SAM_event_summary_participant.columns.get_level_values(
+    1
+)

 # %%
-fig6 = sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
-fig6.set_axis_labels(
-    x_var="participant proportion of stressful events", y_var="frequency"
-)
-if save_figs:
-    fig6.figure.savefig("SAM_events_mean_participant.pdf", dpi=300)
+sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)

 # %%
 sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
@ -196,12 +155,7 @@ sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
 # %% [markdown]
 # * Example of threat: "Did this event make you feel anxious?"
 # * Example of challenge: "How eager are you to tackle this event?"
-# * Possible answers:
-#   0 - Not at all,
-#   1 - Slightly,
-#   2 - Moderately,
-#   3 - Considerably,
-#   4 - Extremely
+# * Possible answers: 0 - Not at all, 1 - Slightly, 2 - Moderately, 3 - Considerably, 4 - Extremely

 # %%
 df_esm_SAM_daily = (
@ -213,45 +167,27 @@ df_esm_SAM_daily = (

 # %%
 df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[
-    (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
-    | (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
+    (df_esm_SAM_daily["questionnaire_id"] == 88)
+    | (df_esm_SAM_daily["questionnaire_id"] == 89)
 ]

 # %%
 df_esm_SAM_summary_participant = (
    df_esm_SAM_daily.groupby(["participant_id", "questionnaire_id"])
-    .esm_numeric_mean.agg(["mean", "median", "std"])
+    .agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
-
-# %%
-df_esm_SAM_event_stressfulness_summary_participant = df_esm_SAM_summary_participant[
-    df_esm_SAM_summary_participant["questionnaire_id"]
-    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
-]
-df_esm_SAM_event_stressfulness_summary_participant.describe()["mean"]
-
-# %%
-df_esm_SAM_event_stressfulness_summary_participant.describe()["std"]
-
-# %%
-sns.displot(
-    data=df_esm_SAM_event_stressfulness_summary_participant, x="mean", binwidth=0.2
+df_esm_SAM_summary_participant.columns = df_esm_SAM_summary_participant.columns.get_level_values(
+    1
 )

 # %%
 df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[
-    (
-        df_esm_SAM_summary_participant["questionnaire_id"]
-        == QUESTIONNAIRE_IDS["appraisal_threat"]
-    )
-    | (
-        df_esm_SAM_summary_participant["questionnaire_id"]
-        == QUESTIONNAIRE_IDS["appraisal_challenge"]
-    )
+    (df_esm_SAM_summary_participant["questionnaire_id"] == 88)
+    | (df_esm_SAM_summary_participant["questionnaire_id"] == 89)
 ]
 df_esm_SAM_threat_challenge_summary_participant[
-    "event subscale"
+    "event_subscale"
 ] = df_esm_SAM_threat_challenge_summary_participant.questionnaire_id.astype(
    "category"
 ).cat.rename_categories(
@ -262,84 +198,26 @@ df_esm_SAM_threat_challenge_summary_participant[
 sns.displot(
    data=df_esm_SAM_threat_challenge_summary_participant,
    x="mean",
-    hue="event subscale",
+    hue="event_subscale",
    binwidth=0.2,
 )

 # %%
-fig3 = sns.displot(
+sns.displot(
    data=df_esm_SAM_threat_challenge_summary_participant,
    x="std",
-    hue="event subscale",
+    hue="event_subscale",
    binwidth=0.1,
 )
-fig3.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
-if save_figs:
-    fig3.figure.savefig("SAM_std_participant.pdf", dpi=300)
-
-# %%
-df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
-    "mean"
-]
-
-# %%
-df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
-    "std"
-]
-
-# %%
-df_esm_SAM_clean.columns
-
-# %%
-df_esm_SAM_clean.esm_status.value_counts()
-
-# %%
-if export_data:
-    df_esm_SAM_fixed = reassign_question_ids(df_esm_SAM_clean, DICT_SAM_QUESTION_IDS)
-    df_esm_SAM_fixed = increment_answers(df_esm_SAM_fixed)
-    df_esm_SAM_for_export = df_esm_SAM_fixed[
-        [
-            "participant_id",
-            "username",
-            "device_id",
-            "_id",
-            "esm_trigger",
-            "esm_session",
-            "esm_notification_id",
-            "question_id",
-            "questionnaire_id",
-            "esm_instructions",
-            "double_esm_user_answer_timestamp",
-            "datetime_lj",
-            "date_lj",
-            "time",
-            "esm_user_answer",
-            "esm_user_answer_numeric",
-        ]
-    ]
-    df_esm_SAM_for_export.sort_values(
-        by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
-    )
-    print(df_esm_SAM_for_export.head())
-    df_esm_SAM_for_export.to_csv(
-        "../data/raw/df_esm_SAM_threat_challenge.csv", index=False
-    )

 # %% [markdown]
 # ## Stressfulness of period

 # %%
 df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[
-    df_esm_SAM_summary_participant["questionnaire_id"]
-    == QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
+    df_esm_SAM_summary_participant["questionnaire_id"] == 93
 ]

-# %%
-df_esm_SAM_period_summary_participant.describe()["mean"]
-
-# %%
-df_esm_SAM_period_summary_participant.describe()["std"]
-
 # %%
 sns.displot(data=df_esm_SAM_period_summary_participant, x="mean", binwidth=0.2)

@ -351,8 +229,8 @@ sns.displot(data=df_esm_SAM_period_summary_participant, x="std", binwidth=0.1)

 # %%
 df_esm_JCQ_demand_control = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["JCQ_job_demand"])
-    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["JCQ_job_control"])
+    (df_esm_preprocessed["questionnaire_id"] >= 10)
+    & (df_esm_preprocessed["questionnaire_id"] <= 11)
 ]
 df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control)

@ -372,11 +250,14 @@ df_esm_JCQ_daily = (
 )
 df_esm_JCQ_summary_participant = (
    df_esm_JCQ_daily.groupby(["participant_id", "questionnaire_id"])
-    .esm_score_mean.agg(["mean", "median", "std"])
+    .agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
+df_esm_JCQ_summary_participant.columns = df_esm_JCQ_summary_participant.columns.get_level_values(
+    1
+)
 df_esm_JCQ_summary_participant[
-    "JCQ subscale"
+    "JCQ_subscale"
 ] = df_esm_JCQ_summary_participant.questionnaire_id.astype(
    "category"
 ).cat.rename_categories(
@ -384,71 +265,11 @@ df_esm_JCQ_summary_participant[
 )

 # %%
-df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["mean"]
-
-# %%
-df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["std"]
-
-# %%
-fig4 = sns.displot(
-    data=df_esm_JCQ_summary_participant,
-    x="mean",
-    hue="JCQ subscale",
-    binwidth=0.1,
+sns.displot(
+    data=df_esm_JCQ_summary_participant, x="mean", hue="JCQ_subscale", binwidth=0.1,
 )
-fig4.set_axis_labels(x_var="participant mean", y_var="frequency")
-if save_figs:
-    fig4.figure.savefig("JCQ_mean_participant.pdf", dpi=300)

 # %%
-fig5 = sns.displot(
-    data=df_esm_JCQ_summary_participant,
-    x="std",
-    hue="JCQ subscale",
-    binwidth=0.05,
+sns.displot(
+    data=df_esm_JCQ_summary_participant, x="std", hue="JCQ_subscale", binwidth=0.05,
 )
-fig6.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
-if save_figs:
-    fig5.figure.savefig("JCQ_std_participant.pdf", dpi=300)
-
-# %% [markdown]
-# # COPE Inventory
-
-# %%
-df_esm_COPE = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["COPE_active"])
-    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["COPE_emotions"])
-]
-
-# %%
-df_esm_COPE_clean = clean_up_esm(df_esm_COPE)
-df_esm_COPE_clean = increment_answers(df_esm_COPE_clean)
-df_esm_COPE_fixed = reassign_question_ids(df_esm_COPE_clean, DICT_COPE_QUESTION_IDS)
-
-# %%
-if export_data:
-    df_esm_COPE_for_export = df_esm_COPE_fixed[
-        [
-            "participant_id",
-            "username",
-            "device_id",
-            "_id",
-            "esm_trigger",
-            "esm_session",
-            "esm_notification_id",
-            "question_id",
-            "questionnaire_id",
-            "esm_instructions",
-            "double_esm_user_answer_timestamp",
-            "datetime_lj",
-            "date_lj",
-            "time",
-            "esm_user_answer",
-            "esm_user_answer_numeric",
-        ]
-    ]
-    df_esm_COPE_for_export.sort_values(
-        by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
-    )
-    print(df_esm_COPE_for_export.head())
-    df_esm_COPE_for_export.to_csv("../data/raw/df_esm_COPE.csv", index=False)
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@ -6,129 +6,457 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.14.5
+#       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---

-# %% jupyter={"outputs_hidden": false, "source_hidden": false}
-# from IPython.core.interactiveshell import InteractiveShell
-from pathlib import Path
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# %matplotlib inline
+import os
+import sys

-# matplotlib inline
-# import os
-# import sys
+import numpy as np
+import matplotlib.pyplot as plt
 import pandas as pd

-from machine_learning.helper import (
-    impute_encode_categorical_features,
-    prepare_cross_validator,
-    prepare_sklearn_data_format,
-    run_all_classification_models,
-)
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
+from sklearn.dummy import DummyClassifier
+from sklearn.impute import SimpleImputer

-# InteractiveShell.ast_node_interactivity = "all"
+from lightgbm import LGBMClassifier
+import xgboost as xg
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+    
+import machine_learning.helper
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# ## Set script's parameters
 #
-# nb_dir = os.path.split(os.getcwd())[0]
-# if nb_dir not in sys.path:
-#     sys.path.append(nb_dir)

+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
+undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method)

-# %%
-CV_METHOD = "logo"  # logo, half_logo, 5kfold
-# Cross-validation method (could be regarded as a hyperparameter)
-print("CV_METHOD: " + CV_METHOD)
-N_SL = 3  # Number of largest/smallest accuracies (of particular CV) outputs
-UNDERSAMPLING = False
-# (bool) If True this will train and test data on balanced dataset
-# (using undersampling method)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
+# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]

-# %% jupyter={"outputs_hidden": false, "source_hidden": false}
-PATH_BASE = Path("E:/STRAWresults/20230415")
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+model_input.set_index(index_columns, inplace=True)
+model_input['target'].value_counts()

-SEGMENT_TYPE = "period"
-print("SEGMENT_TYPE: " + SEGMENT_TYPE)
-SEGMENT_LENGTH = "30_minutes_before"
-print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
-TARGET_VARIABLE = "JCQ_job_control"
-print("TARGET_VARIABLE: " + TARGET_VARIABLE)
-
-if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
-    TARGET_VARIABLE += "_"
-    TARGET_VARIABLE += SEGMENT_TYPE
-
-PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
-
-model_input = pd.read_csv(PATH_FULL)
-
-if SEGMENT_LENGTH == "daily":
-    DAY_LENGTH = "daily"  # or "working"
-    print(DAY_LENGTH)
-    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
-
-# %% jupyter={"outputs_hidden": false, "source_hidden": false}
-model_input["target"].value_counts()
-
-# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
 # bins = [-10, 0, 10] # bins for z-scored targets
-BINS = [-1, 0, 4]  # bins for stressfulness (0-4) target
-print("BINS: ", BINS)
-model_input["target"], edges = pd.cut(
-    model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
-)  # ['low', 'medium', 'high']
-print(model_input["target"].value_counts())
-REMOVE_MEDIUM = True
-if ("medium" in model_input["target"]) and REMOVE_MEDIUM:
-    model_input = model_input[model_input["target"] != "medium"]
-    model_input["target"] = (
-        model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
-    )
-else:
-    model_input["target"] = model_input["target"].map(
-        {"low": 0, "medium": 1, "high": 2}
-    )
-    print(model_input["target"].value_counts())
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
+model_input['target'].value_counts(), edges
+# model_input = model_input[model_input['target'] != "medium"]
+model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)

+model_input['target'].value_counts()

-# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
 # UnderSampling
-if UNDERSAMPLING:
-    no_stress = model_input[model_input["target"] == 0]
-    stress = model_input[model_input["target"] == 1]
-
+if undersampling:
+    no_stress = model_input[model_input['target'] == 0]
+    stress = model_input[model_input['target'] == 1]
+    
    no_stress = no_stress.sample(n=len(stress))
-    model_input = pd.concat([stress, no_stress], axis=0)
+    model_input = pd.concat([stress,no_stress], axis=0)
+
+#   model_input_new = pd.DataFrame(columns=model_input.columns)
+#   for pid in model_input["pid"].unique():
+#     stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
+#     no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
+#     if (len(stress) == 0):
+#       continue
+#     if (len(no_stress) == 0):
+#       continue
+#     model_input_new = pd.concat([model_input_new, stress], axis=0)
+    
+#     no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
+#     # In case there are more stress samples than no_stress, take all instances of no_stress.
+#     model_input_new = pd.concat([model_input_new, no_stress], axis=0)
+#     model_input = model_input_new   
+#     model_input_new = pd.concat([model_input_new, no_stress], axis=0)


-# %% jupyter={"outputs_hidden": false, "source_hidden": false}
-model_input_encoded = impute_encode_categorical_features(model_input)
-# %%
-data_x, data_y, data_groups = prepare_sklearn_data_format(
-    model_input_encoded, CV_METHOD
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+if cv_method_str == 'half_logo':
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
+else:
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+categorical_features = data_x[categorical_feature_colnames].copy()
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+train_x = pd.concat([numerical_features, categorical_features], axis=1)
+train_x.dtypes
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
+if cv_method_str == 'logo' or cv_method_str == 'half_logo':
+    cv_method = LeaveOneGroupOut()
+    cv_method.get_n_splits(
+        train_x,
+        data_y,
+        groups=data_groups,
+    )
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+
+# %% [markdown]
+# ### Baseline: Dummy Classifier (most frequent)
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+dummy_class = DummyClassifier(strategy="most_frequent")
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+dummy_classifier = cross_validate(
+    dummy_class,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
-cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
+print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
+print("Precision", np.mean(dummy_classifier['test_precision']))
+print("Recall", np.mean(dummy_classifier['test_recall']))
+print("F1", np.mean(dummy_classifier['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))

-# %%
-data_y.head()
+# %% [markdown] nteract={"transient": {"deleting": false}}
+# ### All models

+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+final_scores = machine_learning.helper.run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 # %%
-data_y.tail()
-# %%
-data_y.shape
-# %%
-scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
-# %%
-PATH_OUTPUT = Path("..") / Path("presentation/results")
-path_output_full = PATH_OUTPUT / (
-    TARGET_VARIABLE
-    + "_"
-    + SEGMENT_LENGTH
-    + "_classification"
-    + str(BINS)
-    + "_"
-    + CV_METHOD
-    + ".csv"
+final_scores.index.name = "metric"
+final_scores = final_scores.set_index(["method", final_scores.index])
+final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv")
+
+# %% [markdown]
+# ### Logistic Regression
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+logistic_regression = linear_model.LogisticRegression()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+log_reg_scores = cross_validate(
+    logistic_regression,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
-scores.to_csv(path_output_full, index=False)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
+print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
+print("Precision", np.mean(log_reg_scores['test_precision']))
+print("Recall", np.mean(log_reg_scores['test_recall']))
+print("F1", np.mean(log_reg_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Support Vector Machine
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+svc = svm.SVC()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+svc_scores = cross_validate(
+    svc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
+print("Precision", np.mean(svc_scores['test_precision']))
+print("Recall", np.mean(svc_scores['test_recall']))
+print("F1", np.mean(svc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Gaussian Naive Bayes
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+gaussian_nb = naive_bayes.GaussianNB()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+gaussian_nb_scores = cross_validate(
+    gaussian_nb,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
+print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
+print("Precision", np.mean(gaussian_nb_scores['test_precision']))
+print("Recall", np.mean(gaussian_nb_scores['test_recall']))
+print("F1", np.mean(gaussian_nb_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Stochastic Gradient Descent Classifier
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+sgdc = linear_model.SGDClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+sgdc_scores = cross_validate(
+    sgdc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
+print("Precision", np.mean(sgdc_scores['test_precision']))
+print("Recall", np.mean(sgdc_scores['test_recall']))
+print("F1", np.mean(sgdc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### K-nearest neighbors
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+knn = neighbors.KNeighborsClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+knn_scores = cross_validate(
+    knn,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
+print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
+print("Precision", np.mean(knn_scores['test_precision']))
+print("Recall", np.mean(knn_scores['test_recall']))
+print("F1", np.mean(knn_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Decision Tree
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+dtree = tree.DecisionTreeClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+dtree_scores = cross_validate(
+    dtree,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
+print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
+print("Precision", np.mean(dtree_scores['test_precision']))
+print("Recall", np.mean(dtree_scores['test_recall']))
+print("F1", np.mean(dtree_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Random Forest Classifier
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+rfc = ensemble.RandomForestClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+rfc_scores = cross_validate(
+    rfc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1'), 
+    return_estimator=True
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
+print("Precision", np.mean(rfc_scores['test_precision']))
+print("Recall", np.mean(rfc_scores['test_recall']))
+print("F1", np.mean(rfc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Feature importance (RFC)
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
+for idx, estimator in enumerate(rfc_scores['estimator']):
+    feature_importances = pd.DataFrame(estimator.feature_importances_,
+                                       index = list(train_x.columns),
+                                        columns=['importance'])
+    # print("\nFeatures sorted by their score for estimator {}:".format(idx))
+    # print(feature_importances.sort_values('importance', ascending=False).head(10))                                    
+    rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
+
+pd.set_option('display.max_rows', 100)
+print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
+
+rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
+
+rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
+
+train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
+
+# %% [markdown]
+# ### Gradient Boosting Classifier
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+gbc = ensemble.GradientBoostingClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+gbc_scores = cross_validate(
+    gbc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
+print("Precision", np.mean(gbc_scores['test_precision']))
+print("Recall", np.mean(gbc_scores['test_recall']))
+print("F1", np.mean(gbc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### LGBM Classifier
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+lgbm = LGBMClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+lgbm_scores = cross_validate(
+    lgbm,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
+print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
+print("Precision", np.mean(lgbm_scores['test_precision']))
+print("Recall", np.mean(lgbm_scores['test_recall']))
+print("F1", np.mean(lgbm_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### XGBoost Classifier
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+xgb_classifier = xg.sklearn.XGBClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+xgb_classifier_scores = cross_validate(
+    xgb_classifier,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
+print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
+print("Precision", np.mean(xgb_classifier_scores['test_precision']))
+print("Recall", np.mean(xgb_classifier_scores['test_recall']))
+print("F1", np.mean(xgb_classifier_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
+
--- a/exploration/ml_pipeline_classification_composite.py
+++ b/exploration/ml_pipeline_classification_composite.py
@ -1,177 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.14.5
-#   kernelspec:
-#     display_name: straw2analysis
-#     language: python
-#     name: straw2analysis
-# ---
-
-# %% jupyter={"outputs_hidden": false, "source_hidden": false}
-from pathlib import Path
-
-import pandas as pd
-import seaborn as sns
-from sklearn.decomposition import PCA
-
-from machine_learning.helper import (
-    impute_encode_categorical_features,
-    prepare_cross_validator,
-    prepare_sklearn_data_format,
-    run_all_classification_models,
-)
-
-# %%
-CV_METHOD = "logo"  # logo, half_logo, 5kfold
-# Cross-validation method (could be regarded as a hyperparameter)
-print("CV_METHOD: " + CV_METHOD)
-N_SL = 3  # Number of largest/smallest accuracies (of particular CV) outputs
-UNDERSAMPLING = False
-# (bool) If True this will train and test data on balanced dataset
-# (using undersampling method)
-
-# %% jupyter={"outputs_hidden": false, "source_hidden": false}
-PATH_BASE = Path("E:/STRAWresults/20230415")
-
-SEGMENT_TYPE = "period"
-print("SEGMENT_TYPE: " + SEGMENT_TYPE)
-SEGMENT_LENGTH = "30_minutes_before"
-print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
-
-PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv"
-
-all_features_with_baseline = pd.read_csv(PATH_FULL)
-
-# %%
-TARGETS = [
-    "PANAS_negative_affect_mean",
-    "PANAS_positive_affect_mean",
-    "JCQ_job_demand_mean",
-    "JCQ_job_control_mean",
-    "appraisal_stressfulness_period_mean",
-]
-
-# %%
-all_features_cleaned = pd.DataFrame()
-for target in TARGETS:
-    PATH_FULL = (
-        PATH_BASE
-        / SEGMENT_LENGTH
-        / "features"
-        / ("all_sensor_features_cleaned_straw_py_(" + target + ").csv")
-    )
-    current_features = pd.read_csv(PATH_FULL, index_col="local_segment")
-    if all_features_cleaned.empty:
-        all_features_cleaned = current_features
-    else:
-        all_features_cleaned = all_features_cleaned.join(
-            current_features[("phone_esm_straw_" + target)],
-            how="inner",
-            rsuffix="_" + target,
-        )
-    print(all_features_cleaned.shape)
-
-# %%
-pca = PCA(n_components=1)
-TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS]
-pca.fit(all_features_cleaned[TARGETS_PREFIXED])
-print(pca.explained_variance_ratio_)
-
-# %%
-model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED)
-model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED])
-
-# %%
-sns.histplot(data=model_input, x="target")
-
-# %%
-model_input.target.quantile(0.6)
-
-# %% jupyter={"outputs_hidden": false, "source_hidden": false}
-# bins = [-10, 0, 10] # bins for z-scored targets
-BINS = [-10, 0, 10]  # bins for stressfulness (0-4) target
-print("BINS: ", BINS)
-model_input["target"], edges = pd.cut(
-    model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
-)  # ['low', 'medium', 'high']
-print(model_input["target"].value_counts())
-REMOVE_MEDIUM = True
-if REMOVE_MEDIUM:
-    if "medium" in model_input["target"]:
-        model_input = model_input[model_input["target"] != "medium"]
-    model_input["target"] = (
-        model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
-    )
-else:
-    model_input["target"] = model_input["target"].map(
-        {"low": 0, "medium": 1, "high": 2}
-    )
-    print(model_input["target"].value_counts())
-
-
-# %% jupyter={"outputs_hidden": false, "source_hidden": false}
-# UnderSampling
-if UNDERSAMPLING:
-    no_stress = model_input[model_input["target"] == 0]
-    stress = model_input[model_input["target"] == 1]
-
-    no_stress = no_stress.sample(n=len(stress))
-    model_input = pd.concat([stress, no_stress], axis=0)
-
-
-# %%
-TARGET_VARIABLE = "PANAS_negative_affect"
-print("TARGET_VARIABLE: " + TARGET_VARIABLE)
-
-PATH_FULL_HELP = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
-
-model_input_with_baseline = pd.read_csv(PATH_FULL_HELP, index_col="local_segment")
-
-# %%
-baseline_col_names = [
-    col for col in model_input_with_baseline.columns if col not in model_input.columns
-]
-print(baseline_col_names)
-
-# %%
-model_input = model_input.join(
-    model_input_with_baseline[baseline_col_names], how="left"
-)
-model_input.reset_index(inplace=True)
-
-# %%
-model_input_encoded = impute_encode_categorical_features(model_input)
-
-# %%
-data_x, data_y, data_groups = prepare_sklearn_data_format(
-    model_input_encoded, CV_METHOD
-)
-cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
-
-# %%
-data_y.head()
-
-# %%
-data_y.tail()
-# %%
-data_y.shape
-# %%
-scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
-# %%
-PATH_OUTPUT = Path("..") / Path("presentation/results")
-path_output_full = PATH_OUTPUT / (
-    "composite_"
-    + SEGMENT_LENGTH
-    + "_classification"
-    + str(BINS)
-    + "_"
-    + CV_METHOD
-    + ".csv"
-)
-scores.to_csv(path_output_full, index=False)
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.14.5
+#       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,85 +14,80 @@
 # ---

 # %% jupyter={"source_hidden": true}
-from pathlib import Path
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys

-import matplotlib.pyplot as plt
 import numpy as np
+import matplotlib.pyplot as plt
 import pandas as pd
-from sklearn.cluster import KMeans
-from sklearn.impute import SimpleImputer
-from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold, cross_validate
+import seaborn as sns
+from scipy import stats

+from sklearn.model_selection import LeaveOneGroupOut, cross_validate
+from sklearn.impute import SimpleImputer
+
+from sklearn.dummy import DummyClassifier
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
+import xgboost as xg 
+
+from sklearn.cluster import KMeans
+
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.labels
+import machine_learning.model
 from machine_learning.classification_models import ClassificationModels

-# %%
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
 # ## Set script's parameters
-N_CLUSTERS = 4  # Number of clusters (could be regarded as a hyperparameter)
-CV_METHOD = "logo"  # logo, halflogo, 5kfold
-# Cross-validation method (could be regarded as a hyperparameter)
-N_SL = 1  # Number of largest/smallest accuracies (of particular CV) outputs
-
-# %%
-PATH_BASE = Path("E:/STRAWresults/20230415")
-
-SEGMENT_TYPE = "period"
-print("SEGMENT_TYPE: " + SEGMENT_TYPE)
-SEGMENT_LENGTH = "30_minutes_before"
-print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
-TARGET_VARIABLE = "appraisal_stressfulness"
-print("TARGET_VARIABLE: " + TARGET_VARIABLE)
-
-if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
-    TARGET_VARIABLE += "_"
-    TARGET_VARIABLE += SEGMENT_TYPE
-
-PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
-
-model_input = pd.read_csv(PATH_FULL)
-
-if SEGMENT_LENGTH == "daily":
-    DAY_LENGTH = "daily"  # or "working"
-    print(DAY_LENGTH)
-    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
+n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
+cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs

 # %% jupyter={"source_hidden": true}
-index_columns = [
-    "local_segment",
-    "local_segment_label",
-    "local_segment_start_datetime",
-    "local_segment_end_datetime",
-]
+model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]

-CLUST_COL = "limesurvey_demand_control_ratio_quartile"
-print("CLUST_COL: " + CLUST_COL)
+clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance

-BINS = [-1, 0, 4]
-print("BINS: " + str(BINS))
+model_input.columns[list(model_input.columns).index('age'):-1]

-model_input[CLUST_COL].describe()
+lime_cols = [col for col in model_input if col.startswith('limesurvey')]
+lime_cols
+lime_col = 'limesurvey_demand_control_ratio_quartile'
+clust_col = lime_col

+model_input[clust_col].describe()

-# %%
-model_input["target"].value_counts()

 # %% jupyter={"source_hidden": true}
-# Filter-out outlier rows by clust_col
-# model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]

-uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
+# Filter-out outlier rows by clust_col 
+#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
+
+uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
 uniq = uniq.dropna()
-plt.bar(uniq["pid"], uniq[CLUST_COL])
+plt.bar(uniq['pid'], uniq[clust_col])

 # %% jupyter={"source_hidden": true}
 # Get clusters by cluster col & and merge the clusters to main df
-km = KMeans(n_clusters=N_CLUSTERS).fit_predict(uniq.set_index("pid"))
+km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
 np.unique(km, return_counts=True)
-uniq["cluster"] = km
+uniq['cluster'] = km
+uniq

-model_input = model_input.merge(uniq[["pid", "cluster"]])
-
-# %%
-model_input[["cluster", "target"]].value_counts().sort_index()
+model_input = model_input.merge(uniq[['pid', 'cluster']])   

 # %% jupyter={"source_hidden": true}
 model_input.set_index(index_columns, inplace=True)
@ -103,56 +98,31 @@ cm = ClassificationModels()
 cmodels = cm.get_cmodels()

 # %% jupyter={"source_hidden": true}
-for k in range(N_CLUSTERS):
+for k in range(n_clusters):
    model_input_subset = model_input[model_input["cluster"] == k].copy()
-    model_input_subset.loc[:, "target"] = pd.cut(
-        model_input_subset.loc[:, "target"],
-        bins=BINS,
-        labels=["low", "high"],
-        right=True,
-    )  # ['low', 'medium', 'high']
-    model_input_subset["target"].value_counts()
-    # model_input_subset = model_input_subset[model_input_subset["target"] != "medium"]
-    model_input_subset["target"] = (
-        model_input_subset["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
-    )
+    bins = [-10, -1, 1, 10] # bins for z-scored targets
+    model_input_subset.loc[:, 'target'] = \
+        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
+    model_input_subset['target'].value_counts()
+    model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
+    model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)

-    print(model_input_subset["target"].value_counts())
+    model_input_subset['target'].value_counts()
+    
+    if cv_method_str == 'half_logo':
+        model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
+        model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')

-    if CV_METHOD == "half_logo":
-        model_input_subset["pid_index"] = model_input_subset.groupby("pid").cumcount()
-        model_input_subset["pid_count"] = model_input_subset.groupby("pid")[
-            "pid"
-        ].transform("count")
+        model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
+        model_input_subset["pid_half"] = model_input_subset["pid"] + "_" +  model_input_subset["pid_index"].astype(int).astype(str)

-        model_input_subset["pid_index"] = (
-            model_input_subset["pid_index"] / model_input_subset["pid_count"] + 1
-        ).round()
-        model_input_subset["pid_half"] = (
-            model_input_subset["pid"]
-            + "_"
-            + model_input_subset["pid_index"].astype(int).astype(str)
-        )
-
-        data_x, data_y, data_groups = (
-            model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
-            model_input_subset["target"],
-            model_input_subset["pid_half"],
-        )
+        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
    else:
-        data_x, data_y, data_groups = (
-            model_input_subset.drop(["target", "pid"], axis=1),
-            model_input_subset["target"],
-            model_input_subset["pid"],
-        )
+        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]

    # Treat categorical features
    categorical_feature_colnames = ["gender", "startlanguage"]
-    additional_categorical_features = [
-        col
-        for col in data_x.columns
-        if "mostcommonactivity" in col or "homelabel" in col
-    ]
+    additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
    categorical_feature_colnames += additional_categorical_features

    categorical_features = data_x[categorical_feature_colnames].copy()
@ -162,9 +132,7 @@ for k in range(N_CLUSTERS):
    categorical_features = categorical_features.fillna(mode_categorical_features)

    # one-hot encoding
-    categorical_features = categorical_features.apply(
-        lambda col: col.astype("category")
-    )
+    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
    if not categorical_features.empty:
        categorical_features = pd.get_dummies(categorical_features)

@ -172,10 +140,8 @@ for k in range(N_CLUSTERS):
    train_x = pd.concat([numerical_features, categorical_features], axis=1)

    # Establish cv method
-    cv_method = StratifiedKFold(
-        n_splits=5, shuffle=True
-    )  # Defaults to 5 k-folds in cross_validate method
-    if CV_METHOD == "logo" or CV_METHOD == "half_logo":
+    cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
+    if cv_method_str == 'logo' or cv_method_str == 'half_logo':
        cv_method = LeaveOneGroupOut()
        cv_method.get_n_splits(
            train_x,
@ -183,57 +149,36 @@ for k in range(N_CLUSTERS):
            groups=data_groups,
        )

-    imputer = SimpleImputer(missing_values=np.nan, strategy="median")
+    imputer = SimpleImputer(missing_values=np.nan, strategy='median')

    for model_title, model in cmodels.items():
+
        classifier = cross_validate(
-            model["model"],
+            model['model'],
            X=imputer.fit_transform(train_x),
            y=data_y,
            groups=data_groups,
            cv=cv_method,
            n_jobs=-1,
-            error_score="raise",
-            scoring=("accuracy", "precision", "recall", "f1"),
+            error_score='raise',
+            scoring=('accuracy', 'precision', 'recall', 'f1')
        )
-
+        
        print("\n-------------------------------------\n")
        print("Current cluster:", k, end="\n")
        print("Current model:", model_title, end="\n")
-        print("Acc", np.mean(classifier["test_accuracy"]))
-        print("Precision", np.mean(classifier["test_precision"]))
-        print("Recall", np.mean(classifier["test_recall"]))
-        print("F1", np.mean(classifier["test_f1"]))
-        print(
-            f"Largest {N_SL} ACC:",
-            np.sort(-np.partition(-classifier["test_accuracy"], N_SL)[:N_SL])[::-1],
-        )
-        print(
-            f"Smallest {N_SL} ACC:",
-            np.sort(np.partition(classifier["test_accuracy"], N_SL)[:N_SL]),
-        )
-
-        cmodels[model_title]["metrics"][0] += np.mean(classifier["test_accuracy"])
-        cmodels[model_title]["metrics"][1] += np.mean(classifier["test_precision"])
-        cmodels[model_title]["metrics"][2] += np.mean(classifier["test_recall"])
-        cmodels[model_title]["metrics"][3] += np.mean(classifier["test_f1"])
+        print("Acc", np.mean(classifier['test_accuracy']))
+        print("Precision", np.mean(classifier['test_precision']))
+        print("Recall", np.mean(classifier['test_recall']))
+        print("F1", np.mean(classifier['test_f1']))
+        print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
+        print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))
+        
+        cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
+        cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
+        cmodels[model_title]['metrics'][2] += np.mean(classifier['test_recall'])
+        cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])

 # %% jupyter={"source_hidden": true}
 # Get overall results
-scores = cm.get_total_models_scores(n_clusters=N_CLUSTERS)
-
-# %%
-PATH_OUTPUT = Path("..") / Path("presentation/results")
-path_output_full = PATH_OUTPUT / (
-    TARGET_VARIABLE
-    + "_"
-    + SEGMENT_LENGTH
-    + "_classification_"
-    + CV_METHOD
-    + str(BINS)
-    + "_clust_"
-    + CLUST_COL
-    + str(N_CLUSTERS)
-    + ".csv"
-)
-scores.to_csv(path_output_full, index=False)
+cm.get_total_models_scores(n_clusters=n_clusters)
--- a/exploration/ml_pipeline_classification_with_clustering_2_class.py
+++ b/exploration/ml_pipeline_classification_with_clustering_2_class.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.14.5
+#       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,83 +14,92 @@
 # ---

 # %% jupyter={"source_hidden": true}
-from pathlib import Path
+# %matplotlib inline
+import os
+import sys

-import matplotlib.pyplot as plt
 import numpy as np
+import matplotlib.pyplot as plt
 import pandas as pd
 from scipy import stats
-from sklearn.cluster import KMeans
-from sklearn.impute import SimpleImputer
-from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+
 from sklearn.model_selection import train_test_split
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+
+from sklearn.cluster import KMeans
+
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)

 from machine_learning.classification_models import ClassificationModels
-from machine_learning.helper import impute_encode_categorical_features
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# # Useful method
+def treat_categorical_features(input_set):
+    categorical_feature_colnames = ["gender", "startlanguage"]
+    additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col]
+    categorical_feature_colnames += additional_categorical_features
+        
+    categorical_features = input_set[categorical_feature_colnames].copy()
+    mode_categorical_features = categorical_features.mode().iloc[0]
+
+    # fillna with mode
+    categorical_features = categorical_features.fillna(mode_categorical_features)
+
+    # one-hot encoding
+    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+    if not categorical_features.empty:
+        categorical_features = pd.get_dummies(categorical_features)
+
+    numerical_features = input_set.drop(categorical_feature_colnames, axis=1)
+    
+    return pd.concat([numerical_features, categorical_features], axis=1)

 # %% [markdown]
 # ## Set script's parameters
-#
-
-# %%
-n_clusters = 3  # Number of clusters (could be regarded as a hyperparameter)
-n_sl = 3  # Number of largest/smallest accuracies (of particular CV) outputs
-
-# %%
-PATH_BASE = Path("E:/STRAWresults/20230415")
-
-SEGMENT_TYPE = "period"
-print("SEGMENT_TYPE: " + SEGMENT_TYPE)
-SEGMENT_LENGTH = "30_minutes_before"
-print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
-TARGET_VARIABLE = "appraisal_stressfulness"
-print("TARGET_VARIABLE: " + TARGET_VARIABLE)
-
-if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
-    TARGET_VARIABLE += "_"
-    TARGET_VARIABLE += SEGMENT_TYPE
-
-PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
-
-model_input = pd.read_csv(PATH_FULL)
-
-if SEGMENT_LENGTH == "daily":
-    DAY_LENGTH = "daily"  # or "working"
-    print(DAY_LENGTH)
-    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
+n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
+n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs

 # %% jupyter={"source_hidden": true}
-CLUST_COL = "limesurvey_demand_control_ratio"
-print("CLUST_COL: " + CLUST_COL)
+model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]

-BINS = [-1, 0, 4]
-print("BINS: " + str(BINS))
+clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance

-index_columns = [
-    "local_segment",
-    "local_segment_label",
-    "local_segment_start_datetime",
-    "local_segment_end_datetime",
-]
+model_input.columns[list(model_input.columns).index('age'):-1]

-model_input[CLUST_COL].describe()
+lime_cols = [col for col in model_input if col.startswith('limesurvey')]
+lime_cols
+lime_col = 'limesurvey_demand_control_ratio'
+clust_col = lime_col
+
+model_input[clust_col].describe()


 # %% jupyter={"source_hidden": true}
-# Filter-out outlier rows by clust_col
-model_input = model_input[(np.abs(stats.zscore(model_input[CLUST_COL])) < 3)]

-uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
-plt.bar(uniq["pid"], uniq[CLUST_COL])
+# Filter-out outlier rows by clust_col 
+model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
+
+uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
+plt.bar(uniq['pid'], uniq[clust_col])

 # %% jupyter={"source_hidden": true}
 # Get clusters by cluster col & and merge the clusters to main df
-km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index("pid"))
+km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
 np.unique(km, return_counts=True)
-uniq["cluster"] = km
-print(uniq)
+uniq['cluster'] = km
+uniq

-model_input = model_input.merge(uniq[["pid", "cluster"]])
+model_input = model_input.merge(uniq[['pid', 'cluster']])   

 # %% jupyter={"source_hidden": true}
 model_input.set_index(index_columns, inplace=True)
@ -100,64 +109,50 @@ model_input.set_index(index_columns, inplace=True)
 cm = ClassificationModels()
 cmodels = cm.get_cmodels()

-# %%
-model_input["target"].value_counts()
-
 # %% jupyter={"source_hidden": true}
 for k in range(n_clusters):
    model_input_subset = model_input[model_input["cluster"] == k].copy()
-
+    
    # Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
-    # model_input_subset['numerical_target'] = model_input_subset['target']
-
-    model_input_subset.loc[:, "target"] = pd.cut(
-        model_input_subset.loc[:, "target"], bins=BINS, labels=[0, 1], right=True
-    )
-
-    # p15 = np.percentile(model_input_subset['numerical_target'], 15)
-    # p85 = np.percentile(model_input_subset['numerical_target'], 85)
-
+    model_input_subset['numerical_target'] = model_input_subset['target']
+    bins = [-10, 0, 10] # bins for z-scored targets
+    model_input_subset.loc[:, 'target'] = \
+        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True)
+        
+    p15 = np.percentile(model_input_subset['numerical_target'], 15)
+    p85 = np.percentile(model_input_subset['numerical_target'], 85)
+    
    # Treat categorical features
-    model_input_subset = impute_encode_categorical_features(model_input_subset)
-
+    model_input_subset = treat_categorical_features(model_input_subset)
+    
    # Split to train, validate, and test subsets
-    # train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
-    # test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
-    train_set, test_set = train_test_split(
-        model_input_subset,
-        test_size=0.3,
-        stratify=model_input_subset["pid"],
-        random_state=42,
-    )
-
-    print(train_set["target"].value_counts())
-    print(test_set["target"].value_counts())
+    train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
+    test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)

+    train_set['target'].value_counts()
+    test_set['target'].value_counts()
+    
    train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
-
-    validate_x, test_x, validate_y, test_y = train_test_split(
-        test_set.drop(["target", "pid"], axis=1),
-        test_set["target"],
-        test_size=0.50,
-        random_state=42,
-    )
-
+    
+    validate_x, test_x, validate_y, test_y = \
+        train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42)
+    
    # Impute missing values
-    imputer = SimpleImputer(missing_values=np.nan, strategy="median")
+    imputer = SimpleImputer(missing_values=np.nan, strategy='median')

    train_x = imputer.fit_transform(train_x)
    validate_x = imputer.fit_transform(validate_x)
    test_x = imputer.fit_transform(test_x)

    for model_title, model in cmodels.items():
-        model["model"].fit(train_x, train_y)
-        y_pred = model["model"].predict(validate_x)
-
+        model['model'].fit(train_x, train_y)
+        y_pred = model['model'].predict(validate_x)
+        
        acc = accuracy_score(validate_y, y_pred)
        prec = precision_score(validate_y, y_pred)
        rec = recall_score(validate_y, y_pred)
        f1 = f1_score(validate_y, y_pred)
-
+        
        print("\n-------------------------------------\n")
        print("Current cluster:", k, end="\n")
        print("Current model:", model_title, end="\n")
@ -165,30 +160,12 @@ for k in range(n_clusters):
        print("Precision", prec)
        print("Recall", rec)
        print("F1", f1)
-
-        cmodels[model_title]["metrics"][0] += acc
-        cmodels[model_title]["metrics"][1] += prec
-        cmodels[model_title]["metrics"][2] += rec
-        cmodels[model_title]["metrics"][3] += f1
+        
+        cmodels[model_title]['metrics'][0] += acc
+        cmodels[model_title]['metrics'][1] += prec
+        cmodels[model_title]['metrics'][2] += rec
+        cmodels[model_title]['metrics'][3] += f1

 # %% jupyter={"source_hidden": true}
 # Get overall results
-scores = cm.get_total_models_scores(n_clusters=n_clusters)
-
-# %%
-print(scores)
-
-# %%
-PATH_OUTPUT = Path("..") / Path("presentation/results")
-path_output_full = PATH_OUTPUT / (
-    TARGET_VARIABLE
-    + "_"
-    + SEGMENT_LENGTH
-    + "_classification"
-    + str(BINS)
-    + "_CLUST_"
-    + CLUST_COL
-    + +str(n_clusters)
-    + ".csv"
-)
-scores.to_csv(path_output_full, index=False)
+cm.get_total_models_scores(n_clusters=n_clusters)
--- a/exploration/ml_pipeline_regression.py
+++ b/exploration/ml_pipeline_regression.py
@ -6,61 +6,350 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.14.5
+#       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---

-# %%
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import datetime
+import importlib
 import os
 import sys

+import numpy as np
+import matplotlib.pyplot as plt
 import pandas as pd
-
-from machine_learning.helper import (
-    impute_encode_categorical_features,
-    prepare_cross_validator,
-    prepare_sklearn_data_format,
-    run_all_regression_models,
-)
+import seaborn as sns
+import yaml
+from pyprojroot import here
+from sklearn import linear_model, svm, kernel_ridge, gaussian_process
+from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.impute import SimpleImputer
+from sklearn.dummy import DummyRegressor
+import xgboost as xg
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"

 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)

-# %%
-model_input = pd.read_csv(
-    "../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
+import machine_learning.features_sensor
+import machine_learning.labels
+import machine_learning.model
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# ## PANAS negative affect
+
+# %% jupyter={"source_hidden": true}
+model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+
+# %% jupyter={"source_hidden": true}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+#if "pid" in model_input.columns:
+#    index_columns.append("pid")
+model_input.set_index(index_columns, inplace=True)
+
+cv_method = 'half_logo' # logo, half_logo, 5kfold
+if cv_method == 'logo':
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+else:
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
+
+# %% jupyter={"source_hidden": true}
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+# %% jupyter={"source_hidden": true}
+categorical_features = data_x[categorical_feature_colnames].copy()
+
+# %% jupyter={"source_hidden": true}
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# %% jupyter={"source_hidden": true}
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# %% jupyter={"source_hidden": true}
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+# %% jupyter={"source_hidden": true}
+numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+
+# %% jupyter={"source_hidden": true}
+train_x = pd.concat([numerical_features, categorical_features], axis=1)
+
+# %% jupyter={"source_hidden": true}
+train_x.dtypes
+
+# %% jupyter={"source_hidden": true}
+logo = LeaveOneGroupOut()
+logo.get_n_splits(
+    train_x,
+    data_y,
+    groups=data_groups,
 )

-# %%
-model_input = model_input[model_input["local_segment"].str.contains("daily")]
+# Defaults to 5 k folds in cross_validate method
+if cv_method != 'logo' and cv_method != 'half_logo':
+    logo = None

-# %%
-CV_METHOD = "logo"  # logo, half_logo, 5kfold
+# %% jupyter={"source_hidden": true}
+sum(data_y.isna())

-model_input_encoded = impute_encode_categorical_features(model_input)
-# %%
-data_x, data_y, data_groups = prepare_sklearn_data_format(
-    model_input_encoded, CV_METHOD
+# %% [markdown]
+# ### Baseline: Dummy Regression (mean)
+dummy_regr = DummyRegressor(strategy="mean")
+
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+dummy_regressor = cross_validate(
+    dummy_regr,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
-cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
-# %%
-data_y.head()
+print("Negative Mean Squared Error", np.median(dummy_regressor['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(dummy_regressor['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(dummy_regressor['test_neg_root_mean_squared_error']))
+print("R2", np.median(dummy_regressor['test_r2']))

-# %%
-data_y.tail()
+# %% [markdown]
+# ### Linear Regression

-# %%
-data_y.shape
+# %% jupyter={"source_hidden": true}
+lin_reg_rapids = linear_model.LinearRegression()
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

-# %%
-scores = run_all_regression_models(data_x, data_y, data_groups, cross_validator)
-
-# %%
-scores.to_csv(
-    "../presentation/JCQ_supervisor_support_regression_" + CV_METHOD + ".csv",
-    index=False,
+# %% jupyter={"source_hidden": true}
+lin_reg_scores = cross_validate(
+    lin_reg_rapids,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
+print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(lin_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### XGBRegressor Linear Regression
+# %% jupyter={"source_hidden": true}
+xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+xgb_reg_scores = cross_validate(
+    xgb_r,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(xgb_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(xgb_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(xgb_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(xgb_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### XGBRegressor Pseudo Huber Error Regression
+# %% jupyter={"source_hidden": true}
+xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+xgb_psuedo_huber_reg_scores = cross_validate(
+    xgb_psuedo_huber_r,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(xgb_psuedo_huber_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Ridge regression
+
+# %% jupyter={"source_hidden": true}
+ridge_reg = linear_model.Ridge(alpha=.5)
+
+# %% tags=[] jupyter={"source_hidden": true}
+ridge_reg_scores = cross_validate(
+    ridge_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(ridge_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(ridge_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(ridge_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(ridge_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Lasso
+
+# %% jupyter={"source_hidden": true}
+lasso_reg = linear_model.Lasso(alpha=0.1)
+
+# %% jupyter={"source_hidden": true}
+lasso_reg_score = cross_validate(
+    lasso_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(lasso_reg_score['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(lasso_reg_score['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(lasso_reg_score['test_neg_root_mean_squared_error']))
+print("R2", np.median(lasso_reg_score['test_r2']))
+
+# %% [markdown]
+# ### Bayesian Ridge
+
+# %% jupyter={"source_hidden": true}
+bayesian_ridge_reg = linear_model.BayesianRidge()
+
+# %% jupyter={"source_hidden": true}
+bayesian_ridge_reg_score = cross_validate(
+    bayesian_ridge_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
+print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
+
+# %% [markdown]
+# ### RANSAC (outlier robust regression)
+
+# %% jupyter={"source_hidden": true}
+ransac_reg = linear_model.RANSACRegressor()
+
+# %% jupyter={"source_hidden": true}
+ransac_reg_scores = cross_validate(
+    ransac_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(ransac_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(ransac_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(ransac_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(ransac_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Support vector regression
+
+# %% jupyter={"source_hidden": true}
+svr = svm.SVR()
+
+# %% jupyter={"source_hidden": true}
+svr_scores = cross_validate(
+    svr,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(svr_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(svr_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(svr_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(svr_scores['test_r2']))
+
+# %% [markdown]
+# ### Kernel Ridge regression
+
+# %% jupyter={"source_hidden": true}
+kridge = kernel_ridge.KernelRidge()
+
+# %% jupyter={"source_hidden": true}
+kridge_scores = cross_validate(
+    kridge,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(kridge_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(kridge_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(kridge_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(kridge_scores['test_r2']))
+
+# %% [markdown]
+# ### Gaussian Process Regression
+
+# %% jupyter={"source_hidden": true}
+gpr = gaussian_process.GaussianProcessRegressor()
+
+# %% jupyter={"source_hidden": true}
+
+gpr_scores = cross_validate(
+    gpr,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(gpr_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(gpr_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(gpr_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(gpr_scores['test_r2']))
+
+# %%
--- a/exploration/test_JCQ_reversal.py
+++ b/exploration/test_JCQ_reversal.py
@ -1,217 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.14.5
-#   kernelspec:
-#     display_name: straw2analysis
-#     language: python
-#     name: straw2analysis
-# ---
-
-# %%
-import pandas as pd
-
-from features.esm_JCQ import DICT_JCQ_DEMAND_CONTROL_REVERSE
-
-# %%
-limesurvey_questions = pd.read_csv(
-    "E:/STRAWbaseline/survey637813+question_text.csv", header=None
-).T
-
-# %%
-limesurvey_questions
-
-# %%
-limesurvey_questions[["code", "text"]] = limesurvey_questions[0].str.split(
-    r"\.\s", expand=True, n=1
-)
-
-# %%
-limesurvey_questions
-
-# %%
-demand_reverse_lime_rows = (
-    limesurvey_questions["text"].str.startswith(" [Od mene se ne zahteva,")
-    | limesurvey_questions["text"].str.startswith(" [Imam dovolj časa, da končam")
-    | limesurvey_questions["text"].str.startswith(
-        " [Pri svojem delu se ne srečujem s konfliktnimi"
-    )
-)
-control_reverse_lime_rows = limesurvey_questions["text"].str.startswith(
-    " [Moje delo vključuje veliko ponavljajočega"
-) | limesurvey_questions["text"].str.startswith(
-    " [Pri svojem delu imam zelo malo svobode"
-)
-
-# %%
-demand_reverse_lime = limesurvey_questions[demand_reverse_lime_rows]
-demand_reverse_lime.loc[:, "qid"] = demand_reverse_lime["code"].str.extract(
-    r"\[(\d+)\]"
-)
-control_reverse_lime = limesurvey_questions[control_reverse_lime_rows]
-control_reverse_lime.loc[:, "qid"] = control_reverse_lime["code"].str.extract(
-    r"\[(\d+)\]"
-)
-
-# %%
-limesurvey_questions.loc[89, "text"]
-
-# %%
-limesurvey_questions[limesurvey_questions["code"].str.startswith("JobEisen")]
-
-# %%
-demand_reverse_lime
-
-# %%
-control_reverse_lime
-
-# %%
-participant_info = pd.read_csv(
-    "C:/Users/junos/Documents/FWO-ARRS/Analysis/straw2analysis/rapids/data/raw/p031/participant_baseline_raw.csv",
-    parse_dates=["date_of_birth"],
-)
-
-# %%
-participant_info_t = participant_info.T
-
-# %%
-rows_baseline = participant_info_t.index
-
-# %%
-rows_demand = rows_baseline.str.startswith("JobEisen") & ~rows_baseline.str.endswith(
-    "Time"
-)
-
-# %%
-rows_baseline[rows_demand]
-
-# %%
-limesurvey_control = (
-    participant_info_t[rows_demand]
-    .reset_index()
-    .rename(columns={"index": "question", 0: "score_original"})
-)
-
-# %%
-limesurvey_control
-
-# %%
-limesurvey_control["qid"] = (
-    limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
-)
-
-# %%
-limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
-
-# %%
-limesurvey_control["score"] = limesurvey_control["score_original"]
-
-# %%
-limesurvey_control["qid"][0]
-
-# %%
-rows_demand_reverse = limesurvey_control["qid"].isin(
-    DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
-)
-limesurvey_control.loc[rows_demand_reverse, "score"] = (
-    4 + 1 - limesurvey_control.loc[rows_demand_reverse, "score_original"]
-)
-
-# %%
-JCQ_DEMAND = "JobEisen"
-JCQ_CONTROL = "JobControle"
-dict_JCQ_demand_control_reverse = {
-    JCQ_DEMAND: {
-        3: " [Od mene se ne zahteva,",
-        4: " [Imam dovolj časa, da končam",
-        5: " [Pri svojem delu se ne srečujem s konfliktnimi",
-    },
-    JCQ_CONTROL: {
-        2: " |Moje delo vključuje veliko ponavljajočega",
-        6: " [Pri svojem delu imam zelo malo svobode",
-    },
-}
-
-# %%
-limesurvey_control
-
-# %%
-test = pd.DataFrame(
-    data={"question": "one", "score_original": 3, "score": 3, "qid": 10}, index=[0]
-)
-
-# %%
-pd.concat([test, limesurvey_control]).reset_index()
-
-# %%
-limesurvey_control["score"].sum()
-
-# %%
-rows_demand_reverse
-
-# %%
-dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
-
-# %%
-limesurvey_control
-
-# %%
-DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
-DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
-
-JCQ_NORMS = {
-    "F": {
-        0: DEMAND_CONTROL_RATIO_MIN,
-        1: 0.45,
-        2: 0.52,
-        3: 0.62,
-        4: DEMAND_CONTROL_RATIO_MAX,
-    },
-    "M": {
-        0: DEMAND_CONTROL_RATIO_MIN,
-        1: 0.41,
-        2: 0.48,
-        3: 0.56,
-        4: DEMAND_CONTROL_RATIO_MAX,
-    },
-}
-
-# %%
-JCQ_NORMS[participant_info.loc[0, "gender"]][0]
-
-# %%
-participant_info_t.index.str.startswith("JobControle")
-
-# %%
-columns_baseline = participant_info.columns
-
-# %%
-columns_demand = columns_baseline.str.startswith(
-    "JobControle"
-) & ~columns_baseline.str.endswith("Time")
-
-# %%
-columns_baseline[columns_demand]
-
-# %%
-participant_control = participant_info.loc[:, columns_demand]
-
-# %%
-participant_control["id"] = participant_control.index
-
-# %%
-participant_control
-
-# %%
-pd.wide_to_long(
-    participant_control,
-    stubnames="JobControle",
-    i="id",
-    j="qid",
-    sep="[",
-    suffix="(\\d+)]",
-)
--- a/features/esm.py
+++ b/features/esm.py
@ -20,47 +20,11 @@ ANSWER_DAY_OFF = "DayOff3421"
 ANSWER_SET_EVENING = "DayFinishedSetEvening"

 MAX_MORNING_LENGTH = 3
-# When the participant was not yet at work at the time of the first (morning) EMA,
+# When the participants was not yet at work at the time of the first (morning) EMA,
 # only three items were answered.
 # Two sleep related items and one indicating NOT starting work yet.
 # Daytime EMAs are all longer, in fact they always consist of at least 6 items.

-QUESTIONNAIRE_IDS = {
-    "sleep_quality": 1,
-    "PANAS_positive_affect": 8,
-    "PANAS_negative_affect": 9,
-    "JCQ_job_demand": 10,
-    "JCQ_job_control": 11,
-    "JCQ_supervisor_support": 12,
-    "JCQ_coworker_support": 13,
-    "PFITS_supervisor": 14,
-    "PFITS_coworkers": 15,
-    "UWES_vigor": 16,
-    "UWES_dedication": 17,
-    "UWES_absorption": 18,
-    "COPE_active": 19,
-    "COPE_support": 20,
-    "COPE_emotions": 21,
-    "balance_life_work": 22,
-    "balance_work_life": 23,
-    "recovery_experience_detachment": 24,
-    "recovery_experience_relaxation": 25,
-    "symptoms": 26,
-    "appraisal_stressfulness_event": 87,
-    "appraisal_threat": 88,
-    "appraisal_challenge": 89,
-    "appraisal_event_time": 90,
-    "appraisal_event_duration": 91,
-    "appraisal_event_work_related": 92,
-    "appraisal_stressfulness_period": 93,
-    "late_work": 94,
-    "work_hours": 95,
-    "left_work": 96,
-    "activities": 97,
-    "coffee_breaks": 98,
-    "at_work_yet": 99,
-}
-

 def get_esm_data(usernames: Collection) -> pd.DataFrame:
    """
@ -88,10 +52,8 @@ def get_esm_data(usernames: Collection) -> pd.DataFrame:

 def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    """
-    Convert timestamps and expand JSON column.
-
    Convert timestamps into human-readable datetimes and dates
-        and expand the JSON column into several Pandas DF columns.
+    and expand the JSON column into several Pandas DF columns.

    Parameters
    ----------
@ -101,8 +63,7 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    Returns
    -------
    df_esm_preprocessed: pd.DataFrame
-        A dataframe with added columns: datetime in Ljubljana timezone
-            and all fields from ESM_JSON column.
+        A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
    """
    df_esm = helper.get_date_from_timestamp(df_esm)

@ -115,39 +76,31 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
 def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
    For each distinct EMA session, determine how the participant responded to it.
-
-    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED,
-        and SESSION_STATUS_COMPLETE
+    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE

    This is done in three steps.

    First, the esm_status is considered.
-    If any of the ESMs in a session has a status *other than* "answered",
-        then this session is taken as unfinished.
+    If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.

    Second, the sessions which do not represent full questionnaires are identified.
-    These are sessions where participants only marked they are finished with the day
-        or have not yet started working.
+    These are sessions where participants only marked they are finished with the day or have not yet started working.

    Third, the sessions with only one item are marked with their trigger.
-    We never offered questionnaires with single items,
-        so we can be sure these are unfinished.
+    We never offered questionnaires with single items, so we can be sure these are unfinished.

    Finally, all sessions that remain are marked as completed.
-    By going through different possibilities in expl_esm_adherence.ipynb,
-        this turned out to be a reasonable option.
+    By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option.

    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data,
-            which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data, which must include the session ID (esm_session).

    Returns
    -------
    df_session_counts: pd.Dataframe
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
-            with their statuses and the number of items.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
    """
    sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)

@ -202,22 +155,17 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat

 def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
-    Classify EMA sessions into morning, workday, or evening.
-
-    For each EMA session, determine the time of the first user answer
-        and its time type (morning, workday, or evening).
+    For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)

    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data,
-            which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data, which must include the session ID (esm_session).

    Returns
    -------
    df_session_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
-            with their time type and timestamp of first answer.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
    """
    df_session_time = (
        df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
@ -231,17 +179,13 @@ def classify_sessions_by_completion_time(
    df_esm_preprocessed: pd.DataFrame,
 ) -> pd.DataFrame:
    """
-    Classify sessions and correct the time type.
-
-    The point of this function is to not only classify sessions
-        by using the previously defined functions.
+    The point of this function is to not only classify sessions by using the previously defined functions.
    It also serves to "correct" the time type of some EMA sessions.

    A morning questionnaire could seamlessly transition into a daytime questionnaire,
        if the participant was already at work.
    In this case, the "time" label changed mid-session.
-    Because of the way classify_sessions_by_time works,
-        this questionnaire was classified as "morning".
+    Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
    But for all intents and purposes, it can be treated as a "daytime" EMA.

    The way this scenario is differentiated from a true "morning" questionnaire,
@ -250,16 +194,13 @@ def classify_sessions_by_completion_time(
    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data,
-            which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data, which must include the session ID (esm_session).

    Returns
    -------
    df_session_counts_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses,
-            the number of items,
-            their time type (with some morning EMAs reclassified)
-            and timestamp of first answer.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items,
+            their time type (with some morning EMAs reclassified) and timestamp of first answer.

    """
    df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
@ -278,8 +219,7 @@ def classify_sessions_by_completion_time(

 def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
-    Eliminate invalid ESM responses.
-
+    This function eliminates invalid ESM responses.
    It removes unanswered ESMs and those that indicate end of work and similar.
    It also extracts a numeric answer from strings such as "4 - I strongly agree".

@ -316,100 +256,3 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
        )
    )
    return df_esm_clean
-
-
-def increment_answers(df_esm_clean: pd.DataFrame, increment_by=1):
-    """
-    Increment answers to keep in line with original scoring.
-
-    We always used 0 for the lowest value of user answer.
-    Some scales originally used other scoring, such as starting from 1.
-    This restores original scoring so that the values are comparable to references.
-
-    Parameters
-    ----------
-    df_esm_clean: pd.DataFrame
-        A cleaned ESM dataframe, which must also include esm_user_answer_numeric.
-    increment_by:
-        A number to add to the user answer.
-
-    Returns
-    -------
-    df_esm_clean: pd.DataFrame
-        The same df with addition of a column 'esm_user_answer_numeric'.
-
-    """
-    try:
-        df_esm_clean = df_esm_clean.assign(
-            esm_user_score=lambda x: x.esm_user_answer_numeric + increment_by
-        )
-    except AttributeError as e:
-        print("Please, clean the dataframe first using features.esm.clean_up_esm.")
-        print(e)
-    return df_esm_clean
-
-
-def reassign_question_ids(
-    df_esm_cleaned: pd.DataFrame, question_ids_content: dict
-) -> pd.DataFrame:
-    """
-    Fix question IDs to match their actual content.
-
-    Unfortunately, when altering the protocol to adapt to COVID pandemic,
-    we did not retain original question IDs.
-    This means that for participants before 2021, they are different
-    from for the rest of them.
-    This function searches for question IDs by matching their strings.
-
-    Parameters
-    ----------
-    df_esm_cleaned: pd.DataFrame
-        A cleaned up dataframe, which must also include esm_user_answer_numeric.
-    question_ids_content: dict
-        A dictionary, linking question IDs with their content ("instructions").
-
-    Returns
-    -------
-    df_esm_fixed: pd.DataFrame
-        The same dataframe but with fixed question IDs.
-    """
-    df_esm_unique_questions = (
-        df_esm_cleaned.groupby("question_id")
-        .esm_instructions.value_counts()
-        .rename()
-        .reset_index()
-    )
-    # Tabulate all possible answers to each question (group by question ID).
-
-    # First, check that we anticipated all esm instructions.
-    for q_id in question_ids_content.keys():
-        # Look for all questions ("instructions") occurring in the dataframe.
-        actual_questions = df_esm_unique_questions.loc[
-            df_esm_unique_questions["question_id"] == q_id,
-            "esm_instructions",
-        ]
-        # These are all answers to a given question (by q_id).
-        questions_matches = actual_questions.str.startswith(
-            question_ids_content.get(q_id)
-        )
-        # See if they are expected, i.e. included in the dictionary.
-        if ~actual_questions.all():
-            print("One of the questions that occur in the data was undefined.")
-            print("This were the questions found in the data: ")
-            raise KeyError(actual_questions[~questions_matches])
-            # In case there is an unexpected answer, raise an exception.
-
-    # Next, replace question IDs.
-    df_esm_fixed = df_esm_cleaned.copy()
-    df_esm_fixed["question_id"] = df_esm_cleaned["esm_instructions"].apply(
-        lambda x: next(
-            (
-                key
-                for key, values in question_ids_content.items()
-                if x.startswith(values)
-            ),
-            None,
-        )
-    )
-
-    return df_esm_fixed
--- a/features/esm_COPE.py
+++ b/features/esm_COPE.py
@ -1,125 +0,0 @@
-COPE_ORIGINAL_MAX = 4
-COPE_ORIGINAL_MIN = 1
-
-DICT_COPE_QUESTION_IDS = {
-    164: (
-        "I took additional action to try to get rid of the problem",
-        "Ik deed extra mijn best om er iets aan te doen",
-        "Vložila sem dodaten napor, da bi rešila problem",
-        "Vložil sem dodaten napor, da bi rešil problem",
-    ),
-    165: (
-        "I concentrated my efforts on doing something about it",
-        "Ik probeerde de situatie te verbeteren",
-        "Svoje sile sem usmerila v reševanje nastale situacije",
-        "Svoje sile sem usmeril v reševanje nastale situacije",
-    ),
-    166: (
-        "I did what had to be done, one step at a time",
-        "Ik deed stap voor stap wat nodig was",
-        "Naredila sem, kar je bilo potrebno – korak za korakom",
-        "Naredil sem, kar je bilo potrebno – korak za korakom",
-    ),
-    167: (
-        "I took direct action to get around the problem",
-        "Ik handelde vlug om het probleem te verhelpen",
-        "Nekaj sem naredila, da sem zaobšla problem",
-        "Nekaj sem naredil, da sem zaobšel problem",
-    ),
-    168: (
-        "I tried to come up with a strategy about what to do",
-        "Ik probeerde te verzinnen wat ik er aan kon doen",
-        "Skušala sem najti ustrezen način za rešitev situacije",
-        "Skušal sem najti ustrezen način za rešitev situacije",
-    ),
-    169: (
-        "I made a plan of action",
-        "Ik maakte een plan",
-        "Naredila sem načrt za delovanje",
-        "Naredil sem načrt za delovanje",
-    ),
-    170: (
-        "I thought hard about what steps to take",
-        "Ik dacht hard na over wat ik moest doen",
-        "Dobro sem premislila, katere korake moram narediti, da rešim problem",
-        "Dobro sem premislil, katere korake moram narediti, da rešim problem",
-    ),
-    171: (
-        "I thought about how I might best handle the problem",
-        "lk dacht na over hoe ik het probleem het best kon aanpakken",
-        "Razmišljala sem, kaj bi bilo najbolje narediti s problemom",
-        "Razmišljal sem, kaj bi bilo najbolje narediti s problemom",
-    ),
-    172: (
-        "I asked people who have had similar experiences what they did",
-        "Ik vroeg aan mensen met dergelijke ervaringen hoe zij reageerden",
-        "Vprašala sem posameznike s podobnimi izkušnjami, kaj so storili",
-        "Vprašal sem posameznike s podobnimi izkušnjami, kaj so storili",
-    ),
-    173: (
-        "I tried to get advice from someone about what to do",
-        "lk vroeg advies aan iemand",
-        "Pri drugih sem poskušala dobiti nasvet, kaj naj storim",
-        "Pri drugih sem poskušal dobiti nasvet, kaj naj storim",
-    ),
-    174: (
-        "I talked to someone to find out more about the situation",
-        "Ik sprak met iemand om meer te weten te komen over de situatie",
-        "Z nekom sem se pogovorila, da bi izvedela še kaj o svojem problemu",
-        "Z nekom sem se pogovoril, da bi izvedel še kaj o svojem problemu",
-    ),
-    175: (
-        "I talked to someone who could do something concrete about the problem",
-        "Ik sprak met iemand die iets aan het probleem kon doen",
-        "Pogovorila sem se s kom, ki bi lahko naredil kaj konkretnega",
-        "Pogovoril sem se s kom, ki bi lahko naredil kaj konkretnega",
-    ),
-    176: (
-        "I talked to someone about how I felt",
-        "Ik sprak met iemand over hoe ik mij voelde",
-        "Z nekom sem se pogovorila o tem, kako sem se počutila",
-        "Z nekom sem se pogovoril o tem, kako sem se počutil",
-    ),
-    177: (
-        "I tried to get emotional support from friends or relatives",
-        "Ik zocht steun bij vrienden of familie",
-        "Skušala sem dobiti čustveno podporo prijateljev ali sorodnikov",
-        "Skušal sem dobiti čustveno podporo prijateljev ali sorodnikov",
-    ),
-    178: (
-        "I discussed my feelings with someone",
-        "lk besprak mijn gevoelens met iemand",
-        "O svojih občutkih sem se z nekom pogovorila",
-        "O svojih občutkih sem se z nekom pogovoril",
-    ),
-    179: (
-        "I got sympathy and understanding from someone",
-        "Ik vroeg medeleven en begrip van iemand",
-        "Poiskala sem naklonjenost in razumevanje drugih",
-        "Poiskal sem naklonjenost in razumevanje drugih",
-    ),
-    180: (
-        "I got upset and let my emotions out",
-        "Ik raakte van streek",
-        "Razburila sem se in to tudi pokazala",
-        "Razburil sem se in to tudi pokazal",
-    ),
-    181: (
-        "I let my feelings out",
-        "Ik toonde mijn gevoelens",
-        "Svojim čustvom sem dala prosto pot",
-        "Svojim čustvom sem dal prosto pot",
-    ),
-    182: (
-        "I felt a lot of emotional distress and I found myself expressing",
-        "lk liet duidelijk blijken hoe ellendig ik mij voelde",
-        "Doživljala sem veliko stresa in opažala, da sem čustva",
-        "Doživljal sem veliko stresa in opažal, da sem čustva",
-    ),
-    183: (
-        "I got upset, and I was really aware of it",
-        "Ik merkte dat ik erg van streek was",
-        "Razburila sem se in razmišljala samo o tem",
-        "Razburil sem se in razmišljal samo o tem",
-    ),
-}
--- a/features/esm_JCQ.py
+++ b/features/esm_JCQ.py
@ -1,11 +1,9 @@
 import pandas as pd

-from features.esm import increment_answers
-
 JCQ_ORIGINAL_MAX = 4
 JCQ_ORIGINAL_MIN = 1

-DICT_JCQ_DEMAND_CONTROL_REVERSE = {
+dict_JCQ_demand_control_reverse = {
    75: (
        "I was NOT asked",
        "Men legde mij geen overdreven",
@ -42,14 +40,10 @@ def reverse_jcq_demand_control_scoring(
    df_esm_jcq_demand_control: pd.DataFrame,
 ) -> pd.DataFrame:
    """
-    Reverse JCQ demand and control answers.
-
-    This function recodes answers in Job content questionnaire
-        by first incrementing them by 1, to be in line with original (1-4) scoring.
-    Then, some answers are reversed (i.e. 1 becomes 4 etc.),
-        because the questions are negatively phrased.
-    These answers are listed in DICT_JCQ_DEMAND_CONTROL_REVERSE
-        and identified by their question ID.
+    This function recodes answers in Job content questionnaire by first incrementing them by 1,
+    to be in line with original (1-4) scoring.
+    Then, some answers are reversed (i.e. 1 becomes 4 etc.), because the questions are negatively phrased.
+    These answers are listed in dict_JCQ_demand_control_reverse and identified by their question ID.
    However, the existing data is checked against literal phrasing of these questions
        to protect against wrong numbering of questions (differing question IDs).

@ -61,8 +55,7 @@ def reverse_jcq_demand_control_scoring(
    Returns
    -------
    df_esm_jcq_demand_control: pd.DataFrame
-        The same dataframe with a column esm_user_score
-            containing answers recoded and reversed.
+        The same dataframe with a column esm_user_score containing answers recoded and reversed.
    """
    df_esm_jcq_demand_control_unique_answers = (
        df_esm_jcq_demand_control.groupby("question_id")
@ -71,7 +64,7 @@ def reverse_jcq_demand_control_scoring(
        .reset_index()
    )
    # Tabulate all possible answers to each question (group by question ID).
-    for q_id in DICT_JCQ_DEMAND_CONTROL_REVERSE.keys():
+    for q_id in dict_JCQ_demand_control_reverse.keys():
        # Look through all answers that need to be reversed.
        possible_answers = df_esm_jcq_demand_control_unique_answers.loc[
            df_esm_jcq_demand_control_unique_answers["question_id"] == q_id,
@ -79,7 +72,7 @@ def reverse_jcq_demand_control_scoring(
        ]
        # These are all answers to a given question (by q_id).
        answers_matches = possible_answers.str.startswith(
-            DICT_JCQ_DEMAND_CONTROL_REVERSE.get(q_id)
+            dict_JCQ_demand_control_reverse.get(q_id)
        )
        # See if they are expected, i.e. included in the dictionary.
        if ~answers_matches.all():
@ -89,16 +82,18 @@ def reverse_jcq_demand_control_scoring(
            # In case there is an unexpected answer, raise an exception.

    try:
-        df_esm_jcq_demand_control = increment_answers(df_esm_jcq_demand_control)
-        # Increment the original answer by 1 to keep in line
-        # with traditional scoring (from JCQ_ORIGINAL_MIN to JCQ_ORIGINAL_MAX).
+        df_esm_jcq_demand_control = df_esm_jcq_demand_control.assign(
+            esm_user_score=lambda x: x.esm_user_answer_numeric + 1
+        )
+        # Increment the original answer by 1
+        # to keep in line with traditional scoring (JCQ_ORIGINAL_MIN - JCQ_ORIGINAL_MAX).
        df_esm_jcq_demand_control[
            df_esm_jcq_demand_control["question_id"].isin(
-                DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
+                dict_JCQ_demand_control_reverse.keys()
            )
        ] = df_esm_jcq_demand_control[
            df_esm_jcq_demand_control["question_id"].isin(
-                DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
+                dict_JCQ_demand_control_reverse.keys()
            )
        ].assign(
            esm_user_score=lambda x: JCQ_ORIGINAL_MAX
--- a/features/esm_SAM.py
+++ b/features/esm_SAM.py
@ -3,9 +3,6 @@ import pandas as pd

 import features.esm

-SAM_ORIGINAL_MAX = 5
-SAM_ORIGINAL_MIN = 1
-
 QUESTIONNAIRE_ID_SAM = {
    "event_stress": 87,
    "event_threat": 88,
@ -23,107 +20,10 @@ GROUP_QUESTIONNAIRES_BY = [
    "device_id",
    "esm_session",
 ]
-# Each questionnaire occurs only once within each esm_session on the same device
-# within the same participant.
-
-
-DICT_SAM_QUESTION_IDS = {
-    87: (
-        "Was there a particular event that created tension in you?",
-        "Was er een bepaalde gebeurtenis die spanning veroorzaakte?",
-        "Je prišlo do kakega dogodka, ki je v vas ustvaril napetost?",
-    ),
-    88: (
-        "Did this event make you feel anxious?",
-        "Voelde je je angstig door deze gebeurtenis?",
-        "Ste se zaradi tega dogodka počutili tesnob",
-    ),
-    89: (
-        "Will the outcome of this event be negative?",
-        "Zal de uitkomst van deze gebeurtenis negatief zijn?",
-        "Bo izid tega dogodka negativen?",
-    ),
-    90: (
-        "How threatening was this event?",
-        "Hoe bedreigend was deze gebeurtenis?",
-        "Kako grozeč je bil ta dogodek?",
-    ),
-    91: (
-        "Is this going to have a negative impact on you?",
-        "Zal dit een negatieve impact op je hebben?",
-        "Ali bo to negativno vplivalo na vas?",
-    ),
-    92: (
-        "Is this going to have a positive impact on you?",
-        "Zal dit een positief effect op je hebben?",
-        "Ali bo to pozitivno vplivalo na vas?",
-    ),
-    93: (
-        "How eager are you to tackle this event?",
-        "Hoe graag wil je deze gebeurtenis aanpakken?",
-        "Kako zagnani ste bili",
-    ),
-    94: (
-        "To what extent can you become a stronger person because of this event?",
-        "In welke mate kan je een sterkere persoon worden door deze gebeurtenis?",
-        "V kolikšni meri lahko zaradi tega dogodka postanete močnejša oseba?",
-    ),
-    95: (
-        "To what extent are you excited thinking about the outcome of this event?",
-        "In welke mate ben je enthousiast bij de gedachte",
-        "V kolikšni meri vas misel na izid tega dogodka navdušuje?",
-    ),
-    96: (
-        "At what time did this event occur?",
-        "Hoe laat vond deze gebeurtenis plaats?",
-        "Kdaj se je ta dogodek zgodil?",
-    ),
-    97: (
-        "How long did this event last?",
-        "Hoe lang duurde deze gebeurtenis?",
-        "Kako dolgo je trajal ta dogodek?",
-    ),
-    98: (
-        "Was/is this event work-related?",
-        "Was/is deze gebeurtenis werkgerelateerd?",
-        "Je (bil) ta dogodek povezan s službo?",
-        "Je bil ali je ta dogodek povezan s službo?",
-    ),
-    99: (
-        "Did this overall period create tension in you?",
-        "Heeft deze globale periode spanning veroorzaakt?",
-        "Je to obdobje kot celota v vas ustvarilo napetost?",
-        "Je to celo obdobje v vas ustvarilo napetost?",
-    ),
-    100: (
-        "To what extent do you perceive this overall period as stressful?",
-        "In welke mate ervaar je deze globale periode als stressvol?",
-        "V kolikšni meri ste to obdobje dojemali kot stresno?",
-        "V kolikšni meri ste celo to obdobje dojemali kot stresno?",
-    ),
-}
+# Each questionnaire occurs only once within each esm_session on the same device within the same participant.


 def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
-    """
-    Extract information about stressful events.
-
-    Participants were asked: "Was there a particular event that created tension in you?"
-    Then a subset of questions related to this event followed.
-    This function goes through the follow-up questions one by one
-        and preprocesses them, so that it adds new columns to the dataframe.
-
-    Parameters
-    ----------
-    df_esm: pd.DataFrame
-        A raw dataframe of all ESM data.
-
-    Returns
-    -------
-    df_esm_events: pd.DataFrame
-        A cleaned up df of Stress Appraisal Measure items with additional columns.
-
-    """
    # 0. Select only questions from Stress Appraisal Measure.
    df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
    df_esm_sam = df_esm_preprocessed[
@ -178,8 +78,7 @@ def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:

 def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
-    This function calculates challenge and threat
-        (two Stress Appraisal Measure subscales) means,
+    This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
        for each ESM session (within participants and devices).
    It creates a grouped dataframe with means in two columns.

@ -191,8 +90,7 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
    Returns
    -------
    df_esm_event_threat_challenge_mean_wide: pd.DataFrame
-        A dataframe of unique ESM sessions (by participants and devices)
-        with threat and challenge means.
+        A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
    """
    # Select only threat and challenge assessments for events
    df_esm_event_threat_challenge = df_esm_sam_clean[
@ -214,8 +112,8 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
        aggfunc="mean",
    )
    # Drop unnecessary column values.
-    df_esm_event_threat_challenge_mean_wide.columns = (
-        df_esm_event_threat_challenge_mean_wide.columns.get_level_values(1)
+    df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
+        1
    )
    df_esm_event_threat_challenge_mean_wide.columns.name = None
    df_esm_event_threat_challenge_mean_wide.rename(
@ -291,12 +189,10 @@ def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:

 def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
-    This function only serves to convert the string datetime answer
-        into a real datetime type.
-    Errors during this conversion are coerced, meaning that non-datetime answers
-        are assigned Not a Time (NaT).
-    NOTE: Since the only available non-datetime answer to this question was
-        "0 - I do not remember", the NaTs can be interpreted to mean this.
+    This function only serves to convert the string datetime answer into a real datetime type.
+    Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
+    NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
+        the NaTs can be interpreted to mean this.

    Parameters
    ----------
@ -312,13 +208,9 @@ def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
        df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
    ].assign(
        event_time=lambda x: pd.to_datetime(
-            x.esm_user_answer,
-            errors="coerce",
-            format="%Y-%m-%d %H:%M:%S %z",
-            exact=True,
+            x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
        )
    )
-    # Example answer: 2020-09-29 00:05:00 +0200
    return df_esm_event_time


@ -349,12 +241,9 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
        == QUESTIONNAIRE_ID_SAM.get("event_duration")
    ].assign(
        event_duration=lambda x: pd.to_datetime(
-            x.esm_user_answer.str.slice(start=0, stop=-6),
-            errors="coerce",
-            format="%Y-%m-%d %H:%M:%S",
+            x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
        ).dt.time
    )
-    # Example answer: 2020-09-29 00:05:00 +0200
    # TODO Explore the values recorded in event_duration and possibly fix mistakes.
    # For example, participants reported setting 23:50:00 instead of 00:50:00.

@ -362,7 +251,7 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    # we can determine whether:
    #   - this event is still going on ("1 - It is still going on")
    #   - the participant couldn't remember it's duration ("0 - I do not remember")
-    # Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm
+    # Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
    # but only the numeric types of questions and answers.
    # Since this was of "datetime" type, convert these specific answers here again.
    df_esm_event_duration["event_duration_info"] = np.nan
@ -375,5 +264,4 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    return df_esm_event_duration


-# TODO: How many questions about the stressfulness of the period were asked
-#  and how does this relate to events?
+# TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?
--- a/machine_learning/classification_models.py
+++ b/machine_learning/classification_models.py
@ -1,123 +1,71 @@
-import pandas as pd
-import xgboost as xg
-from lightgbm import LGBMClassifier
-from sklearn import ensemble, linear_model, naive_bayes, neighbors, svm, tree
 from sklearn.dummy import DummyClassifier
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
+from lightgbm import LGBMClassifier
+import xgboost as xg 

-
-class ClassificationModels:
+class ClassificationModels():
+    
    def __init__(self):
        self.cmodels = self.init_classification_models()
-
+        
    def get_cmodels(self):
        return self.cmodels

    def init_classification_models(self):
        cmodels = {
-            "dummy_classifier": {
-                "model": DummyClassifier(strategy="most_frequent"),
-                "metrics": [0, 0, 0, 0],
+            'dummy_classifier': {
+                'model': DummyClassifier(strategy="most_frequent"),
+                'metrics': [0, 0, 0, 0]
            },
-            "logistic_regression": {
-                "model": linear_model.LogisticRegression(max_iter=1000),
-                "metrics": [0, 0, 0, 0],
+            'logistic_regression': {
+                'model': linear_model.LogisticRegression(max_iter=1000),
+                'metrics': [0, 0, 0, 0]
            },
-            "support_vector_machine": {"model": svm.SVC(), "metrics": [0, 0, 0, 0]},
-            "gaussian_naive_bayes": {
-                "model": naive_bayes.GaussianNB(),
-                "metrics": [0, 0, 0, 0],
+            'support_vector_machine': {
+                'model': svm.SVC(),
+                'metrics': [0, 0, 0, 0]
            },
-            "stochastic_gradient_descent_classifier": {
-                "model": linear_model.SGDClassifier(),
-                "metrics": [0, 0, 0, 0],
+            'gaussian_naive_bayes': {
+                'model': naive_bayes.GaussianNB(),
+                'metrics': [0, 0, 0, 0]
            },
-            "knn": {"model": neighbors.KNeighborsClassifier(), "metrics": [0, 0, 0, 0]},
-            "decision_tree": {
-                "model": tree.DecisionTreeClassifier(),
-                "metrics": [0, 0, 0, 0],
+            'stochastic_gradient_descent_classifier': {
+                'model': linear_model.SGDClassifier(),
+                'metrics': [0, 0, 0, 0]
            },
-            "random_forest_classifier": {
-                "model": ensemble.RandomForestClassifier(),
-                "metrics": [0, 0, 0, 0],
+            'knn': {
+                'model': neighbors.KNeighborsClassifier(),
+                'metrics': [0, 0, 0, 0]
            },
-            "gradient_boosting_classifier": {
-                "model": ensemble.GradientBoostingClassifier(),
-                "metrics": [0, 0, 0, 0],
+            'decision_tree': {
+                'model': tree.DecisionTreeClassifier(),
+                'metrics': [0, 0, 0, 0]
            },
-            "lgbm_classifier": {"model": LGBMClassifier(), "metrics": [0, 0, 0, 0]},
-            "XGBoost_classifier": {
-                "model": xg.sklearn.XGBClassifier(),
-                "metrics": [0, 0, 0, 0],
+            'random_forest_classifier': {
+                'model': ensemble.RandomForestClassifier(),
+                'metrics': [0, 0, 0, 0]
            },
+            'gradient_boosting_classifier': {
+                'model': ensemble.GradientBoostingClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'lgbm_classifier': {
+                'model': LGBMClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'XGBoost_classifier': {
+                'model': xg.sklearn.XGBClassifier(),
+                'metrics': [0, 0, 0, 0]
+            }
        }
-
+        
        return cmodels
-
+    
    def get_total_models_scores(self, n_clusters=1):
-        scores = pd.DataFrame(columns=["method", "metric", "mean"])
        for model_title, model in self.cmodels.items():
-            scores_df = pd.DataFrame(columns=["method", "metric", "mean"])
            print("\n************************************\n")
            print("Current model:", model_title, end="\n")
-            print("Acc:", model["metrics"][0] / n_clusters)
-            scores_df = pd.concat(
-                [
-                    scores_df,
-                    pd.DataFrame(
-                        {
-                            "method": model_title,
-                            "metric": "test_accuracy",
-                            "mean": model["metrics"][0] / n_clusters,
-                        },
-                        index=[0],
-                    ),
-                ],
-                ignore_index=True,
-            )
-            print("Precision:", model["metrics"][1] / n_clusters)
-            scores_df = pd.concat(
-                [
-                    scores_df,
-                    pd.DataFrame(
-                        {
-                            "method": model_title,
-                            "metric": "test_precision",
-                            "mean": model["metrics"][1] / n_clusters,
-                        },
-                        index=[0],
-                    ),
-                ],
-                ignore_index=True,
-            )
-            print("Recall:", model["metrics"][2] / n_clusters)
-            scores_df = pd.concat(
-                [
-                    scores_df,
-                    pd.DataFrame(
-                        {
-                            "method": model_title,
-                            "metric": "test_recall",
-                            "mean": model["metrics"][2] / n_clusters,
-                        },
-                        index=[0],
-                    ),
-                ],
-                ignore_index=True,
-            )
-            print("F1:", model["metrics"][3] / n_clusters)
-            scores_df = pd.concat(
-                [
-                    scores_df,
-                    pd.DataFrame(
-                        {
-                            "method": model_title,
-                            "metric": "test_f1",
-                            "mean": model["metrics"][3] / n_clusters,
-                        },
-                        index=[0],
-                    ),
-                ],
-                ignore_index=True,
-            )
-            scores = pd.concat([scores, scores_df])
-        return scores
+            print("Acc:", model['metrics'][0]/n_clusters)
+            print("Precision:", model['metrics'][1]/n_clusters)
+            print("Recall:", model['metrics'][2]/n_clusters)
+            print("F1:", model['metrics'][3]/n_clusters)
--- a/machine_learning/helper.py
+++ b/machine_learning/helper.py
@ -1,24 +1,15 @@
 from pathlib import Path
+from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble, naive_bayes, neighbors, tree
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.impute import SimpleImputer
+from sklearn.dummy import DummyRegressor, DummyClassifier
+
+from xgboost import XGBRegressor, XGBClassifier
+import xgboost as xg

-import numpy as np
 import pandas as pd
-from sklearn import (
-    ensemble,
-    gaussian_process,
-    kernel_ridge,
-    linear_model,
-    naive_bayes,
-    svm,
-)
-from sklearn.dummy import DummyClassifier, DummyRegressor
-from sklearn.metrics import confusion_matrix
-from sklearn.model_selection import (
-    BaseCrossValidator,
-    LeaveOneGroupOut,
-    StratifiedKFold,
-    cross_validate,
-)
-from xgboost import XGBClassifier, XGBRegressor
+import numpy as np


 def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
@ -74,116 +65,52 @@ def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> P
    full_path = folder / export_filename
    return full_path

-
 def insert_row(df, row):
    return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)

+def prepare_regression_model_input(input_csv):

-def impute_encode_categorical_features(model_input: pd.DataFrame) -> pd.DataFrame:
-    categorical_feature_col_names = [
-        "gender",
-        "startlanguage",
-        "limesurvey_demand_control_ratio_quartile",
-    ]
-    additional_categorical_features = [
-        col
-        for col in model_input.columns
-        if "mostcommonactivity" in col or "homelabel" in col
-    ]
-    categorical_feature_col_names += additional_categorical_features
+    model_input = pd.read_csv(input_csv)

-    categorical_features = model_input[categorical_feature_col_names].copy()
+    index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+    model_input.set_index(index_columns, inplace=True)

+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+
+    categorical_feature_colnames = ["gender", "startlanguage", "limesurvey_demand_control_ratio_quartile"]
+    additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+    categorical_feature_colnames += additional_categorical_features
+    #TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully
+    categorical_features = data_x[categorical_feature_colnames].copy()
    mode_categorical_features = categorical_features.mode().iloc[0]
    # fillna with mode
    categorical_features = categorical_features.fillna(mode_categorical_features)
    # one-hot encoding
-    categorical_features = categorical_features.apply(
-        lambda col: col.astype("category")
-    )
+    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
    if not categorical_features.empty:
        categorical_features = pd.get_dummies(categorical_features)

-    numerical_features = model_input.drop(categorical_feature_col_names, axis=1)
+    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)

-    model_input = pd.concat([numerical_features, categorical_features], axis=1)
-    return model_input
+    train_x = pd.concat([numerical_features, categorical_features], axis=1)
+
+    return train_x, data_y, data_groups


-def prepare_sklearn_data_format(
-    model_input: pd.DataFrame, cv_method: str = "logo"
-) -> tuple:
-    index_columns = [
-        "local_segment",
-        "local_segment_label",
-        "local_segment_start_datetime",
-        "local_segment_end_datetime",
-    ]
-    model_input.set_index(index_columns, inplace=True)
+def run_all_regression_models(input_csv):
+    # Prepare data
+    data_x, data_y, data_groups = prepare_regression_model_input(input_csv)

-    if cv_method == "half_logo":
-        model_input["pid_index"] = model_input.groupby("pid").cumcount()
-        model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")
-
-        model_input["pid_index"] = (
-            model_input["pid_index"] / model_input["pid_count"] + 1
-        ).round()
-        model_input["pid_half"] = (
-            model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
-        )
-
-        data_x, data_y, data_groups = (
-            model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
-            model_input["target"],
-            model_input["pid_half"],
-        )
-    else:
-        data_x, data_y, data_groups = (
-            model_input.drop(["target", "pid"], axis=1),
-            model_input["target"],
-            model_input["pid"],
-        )
-    return data_x, data_y, data_groups
-
-
-def prepare_cross_validator(
-    data_x: pd.DataFrame,
-    data_y: pd.DataFrame,
-    data_groups: pd.DataFrame,
-    cv_method: str = "logo",
-) -> BaseCrossValidator:
-    if cv_method == "logo" or cv_method == "half_logo":
-        cv = LeaveOneGroupOut()
-        cv.get_n_splits(
-            data_x,
-            data_y,
-            groups=data_groups,
-        )
-    else:
-        cv = StratifiedKFold(n_splits=5, shuffle=True)
-    return cv
-
-
-def aggregate_and_transpose(df: pd.DataFrame, statistics=None) -> pd.DataFrame:
-    if statistics is None:
-        statistics = ["max", "mean"]
-    return (
-        df.agg(statistics)
-        .transpose()
-        .reset_index()
-        .rename(columns={"index": "test_metric"})
+    # Prepare cross validation
+    logo = LeaveOneGroupOut()
+    logo.get_n_splits(
+        data_x,
+        data_y,
+        groups=data_groups,
    )
-
-
-def run_all_regression_models(
-    data_x: pd.DataFrame,
-    data_y: pd.DataFrame,
-    data_groups: pd.DataFrame,
-    cross_validator: BaseCrossValidator,
-) -> pd.DataFrame:
-    metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
+    metrics = ['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error']
    test_metrics = ["test_" + metric for metric in metrics]
-    scores = pd.DataFrame(columns=["method", "test_metric", "max", "nanmedian"])
+    scores = pd.DataFrame(columns=["method", "max", "nanmedian"])

    # Validate models
    dummy_regr = DummyRegressor(strategy="mean")
@ -192,58 +119,53 @@ def run_all_regression_models(
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cross_validator,
+        cv=logo,
        n_jobs=-1,
-        scoring=metrics,
+        scoring=metrics
    )
    print("Dummy model:")
-    print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))
-
+    print("R^2: ", np.nanmedian(dummy_regr_scores['test_r2']))
+    
    scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "dummy"
    scores = pd.concat([scores, scores_df])
-    del dummy_regr
-    del dummy_regr_scores

-    lin_reg = linear_model.LinearRegression()
+    lin_reg_rapids = linear_model.LinearRegression()
    lin_reg_scores = cross_validate(
-        lin_reg,
+        lin_reg_rapids,
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cross_validator,
+        cv=logo,
        n_jobs=-1,
-        scoring=metrics,
+        scoring=metrics
    )
    print("Linear regression:")
-    print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))
+    print("R^2: ", np.nanmedian(lin_reg_scores['test_r2']))

    scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "linear_reg"
    scores = pd.concat([scores, scores_df])
-    del lin_reg
-    del lin_reg_scores

-    ridge_reg = linear_model.Ridge(alpha=0.5)
+    ridge_reg = linear_model.Ridge(alpha=.5)
    ridge_reg_scores = cross_validate(
        ridge_reg,
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cross_validator,
+        cv=logo,
        n_jobs=-1,
-        scoring=metrics,
+        scoring=metrics
    )
    print("Ridge regression")

    scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "ridge_reg"
    scores = pd.concat([scores, scores_df])
-    del ridge_reg
-    del ridge_reg_scores
+

    lasso_reg = linear_model.Lasso(alpha=0.1)
    lasso_reg_score = cross_validate(
@ -251,18 +173,16 @@ def run_all_regression_models(
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cross_validator,
+        cv=logo,
        n_jobs=-1,
-        scoring=metrics,
+        scoring=metrics
    )
    print("Lasso regression")

    scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "lasso_reg"
    scores = pd.concat([scores, scores_df])
-    del lasso_reg
-    del lasso_reg_score

    bayesian_ridge_reg = linear_model.BayesianRidge()
    bayesian_ridge_reg_score = cross_validate(
@ -270,18 +190,16 @@ def run_all_regression_models(
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cross_validator,
+        cv=logo,
        n_jobs=-1,
-        scoring=metrics,
+        scoring=metrics
    )
    print("Bayesian Ridge")

    scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "bayesian_ridge"
    scores = pd.concat([scores, scores_df])
-    del bayesian_ridge_reg
-    del bayesian_ridge_reg_score

    ransac_reg = linear_model.RANSACRegressor()
    ransac_reg_score = cross_validate(
@ -289,18 +207,16 @@ def run_all_regression_models(
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cross_validator,
+        cv=logo,
        n_jobs=-1,
-        scoring=metrics,
+        scoring=metrics
    )
    print("RANSAC (outlier robust regression)")

    scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "RANSAC"
    scores = pd.concat([scores, scores_df])
-    del ransac_reg
-    del ransac_reg_score

    svr = svm.SVR()
    svr_score = cross_validate(
@ -308,18 +224,16 @@ def run_all_regression_models(
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cross_validator,
+        cv=logo,
        n_jobs=-1,
-        scoring=metrics,
+        scoring=metrics
    )
    print("Support vector regression")
-
+    
    scores_df = pd.DataFrame(svr_score)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "SVR"
    scores = pd.concat([scores, scores_df])
-    del svr
-    del svr_score

    kridge = kernel_ridge.KernelRidge()
    kridge_score = cross_validate(
@ -327,18 +241,16 @@ def run_all_regression_models(
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cross_validator,
+        cv=logo,
        n_jobs=-1,
-        scoring=metrics,
+        scoring=metrics
    )
    print("Kernel Ridge regression")
-
+    
    scores_df = pd.DataFrame(kridge_score)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "kernel_ridge"
    scores = pd.concat([scores, scores_df])
-    del kridge
-    del kridge_score

    gpr = gaussian_process.GaussianProcessRegressor()
    gpr_score = cross_validate(
@ -346,18 +258,16 @@ def run_all_regression_models(
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cross_validator,
+        cv=logo,
        n_jobs=-1,
-        scoring=metrics,
+        scoring=metrics
    )
    print("Gaussian Process Regression")

    scores_df = pd.DataFrame(gpr_score)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "gaussian_proc"
    scores = pd.concat([scores, scores_df])
-    del gpr
-    del gpr_score

    rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
    rfr_score = cross_validate(
@ -365,18 +275,16 @@ def run_all_regression_models(
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cross_validator,
+        cv=logo,
        n_jobs=-1,
-        scoring=metrics,
+        scoring=metrics
    )
    print("Random Forest Regression")

    scores_df = pd.DataFrame(rfr_score)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "random_forest"
    scores = pd.concat([scores, scores_df])
-    del rfr
-    del rfr_score

    xgb = XGBRegressor()
    xgb_score = cross_validate(
@ -384,18 +292,16 @@ def run_all_regression_models(
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cross_validator,
+        cv=logo,
        n_jobs=-1,
-        scoring=metrics,
+        scoring=metrics
    )
    print("XGBoost Regressor")

    scores_df = pd.DataFrame(xgb_score)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "XGBoost"
    scores = pd.concat([scores, scores_df])
-    del xgb
-    del xgb_score

    ada = ensemble.AdaBoostRegressor()
    ada_score = cross_validate(
@ -403,328 +309,151 @@ def run_all_regression_models(
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cross_validator,
+        cv=logo,
        n_jobs=-1,
-        scoring=metrics,
+        scoring=metrics
    )
    print("ADA Boost Regressor")

    scores_df = pd.DataFrame(ada_score)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "ADA_boost"
    scores = pd.concat([scores, scores_df])
-    del ada
-    del ada_score

    return scores


-def confusion_matrix_scorer(clf, X, y):
-    y_pred = clf.predict(X)
-    cm = confusion_matrix(y, y_pred)
-    return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]}
-
-
-def aggregate_confusion_matrix(scores_dict: dict) -> pd.DataFrame:
-    scores_aggregated = aggregate_and_transpose(
-        pd.DataFrame(scores_dict), statistics=["sum"]
-    )
-    return scores_aggregated[
-        ~scores_aggregated.test_metric.isin(["fit_time", "score_time"])
-    ]
-
-
-def run_all_classification_models(
-    data_x: pd.DataFrame,
-    data_y: pd.DataFrame,
-    data_groups: pd.DataFrame,
-    cross_validator: BaseCrossValidator,
-):
-    data_y_value_counts = data_y.value_counts()
-    if len(data_y_value_counts) == 1:
-        raise (ValueError("There is only one unique value in data_y."))
-    if len(data_y_value_counts) == 2:
-        metrics = ["accuracy", "average_precision", "recall", "f1"]
-    else:
-        metrics = ["accuracy", "precision_micro", "recall_micro", "f1_micro"]
-
+def run_all_classification_models(data_x, data_y, data_groups, cv_method):
+    metrics = ['accuracy', 'average_precision', 'recall', 'f1']
    test_metrics = ["test_" + metric for metric in metrics]

-    scores = pd.DataFrame(columns=["method", "test_metric", "max", "mean"])
+    scores = pd.DataFrame(columns=["method", "max", "mean"])

    dummy_class = DummyClassifier(strategy="most_frequent")

    dummy_score = cross_validate(
-        dummy_class,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        error_score="raise",
-        scoring=metrics,
-    )
-    dummy_confusion_matrix = cross_validate(
-        dummy_class,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        error_score="raise",
-        scoring=confusion_matrix_scorer,
+    dummy_class,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=metrics
    )
    print("Dummy")

    scores_df = pd.DataFrame(dummy_score)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
-    scores_df = pd.concat(
-        [
-            scores_df,
-            aggregate_confusion_matrix(dummy_confusion_matrix).rename(
-                columns={"sum": "mean"}
-                # Note: the column is misleadingly renamed to get concise output.
-            ),
-        ]
-    )
-    scores_df["method"] = "dummy_classifier"
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
+    scores_df["method"] = "Dummy"
    scores = pd.concat([scores, scores_df])
-    del dummy_class
-    del dummy_score
-    del dummy_confusion_matrix

    logistic_regression = linear_model.LogisticRegression()

    log_reg_scores = cross_validate(
-        logistic_regression,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        scoring=metrics,
-    )
-    log_reg_confusion_matrix = cross_validate(
-        logistic_regression,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        scoring=confusion_matrix_scorer,
+    logistic_regression,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=metrics
    )
    print("Logistic regression")

    scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
-    scores_df = pd.concat(
-        [
-            scores_df,
-            aggregate_confusion_matrix(log_reg_confusion_matrix).rename(
-                columns={"sum": "mean"}
-                # Note: the column is misleadingly renamed to get concise output.
-            ),
-        ]
-    )
-    scores_df["method"] = "logistic_regression"
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
+    scores_df["method"] = "logistic_reg"
    scores = pd.concat([scores, scores_df])
-    del logistic_regression
-    del log_reg_scores
-    del log_reg_confusion_matrix

    svc = svm.SVC()

    svc_scores = cross_validate(
-        svc,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        scoring=metrics,
-    )
-    svc_confusion_matrix = cross_validate(
-        svc,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        scoring=confusion_matrix_scorer,
+    svc,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=metrics
    )
    print("Support Vector Machine")

    scores_df = pd.DataFrame(svc_scores)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
-    scores_df = pd.concat(
-        [
-            scores_df,
-            aggregate_confusion_matrix(svc_confusion_matrix).rename(
-                columns={"sum": "mean"}
-                # Note: the column is misleadingly renamed to get concise output.
-            ),
-        ]
-    )
-    scores_df["method"] = "SVC"
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
+    scores_df["method"] = "svc"
    scores = pd.concat([scores, scores_df])
-    del svc
-    del svc_scores
-    del svc_confusion_matrix

    gaussian_nb = naive_bayes.GaussianNB()
-
+    
    gaussian_nb_scores = cross_validate(
-        gaussian_nb,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        scoring=metrics,
-    )
-    gaussian_nb_confusion_matrix = cross_validate(
-        gaussian_nb,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        scoring=confusion_matrix_scorer,
+    gaussian_nb,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=metrics
    )
    print("Gaussian Naive Bayes")

    scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
-    scores_df = pd.concat(
-        [
-            scores_df,
-            aggregate_confusion_matrix(gaussian_nb_confusion_matrix).rename(
-                columns={"sum": "mean"}
-                # Note: the column is misleadingly renamed to get concise output.
-            ),
-        ]
-    )
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
    scores_df["method"] = "gaussian_naive_bayes"
    scores = pd.concat([scores, scores_df])
-    del gaussian_nb
-    del gaussian_nb_scores
-    del gaussian_nb_confusion_matrix

    sgdc = linear_model.SGDClassifier()

    sgdc_scores = cross_validate(
-        sgdc,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        scoring=metrics,
-    )
-    sgdc_confusion_matrix = cross_validate(
-        sgdc,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        scoring=confusion_matrix_scorer,
+    sgdc,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=metrics
    )
    print("Stochastic Gradient Descent")

    scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
-    scores_df = pd.concat(
-        [
-            scores_df,
-            aggregate_confusion_matrix(sgdc_confusion_matrix).rename(
-                columns={"sum": "mean"}
-                # Note: the column is misleadingly renamed to get concise output.
-            ),
-        ]
-    )
-    scores_df["method"] = "stochastic_gradient_descent_classifier"
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
+    scores_df["method"] = "stochastic_gradient_descent"
    scores = pd.concat([scores, scores_df])
-    del sgdc
-    del sgdc_scores
-    del sgdc_confusion_matrix

    rfc = ensemble.RandomForestClassifier()

    rfc_scores = cross_validate(
-        rfc,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        scoring=metrics,
-    )
-    rfc_confusion_matrix = cross_validate(
-        rfc,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        scoring=confusion_matrix_scorer,
+    rfc,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=metrics
    )
    print("Random Forest")

    scores_df = pd.DataFrame(rfc_scores)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
-    scores_df = pd.concat(
-        [
-            scores_df,
-            aggregate_confusion_matrix(rfc_confusion_matrix).rename(
-                columns={"sum": "mean"}
-                # Note: the column is misleadingly renamed to get concise output.
-            ),
-        ]
-    )
-    scores_df["method"] = "random_forest_classifier"
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
+    scores_df["method"] = "random_forest"
    scores = pd.concat([scores, scores_df])
-    del rfc
-    del rfc_scores
-    del rfc_confusion_matrix

    xgb_classifier = XGBClassifier()

    xgb_scores = cross_validate(
-        xgb_classifier,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        scoring=metrics,
-    )
-    xgb_confusion_matrix = cross_validate(
-        xgb_classifier,
-        X=data_x,
-        y=data_y,
-        groups=data_groups,
-        cv=cross_validator,
-        n_jobs=-1,
-        scoring=confusion_matrix_scorer,
+    xgb_classifier,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=metrics
    )
    print("XGBoost")

    scores_df = pd.DataFrame(xgb_scores)[test_metrics]
-    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
-    scores_df = pd.concat(
-        [
-            scores_df,
-            aggregate_confusion_matrix(xgb_confusion_matrix).rename(
-                columns={"sum": "mean"}
-                # Note: the column is misleadingly renamed to get concise output.
-            ),
-        ]
-    )
-    scores_df["method"] = "XGBoost_classifier"
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
+    scores_df["method"] = "xgboost"
    scores = pd.concat([scores, scores_df])
-    del xgb_classifier
-    del xgb_scores
-    del xgb_confusion_matrix

    return scores
--- a/presentation/ApplicationCategories.R
+++ b/presentation/ApplicationCategories.R
@ -34,114 +34,18 @@ df_app_categories <- tbl(con, "app_categories") %>%
 head(df_app_categories)
 table(df_app_categories$play_store_genre)

-df_app_categories %>%
-  filter(play_store_genre == "not_found") %>%
-  group_by(play_store_response) %>%
-  count()
-# All "not_found" have an HTTP status of 404.
-
-df_app_categories %>%
-  filter(play_store_genre == "not_found") %>%
-  group_by(package_name) %>%
-  count() %>%
-  arrange(desc(n))
-# All "not_found" apps are unique.
-
-# Exclude phone manufacturers, custom ROM names and similar.
-manufacturers <- c(
-  "samsung",
-  "oneplus",
-  "huawei",
-  "xiaomi",
-  "lge",
-  "motorola",
-  "miui",
-  "lenovo",
-  "oppo",
-  "mediatek"
-)
-custom_rom <- c("coloros", "lineageos", "myos", "cyanogenmod", "foundation.e")
-other <- c("android", "wssyncmldm")
-
-grep_pattern <- paste(c(manufacturers, custom_rom, other), collapse = "|")
-
-rows_os_manufacturer <- grepl(grep_pattern, df_app_categories$package_name)
-
-# Explore what remains after excluding above.
-df_app_categories[!rows_os_manufacturer, ] %>%
-  filter(play_store_genre == "not_found")
-
-# Also check the relationship between is_system_app and System category.
-tbl(con, "applications") %>% 
-  filter(is_system_app, play_store_genre != "System") %>% 
-  count()
-# They are perfectly correlated.
-
-# Manually classify apps
-df_app_categories[df_app_categories$play_store_genre == "not_found",] <- 
-  df_app_categories %>% 
-  filter(play_store_genre == "not_found") %>% 
-  mutate(
-    play_store_genre =
-      case_when(
-        str_detect(str_to_lower(package_name), grep_pattern) ~ "System",
-        str_detect(str_to_lower(package_name), "straw") ~ "STRAW",
-        str_detect(str_to_lower(package_name), "chromium") ~ "Communication", # Same as chrome.
-        str_detect(str_to_lower(package_name), "skype") ~ "Communication", # Skype Lite not classified.
-        str_detect(str_to_lower(package_name), "imsservice") ~ "Communication", # IP Multimedia Subsystem
-        str_detect(str_to_lower(package_name), paste(c("covid", "empatica"), collapse = "|")) ~ "Medical",
-        str_detect(str_to_lower(package_name), paste(c("libri", "tachiyomi"), collapse = "|")) ~ "Books & Reference",
-        str_detect(str_to_lower(package_name), paste(c("bricks", "chess"), collapse = "|")) ~ "Casual",
-        str_detect(str_to_lower(package_name), "weather") ~ "Weather",
-        str_detect(str_to_lower(package_name), "excel") ~ "Productivity",
-        str_detect(str_to_lower(package_name), paste(c("qr", "barcode", "archimedes", "mixplorer", "winrar", "filemanager", "shot", "faceunlock", "signin", "milink"), collapse = "|")) ~ "Tools",
-        str_detect(str_to_lower(package_name), "stupeflix") ~ "Photography",
-        str_detect(str_to_lower(package_name), "anyme") ~ "Entertainment",
-        str_detect(str_to_lower(package_name), "vanced") ~ "Video Players & Editors",
-        str_detect(str_to_lower(package_name), paste(c("music", "radio", "dolby"), collapse = "|")) ~ "Music & Audio",
-        str_detect(str_to_lower(package_name), paste(c("tensorflow", "object_detection"), collapse = "|")) ~ "Education",
-        .default = play_store_genre
-      )
-  )
-
-# Explore what remains after classifying above.
-df_app_categories %>%
-  filter(play_store_genre == "not_found")
-
-# After this, 13 applications remain, which I will classify as "Other".
-
 # Correct some mistakes
-# And classify 'not_found'
-df_app_categories %<>%
-  mutate(
-    play_store_genre = {
-      function(x) {
-        case_when(
-          x == "Education,Education" ~ "Education",
-          x == "EducationEducation" ~ "Education",
-          x == "not_found" ~ "Other",
-          .default = x
-        )
-      }
-    }(play_store_genre)
-  ) %>%
-  select(-package_name) %>%
-  rename(
-    genre = play_store_genre,
-    package_name = package_hash
-  )
-
-table(df_app_categories$genre)
-
-df_app_categories %>%
-  group_by(genre) %>%
-  count() %>%
-  arrange(desc(n)) %>%
-  write_csv("play_store_categories_count.csv")
-
-write_csv(
-  x = select(df_app_categories, c(package_name, genre)),
-  file = "play_store_application_genre_catalogue.csv"
+df_app_categories %<>% mutate(
+  play_store_genre = {
+    function(x) {
+      case_when(
+        x == "Education,Education" ~ "Education",
+        x == "EducationEducation" ~ "Education",
+        x == "not_found" ~ "System",
+        .default = x
+      )
+    }
+  }(play_store_genre)
 )

 dbDisconnect(con)
--- a/presentation/event_stressful_detection_5fold.csv
+++ b/presentation/event_stressful_detection_5fold.csv
@ -0,0 +1,29 @@
+method,metric,max,mean
+Dummy,test_accuracy,0.8557046979865772,0.8548446932649828
+Dummy,test_average_precision,0.1457286432160804,0.14515530673501736
+Dummy,test_recall,0.0,0.0
+Dummy,test_f1,0.0,0.0
+logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606
+logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834
+logistic_reg,test_recall,0.3023255813953488,0.24266238973536486
+logistic_reg,test_f1,0.3909774436090226,0.318943511424051
+svc,test_accuracy,0.8557046979865772,0.8548446932649828
+svc,test_average_precision,0.44514416839823046,0.4068200938341621
+svc,test_recall,0.0,0.0
+svc,test_f1,0.0,0.0
+gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234
+gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853
+gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935
+gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699
+stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601
+stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592
+stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675
+stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583
+random_forest,test_accuracy,0.9110738255033557,0.9011129472867694
+random_forest,test_average_precision,0.6998372262021191,0.6619275281099584
+random_forest,test_recall,0.4069767441860465,0.35356856455493185
+random_forest,test_f1,0.5691056910569107,0.5078402513053142
+xgboost,test_accuracy,0.9128978224455612,0.9007711937764886
+xgboost,test_average_precision,0.7366643049075349,0.698622165966308
+xgboost,test_recall,0.5287356321839081,0.44346431435445066
+xgboost,test_f1,0.638888888888889,0.5633957169928393
--- a/presentation/event_stressful_detection_logo.csv
+++ b/presentation/event_stressful_detection_logo.csv
@ -0,0 +1,29 @@
+method,metric,max,mean
+Dummy,test_accuracy,1.0,0.8524114578096439
+Dummy,test_average_precision,0.7,0.14758854219035614
+Dummy,test_recall,0.0,0.0
+Dummy,test_f1,0.0,0.0
+logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311
+logistic_reg,test_average_precision,1.0,0.44605167668563583
+logistic_reg,test_recall,1.0,0.25353566685532386
+logistic_reg,test_f1,0.823529411764706,0.27951926390778625
+svc,test_accuracy,1.0,0.8524114578096439
+svc,test_average_precision,0.9612401707068228,0.44179454944271934
+svc,test_recall,0.0,0.0
+svc,test_f1,0.0,0.0
+gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129
+gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991
+gaussian_naive_bayes,test_recall,1.0,0.3743761174081108
+gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668
+stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739
+stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582
+stochastic_gradient_descent,test_recall,1.0,0.30156420704502945
+stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388
+random_forest,test_accuracy,1.0,0.8722158105763481
+random_forest,test_average_precision,1.0,0.49817066323226833
+random_forest,test_recall,1.0,0.18161263127840668
+random_forest,test_f1,1.0,0.2508096532365307
+xgboost,test_accuracy,1.0,0.8812627400277729
+xgboost,test_average_precision,1.0,0.5505695112208401
+xgboost,test_recall,1.0,0.2896161238315027
+xgboost,test_f1,0.9411764705882353,0.36887408735855665
--- a/presentation/play_store_application_genre_catalogue.csv
+++ b/presentation/play_store_application_genre_catalogue.csv
--- a/presentation/play_store_categories_count.csv
+++ b/presentation/play_store_categories_count.csv
@ -1,45 +0,0 @@
-genre,n
-System,261
-Tools,96
-Productivity,71
-Health & Fitness,60
-Finance,54
-Communication,39
-Music & Audio,39
-Shopping,38
-Lifestyle,33
-Education,28
-News & Magazines,24
-Maps & Navigation,23
-Entertainment,21
-Business,18
-Travel & Local,18
-Books & Reference,16
-Social,16
-Weather,16
-Food & Drink,14
-Sports,14
-Other,13
-Photography,13
-Puzzle,13
-Video Players & Editors,12
-Card,9
-Casual,9
-Personalization,8
-Medical,7
-Board,5
-Strategy,4
-House & Home,3
-Trivia,3
-Word,3
-Adventure,2
-Art & Design,2
-Auto & Vehicles,2
-Dating,2
-Role Playing,2
-STRAW,2
-Simulation,2
-"Board,Brain Games",1
-"Entertainment,Music & Video",1
-Parenting,1
-Racing,1
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,7 +0,0 @@
-[tool.isort]
-profile = "black"
-py_version = 311
-skip_gitignore = "true"
-
-[tool.black]
-target-version = ["py311"]
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 059774bda10545a83ab282f59eb7a329fef9ee4c
+Subproject commit 7b8538ce5152bb6e978cae37fcb7099941e95364
--- a/setup.py
+++ b/setup.py
@ -1,7 +1,8 @@
 import os

+import sqlalchemy.engine.url
 from dotenv import load_dotenv
-from sqlalchemy import URL, create_engine
+from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker

 load_dotenv()
@ -10,7 +11,7 @@ testing: bool = False

 db_password = os.getenv("DB_PASSWORD")

-db_uri = URL.create(
+db_uri = sqlalchemy.engine.url.URL(
    drivername="postgresql+psycopg2",
    username="staw_db",
    password=db_password,
--- a/statistical_analysis/scale_reliability.rmd
+++ b/statistical_analysis/scale_reliability.rmd
@ -1,60 +0,0 @@
---
-title: "Reliability of SAM threat and challenge and COPE"
-output: html_notebook
---
-
-
-```{r libraries, message=FALSE, warning=FALSE, include=FALSE, cache=FALSE}
-library(conflicted)
-library(here)
-library(tidyverse)
-library(magrittr)
-library(lavaan)
-library(kableExtra)
-
-conflicts_prefer(
-    readr::col_factor,
-    purrr::discard,
-    dplyr::filter,
-    dplyr::lag,
-    purrr::set_names,
-    tidyr::extract,
-    kableExtra::group_rows
-)
-```
-
-```{r style, include=FALSE, cache=FALSE}
-styler::style_file(
-    here("statistical_analysis", "scale_reliability.Rmd"),
-    scope = "tokens",
-    indent_by = 4L
-)
-```
-
-The data were preprocessed and cleaned using [expl_esm_labels.py](../exploration/expl_esm_labels.py) script and read as csv here.
-
-```{r read_data}
-COL_TYPES <- cols(
-    .default = col_double(),
-    participant_id = col_factor(),
-    username = col_factor(),
-    device_id = col_factor(),
-    esm_trigger = col_factor(),
-    esm_instructions = col_factor(),
-    double_esm_user_answer_timestamp = col_double(),
-    datetime_lj = col_datetime(format = ""),
-    date_lj = col_date(format = ""),
-    time = col_factor(),
-    esm_user_answer = col_factor()
-)
-df_SAM <- read_csv(here("data", "raw", "df_esm_SAM_threat_challenge.csv"), col_types = COL_TYPES)
-df_COPE <- read_csv(here("data", "raw", "df_esm_COPE.csv"), col_types = COL_TYPES)
-```
-
-Demonstrate factor analysis for a single participant.
-
-```{r}
-df_COPE %>%
-	group_by(question_id, questionnaire_id) %>%
-	count()
-```
--- a/straw2analysis.Rproj
+++ b/straw2analysis.Rproj
@ -1,20 +0,0 @@
-Version: 1.0
-
-RestoreWorkspace: Default
-SaveWorkspace: Default
-AlwaysSaveHistory: Default
-
-EnableCodeIndexing: Yes
-UseSpacesForTab: No
-NumSpacesForTab: 4
-Encoding: UTF-8
-
-RnwWeave: Sweave
-LaTeX: XeLaTeX
-
-AutoAppendNewline: Yes
-StripTrailingWhitespace: Yes
-
-PythonType: conda
-PythonVersion: 3.11.3
-PythonPath: E:/ProgramData/mambaforge/envs/straw2analysis/python.exe