Fix questions that were slightly different in the morning.

Use refactored methods.
Fix trailing whitespace.
2023-07-03 21:29:09 +02:00 · 2023-07-03 21:18:15 +02:00 · 2023-07-03 21:17:40 +02:00 · 2023-07-03 21:13:50 +02:00 · 2023-07-03 21:01:15 +02:00 · 2023-07-03 20:52:08 +02:00
36 changed files with 2379 additions and 1514 deletions
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,9 @@
+[flake8]
+max-line-length = 88
+extend-ignore =
+    E203,
+    # E501 line too long for docstrings
+    D501
+per-file-ignores =
+    exploration/*.py:E501
+docstring-convention = numpy
--- a/.gitignore
+++ b/.gitignore
@ -12,12 +12,15 @@ __pycache__/
 /data/*input*.csv
 /data/daily*
 /data/intradaily*
+/data/raw
 /data/stressfulness_event*
 /data/30min*
 /presentation/*scores.csv
 /presentation/Results.ods
+/presentation/results/
 .Rproj.user
 .Rhistory
 /presentation/*.nb.html
 presentation/event_stressful_detection_half_loso.csv
 presentation/event_stressful_detection_loso.csv
+/statistical_analysis/scale_reliability.nb.html
--- a/.idea/codeStyles/Project.xml
+++ b/.idea/codeStyles/Project.xml
@ -0,0 +1,6 @@
+<component name="ProjectCodeStyleConfiguration">
+  <code_scheme name="Project" version="173">
+    <option name="RIGHT_MARGIN" value="150" />
+    <option name="SOFT_MARGINS" value="88" />
+  </code_scheme>
+</component>
--- a/.idea/codeStyles/codeStyleConfig.xml
+++ b/.idea/codeStyles/codeStyleConfig.xml
@ -0,0 +1,5 @@
+<component name="ProjectCodeStyleConfiguration">
+  <state>
+    <option name="USE_PER_PROJECT_SETTINGS" value="true" />
+  </state>
+</component>
--- a/.idea/dictionaries/junos.xml
+++ b/.idea/dictionaries/junos.xml
@ -0,0 +1,3 @@
+<component name="ProjectDictionaryState">
+  <dictionary name="junos" />
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -17,7 +17,15 @@
            </RMarkdownRenderProfile>
          </value>
        </entry>
+        <entry key="file://$PROJECT_DIR$/statistical_analysis/scale_reliability.rmd">
+          <value>
+            <RMarkdownRenderProfile>
+              <option name="lastOutput" value="$PROJECT_DIR$/statistical_analysis/scale_reliability.nb.html" />
+              <option name="outputDirectoryUrl" value="file://$PROJECT_DIR$/statistical_analysis" />
+            </RMarkdownRenderProfile>
+          </value>
+        </entry>
      </map>
    </option>
  </component>
-</project>
+</project>
--- a/.idea/rGraphicsSettings.xml
+++ b/.idea/rGraphicsSettings.xml
@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="RGraphicsSettings">
+    <option name="height" value="600" />
+    <option name="resolution" value="75" />
+    <option name="version" value="2" />
+    <option name="width" value="960" />
+  </component>
+</project>
--- a/.idea/rMarkdownGraphicsSettings.xml
+++ b/.idea/rMarkdownGraphicsSettings.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="RMarkdownGraphicsSettings">
+    <option name="globalResolution" value="75" />
+    <option name="version" value="2" />
+  </component>
+</project>
--- a/.idea/rSettings.xml
+++ b/.idea/rSettings.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="RSettings">
+    <option name="interpreterPath" value="C:\Program Files\R\R-4.3.1\bin\R.exe" />
+  </component>
+</project>
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,30 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: isort (python)
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+      - id: black
+        language_version: python3
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.0.0
+    hooks:
+      - id: flake8
+#  - repo: https://github.com/mwouts/jupytext
+#    rev: v1.14.7
+#    hooks:
+#      - id: jupytext
+#        args: [ --from, "py:percent", --to, "ipynb" ]
+#        additional_dependencies:
+#          - isort==5.12.0 # Matches hook
+#          - black==23.3.0
+#          - flake8==6.0.0
--- a/config/environment.yml
+++ b/config/environment.yml
@ -6,6 +6,7 @@ dependencies:
  - black
  - isort
  - flake8
+  - flake8-docstrings
  - imbalanced-learn=0.10.0
  - jupyterlab
  - jupytext
@ -14,6 +15,7 @@ dependencies:
  - nodejs
  - pandas
  - psycopg2 >= 2.9.1
+  - pre-commit
  - python-dotenv
  - pytz
  - pyprojroot
@ -23,4 +25,4 @@ dependencies:
  - sqlalchemy
  - statsmodels
  - tabulate
-  - xgboost
+  - xgboost
--- a/exploration/debug_heatmap.py
+++ b/exploration/debug_heatmap.py
@ -14,15 +14,9 @@
 # ---

 # %%
-import os, sys
-import importlib
 import pandas as pd
-import numpy as np

-# import plotly.graph_objects as go
-from importlib import util
-from pathlib import Path
-import yaml
+from rapids.src.features.utils.utils import chunk_episodes

 # %%
 phone_data_yield = pd.read_csv(
@ -36,23 +30,29 @@ time_segments_labels = pd.read_csv(
 # %%
 phone_data_yield["assigned_segments"] = phone_data_yield[
    "assigned_segments"
-].str.replace(r"_RR\d+SS#", "#")
+].str.replace(r"_RR\d+SS#", "#", regex=True)
 time_segments_labels["label"] = time_segments_labels["label"].str.replace(
-    r"_RR\d+SS$", ""
+    r"_RR\d+SS$", "", regex=True
 )


 # %% tags=[]
-def filter_data_by_segment(data, time_segment):
+def filter_data_by_segment(data, time_segment_current):
    data.dropna(subset=["assigned_segments"], inplace=True)
    if data.shape[0] == 0:  # data is empty
        data["local_segment"] = data["timestamps_segment"] = None
        return data

-    datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
-    timestamps_regex = "[0-9]{13}"
-    segment_regex = "\[({}#{},{};{},{})\]".format(
-        time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex
+    datetime_regex = (
+        r"[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
+    )
+    timestamps_regex = r"[0-9]{13}"
+    segment_regex = r"\[({}#{},{};{},{})\]".format(
+        time_segment_current,
+        datetime_regex,
+        datetime_regex,
+        timestamps_regex,
+        timestamps_regex,
    )
    data["local_segment"] = data["assigned_segments"].str.extract(
        segment_regex, expand=True
@ -147,14 +147,17 @@ def getDataForPlot(phone_data_yield_per_segment):
        .fillna(0)
    )

-    # transpose the dataframe per local start datetime of the segment and discard the useless index layer
+    # transpose the dataframe per local start datetime of the segment
+    # and discard the useless index layer
    phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
        "local_segment_start_datetimes"
    )[["minutes_after_segment_start", "sensor"]].apply(
        lambda x: x.set_index("minutes_after_segment_start").transpose()
    )
-    phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
-        "local_segment_start_datetimes"
+    phone_data_yield_per_segment.index = (
+        phone_data_yield_per_segment.index.get_level_values(
+            "local_segment_start_datetimes"
+        )
    )
    return phone_data_yield_per_segment

@ -227,9 +230,13 @@ phone_data_yield_per_segment.tail()
 # # A workaround

 # %%
-phone_data_yield_per_segment["local_segment_start_datetimes", "minutes_after_segment_start"] = phone_data_yield_per_segment[
+phone_data_yield_per_segment[
+    "local_segment_start_datetimes", "minutes_after_segment_start"
+] = phone_data_yield_per_segment[
    ["local_segment_start_datetimes", "minutes_after_segment_start"]
-].drop_duplicates(keep="first")
+].drop_duplicates(
+    keep="first"
+)

 # %%
 phone_data_yield_per_segment.set_index(
@ -244,8 +251,9 @@ phone_data_yield_per_segment.head()
 # %% [markdown]
 # # Retry

+
 # %%
-def getDataForPlot(phone_data_yield_per_segment):
+def get_data_for_plot(phone_data_yield_per_segment):
    # calculate the length (in minute) of per segment instance
    phone_data_yield_per_segment["length"] = (
        phone_data_yield_per_segment["timestamps_segment"]
@ -292,7 +300,10 @@ def getDataForPlot(phone_data_yield_per_segment):
        full_index,
        names=("local_segment_start_datetimes", "minutes_after_segment_start"),
    )
-    phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(subset=["local_segment_start_datetimes", "minutes_after_segment_start"],keep="first")
+    phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(
+        subset=["local_segment_start_datetimes", "minutes_after_segment_start"],
+        keep="first",
+    )
    phone_data_yield_per_segment = (
        phone_data_yield_per_segment.set_index(
            ["local_segment_start_datetimes", "minutes_after_segment_start"]
@ -302,14 +313,17 @@ def getDataForPlot(phone_data_yield_per_segment):
        .fillna(0)
    )

-    # transpose the dataframe per local start datetime of the segment and discard the useless index layer
+    # transpose the dataframe per local start datetime of the segment
+    # and discard the useless index layer
    phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
        "local_segment_start_datetimes"
    )[["minutes_after_segment_start", "sensor"]].apply(
        lambda x: x.set_index("minutes_after_segment_start").transpose()
    )
-    phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
-        "local_segment_start_datetimes"
+    phone_data_yield_per_segment.index = (
+        phone_data_yield_per_segment.index.get_level_values(
+            "local_segment_start_datetimes"
+        )
    )
    return phone_data_yield_per_segment

@ -318,6 +332,6 @@ def getDataForPlot(phone_data_yield_per_segment):
 phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)

 # %%
-data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
+data_for_plot_per_segment = get_data_for_plot(phone_data_yield_per_segment)

 # %%
--- a/exploration/expl_esm_labels.py
+++ b/exploration/expl_esm_labels.py
@ -7,7 +7,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -15,19 +15,33 @@
 # ---

 # %%
-import os
-import sys
 import datetime

 import seaborn as sns

-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
 import participants.query_db
-from features.esm import *
-from features.esm_JCQ import *
-from features.esm_SAM import *
+from features.esm import (
+    QUESTIONNAIRE_IDS,
+    clean_up_esm,
+    get_esm_data,
+    increment_answers,
+    preprocess_esm,
+    reassign_question_ids,
+)
+from features.esm_COPE import DICT_COPE_QUESTION_IDS
+from features.esm_JCQ import reverse_jcq_demand_control_scoring
+from features.esm_SAM import DICT_SAM_QUESTION_IDS, extract_stressful_events
+
+# import os
+# import sys
+# nb_dir = os.path.split(os.getcwd())[0]
+# if nb_dir not in sys.path:
+#     sys.path.append(nb_dir)
+
+
+# %%
+save_figs = False
+export_data = True

 # %%
 participants_inactive_usernames = participants.query_db.get_usernames(
@ -43,8 +57,14 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)

 # %%
 df_esm_PANAS = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] == 8)
-    | (df_esm_preprocessed["questionnaire_id"] == 9)
+    (
+        df_esm_preprocessed["questionnaire_id"]
+        == QUESTIONNAIRE_IDS["PANAS_positive_affect"]
+    )
+    | (
+        df_esm_preprocessed["questionnaire_id"]
+        == QUESTIONNAIRE_IDS["PANAS_negative_affect"]
+    )
 ]
 df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)

@ -65,35 +85,47 @@ df_esm_PANAS_daily_means = (
 # %%
 df_esm_PANAS_summary_participant = (
    df_esm_PANAS_daily_means.groupby(["participant_id", "questionnaire_id"])
-    .agg(["mean", "median", "std"])
+    .esm_numeric_mean.agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
-df_esm_PANAS_summary_participant.columns = df_esm_PANAS_summary_participant.columns.get_level_values(
-    1
-)
 df_esm_PANAS_summary_participant[
-    "PANAS_subscale"
+    "PANAS subscale"
 ] = df_esm_PANAS_daily_means.questionnaire_id.astype("category").cat.rename_categories(
-    {8.0: "PA", 9.0: "NA"}
+    {8.0: "positive affect", 9.0: "negative affect"}
 )

 # %%
-sns.displot(
-    data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS_subscale", binwidth=0.2
+df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["mean"]
+
+# %%
+df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["std"]
+
+# %%
+df_esm_PANAS_summary_participant.query("std == 0")
+
+# %%
+fig1 = sns.displot(
+    data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS subscale", binwidth=0.2
 )
+fig1.set_axis_labels(x_var="participant mean", y_var="frequency")
+if save_figs:
+    fig1.figure.savefig("PANAS_mean_participant.pdf", dpi=300)

 # %%
 sns.displot(
    data=df_esm_PANAS_summary_participant,
    x="median",
-    hue="PANAS_subscale",
+    hue="PANAS subscale",
    binwidth=0.2,
 )

 # %%
-sns.displot(
-    data=df_esm_PANAS_summary_participant, x="std", hue="PANAS_subscale", binwidth=0.05
+fig2 = sns.displot(
+    data=df_esm_PANAS_summary_participant, x="std", hue="PANAS subscale", binwidth=0.05
 )
+fig2.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
+if save_figs:
+    fig2.figure.savefig("PANAS_std_participant.pdf", dpi=300)

 # %%
 df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
@ -109,8 +141,14 @@ df_SAM_all.head()

 # %%
 df_esm_SAM = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= 87)
-    & (df_esm_preprocessed["questionnaire_id"] <= 93)
+    (
+        df_esm_preprocessed["questionnaire_id"]
+        >= QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
+    )
+    & (
+        df_esm_preprocessed["questionnaire_id"]
+        <= QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
+    )
 ]
 df_esm_SAM_clean = clean_up_esm(df_esm_SAM)

@ -118,9 +156,10 @@ df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
 # ## Stressful events

 # %%
-df_esm_SAM_event = df_esm_SAM_clean[df_esm_SAM_clean["questionnaire_id"] == 87].assign(
-    stressful_event=lambda x: (x.esm_user_answer_numeric > 0)
-)
+df_esm_SAM_event = df_esm_SAM_clean[
+    df_esm_SAM_clean["questionnaire_id"]
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
+].assign(stressful_event=lambda x: (x.esm_user_answer_numeric > 0))

 # %%
 df_esm_SAM_daily_events = (
@ -131,20 +170,22 @@ df_esm_SAM_daily_events = (
 )

 # %% [markdown]
-# Calculate the daily mean of YES (1) or NO (0) answers to the question about a stressful events. This is then the daily ratio of EMA sessions that included a stressful event.
+# Calculate the daily mean of YES (1) or NO (0) answers to the question about stressful events. This is then the daily ratio of EMA sessions that included a stressful event.

 # %%
 df_esm_SAM_event_summary_participant = (
    df_esm_SAM_daily_events.groupby(["participant_id"])
-    .agg(["mean", "median", "std"])
+    .SAM_event_ratio.agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
-df_esm_SAM_event_summary_participant.columns = df_esm_SAM_event_summary_participant.columns.get_level_values(
-    1
-)

 # %%
-sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
+fig6 = sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
+fig6.set_axis_labels(
+    x_var="participant proportion of stressful events", y_var="frequency"
+)
+if save_figs:
+    fig6.figure.savefig("SAM_events_mean_participant.pdf", dpi=300)

 # %%
 sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
@ -155,7 +196,12 @@ sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
 # %% [markdown]
 # * Example of threat: "Did this event make you feel anxious?"
 # * Example of challenge: "How eager are you to tackle this event?"
-# * Possible answers: 0 - Not at all, 1 - Slightly, 2 - Moderately, 3 - Considerably, 4 - Extremely
+# * Possible answers:
+#   0 - Not at all,
+#   1 - Slightly,
+#   2 - Moderately,
+#   3 - Considerably,
+#   4 - Extremely

 # %%
 df_esm_SAM_daily = (
@ -167,27 +213,45 @@ df_esm_SAM_daily = (

 # %%
 df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[
-    (df_esm_SAM_daily["questionnaire_id"] == 88)
-    | (df_esm_SAM_daily["questionnaire_id"] == 89)
+    (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
+    | (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
 ]

 # %%
 df_esm_SAM_summary_participant = (
    df_esm_SAM_daily.groupby(["participant_id", "questionnaire_id"])
-    .agg(["mean", "median", "std"])
+    .esm_numeric_mean.agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
-df_esm_SAM_summary_participant.columns = df_esm_SAM_summary_participant.columns.get_level_values(
-    1
+
+# %%
+df_esm_SAM_event_stressfulness_summary_participant = df_esm_SAM_summary_participant[
+    df_esm_SAM_summary_participant["questionnaire_id"]
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
+]
+df_esm_SAM_event_stressfulness_summary_participant.describe()["mean"]
+
+# %%
+df_esm_SAM_event_stressfulness_summary_participant.describe()["std"]
+
+# %%
+sns.displot(
+    data=df_esm_SAM_event_stressfulness_summary_participant, x="mean", binwidth=0.2
 )

 # %%
 df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[
-    (df_esm_SAM_summary_participant["questionnaire_id"] == 88)
-    | (df_esm_SAM_summary_participant["questionnaire_id"] == 89)
+    (
+        df_esm_SAM_summary_participant["questionnaire_id"]
+        == QUESTIONNAIRE_IDS["appraisal_threat"]
+    )
+    | (
+        df_esm_SAM_summary_participant["questionnaire_id"]
+        == QUESTIONNAIRE_IDS["appraisal_challenge"]
+    )
 ]
 df_esm_SAM_threat_challenge_summary_participant[
-    "event_subscale"
+    "event subscale"
 ] = df_esm_SAM_threat_challenge_summary_participant.questionnaire_id.astype(
    "category"
 ).cat.rename_categories(
@ -198,26 +262,84 @@ df_esm_SAM_threat_challenge_summary_participant[
 sns.displot(
    data=df_esm_SAM_threat_challenge_summary_participant,
    x="mean",
-    hue="event_subscale",
+    hue="event subscale",
    binwidth=0.2,
 )

 # %%
-sns.displot(
+fig3 = sns.displot(
    data=df_esm_SAM_threat_challenge_summary_participant,
    x="std",
-    hue="event_subscale",
+    hue="event subscale",
    binwidth=0.1,
 )
+fig3.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
+if save_figs:
+    fig3.figure.savefig("SAM_std_participant.pdf", dpi=300)
+
+# %%
+df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
+    "mean"
+]
+
+# %%
+df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
+    "std"
+]
+
+# %%
+df_esm_SAM_clean.columns
+
+# %%
+df_esm_SAM_clean.esm_status.value_counts()
+
+# %%
+if export_data:
+    df_esm_SAM_fixed = reassign_question_ids(df_esm_SAM_clean, DICT_SAM_QUESTION_IDS)
+    df_esm_SAM_fixed = increment_answers(df_esm_SAM_fixed)
+    df_esm_SAM_for_export = df_esm_SAM_fixed[
+        [
+            "participant_id",
+            "username",
+            "device_id",
+            "_id",
+            "esm_trigger",
+            "esm_session",
+            "esm_notification_id",
+            "question_id",
+            "questionnaire_id",
+            "esm_instructions",
+            "double_esm_user_answer_timestamp",
+            "datetime_lj",
+            "date_lj",
+            "time",
+            "esm_user_answer",
+            "esm_user_answer_numeric",
+        ]
+    ]
+    df_esm_SAM_for_export.sort_values(
+        by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
+    )
+    print(df_esm_SAM_for_export.head())
+    df_esm_SAM_for_export.to_csv(
+        "../data/raw/df_esm_SAM_threat_challenge.csv", index=False
+    )

 # %% [markdown]
 # ## Stressfulness of period

 # %%
 df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[
-    df_esm_SAM_summary_participant["questionnaire_id"] == 93
+    df_esm_SAM_summary_participant["questionnaire_id"]
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
 ]

+# %%
+df_esm_SAM_period_summary_participant.describe()["mean"]
+
+# %%
+df_esm_SAM_period_summary_participant.describe()["std"]
+
 # %%
 sns.displot(data=df_esm_SAM_period_summary_participant, x="mean", binwidth=0.2)

@ -229,8 +351,8 @@ sns.displot(data=df_esm_SAM_period_summary_participant, x="std", binwidth=0.1)

 # %%
 df_esm_JCQ_demand_control = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= 10)
-    & (df_esm_preprocessed["questionnaire_id"] <= 11)
+    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["JCQ_job_demand"])
+    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["JCQ_job_control"])
 ]
 df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control)

@ -250,14 +372,11 @@ df_esm_JCQ_daily = (
 )
 df_esm_JCQ_summary_participant = (
    df_esm_JCQ_daily.groupby(["participant_id", "questionnaire_id"])
-    .agg(["mean", "median", "std"])
+    .esm_score_mean.agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
-df_esm_JCQ_summary_participant.columns = df_esm_JCQ_summary_participant.columns.get_level_values(
-    1
-)
 df_esm_JCQ_summary_participant[
-    "JCQ_subscale"
+    "JCQ subscale"
 ] = df_esm_JCQ_summary_participant.questionnaire_id.astype(
    "category"
 ).cat.rename_categories(
@ -265,11 +384,71 @@ df_esm_JCQ_summary_participant[
 )

 # %%
-sns.displot(
-    data=df_esm_JCQ_summary_participant, x="mean", hue="JCQ_subscale", binwidth=0.1,
-)
+df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["mean"]

 # %%
-sns.displot(
-    data=df_esm_JCQ_summary_participant, x="std", hue="JCQ_subscale", binwidth=0.05,
+df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["std"]
+
+# %%
+fig4 = sns.displot(
+    data=df_esm_JCQ_summary_participant,
+    x="mean",
+    hue="JCQ subscale",
+    binwidth=0.1,
 )
+fig4.set_axis_labels(x_var="participant mean", y_var="frequency")
+if save_figs:
+    fig4.figure.savefig("JCQ_mean_participant.pdf", dpi=300)
+
+# %%
+fig5 = sns.displot(
+    data=df_esm_JCQ_summary_participant,
+    x="std",
+    hue="JCQ subscale",
+    binwidth=0.05,
+)
+fig6.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
+if save_figs:
+    fig5.figure.savefig("JCQ_std_participant.pdf", dpi=300)
+
+# %% [markdown]
+# # COPE Inventory
+
+# %%
+df_esm_COPE = df_esm_preprocessed[
+    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["COPE_active"])
+    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["COPE_emotions"])
+]
+
+# %%
+df_esm_COPE_clean = clean_up_esm(df_esm_COPE)
+df_esm_COPE_clean = increment_answers(df_esm_COPE_clean)
+df_esm_COPE_fixed = reassign_question_ids(df_esm_COPE_clean, DICT_COPE_QUESTION_IDS)
+
+# %%
+if export_data:
+    df_esm_COPE_for_export = df_esm_COPE_fixed[
+        [
+            "participant_id",
+            "username",
+            "device_id",
+            "_id",
+            "esm_trigger",
+            "esm_session",
+            "esm_notification_id",
+            "question_id",
+            "questionnaire_id",
+            "esm_instructions",
+            "double_esm_user_answer_timestamp",
+            "datetime_lj",
+            "date_lj",
+            "time",
+            "esm_user_answer",
+            "esm_user_answer_numeric",
+        ]
+    ]
+    df_esm_COPE_for_export.sort_values(
+        by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
+    )
+    print(df_esm_COPE_for_export.head())
+    df_esm_COPE_for_export.to_csv("../data/raw/df_esm_COPE.csv", index=False)
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@ -20,30 +20,74 @@ import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd

+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import recall_score, f1_score
+
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)

 from machine_learning.cross_validation import CrossValidation
 from machine_learning.preprocessing import Preprocessing
+from machine_learning.feature_selection import FeatureSelection

 # %% 
 df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 df.set_index(index_columns, inplace=True)

+# Create binary target 
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
+
+
+nan_cols = df.columns[df.isna().any()].tolist()
+df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))
+
 cv = CrossValidation(data=df, cv_method="logo")

 categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
 interval_feature_list, other_feature_list = [], []

-print(df.columns.tolist())
-
+# %%
 for split in cv.get_splits():
    train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
    pre = Preprocessing(train_X, train_y, test_X, test_y)
    pre.one_hot_encode_train_and_test_sets(categorical_columns)
    train_X, train_y, test_X, test_y = pre.get_train_test_sets()
+    
+    
+    print(train_X.shape, test_X.shape)
+    # Predict before feature selection
+    rfc = RandomForestClassifier(n_estimators=10)
+    rfc.fit(train_X, train_y)
+    predictions = rfc.predict(test_X)
+    
+    print("Recall:", recall_score(test_y, predictions))
+    print("F1:", f1_score(test_y, predictions))
+    
+    # Feature selection on train set
+    train_groups, test_groups = cv.get_groups_sets(split)
+
+    fs = FeatureSelection(train_X, train_y, train_groups) 
+    selected_features = fs.select_features(n_min=20, n_max=29, k=40,
+                                           ml_type="classification_bin", 
+                                           metric="recall", n_tolerance=20)
+    
+    train_X = train_X[selected_features]
+    test_X = test_X[selected_features]
+    
+    print(selected_features)
+    print(len(selected_features))
+    
+    # Predict after feature selection    
+    rfc = RandomForestClassifier(n_estimators=500)
+    rfc.fit(train_X, train_y)
+    predictions = rfc.predict(test_X)
+    
+    print("Recall:", recall_score(test_y, predictions))
+    print("F1:", f1_score(test_y, predictions))
+    
    break

 # %%
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@ -6,457 +6,129 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---

-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-# %matplotlib inline
-import os
-import sys
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# from IPython.core.interactiveshell import InteractiveShell
+from pathlib import Path

-import numpy as np
-import matplotlib.pyplot as plt
+# matplotlib inline
+# import os
+# import sys
 import pandas as pd

-from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
-from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
-from sklearn.dummy import DummyClassifier
-from sklearn.impute import SimpleImputer
+from machine_learning.helper import (
+    impute_encode_categorical_features,
+    prepare_cross_validator,
+    prepare_sklearn_data_format,
+    run_all_classification_models,
+)

-from lightgbm import LGBMClassifier
-import xgboost as xg
-from IPython.core.interactiveshell import InteractiveShell
-InteractiveShell.ast_node_interactivity = "all"
-
-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
-    
-import machine_learning.helper
-
-# %% [markdown]
-# # RAPIDS models
-
-# %% [markdown]
-# ## Set script's parameters
+# InteractiveShell.ast_node_interactivity = "all"
 #
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
-cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
-n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
-undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method)
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
-# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-model_input.set_index(index_columns, inplace=True)
-model_input['target'].value_counts()
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-# bins = [-10, 0, 10] # bins for z-scored targets
-bins = [-1, 0, 4] # bins for stressfulness (0-4) target
-model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
-model_input['target'].value_counts(), edges
-# model_input = model_input[model_input['target'] != "medium"]
-model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
-
-model_input['target'].value_counts()
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-# UnderSampling
-if undersampling:
-    no_stress = model_input[model_input['target'] == 0]
-    stress = model_input[model_input['target'] == 1]
-    
-    no_stress = no_stress.sample(n=len(stress))
-    model_input = pd.concat([stress,no_stress], axis=0)
-
-#   model_input_new = pd.DataFrame(columns=model_input.columns)
-#   for pid in model_input["pid"].unique():
-#     stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
-#     no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
-#     if (len(stress) == 0):
-#       continue
-#     if (len(no_stress) == 0):
-#       continue
-#     model_input_new = pd.concat([model_input_new, stress], axis=0)
-    
-#     no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
-#     # In case there are more stress samples than no_stress, take all instances of no_stress.
-#     model_input_new = pd.concat([model_input_new, no_stress], axis=0)
-#     model_input = model_input_new   
-#     model_input_new = pd.concat([model_input_new, no_stress], axis=0)
+# nb_dir = os.path.split(os.getcwd())[0]
+# if nb_dir not in sys.path:
+#     sys.path.append(nb_dir)


-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-if cv_method_str == 'half_logo':
-    model_input['pid_index'] = model_input.groupby('pid').cumcount()
-    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
-
-    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
-    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
-
-    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
-else:
-    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
-
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-categorical_feature_colnames = ["gender", "startlanguage"]
-additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
-categorical_feature_colnames += additional_categorical_features
-
-categorical_features = data_x[categorical_feature_colnames].copy()
-mode_categorical_features = categorical_features.mode().iloc[0]
-
-# fillna with mode
-categorical_features = categorical_features.fillna(mode_categorical_features)
-
-# one-hot encoding
-categorical_features = categorical_features.apply(lambda col: col.astype("category"))
-if not categorical_features.empty:
-    categorical_features = pd.get_dummies(categorical_features)
-
-numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
-train_x = pd.concat([numerical_features, categorical_features], axis=1)
-train_x.dtypes
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
-if cv_method_str == 'logo' or cv_method_str == 'half_logo':
-    cv_method = LeaveOneGroupOut()
-    cv_method.get_n_splits(
-        train_x,
-        data_y,
-        groups=data_groups,
-    )
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-imputer = SimpleImputer(missing_values=np.nan, strategy='median')
-
-# %% [markdown]
-# ### Baseline: Dummy Classifier (most frequent)
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
-dummy_class = DummyClassifier(strategy="most_frequent")
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-dummy_classifier = cross_validate(
-    dummy_class,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=cv_method,
-    n_jobs=-1,
-    error_score='raise',
-    scoring=('accuracy', 'precision', 'recall', 'f1')
-)
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
-print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
-print("Precision", np.mean(dummy_classifier['test_precision']))
-print("Recall", np.mean(dummy_classifier['test_recall']))
-print("F1", np.mean(dummy_classifier['test_f1']))
-print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
-print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
-
-# %% [markdown] nteract={"transient": {"deleting": false}}
-# ### All models
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
-final_scores = machine_learning.helper.run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 # %%
-final_scores.index.name = "metric"
-final_scores = final_scores.set_index(["method", final_scores.index])
-final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv")
+CV_METHOD = "logo"  # logo, half_logo, 5kfold
+# Cross-validation method (could be regarded as a hyperparameter)
+print("CV_METHOD: " + CV_METHOD)
+N_SL = 3  # Number of largest/smallest accuracies (of particular CV) outputs
+UNDERSAMPLING = False
+# (bool) If True this will train and test data on balanced dataset
+# (using undersampling method)

-# %% [markdown]
-# ### Logistic Regression
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+PATH_BASE = Path("E:/STRAWresults/20230415")

-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-logistic_regression = linear_model.LogisticRegression()
+SEGMENT_TYPE = "period"
+print("SEGMENT_TYPE: " + SEGMENT_TYPE)
+SEGMENT_LENGTH = "30_minutes_before"
+print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
+TARGET_VARIABLE = "JCQ_job_control"
+print("TARGET_VARIABLE: " + TARGET_VARIABLE)

-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-log_reg_scores = cross_validate(
-    logistic_regression,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=cv_method,
-    n_jobs=-1,
-    scoring=('accuracy', 'precision', 'recall', 'f1')
+if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
+    TARGET_VARIABLE += "_"
+    TARGET_VARIABLE += SEGMENT_TYPE
+
+PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
+
+model_input = pd.read_csv(PATH_FULL)
+
+if SEGMENT_LENGTH == "daily":
+    DAY_LENGTH = "daily"  # or "working"
+    print(DAY_LENGTH)
+    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+model_input["target"].value_counts()
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# bins = [-10, 0, 10] # bins for z-scored targets
+BINS = [-1, 0, 4]  # bins for stressfulness (0-4) target
+print("BINS: ", BINS)
+model_input["target"], edges = pd.cut(
+    model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
+)  # ['low', 'medium', 'high']
+print(model_input["target"].value_counts())
+REMOVE_MEDIUM = True
+if ("medium" in model_input["target"]) and REMOVE_MEDIUM:
+    model_input = model_input[model_input["target"] != "medium"]
+    model_input["target"] = (
+        model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
+    )
+else:
+    model_input["target"] = model_input["target"].map(
+        {"low": 0, "medium": 1, "high": 2}
+    )
+    print(model_input["target"].value_counts())
+
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# UnderSampling
+if UNDERSAMPLING:
+    no_stress = model_input[model_input["target"] == 0]
+    stress = model_input[model_input["target"] == 1]
+
+    no_stress = no_stress.sample(n=len(stress))
+    model_input = pd.concat([stress, no_stress], axis=0)
+
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+model_input_encoded = impute_encode_categorical_features(model_input)
+# %%
+data_x, data_y, data_groups = prepare_sklearn_data_format(
+    model_input_encoded, CV_METHOD
 )
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
-print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
-print("Precision", np.mean(log_reg_scores['test_precision']))
-print("Recall", np.mean(log_reg_scores['test_recall']))
-print("F1", np.mean(log_reg_scores['test_f1']))
-print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
-print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n_sl)[:n_sl]))
+cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)

-# %% [markdown]
-# ### Support Vector Machine
+# %%
+data_y.head()

-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-svc = svm.SVC()
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-svc_scores = cross_validate(
-    svc,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=cv_method,
-    n_jobs=-1,
-    scoring=('accuracy', 'precision', 'recall', 'f1')
+# %%
+data_y.tail()
+# %%
+data_y.shape
+# %%
+scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
+# %%
+PATH_OUTPUT = Path("..") / Path("presentation/results")
+path_output_full = PATH_OUTPUT / (
+    TARGET_VARIABLE
+    + "_"
+    + SEGMENT_LENGTH
+    + "_classification"
+    + str(BINS)
+    + "_"
+    + CV_METHOD
+    + ".csv"
 )
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
-print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
-print("Precision", np.mean(svc_scores['test_precision']))
-print("Recall", np.mean(svc_scores['test_recall']))
-print("F1", np.mean(svc_scores['test_f1']))
-print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
-print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n_sl)[:n_sl]))
-
-# %% [markdown]
-# ### Gaussian Naive Bayes
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-gaussian_nb = naive_bayes.GaussianNB()
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-gaussian_nb_scores = cross_validate(
-    gaussian_nb,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=cv_method,
-    n_jobs=-1,
-    error_score='raise',
-    scoring=('accuracy', 'precision', 'recall', 'f1')
-)
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
-print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
-print("Precision", np.mean(gaussian_nb_scores['test_precision']))
-print("Recall", np.mean(gaussian_nb_scores['test_recall']))
-print("F1", np.mean(gaussian_nb_scores['test_f1']))
-print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
-print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl]))
-
-# %% [markdown]
-# ### Stochastic Gradient Descent Classifier
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-sgdc = linear_model.SGDClassifier()
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-sgdc_scores = cross_validate(
-    sgdc,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=cv_method,
-    n_jobs=-1,
-    error_score='raise',
-    scoring=('accuracy', 'precision', 'recall', 'f1')
-)
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
-print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
-print("Precision", np.mean(sgdc_scores['test_precision']))
-print("Recall", np.mean(sgdc_scores['test_recall']))
-print("F1", np.mean(sgdc_scores['test_f1']))
-print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
-print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n_sl)[:n_sl]))
-
-# %% [markdown]
-# ### K-nearest neighbors
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-knn = neighbors.KNeighborsClassifier()
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-knn_scores = cross_validate(
-    knn,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=cv_method,
-    n_jobs=-1,
-    error_score='raise',
-    scoring=('accuracy', 'precision', 'recall', 'f1')
-)
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
-print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
-print("Precision", np.mean(knn_scores['test_precision']))
-print("Recall", np.mean(knn_scores['test_recall']))
-print("F1", np.mean(knn_scores['test_f1']))
-print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
-print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n_sl)[:n_sl]))
-
-# %% [markdown]
-# ### Decision Tree
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-dtree = tree.DecisionTreeClassifier()
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-dtree_scores = cross_validate(
-    dtree,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=cv_method,
-    n_jobs=-1,
-    error_score='raise',
-    scoring=('accuracy', 'precision', 'recall', 'f1')
-)
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
-print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
-print("Precision", np.mean(dtree_scores['test_precision']))
-print("Recall", np.mean(dtree_scores['test_recall']))
-print("F1", np.mean(dtree_scores['test_f1']))
-print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
-print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n_sl)[:n_sl]))
-
-# %% [markdown]
-# ### Random Forest Classifier
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-rfc = ensemble.RandomForestClassifier()
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-rfc_scores = cross_validate(
-    rfc,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=cv_method,
-    n_jobs=-1,
-    error_score='raise',
-    scoring=('accuracy', 'precision', 'recall', 'f1'), 
-    return_estimator=True
-)
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
-print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
-print("Precision", np.mean(rfc_scores['test_precision']))
-print("Recall", np.mean(rfc_scores['test_recall']))
-print("F1", np.mean(rfc_scores['test_f1']))
-print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
-print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
-
-# %% [markdown]
-# ### Feature importance (RFC)
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
-for idx, estimator in enumerate(rfc_scores['estimator']):
-    feature_importances = pd.DataFrame(estimator.feature_importances_,
-                                       index = list(train_x.columns),
-                                        columns=['importance'])
-    # print("\nFeatures sorted by their score for estimator {}:".format(idx))
-    # print(feature_importances.sort_values('importance', ascending=False).head(10))                                    
-    rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
-
-pd.set_option('display.max_rows', 100)
-print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
-
-rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
-
-rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
-
-train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
-
-# %% [markdown]
-# ### Gradient Boosting Classifier
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-gbc = ensemble.GradientBoostingClassifier()
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-gbc_scores = cross_validate(
-    gbc,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=cv_method,
-    n_jobs=-1,
-    error_score='raise',
-    scoring=('accuracy', 'precision', 'recall', 'f1')
-)
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
-print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
-print("Precision", np.mean(gbc_scores['test_precision']))
-print("Recall", np.mean(gbc_scores['test_recall']))
-print("F1", np.mean(gbc_scores['test_f1']))
-print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
-print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n_sl)[:n_sl]))
-
-# %% [markdown]
-# ### LGBM Classifier
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-lgbm = LGBMClassifier()
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-lgbm_scores = cross_validate(
-    lgbm,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=cv_method,
-    n_jobs=-1,
-    error_score='raise',
-    scoring=('accuracy', 'precision', 'recall', 'f1')
-)
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
-print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
-print("Precision", np.mean(lgbm_scores['test_precision']))
-print("Recall", np.mean(lgbm_scores['test_recall']))
-print("F1", np.mean(lgbm_scores['test_f1']))
-print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
-print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n_sl)[:n_sl]))
-
-# %% [markdown]
-# ### XGBoost Classifier
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-xgb_classifier = xg.sklearn.XGBClassifier()
-
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-xgb_classifier_scores = cross_validate(
-    xgb_classifier,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=cv_method,
-    n_jobs=-1,
-    error_score='raise',
-    scoring=('accuracy', 'precision', 'recall', 'f1')
-)
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
-print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
-print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
-print("Precision", np.mean(xgb_classifier_scores['test_precision']))
-print("Recall", np.mean(xgb_classifier_scores['test_recall']))
-print("F1", np.mean(xgb_classifier_scores['test_f1']))
-print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
-print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
-
+scores.to_csv(path_output_full, index=False)
--- a/exploration/ml_pipeline_classification_composite.py
+++ b/exploration/ml_pipeline_classification_composite.py
@ -0,0 +1,177 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.14.5
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+from pathlib import Path
+
+import pandas as pd
+import seaborn as sns
+from sklearn.decomposition import PCA
+
+from machine_learning.helper import (
+    impute_encode_categorical_features,
+    prepare_cross_validator,
+    prepare_sklearn_data_format,
+    run_all_classification_models,
+)
+
+# %%
+CV_METHOD = "logo"  # logo, half_logo, 5kfold
+# Cross-validation method (could be regarded as a hyperparameter)
+print("CV_METHOD: " + CV_METHOD)
+N_SL = 3  # Number of largest/smallest accuracies (of particular CV) outputs
+UNDERSAMPLING = False
+# (bool) If True this will train and test data on balanced dataset
+# (using undersampling method)
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+PATH_BASE = Path("E:/STRAWresults/20230415")
+
+SEGMENT_TYPE = "period"
+print("SEGMENT_TYPE: " + SEGMENT_TYPE)
+SEGMENT_LENGTH = "30_minutes_before"
+print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
+
+PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv"
+
+all_features_with_baseline = pd.read_csv(PATH_FULL)
+
+# %%
+TARGETS = [
+    "PANAS_negative_affect_mean",
+    "PANAS_positive_affect_mean",
+    "JCQ_job_demand_mean",
+    "JCQ_job_control_mean",
+    "appraisal_stressfulness_period_mean",
+]
+
+# %%
+all_features_cleaned = pd.DataFrame()
+for target in TARGETS:
+    PATH_FULL = (
+        PATH_BASE
+        / SEGMENT_LENGTH
+        / "features"
+        / ("all_sensor_features_cleaned_straw_py_(" + target + ").csv")
+    )
+    current_features = pd.read_csv(PATH_FULL, index_col="local_segment")
+    if all_features_cleaned.empty:
+        all_features_cleaned = current_features
+    else:
+        all_features_cleaned = all_features_cleaned.join(
+            current_features[("phone_esm_straw_" + target)],
+            how="inner",
+            rsuffix="_" + target,
+        )
+    print(all_features_cleaned.shape)
+
+# %%
+pca = PCA(n_components=1)
+TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS]
+pca.fit(all_features_cleaned[TARGETS_PREFIXED])
+print(pca.explained_variance_ratio_)
+
+# %%
+model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED)
+model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED])
+
+# %%
+sns.histplot(data=model_input, x="target")
+
+# %%
+model_input.target.quantile(0.6)
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# bins = [-10, 0, 10] # bins for z-scored targets
+BINS = [-10, 0, 10]  # bins for stressfulness (0-4) target
+print("BINS: ", BINS)
+model_input["target"], edges = pd.cut(
+    model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
+)  # ['low', 'medium', 'high']
+print(model_input["target"].value_counts())
+REMOVE_MEDIUM = True
+if REMOVE_MEDIUM:
+    if "medium" in model_input["target"]:
+        model_input = model_input[model_input["target"] != "medium"]
+    model_input["target"] = (
+        model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
+    )
+else:
+    model_input["target"] = model_input["target"].map(
+        {"low": 0, "medium": 1, "high": 2}
+    )
+    print(model_input["target"].value_counts())
+
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# UnderSampling
+if UNDERSAMPLING:
+    no_stress = model_input[model_input["target"] == 0]
+    stress = model_input[model_input["target"] == 1]
+
+    no_stress = no_stress.sample(n=len(stress))
+    model_input = pd.concat([stress, no_stress], axis=0)
+
+
+# %%
+TARGET_VARIABLE = "PANAS_negative_affect"
+print("TARGET_VARIABLE: " + TARGET_VARIABLE)
+
+PATH_FULL_HELP = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
+
+model_input_with_baseline = pd.read_csv(PATH_FULL_HELP, index_col="local_segment")
+
+# %%
+baseline_col_names = [
+    col for col in model_input_with_baseline.columns if col not in model_input.columns
+]
+print(baseline_col_names)
+
+# %%
+model_input = model_input.join(
+    model_input_with_baseline[baseline_col_names], how="left"
+)
+model_input.reset_index(inplace=True)
+
+# %%
+model_input_encoded = impute_encode_categorical_features(model_input)
+
+# %%
+data_x, data_y, data_groups = prepare_sklearn_data_format(
+    model_input_encoded, CV_METHOD
+)
+cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
+
+# %%
+data_y.head()
+
+# %%
+data_y.tail()
+# %%
+data_y.shape
+# %%
+scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
+# %%
+PATH_OUTPUT = Path("..") / Path("presentation/results")
+path_output_full = PATH_OUTPUT / (
+    "composite_"
+    + SEGMENT_LENGTH
+    + "_classification"
+    + str(BINS)
+    + "_"
+    + CV_METHOD
+    + ".csv"
+)
+scores.to_csv(path_output_full, index=False)
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,80 +14,85 @@
 # ---

 # %% jupyter={"source_hidden": true}
-# %matplotlib inline
-import datetime
-import importlib
-import os
-import sys
+from pathlib import Path

-import numpy as np
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
-import seaborn as sns
-from scipy import stats
-
-from sklearn.model_selection import LeaveOneGroupOut, cross_validate
-from sklearn.impute import SimpleImputer
-
-from sklearn.dummy import DummyClassifier
-from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
-import xgboost as xg 
-
 from sklearn.cluster import KMeans
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold, cross_validate

-from IPython.core.interactiveshell import InteractiveShell
-InteractiveShell.ast_node_interactivity = "all"
-
-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
-
-import machine_learning.labels
-import machine_learning.model
 from machine_learning.classification_models import ClassificationModels

-# %% [markdown]
-# # RAPIDS models
-
-# %% [markdown]
+# %%
 # ## Set script's parameters
-n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
-cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
-n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
+N_CLUSTERS = 4  # Number of clusters (could be regarded as a hyperparameter)
+CV_METHOD = "logo"  # logo, halflogo, 5kfold
+# Cross-validation method (could be regarded as a hyperparameter)
+N_SL = 1  # Number of largest/smallest accuracies (of particular CV) outputs
+
+# %%
+PATH_BASE = Path("E:/STRAWresults/20230415")
+
+SEGMENT_TYPE = "period"
+print("SEGMENT_TYPE: " + SEGMENT_TYPE)
+SEGMENT_LENGTH = "30_minutes_before"
+print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
+TARGET_VARIABLE = "appraisal_stressfulness"
+print("TARGET_VARIABLE: " + TARGET_VARIABLE)
+
+if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
+    TARGET_VARIABLE += "_"
+    TARGET_VARIABLE += SEGMENT_TYPE
+
+PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
+
+model_input = pd.read_csv(PATH_FULL)
+
+if SEGMENT_LENGTH == "daily":
+    DAY_LENGTH = "daily"  # or "working"
+    print(DAY_LENGTH)
+    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]

 # %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+index_columns = [
+    "local_segment",
+    "local_segment_label",
+    "local_segment_start_datetime",
+    "local_segment_end_datetime",
+]

-clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
+CLUST_COL = "limesurvey_demand_control_ratio_quartile"
+print("CLUST_COL: " + CLUST_COL)

-model_input.columns[list(model_input.columns).index('age'):-1]
+BINS = [-1, 0, 4]
+print("BINS: " + str(BINS))

-lime_cols = [col for col in model_input if col.startswith('limesurvey')]
-lime_cols
-lime_col = 'limesurvey_demand_control_ratio_quartile'
-clust_col = lime_col
+model_input[CLUST_COL].describe()

-model_input[clust_col].describe()

+# %%
+model_input["target"].value_counts()

 # %% jupyter={"source_hidden": true}
+# Filter-out outlier rows by clust_col
+# model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]

-# Filter-out outlier rows by clust_col 
-#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
-
-uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
+uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
 uniq = uniq.dropna()
-plt.bar(uniq['pid'], uniq[clust_col])
+plt.bar(uniq["pid"], uniq[CLUST_COL])

 # %% jupyter={"source_hidden": true}
 # Get clusters by cluster col & and merge the clusters to main df
-km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
+km = KMeans(n_clusters=N_CLUSTERS).fit_predict(uniq.set_index("pid"))
 np.unique(km, return_counts=True)
-uniq['cluster'] = km
-uniq
+uniq["cluster"] = km

-model_input = model_input.merge(uniq[['pid', 'cluster']])   
+model_input = model_input.merge(uniq[["pid", "cluster"]])
+
+# %%
+model_input[["cluster", "target"]].value_counts().sort_index()

 # %% jupyter={"source_hidden": true}
 model_input.set_index(index_columns, inplace=True)
@ -98,31 +103,56 @@ cm = ClassificationModels()
 cmodels = cm.get_cmodels()

 # %% jupyter={"source_hidden": true}
-for k in range(n_clusters):
+for k in range(N_CLUSTERS):
    model_input_subset = model_input[model_input["cluster"] == k].copy()
-    bins = [-10, -1, 1, 10] # bins for z-scored targets
-    model_input_subset.loc[:, 'target'] = \
-        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
-    model_input_subset['target'].value_counts()
-    model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
-    model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
+    model_input_subset.loc[:, "target"] = pd.cut(
+        model_input_subset.loc[:, "target"],
+        bins=BINS,
+        labels=["low", "high"],
+        right=True,
+    )  # ['low', 'medium', 'high']
+    model_input_subset["target"].value_counts()
+    # model_input_subset = model_input_subset[model_input_subset["target"] != "medium"]
+    model_input_subset["target"] = (
+        model_input_subset["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
+    )

-    model_input_subset['target'].value_counts()
-    
-    if cv_method_str == 'half_logo':
-        model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
-        model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
+    print(model_input_subset["target"].value_counts())

-        model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
-        model_input_subset["pid_half"] = model_input_subset["pid"] + "_" +  model_input_subset["pid_index"].astype(int).astype(str)
+    if CV_METHOD == "half_logo":
+        model_input_subset["pid_index"] = model_input_subset.groupby("pid").cumcount()
+        model_input_subset["pid_count"] = model_input_subset.groupby("pid")[
+            "pid"
+        ].transform("count")

-        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
+        model_input_subset["pid_index"] = (
+            model_input_subset["pid_index"] / model_input_subset["pid_count"] + 1
+        ).round()
+        model_input_subset["pid_half"] = (
+            model_input_subset["pid"]
+            + "_"
+            + model_input_subset["pid_index"].astype(int).astype(str)
+        )
+
+        data_x, data_y, data_groups = (
+            model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
+            model_input_subset["target"],
+            model_input_subset["pid_half"],
+        )
    else:
-        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]
+        data_x, data_y, data_groups = (
+            model_input_subset.drop(["target", "pid"], axis=1),
+            model_input_subset["target"],
+            model_input_subset["pid"],
+        )

    # Treat categorical features
    categorical_feature_colnames = ["gender", "startlanguage"]
-    additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+    additional_categorical_features = [
+        col
+        for col in data_x.columns
+        if "mostcommonactivity" in col or "homelabel" in col
+    ]
    categorical_feature_colnames += additional_categorical_features

    categorical_features = data_x[categorical_feature_colnames].copy()
@ -132,7 +162,9 @@ for k in range(n_clusters):
    categorical_features = categorical_features.fillna(mode_categorical_features)

    # one-hot encoding
-    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+    categorical_features = categorical_features.apply(
+        lambda col: col.astype("category")
+    )
    if not categorical_features.empty:
        categorical_features = pd.get_dummies(categorical_features)

@ -140,8 +172,10 @@ for k in range(n_clusters):
    train_x = pd.concat([numerical_features, categorical_features], axis=1)

    # Establish cv method
-    cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
-    if cv_method_str == 'logo' or cv_method_str == 'half_logo':
+    cv_method = StratifiedKFold(
+        n_splits=5, shuffle=True
+    )  # Defaults to 5 k-folds in cross_validate method
+    if CV_METHOD == "logo" or CV_METHOD == "half_logo":
        cv_method = LeaveOneGroupOut()
        cv_method.get_n_splits(
            train_x,
@ -149,36 +183,57 @@ for k in range(n_clusters):
            groups=data_groups,
        )

-    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+    imputer = SimpleImputer(missing_values=np.nan, strategy="median")

    for model_title, model in cmodels.items():
-
        classifier = cross_validate(
-            model['model'],
+            model["model"],
            X=imputer.fit_transform(train_x),
            y=data_y,
            groups=data_groups,
            cv=cv_method,
            n_jobs=-1,
-            error_score='raise',
-            scoring=('accuracy', 'precision', 'recall', 'f1')
+            error_score="raise",
+            scoring=("accuracy", "precision", "recall", "f1"),
        )
-        
+
        print("\n-------------------------------------\n")
        print("Current cluster:", k, end="\n")
        print("Current model:", model_title, end="\n")
-        print("Acc", np.mean(classifier['test_accuracy']))
-        print("Precision", np.mean(classifier['test_precision']))
-        print("Recall", np.mean(classifier['test_recall']))
-        print("F1", np.mean(classifier['test_f1']))
-        print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
-        print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))
-        
-        cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
-        cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
-        cmodels[model_title]['metrics'][2] += np.mean(classifier['test_recall'])
-        cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])
+        print("Acc", np.mean(classifier["test_accuracy"]))
+        print("Precision", np.mean(classifier["test_precision"]))
+        print("Recall", np.mean(classifier["test_recall"]))
+        print("F1", np.mean(classifier["test_f1"]))
+        print(
+            f"Largest {N_SL} ACC:",
+            np.sort(-np.partition(-classifier["test_accuracy"], N_SL)[:N_SL])[::-1],
+        )
+        print(
+            f"Smallest {N_SL} ACC:",
+            np.sort(np.partition(classifier["test_accuracy"], N_SL)[:N_SL]),
+        )
+
+        cmodels[model_title]["metrics"][0] += np.mean(classifier["test_accuracy"])
+        cmodels[model_title]["metrics"][1] += np.mean(classifier["test_precision"])
+        cmodels[model_title]["metrics"][2] += np.mean(classifier["test_recall"])
+        cmodels[model_title]["metrics"][3] += np.mean(classifier["test_f1"])

 # %% jupyter={"source_hidden": true}
 # Get overall results
-cm.get_total_models_scores(n_clusters=n_clusters)
+scores = cm.get_total_models_scores(n_clusters=N_CLUSTERS)
+
+# %%
+PATH_OUTPUT = Path("..") / Path("presentation/results")
+path_output_full = PATH_OUTPUT / (
+    TARGET_VARIABLE
+    + "_"
+    + SEGMENT_LENGTH
+    + "_classification_"
+    + CV_METHOD
+    + str(BINS)
+    + "_clust_"
+    + CLUST_COL
+    + str(N_CLUSTERS)
+    + ".csv"
+)
+scores.to_csv(path_output_full, index=False)
--- a/exploration/ml_pipeline_classification_with_clustering_2_class.py
+++ b/exploration/ml_pipeline_classification_with_clustering_2_class.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,92 +14,83 @@
 # ---

 # %% jupyter={"source_hidden": true}
-# %matplotlib inline
-import os
-import sys
+from pathlib import Path

-import numpy as np
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
 from scipy import stats
-
-from sklearn.model_selection import train_test_split
-from sklearn.impute import SimpleImputer
-from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
-
 from sklearn.cluster import KMeans
-
-from IPython.core.interactiveshell import InteractiveShell
-InteractiveShell.ast_node_interactivity = "all"
-
-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+from sklearn.model_selection import train_test_split

 from machine_learning.classification_models import ClassificationModels
-
-# %% [markdown]
-# # RAPIDS models
-
-# %% [markdown]
-# # Useful method
-def treat_categorical_features(input_set):
-    categorical_feature_colnames = ["gender", "startlanguage"]
-    additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col]
-    categorical_feature_colnames += additional_categorical_features
-        
-    categorical_features = input_set[categorical_feature_colnames].copy()
-    mode_categorical_features = categorical_features.mode().iloc[0]
-
-    # fillna with mode
-    categorical_features = categorical_features.fillna(mode_categorical_features)
-
-    # one-hot encoding
-    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
-    if not categorical_features.empty:
-        categorical_features = pd.get_dummies(categorical_features)
-
-    numerical_features = input_set.drop(categorical_feature_colnames, axis=1)
-    
-    return pd.concat([numerical_features, categorical_features], axis=1)
+from machine_learning.helper import impute_encode_categorical_features

 # %% [markdown]
 # ## Set script's parameters
-n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
-n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
+#
+
+# %%
+n_clusters = 3  # Number of clusters (could be regarded as a hyperparameter)
+n_sl = 3  # Number of largest/smallest accuracies (of particular CV) outputs
+
+# %%
+PATH_BASE = Path("E:/STRAWresults/20230415")
+
+SEGMENT_TYPE = "period"
+print("SEGMENT_TYPE: " + SEGMENT_TYPE)
+SEGMENT_LENGTH = "30_minutes_before"
+print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
+TARGET_VARIABLE = "appraisal_stressfulness"
+print("TARGET_VARIABLE: " + TARGET_VARIABLE)
+
+if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
+    TARGET_VARIABLE += "_"
+    TARGET_VARIABLE += SEGMENT_TYPE
+
+PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
+
+model_input = pd.read_csv(PATH_FULL)
+
+if SEGMENT_LENGTH == "daily":
+    DAY_LENGTH = "daily"  # or "working"
+    print(DAY_LENGTH)
+    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]

 # %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+CLUST_COL = "limesurvey_demand_control_ratio"
+print("CLUST_COL: " + CLUST_COL)

-clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
+BINS = [-1, 0, 4]
+print("BINS: " + str(BINS))

-model_input.columns[list(model_input.columns).index('age'):-1]
+index_columns = [
+    "local_segment",
+    "local_segment_label",
+    "local_segment_start_datetime",
+    "local_segment_end_datetime",
+]

-lime_cols = [col for col in model_input if col.startswith('limesurvey')]
-lime_cols
-lime_col = 'limesurvey_demand_control_ratio'
-clust_col = lime_col
-
-model_input[clust_col].describe()
+model_input[CLUST_COL].describe()


 # %% jupyter={"source_hidden": true}
+# Filter-out outlier rows by clust_col
+model_input = model_input[(np.abs(stats.zscore(model_input[CLUST_COL])) < 3)]

-# Filter-out outlier rows by clust_col 
-model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
-
-uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
-plt.bar(uniq['pid'], uniq[clust_col])
+uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
+plt.bar(uniq["pid"], uniq[CLUST_COL])

 # %% jupyter={"source_hidden": true}
 # Get clusters by cluster col & and merge the clusters to main df
-km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
+km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index("pid"))
 np.unique(km, return_counts=True)
-uniq['cluster'] = km
-uniq
+uniq["cluster"] = km
+print(uniq)

-model_input = model_input.merge(uniq[['pid', 'cluster']])   
+model_input = model_input.merge(uniq[["pid", "cluster"]])

 # %% jupyter={"source_hidden": true}
 model_input.set_index(index_columns, inplace=True)
@ -109,50 +100,64 @@ model_input.set_index(index_columns, inplace=True)
 cm = ClassificationModels()
 cmodels = cm.get_cmodels()

+# %%
+model_input["target"].value_counts()
+
 # %% jupyter={"source_hidden": true}
 for k in range(n_clusters):
    model_input_subset = model_input[model_input["cluster"] == k].copy()
-    
-    # Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
-    model_input_subset['numerical_target'] = model_input_subset['target']
-    bins = [-10, 0, 10] # bins for z-scored targets
-    model_input_subset.loc[:, 'target'] = \
-        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True)
-        
-    p15 = np.percentile(model_input_subset['numerical_target'], 15)
-    p85 = np.percentile(model_input_subset['numerical_target'], 85)
-    
-    # Treat categorical features
-    model_input_subset = treat_categorical_features(model_input_subset)
-    
-    # Split to train, validate, and test subsets
-    train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
-    test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)

-    train_set['target'].value_counts()
-    test_set['target'].value_counts()
-    
+    # Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
+    # model_input_subset['numerical_target'] = model_input_subset['target']
+
+    model_input_subset.loc[:, "target"] = pd.cut(
+        model_input_subset.loc[:, "target"], bins=BINS, labels=[0, 1], right=True
+    )
+
+    # p15 = np.percentile(model_input_subset['numerical_target'], 15)
+    # p85 = np.percentile(model_input_subset['numerical_target'], 85)
+
+    # Treat categorical features
+    model_input_subset = impute_encode_categorical_features(model_input_subset)
+
+    # Split to train, validate, and test subsets
+    # train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
+    # test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
+    train_set, test_set = train_test_split(
+        model_input_subset,
+        test_size=0.3,
+        stratify=model_input_subset["pid"],
+        random_state=42,
+    )
+
+    print(train_set["target"].value_counts())
+    print(test_set["target"].value_counts())
+
    train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
-    
-    validate_x, test_x, validate_y, test_y = \
-        train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42)
-    
+
+    validate_x, test_x, validate_y, test_y = train_test_split(
+        test_set.drop(["target", "pid"], axis=1),
+        test_set["target"],
+        test_size=0.50,
+        random_state=42,
+    )
+
    # Impute missing values
-    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+    imputer = SimpleImputer(missing_values=np.nan, strategy="median")

    train_x = imputer.fit_transform(train_x)
    validate_x = imputer.fit_transform(validate_x)
    test_x = imputer.fit_transform(test_x)

    for model_title, model in cmodels.items():
-        model['model'].fit(train_x, train_y)
-        y_pred = model['model'].predict(validate_x)
-        
+        model["model"].fit(train_x, train_y)
+        y_pred = model["model"].predict(validate_x)
+
        acc = accuracy_score(validate_y, y_pred)
        prec = precision_score(validate_y, y_pred)
        rec = recall_score(validate_y, y_pred)
        f1 = f1_score(validate_y, y_pred)
-        
+
        print("\n-------------------------------------\n")
        print("Current cluster:", k, end="\n")
        print("Current model:", model_title, end="\n")
@ -160,12 +165,30 @@ for k in range(n_clusters):
        print("Precision", prec)
        print("Recall", rec)
        print("F1", f1)
-        
-        cmodels[model_title]['metrics'][0] += acc
-        cmodels[model_title]['metrics'][1] += prec
-        cmodels[model_title]['metrics'][2] += rec
-        cmodels[model_title]['metrics'][3] += f1
+
+        cmodels[model_title]["metrics"][0] += acc
+        cmodels[model_title]["metrics"][1] += prec
+        cmodels[model_title]["metrics"][2] += rec
+        cmodels[model_title]["metrics"][3] += f1

 # %% jupyter={"source_hidden": true}
 # Get overall results
-cm.get_total_models_scores(n_clusters=n_clusters)
+scores = cm.get_total_models_scores(n_clusters=n_clusters)
+
+# %%
+print(scores)
+
+# %%
+PATH_OUTPUT = Path("..") / Path("presentation/results")
+path_output_full = PATH_OUTPUT / (
+    TARGET_VARIABLE
+    + "_"
+    + SEGMENT_LENGTH
+    + "_classification"
+    + str(BINS)
+    + "_CLUST_"
+    + CLUST_COL
+    + +str(n_clusters)
+    + ".csv"
+)
+scores.to_csv(path_output_full, index=False)
--- a/exploration/ml_pipeline_regression.py
+++ b/exploration/ml_pipeline_regression.py
@ -6,445 +6,61 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---

-# %% jupyter={"source_hidden": true}
-# %matplotlib inline
+# %%
 import os
 import sys

-import numpy as np
 import pandas as pd
-import xgboost as xg
-from machine_learning.helper import prepare_regression_model_input
-from sklearn import gaussian_process, kernel_ridge, linear_model, svm
-from sklearn.dummy import DummyRegressor
-from sklearn.impute import SimpleImputer
-from sklearn.model_selection import LeaveOneGroupOut, cross_validate

-# from IPython.core.interactiveshell import InteractiveShell
-# InteractiveShell.ast_node_interactivity = "all"
+from machine_learning.helper import (
+    impute_encode_categorical_features,
+    prepare_cross_validator,
+    prepare_sklearn_data_format,
+    run_all_regression_models,
+)

 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)

-# %% jupyter={"source_hidden": true}
+# %%
 model_input = pd.read_csv(
    "../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
 )

-# %% jupyter={"source_hidden": true}
-cv_method = "half_logo"  # logo, half_logo, 5kfold
-
-train_x, data_y, data_groups = prepare_regression_model_input(model_input, cv_method)
-# %% jupyter={"source_hidden": true}
-logo = LeaveOneGroupOut()
-logo.get_n_splits(
-    train_x,
-    data_y,
-    groups=data_groups,
-)
-
-# Defaults to 5 k folds in cross_validate method
-if cv_method != "logo" and cv_method != "half_logo":
-    logo = None
-
-# %% jupyter={"source_hidden": true}
-sum(data_y.isna())
-
-# %% [markdown]
-# ### Baseline: Dummy Regression (mean)
-dummy_regr = DummyRegressor(strategy="mean")
-
-# %% jupyter={"source_hidden": true}
-imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
-
-# %% jupyter={"source_hidden": true}
-dummy_regressor = cross_validate(
-    dummy_regr,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=(
-        "r2",
-        "neg_mean_squared_error",
-        "neg_mean_absolute_error",
-        "neg_root_mean_squared_error",
-    ),
-)
-print(
-    "Negative Mean Squared Error",
-    np.median(dummy_regressor["test_neg_mean_squared_error"]),
-)
-print(
-    "Negative Mean Absolute Error",
-    np.median(dummy_regressor["test_neg_mean_absolute_error"]),
-)
-print(
-    "Negative Root Mean Squared Error",
-    np.median(dummy_regressor["test_neg_root_mean_squared_error"]),
-)
-print("R2", np.median(dummy_regressor["test_r2"]))
-
-# %% [markdown]
-# ### Linear Regression
-
-# %% jupyter={"source_hidden": true}
-lin_reg_rapids = linear_model.LinearRegression()
-# %% jupyter={"source_hidden": true}
-imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
-
-# %% jupyter={"source_hidden": true}
-lin_reg_scores = cross_validate(
-    lin_reg_rapids,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=(
-        "r2",
-        "neg_mean_squared_error",
-        "neg_mean_absolute_error",
-        "neg_root_mean_squared_error",
-    ),
-)
-print(
-    "Negative Mean Squared Error",
-    np.median(lin_reg_scores["test_neg_mean_squared_error"]),
-)
-print(
-    "Negative Mean Absolute Error",
-    np.median(lin_reg_scores["test_neg_mean_absolute_error"]),
-)
-print(
-    "Negative Root Mean Squared Error",
-    np.median(lin_reg_scores["test_neg_root_mean_squared_error"]),
-)
-print("R2", np.median(lin_reg_scores["test_r2"]))
-
-# %% [markdown]
-# ### XGBRegressor Linear Regression
-# %% jupyter={"source_hidden": true}
-xgb_r = xg.XGBRegressor(objective="reg:squarederror", n_estimators=10)
-# %% jupyter={"source_hidden": true}
-imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
-
-# %% jupyter={"source_hidden": true}
-xgb_reg_scores = cross_validate(
-    xgb_r,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=(
-        "r2",
-        "neg_mean_squared_error",
-        "neg_mean_absolute_error",
-        "neg_root_mean_squared_error",
-    ),
-)
-print(
-    "Negative Mean Squared Error",
-    np.median(xgb_reg_scores["test_neg_mean_squared_error"]),
-)
-print(
-    "Negative Mean Absolute Error",
-    np.median(xgb_reg_scores["test_neg_mean_absolute_error"]),
-)
-print(
-    "Negative Root Mean Squared Error",
-    np.median(xgb_reg_scores["test_neg_root_mean_squared_error"]),
-)
-print("R2", np.median(xgb_reg_scores["test_r2"]))
-
-# %% [markdown]
-# ### XGBRegressor Pseudo Huber Error Regression
-# %% jupyter={"source_hidden": true}
-xgb_psuedo_huber_r = xg.XGBRegressor(objective="reg:pseudohubererror", n_estimators=10)
-# %% jupyter={"source_hidden": true}
-imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
-
-# %% jupyter={"source_hidden": true}
-xgb_psuedo_huber_reg_scores = cross_validate(
-    xgb_psuedo_huber_r,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=(
-        "r2",
-        "neg_mean_squared_error",
-        "neg_mean_absolute_error",
-        "neg_root_mean_squared_error",
-    ),
-)
-print(
-    "Negative Mean Squared Error",
-    np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_squared_error"]),
-)
-print(
-    "Negative Mean Absolute Error",
-    np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_absolute_error"]),
-)
-print(
-    "Negative Root Mean Squared Error",
-    np.median(xgb_psuedo_huber_reg_scores["test_neg_root_mean_squared_error"]),
-)
-print("R2", np.median(xgb_psuedo_huber_reg_scores["test_r2"]))
-
-# %% [markdown]
-# ### Ridge regression
-
-# %% jupyter={"source_hidden": true}
-ridge_reg = linear_model.Ridge(alpha=0.5)
-
-# %% tags=[] jupyter={"source_hidden": true}
-ridge_reg_scores = cross_validate(
-    ridge_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=(
-        "r2",
-        "neg_mean_squared_error",
-        "neg_mean_absolute_error",
-        "neg_root_mean_squared_error",
-    ),
-)
-print(
-    "Negative Mean Squared Error",
-    np.median(ridge_reg_scores["test_neg_mean_squared_error"]),
-)
-print(
-    "Negative Mean Absolute Error",
-    np.median(ridge_reg_scores["test_neg_mean_absolute_error"]),
-)
-print(
-    "Negative Root Mean Squared Error",
-    np.median(ridge_reg_scores["test_neg_root_mean_squared_error"]),
-)
-print("R2", np.median(ridge_reg_scores["test_r2"]))
-
-# %% [markdown]
-# ### Lasso
-
-# %% jupyter={"source_hidden": true}
-lasso_reg = linear_model.Lasso(alpha=0.1)
-
-# %% jupyter={"source_hidden": true}
-lasso_reg_score = cross_validate(
-    lasso_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=(
-        "r2",
-        "neg_mean_squared_error",
-        "neg_mean_absolute_error",
-        "neg_root_mean_squared_error",
-    ),
-)
-print(
-    "Negative Mean Squared Error",
-    np.median(lasso_reg_score["test_neg_mean_squared_error"]),
-)
-print(
-    "Negative Mean Absolute Error",
-    np.median(lasso_reg_score["test_neg_mean_absolute_error"]),
-)
-print(
-    "Negative Root Mean Squared Error",
-    np.median(lasso_reg_score["test_neg_root_mean_squared_error"]),
-)
-print("R2", np.median(lasso_reg_score["test_r2"]))
-
-# %% [markdown]
-# ### Bayesian Ridge
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg = linear_model.BayesianRidge()
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg_score = cross_validate(
-    bayesian_ridge_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=(
-        "r2",
-        "neg_mean_squared_error",
-        "neg_mean_absolute_error",
-        "neg_root_mean_squared_error",
-    ),
-)
-print(
-    "Negative Mean Squared Error",
-    np.median(bayesian_ridge_reg_score["test_neg_mean_squared_error"]),
-)
-print(
-    "Negative Mean Absolute Error",
-    np.median(bayesian_ridge_reg_score["test_neg_mean_absolute_error"]),
-)
-print(
-    "Negative Root Mean Squared Error",
-    np.median(bayesian_ridge_reg_score["test_neg_root_mean_squared_error"]),
-)
-print("R2", np.median(bayesian_ridge_reg_score["test_r2"]))
-
-# %% [markdown]
-# ### RANSAC (outlier robust regression)
-
-# %% jupyter={"source_hidden": true}
-ransac_reg = linear_model.RANSACRegressor()
-
-# %% jupyter={"source_hidden": true}
-ransac_reg_scores = cross_validate(
-    ransac_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=(
-        "r2",
-        "neg_mean_squared_error",
-        "neg_mean_absolute_error",
-        "neg_root_mean_squared_error",
-    ),
-)
-print(
-    "Negative Mean Squared Error",
-    np.median(ransac_reg_scores["test_neg_mean_squared_error"]),
-)
-print(
-    "Negative Mean Absolute Error",
-    np.median(ransac_reg_scores["test_neg_mean_absolute_error"]),
-)
-print(
-    "Negative Root Mean Squared Error",
-    np.median(ransac_reg_scores["test_neg_root_mean_squared_error"]),
-)
-print("R2", np.median(ransac_reg_scores["test_r2"]))
-
-# %% [markdown]
-# ### Support vector regression
-
-# %% jupyter={"source_hidden": true}
-svr = svm.SVR()
-
-# %% jupyter={"source_hidden": true}
-svr_scores = cross_validate(
-    svr,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=(
-        "r2",
-        "neg_mean_squared_error",
-        "neg_mean_absolute_error",
-        "neg_root_mean_squared_error",
-    ),
-)
-print(
-    "Negative Mean Squared Error", np.median(svr_scores["test_neg_mean_squared_error"])
-)
-print(
-    "Negative Mean Absolute Error",
-    np.median(svr_scores["test_neg_mean_absolute_error"]),
-)
-print(
-    "Negative Root Mean Squared Error",
-    np.median(svr_scores["test_neg_root_mean_squared_error"]),
-)
-print("R2", np.median(svr_scores["test_r2"]))
-
-# %% [markdown]
-# ### Kernel Ridge regression
-
-# %% jupyter={"source_hidden": true}
-kridge = kernel_ridge.KernelRidge()
-
-# %% jupyter={"source_hidden": true}
-kridge_scores = cross_validate(
-    kridge,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=(
-        "r2",
-        "neg_mean_squared_error",
-        "neg_mean_absolute_error",
-        "neg_root_mean_squared_error",
-    ),
-)
-print(
-    "Negative Mean Squared Error",
-    np.median(kridge_scores["test_neg_mean_squared_error"]),
-)
-print(
-    "Negative Mean Absolute Error",
-    np.median(kridge_scores["test_neg_mean_absolute_error"]),
-)
-print(
-    "Negative Root Mean Squared Error",
-    np.median(kridge_scores["test_neg_root_mean_squared_error"]),
-)
-print("R2", np.median(kridge_scores["test_r2"]))
-
-# %% [markdown]
-# ### Gaussian Process Regression
-
-# %% jupyter={"source_hidden": true}
-gpr = gaussian_process.GaussianProcessRegressor()
-
-# %% jupyter={"source_hidden": true}
-
-gpr_scores = cross_validate(
-    gpr,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=(
-        "r2",
-        "neg_mean_squared_error",
-        "neg_mean_absolute_error",
-        "neg_root_mean_squared_error",
-    ),
-)
-print(
-    "Negative Mean Squared Error", np.median(gpr_scores["test_neg_mean_squared_error"])
-)
-print(
-    "Negative Mean Absolute Error",
-    np.median(gpr_scores["test_neg_mean_absolute_error"]),
-)
-print(
-    "Negative Root Mean Squared Error",
-    np.median(gpr_scores["test_neg_root_mean_squared_error"]),
-)
-print("R2", np.median(gpr_scores["test_r2"]))
+# %%
+model_input = model_input[model_input["local_segment"].str.contains("daily")]

 # %%
+CV_METHOD = "logo"  # logo, half_logo, 5kfold
+
+model_input_encoded = impute_encode_categorical_features(model_input)
+# %%
+data_x, data_y, data_groups = prepare_sklearn_data_format(
+    model_input_encoded, CV_METHOD
+)
+cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
+# %%
+data_y.head()
+
+# %%
+data_y.tail()
+
+# %%
+data_y.shape
+
+# %%
+scores = run_all_regression_models(data_x, data_y, data_groups, cross_validator)
+
+# %%
+scores.to_csv(
+    "../presentation/JCQ_supervisor_support_regression_" + CV_METHOD + ".csv",
+    index=False,
+)
--- a/exploration/test_JCQ_reversal.py
+++ b/exploration/test_JCQ_reversal.py
@ -0,0 +1,217 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.14.5
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %%
+import pandas as pd
+
+from features.esm_JCQ import DICT_JCQ_DEMAND_CONTROL_REVERSE
+
+# %%
+limesurvey_questions = pd.read_csv(
+    "E:/STRAWbaseline/survey637813+question_text.csv", header=None
+).T
+
+# %%
+limesurvey_questions
+
+# %%
+limesurvey_questions[["code", "text"]] = limesurvey_questions[0].str.split(
+    r"\.\s", expand=True, n=1
+)
+
+# %%
+limesurvey_questions
+
+# %%
+demand_reverse_lime_rows = (
+    limesurvey_questions["text"].str.startswith(" [Od mene se ne zahteva,")
+    | limesurvey_questions["text"].str.startswith(" [Imam dovolj časa, da končam")
+    | limesurvey_questions["text"].str.startswith(
+        " [Pri svojem delu se ne srečujem s konfliktnimi"
+    )
+)
+control_reverse_lime_rows = limesurvey_questions["text"].str.startswith(
+    " [Moje delo vključuje veliko ponavljajočega"
+) | limesurvey_questions["text"].str.startswith(
+    " [Pri svojem delu imam zelo malo svobode"
+)
+
+# %%
+demand_reverse_lime = limesurvey_questions[demand_reverse_lime_rows]
+demand_reverse_lime.loc[:, "qid"] = demand_reverse_lime["code"].str.extract(
+    r"\[(\d+)\]"
+)
+control_reverse_lime = limesurvey_questions[control_reverse_lime_rows]
+control_reverse_lime.loc[:, "qid"] = control_reverse_lime["code"].str.extract(
+    r"\[(\d+)\]"
+)
+
+# %%
+limesurvey_questions.loc[89, "text"]
+
+# %%
+limesurvey_questions[limesurvey_questions["code"].str.startswith("JobEisen")]
+
+# %%
+demand_reverse_lime
+
+# %%
+control_reverse_lime
+
+# %%
+participant_info = pd.read_csv(
+    "C:/Users/junos/Documents/FWO-ARRS/Analysis/straw2analysis/rapids/data/raw/p031/participant_baseline_raw.csv",
+    parse_dates=["date_of_birth"],
+)
+
+# %%
+participant_info_t = participant_info.T
+
+# %%
+rows_baseline = participant_info_t.index
+
+# %%
+rows_demand = rows_baseline.str.startswith("JobEisen") & ~rows_baseline.str.endswith(
+    "Time"
+)
+
+# %%
+rows_baseline[rows_demand]
+
+# %%
+limesurvey_control = (
+    participant_info_t[rows_demand]
+    .reset_index()
+    .rename(columns={"index": "question", 0: "score_original"})
+)
+
+# %%
+limesurvey_control
+
+# %%
+limesurvey_control["qid"] = (
+    limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
+)
+
+# %%
+limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
+
+# %%
+limesurvey_control["score"] = limesurvey_control["score_original"]
+
+# %%
+limesurvey_control["qid"][0]
+
+# %%
+rows_demand_reverse = limesurvey_control["qid"].isin(
+    DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
+)
+limesurvey_control.loc[rows_demand_reverse, "score"] = (
+    4 + 1 - limesurvey_control.loc[rows_demand_reverse, "score_original"]
+)
+
+# %%
+JCQ_DEMAND = "JobEisen"
+JCQ_CONTROL = "JobControle"
+dict_JCQ_demand_control_reverse = {
+    JCQ_DEMAND: {
+        3: " [Od mene se ne zahteva,",
+        4: " [Imam dovolj časa, da končam",
+        5: " [Pri svojem delu se ne srečujem s konfliktnimi",
+    },
+    JCQ_CONTROL: {
+        2: " |Moje delo vključuje veliko ponavljajočega",
+        6: " [Pri svojem delu imam zelo malo svobode",
+    },
+}
+
+# %%
+limesurvey_control
+
+# %%
+test = pd.DataFrame(
+    data={"question": "one", "score_original": 3, "score": 3, "qid": 10}, index=[0]
+)
+
+# %%
+pd.concat([test, limesurvey_control]).reset_index()
+
+# %%
+limesurvey_control["score"].sum()
+
+# %%
+rows_demand_reverse
+
+# %%
+dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
+
+# %%
+limesurvey_control
+
+# %%
+DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
+DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
+
+JCQ_NORMS = {
+    "F": {
+        0: DEMAND_CONTROL_RATIO_MIN,
+        1: 0.45,
+        2: 0.52,
+        3: 0.62,
+        4: DEMAND_CONTROL_RATIO_MAX,
+    },
+    "M": {
+        0: DEMAND_CONTROL_RATIO_MIN,
+        1: 0.41,
+        2: 0.48,
+        3: 0.56,
+        4: DEMAND_CONTROL_RATIO_MAX,
+    },
+}
+
+# %%
+JCQ_NORMS[participant_info.loc[0, "gender"]][0]
+
+# %%
+participant_info_t.index.str.startswith("JobControle")
+
+# %%
+columns_baseline = participant_info.columns
+
+# %%
+columns_demand = columns_baseline.str.startswith(
+    "JobControle"
+) & ~columns_baseline.str.endswith("Time")
+
+# %%
+columns_baseline[columns_demand]
+
+# %%
+participant_control = participant_info.loc[:, columns_demand]
+
+# %%
+participant_control["id"] = participant_control.index
+
+# %%
+participant_control
+
+# %%
+pd.wide_to_long(
+    participant_control,
+    stubnames="JobControle",
+    i="id",
+    j="qid",
+    sep="[",
+    suffix="(\\d+)]",
+)
--- a/features/esm.py
+++ b/features/esm.py
@ -20,11 +20,47 @@ ANSWER_DAY_OFF = "DayOff3421"
 ANSWER_SET_EVENING = "DayFinishedSetEvening"

 MAX_MORNING_LENGTH = 3
-# When the participants was not yet at work at the time of the first (morning) EMA,
+# When the participant was not yet at work at the time of the first (morning) EMA,
 # only three items were answered.
 # Two sleep related items and one indicating NOT starting work yet.
 # Daytime EMAs are all longer, in fact they always consist of at least 6 items.

+QUESTIONNAIRE_IDS = {
+    "sleep_quality": 1,
+    "PANAS_positive_affect": 8,
+    "PANAS_negative_affect": 9,
+    "JCQ_job_demand": 10,
+    "JCQ_job_control": 11,
+    "JCQ_supervisor_support": 12,
+    "JCQ_coworker_support": 13,
+    "PFITS_supervisor": 14,
+    "PFITS_coworkers": 15,
+    "UWES_vigor": 16,
+    "UWES_dedication": 17,
+    "UWES_absorption": 18,
+    "COPE_active": 19,
+    "COPE_support": 20,
+    "COPE_emotions": 21,
+    "balance_life_work": 22,
+    "balance_work_life": 23,
+    "recovery_experience_detachment": 24,
+    "recovery_experience_relaxation": 25,
+    "symptoms": 26,
+    "appraisal_stressfulness_event": 87,
+    "appraisal_threat": 88,
+    "appraisal_challenge": 89,
+    "appraisal_event_time": 90,
+    "appraisal_event_duration": 91,
+    "appraisal_event_work_related": 92,
+    "appraisal_stressfulness_period": 93,
+    "late_work": 94,
+    "work_hours": 95,
+    "left_work": 96,
+    "activities": 97,
+    "coffee_breaks": 98,
+    "at_work_yet": 99,
+}
+

 def get_esm_data(usernames: Collection) -> pd.DataFrame:
    """
@ -52,8 +88,10 @@ def get_esm_data(usernames: Collection) -> pd.DataFrame:

 def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    """
+    Convert timestamps and expand JSON column.
+
    Convert timestamps into human-readable datetimes and dates
-    and expand the JSON column into several Pandas DF columns.
+        and expand the JSON column into several Pandas DF columns.

    Parameters
    ----------
@ -63,7 +101,8 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    Returns
    -------
    df_esm_preprocessed: pd.DataFrame
-        A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
+        A dataframe with added columns: datetime in Ljubljana timezone
+            and all fields from ESM_JSON column.
    """
    df_esm = helper.get_date_from_timestamp(df_esm)

@ -76,31 +115,39 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
 def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
    For each distinct EMA session, determine how the participant responded to it.
-    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
+
+    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED,
+        and SESSION_STATUS_COMPLETE

    This is done in three steps.

    First, the esm_status is considered.
-    If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
+    If any of the ESMs in a session has a status *other than* "answered",
+        then this session is taken as unfinished.

    Second, the sessions which do not represent full questionnaires are identified.
-    These are sessions where participants only marked they are finished with the day or have not yet started working.
+    These are sessions where participants only marked they are finished with the day
+        or have not yet started working.

    Third, the sessions with only one item are marked with their trigger.
-    We never offered questionnaires with single items, so we can be sure these are unfinished.
+    We never offered questionnaires with single items,
+        so we can be sure these are unfinished.

    Finally, all sessions that remain are marked as completed.
-    By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option.
+    By going through different possibilities in expl_esm_adherence.ipynb,
+        this turned out to be a reasonable option.

    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
+            which must include the session ID (esm_session).

    Returns
    -------
    df_session_counts: pd.Dataframe
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
+            with their statuses and the number of items.
    """
    sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)

@ -155,17 +202,22 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat

 def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
-    For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)
+    Classify EMA sessions into morning, workday, or evening.
+
+    For each EMA session, determine the time of the first user answer
+        and its time type (morning, workday, or evening).

    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
+            which must include the session ID (esm_session).

    Returns
    -------
    df_session_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
+            with their time type and timestamp of first answer.
    """
    df_session_time = (
        df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
@ -179,13 +231,17 @@ def classify_sessions_by_completion_time(
    df_esm_preprocessed: pd.DataFrame,
 ) -> pd.DataFrame:
    """
-    The point of this function is to not only classify sessions by using the previously defined functions.
+    Classify sessions and correct the time type.
+
+    The point of this function is to not only classify sessions
+        by using the previously defined functions.
    It also serves to "correct" the time type of some EMA sessions.

    A morning questionnaire could seamlessly transition into a daytime questionnaire,
        if the participant was already at work.
    In this case, the "time" label changed mid-session.
-    Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
+    Because of the way classify_sessions_by_time works,
+        this questionnaire was classified as "morning".
    But for all intents and purposes, it can be treated as a "daytime" EMA.

    The way this scenario is differentiated from a true "morning" questionnaire,
@ -194,13 +250,16 @@ def classify_sessions_by_completion_time(
    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
+            which must include the session ID (esm_session).

    Returns
    -------
    df_session_counts_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items,
-            their time type (with some morning EMAs reclassified) and timestamp of first answer.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses,
+            the number of items,
+            their time type (with some morning EMAs reclassified)
+            and timestamp of first answer.

    """
    df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
@ -219,7 +278,8 @@ def classify_sessions_by_completion_time(

 def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
-    This function eliminates invalid ESM responses.
+    Eliminate invalid ESM responses.
+
    It removes unanswered ESMs and those that indicate end of work and similar.
    It also extracts a numeric answer from strings such as "4 - I strongly agree".

@ -256,3 +316,100 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
        )
    )
    return df_esm_clean
+
+
+def increment_answers(df_esm_clean: pd.DataFrame, increment_by=1):
+    """
+    Increment answers to keep in line with original scoring.
+
+    We always used 0 for the lowest value of user answer.
+    Some scales originally used other scoring, such as starting from 1.
+    This restores original scoring so that the values are comparable to references.
+
+    Parameters
+    ----------
+    df_esm_clean: pd.DataFrame
+        A cleaned ESM dataframe, which must also include esm_user_answer_numeric.
+    increment_by:
+        A number to add to the user answer.
+
+    Returns
+    -------
+    df_esm_clean: pd.DataFrame
+        The same df with addition of a column 'esm_user_answer_numeric'.
+
+    """
+    try:
+        df_esm_clean = df_esm_clean.assign(
+            esm_user_score=lambda x: x.esm_user_answer_numeric + increment_by
+        )
+    except AttributeError as e:
+        print("Please, clean the dataframe first using features.esm.clean_up_esm.")
+        print(e)
+    return df_esm_clean
+
+
+def reassign_question_ids(
+    df_esm_cleaned: pd.DataFrame, question_ids_content: dict
+) -> pd.DataFrame:
+    """
+    Fix question IDs to match their actual content.
+
+    Unfortunately, when altering the protocol to adapt to COVID pandemic,
+    we did not retain original question IDs.
+    This means that for participants before 2021, they are different
+    from for the rest of them.
+    This function searches for question IDs by matching their strings.
+
+    Parameters
+    ----------
+    df_esm_cleaned: pd.DataFrame
+        A cleaned up dataframe, which must also include esm_user_answer_numeric.
+    question_ids_content: dict
+        A dictionary, linking question IDs with their content ("instructions").
+
+    Returns
+    -------
+    df_esm_fixed: pd.DataFrame
+        The same dataframe but with fixed question IDs.
+    """
+    df_esm_unique_questions = (
+        df_esm_cleaned.groupby("question_id")
+        .esm_instructions.value_counts()
+        .rename()
+        .reset_index()
+    )
+    # Tabulate all possible answers to each question (group by question ID).
+
+    # First, check that we anticipated all esm instructions.
+    for q_id in question_ids_content.keys():
+        # Look for all questions ("instructions") occurring in the dataframe.
+        actual_questions = df_esm_unique_questions.loc[
+            df_esm_unique_questions["question_id"] == q_id,
+            "esm_instructions",
+        ]
+        # These are all answers to a given question (by q_id).
+        questions_matches = actual_questions.str.startswith(
+            question_ids_content.get(q_id)
+        )
+        # See if they are expected, i.e. included in the dictionary.
+        if ~actual_questions.all():
+            print("One of the questions that occur in the data was undefined.")
+            print("This were the questions found in the data: ")
+            raise KeyError(actual_questions[~questions_matches])
+            # In case there is an unexpected answer, raise an exception.
+
+    # Next, replace question IDs.
+    df_esm_fixed = df_esm_cleaned.copy()
+    df_esm_fixed["question_id"] = df_esm_cleaned["esm_instructions"].apply(
+        lambda x: next(
+            (
+                key
+                for key, values in question_ids_content.items()
+                if x.startswith(values)
+            ),
+            None,
+        )
+    )
+
+    return df_esm_fixed
--- a/features/esm_COPE.py
+++ b/features/esm_COPE.py
@ -0,0 +1,125 @@
+COPE_ORIGINAL_MAX = 4
+COPE_ORIGINAL_MIN = 1
+
+DICT_COPE_QUESTION_IDS = {
+    164: (
+        "I took additional action to try to get rid of the problem",
+        "Ik deed extra mijn best om er iets aan te doen",
+        "Vložila sem dodaten napor, da bi rešila problem",
+        "Vložil sem dodaten napor, da bi rešil problem",
+    ),
+    165: (
+        "I concentrated my efforts on doing something about it",
+        "Ik probeerde de situatie te verbeteren",
+        "Svoje sile sem usmerila v reševanje nastale situacije",
+        "Svoje sile sem usmeril v reševanje nastale situacije",
+    ),
+    166: (
+        "I did what had to be done, one step at a time",
+        "Ik deed stap voor stap wat nodig was",
+        "Naredila sem, kar je bilo potrebno – korak za korakom",
+        "Naredil sem, kar je bilo potrebno – korak za korakom",
+    ),
+    167: (
+        "I took direct action to get around the problem",
+        "Ik handelde vlug om het probleem te verhelpen",
+        "Nekaj sem naredila, da sem zaobšla problem",
+        "Nekaj sem naredil, da sem zaobšel problem",
+    ),
+    168: (
+        "I tried to come up with a strategy about what to do",
+        "Ik probeerde te verzinnen wat ik er aan kon doen",
+        "Skušala sem najti ustrezen način za rešitev situacije",
+        "Skušal sem najti ustrezen način za rešitev situacije",
+    ),
+    169: (
+        "I made a plan of action",
+        "Ik maakte een plan",
+        "Naredila sem načrt za delovanje",
+        "Naredil sem načrt za delovanje",
+    ),
+    170: (
+        "I thought hard about what steps to take",
+        "Ik dacht hard na over wat ik moest doen",
+        "Dobro sem premislila, katere korake moram narediti, da rešim problem",
+        "Dobro sem premislil, katere korake moram narediti, da rešim problem",
+    ),
+    171: (
+        "I thought about how I might best handle the problem",
+        "lk dacht na over hoe ik het probleem het best kon aanpakken",
+        "Razmišljala sem, kaj bi bilo najbolje narediti s problemom",
+        "Razmišljal sem, kaj bi bilo najbolje narediti s problemom",
+    ),
+    172: (
+        "I asked people who have had similar experiences what they did",
+        "Ik vroeg aan mensen met dergelijke ervaringen hoe zij reageerden",
+        "Vprašala sem posameznike s podobnimi izkušnjami, kaj so storili",
+        "Vprašal sem posameznike s podobnimi izkušnjami, kaj so storili",
+    ),
+    173: (
+        "I tried to get advice from someone about what to do",
+        "lk vroeg advies aan iemand",
+        "Pri drugih sem poskušala dobiti nasvet, kaj naj storim",
+        "Pri drugih sem poskušal dobiti nasvet, kaj naj storim",
+    ),
+    174: (
+        "I talked to someone to find out more about the situation",
+        "Ik sprak met iemand om meer te weten te komen over de situatie",
+        "Z nekom sem se pogovorila, da bi izvedela še kaj o svojem problemu",
+        "Z nekom sem se pogovoril, da bi izvedel še kaj o svojem problemu",
+    ),
+    175: (
+        "I talked to someone who could do something concrete about the problem",
+        "Ik sprak met iemand die iets aan het probleem kon doen",
+        "Pogovorila sem se s kom, ki bi lahko naredil kaj konkretnega",
+        "Pogovoril sem se s kom, ki bi lahko naredil kaj konkretnega",
+    ),
+    176: (
+        "I talked to someone about how I felt",
+        "Ik sprak met iemand over hoe ik mij voelde",
+        "Z nekom sem se pogovorila o tem, kako sem se počutila",
+        "Z nekom sem se pogovoril o tem, kako sem se počutil",
+    ),
+    177: (
+        "I tried to get emotional support from friends or relatives",
+        "Ik zocht steun bij vrienden of familie",
+        "Skušala sem dobiti čustveno podporo prijateljev ali sorodnikov",
+        "Skušal sem dobiti čustveno podporo prijateljev ali sorodnikov",
+    ),
+    178: (
+        "I discussed my feelings with someone",
+        "lk besprak mijn gevoelens met iemand",
+        "O svojih občutkih sem se z nekom pogovorila",
+        "O svojih občutkih sem se z nekom pogovoril",
+    ),
+    179: (
+        "I got sympathy and understanding from someone",
+        "Ik vroeg medeleven en begrip van iemand",
+        "Poiskala sem naklonjenost in razumevanje drugih",
+        "Poiskal sem naklonjenost in razumevanje drugih",
+    ),
+    180: (
+        "I got upset and let my emotions out",
+        "Ik raakte van streek",
+        "Razburila sem se in to tudi pokazala",
+        "Razburil sem se in to tudi pokazal",
+    ),
+    181: (
+        "I let my feelings out",
+        "Ik toonde mijn gevoelens",
+        "Svojim čustvom sem dala prosto pot",
+        "Svojim čustvom sem dal prosto pot",
+    ),
+    182: (
+        "I felt a lot of emotional distress and I found myself expressing",
+        "lk liet duidelijk blijken hoe ellendig ik mij voelde",
+        "Doživljala sem veliko stresa in opažala, da sem čustva",
+        "Doživljal sem veliko stresa in opažal, da sem čustva",
+    ),
+    183: (
+        "I got upset, and I was really aware of it",
+        "Ik merkte dat ik erg van streek was",
+        "Razburila sem se in razmišljala samo o tem",
+        "Razburil sem se in razmišljal samo o tem",
+    ),
+}
--- a/features/esm_JCQ.py
+++ b/features/esm_JCQ.py
@ -1,9 +1,11 @@
 import pandas as pd

+from features.esm import increment_answers
+
 JCQ_ORIGINAL_MAX = 4
 JCQ_ORIGINAL_MIN = 1

-dict_JCQ_demand_control_reverse = {
+DICT_JCQ_DEMAND_CONTROL_REVERSE = {
    75: (
        "I was NOT asked",
        "Men legde mij geen overdreven",
@ -40,10 +42,14 @@ def reverse_jcq_demand_control_scoring(
    df_esm_jcq_demand_control: pd.DataFrame,
 ) -> pd.DataFrame:
    """
-    This function recodes answers in Job content questionnaire by first incrementing them by 1,
-    to be in line with original (1-4) scoring.
-    Then, some answers are reversed (i.e. 1 becomes 4 etc.), because the questions are negatively phrased.
-    These answers are listed in dict_JCQ_demand_control_reverse and identified by their question ID.
+    Reverse JCQ demand and control answers.
+
+    This function recodes answers in Job content questionnaire
+        by first incrementing them by 1, to be in line with original (1-4) scoring.
+    Then, some answers are reversed (i.e. 1 becomes 4 etc.),
+        because the questions are negatively phrased.
+    These answers are listed in DICT_JCQ_DEMAND_CONTROL_REVERSE
+        and identified by their question ID.
    However, the existing data is checked against literal phrasing of these questions
        to protect against wrong numbering of questions (differing question IDs).

@ -55,7 +61,8 @@ def reverse_jcq_demand_control_scoring(
    Returns
    -------
    df_esm_jcq_demand_control: pd.DataFrame
-        The same dataframe with a column esm_user_score containing answers recoded and reversed.
+        The same dataframe with a column esm_user_score
+            containing answers recoded and reversed.
    """
    df_esm_jcq_demand_control_unique_answers = (
        df_esm_jcq_demand_control.groupby("question_id")
@ -64,7 +71,7 @@ def reverse_jcq_demand_control_scoring(
        .reset_index()
    )
    # Tabulate all possible answers to each question (group by question ID).
-    for q_id in dict_JCQ_demand_control_reverse.keys():
+    for q_id in DICT_JCQ_DEMAND_CONTROL_REVERSE.keys():
        # Look through all answers that need to be reversed.
        possible_answers = df_esm_jcq_demand_control_unique_answers.loc[
            df_esm_jcq_demand_control_unique_answers["question_id"] == q_id,
@ -72,7 +79,7 @@ def reverse_jcq_demand_control_scoring(
        ]
        # These are all answers to a given question (by q_id).
        answers_matches = possible_answers.str.startswith(
-            dict_JCQ_demand_control_reverse.get(q_id)
+            DICT_JCQ_DEMAND_CONTROL_REVERSE.get(q_id)
        )
        # See if they are expected, i.e. included in the dictionary.
        if ~answers_matches.all():
@ -82,18 +89,16 @@ def reverse_jcq_demand_control_scoring(
            # In case there is an unexpected answer, raise an exception.

    try:
-        df_esm_jcq_demand_control = df_esm_jcq_demand_control.assign(
-            esm_user_score=lambda x: x.esm_user_answer_numeric + 1
-        )
-        # Increment the original answer by 1
-        # to keep in line with traditional scoring (JCQ_ORIGINAL_MIN - JCQ_ORIGINAL_MAX).
+        df_esm_jcq_demand_control = increment_answers(df_esm_jcq_demand_control)
+        # Increment the original answer by 1 to keep in line
+        # with traditional scoring (from JCQ_ORIGINAL_MIN to JCQ_ORIGINAL_MAX).
        df_esm_jcq_demand_control[
            df_esm_jcq_demand_control["question_id"].isin(
-                dict_JCQ_demand_control_reverse.keys()
+                DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
            )
        ] = df_esm_jcq_demand_control[
            df_esm_jcq_demand_control["question_id"].isin(
-                dict_JCQ_demand_control_reverse.keys()
+                DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
            )
        ].assign(
            esm_user_score=lambda x: JCQ_ORIGINAL_MAX
--- a/features/esm_SAM.py
+++ b/features/esm_SAM.py
@ -3,6 +3,9 @@ import pandas as pd

 import features.esm

+SAM_ORIGINAL_MAX = 5
+SAM_ORIGINAL_MIN = 1
+
 QUESTIONNAIRE_ID_SAM = {
    "event_stress": 87,
    "event_threat": 88,
@ -20,10 +23,107 @@ GROUP_QUESTIONNAIRES_BY = [
    "device_id",
    "esm_session",
 ]
-# Each questionnaire occurs only once within each esm_session on the same device within the same participant.
+# Each questionnaire occurs only once within each esm_session on the same device
+# within the same participant.
+
+
+DICT_SAM_QUESTION_IDS = {
+    87: (
+        "Was there a particular event that created tension in you?",
+        "Was er een bepaalde gebeurtenis die spanning veroorzaakte?",
+        "Je prišlo do kakega dogodka, ki je v vas ustvaril napetost?",
+    ),
+    88: (
+        "Did this event make you feel anxious?",
+        "Voelde je je angstig door deze gebeurtenis?",
+        "Ste se zaradi tega dogodka počutili tesnob",
+    ),
+    89: (
+        "Will the outcome of this event be negative?",
+        "Zal de uitkomst van deze gebeurtenis negatief zijn?",
+        "Bo izid tega dogodka negativen?",
+    ),
+    90: (
+        "How threatening was this event?",
+        "Hoe bedreigend was deze gebeurtenis?",
+        "Kako grozeč je bil ta dogodek?",
+    ),
+    91: (
+        "Is this going to have a negative impact on you?",
+        "Zal dit een negatieve impact op je hebben?",
+        "Ali bo to negativno vplivalo na vas?",
+    ),
+    92: (
+        "Is this going to have a positive impact on you?",
+        "Zal dit een positief effect op je hebben?",
+        "Ali bo to pozitivno vplivalo na vas?",
+    ),
+    93: (
+        "How eager are you to tackle this event?",
+        "Hoe graag wil je deze gebeurtenis aanpakken?",
+        "Kako zagnani ste bili",
+    ),
+    94: (
+        "To what extent can you become a stronger person because of this event?",
+        "In welke mate kan je een sterkere persoon worden door deze gebeurtenis?",
+        "V kolikšni meri lahko zaradi tega dogodka postanete močnejša oseba?",
+    ),
+    95: (
+        "To what extent are you excited thinking about the outcome of this event?",
+        "In welke mate ben je enthousiast bij de gedachte",
+        "V kolikšni meri vas misel na izid tega dogodka navdušuje?",
+    ),
+    96: (
+        "At what time did this event occur?",
+        "Hoe laat vond deze gebeurtenis plaats?",
+        "Kdaj se je ta dogodek zgodil?",
+    ),
+    97: (
+        "How long did this event last?",
+        "Hoe lang duurde deze gebeurtenis?",
+        "Kako dolgo je trajal ta dogodek?",
+    ),
+    98: (
+        "Was/is this event work-related?",
+        "Was/is deze gebeurtenis werkgerelateerd?",
+        "Je (bil) ta dogodek povezan s službo?",
+        "Je bil ali je ta dogodek povezan s službo?",
+    ),
+    99: (
+        "Did this overall period create tension in you?",
+        "Heeft deze globale periode spanning veroorzaakt?",
+        "Je to obdobje kot celota v vas ustvarilo napetost?",
+        "Je to celo obdobje v vas ustvarilo napetost?",
+    ),
+    100: (
+        "To what extent do you perceive this overall period as stressful?",
+        "In welke mate ervaar je deze globale periode als stressvol?",
+        "V kolikšni meri ste to obdobje dojemali kot stresno?",
+        "V kolikšni meri ste celo to obdobje dojemali kot stresno?",
+    ),
+}


 def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
+    """
+    Extract information about stressful events.
+
+    Participants were asked: "Was there a particular event that created tension in you?"
+    Then a subset of questions related to this event followed.
+    This function goes through the follow-up questions one by one
+        and preprocesses them, so that it adds new columns to the dataframe.
+
+    Parameters
+    ----------
+    df_esm: pd.DataFrame
+        A raw dataframe of all ESM data.
+
+    Returns
+    -------
+    df_esm_events: pd.DataFrame
+        A cleaned up df of Stress Appraisal Measure items with additional columns.
+
+    """
    # 0. Select only questions from Stress Appraisal Measure.
    df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
    df_esm_sam = df_esm_preprocessed[
@ -78,7 +178,8 @@ def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:

 def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
-    This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
+    This function calculates challenge and threat
+        (two Stress Appraisal Measure subscales) means,
        for each ESM session (within participants and devices).
    It creates a grouped dataframe with means in two columns.

@ -90,7 +191,8 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
    Returns
    -------
    df_esm_event_threat_challenge_mean_wide: pd.DataFrame
-        A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
+        A dataframe of unique ESM sessions (by participants and devices)
+        with threat and challenge means.
    """
    # Select only threat and challenge assessments for events
    df_esm_event_threat_challenge = df_esm_sam_clean[
@ -112,8 +214,8 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
        aggfunc="mean",
    )
    # Drop unnecessary column values.
-    df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
-        1
+    df_esm_event_threat_challenge_mean_wide.columns = (
+        df_esm_event_threat_challenge_mean_wide.columns.get_level_values(1)
    )
    df_esm_event_threat_challenge_mean_wide.columns.name = None
    df_esm_event_threat_challenge_mean_wide.rename(
@ -189,10 +291,12 @@ def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:

 def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
-    This function only serves to convert the string datetime answer into a real datetime type.
-    Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
-    NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
-        the NaTs can be interpreted to mean this.
+    This function only serves to convert the string datetime answer
+        into a real datetime type.
+    Errors during this conversion are coerced, meaning that non-datetime answers
+        are assigned Not a Time (NaT).
+    NOTE: Since the only available non-datetime answer to this question was
+        "0 - I do not remember", the NaTs can be interpreted to mean this.

    Parameters
    ----------
@ -208,9 +312,13 @@ def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
        df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
    ].assign(
        event_time=lambda x: pd.to_datetime(
-            x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
+            x.esm_user_answer,
+            errors="coerce",
+            format="%Y-%m-%d %H:%M:%S %z",
+            exact=True,
        )
    )
+    # Example answer: 2020-09-29 00:05:00 +0200
    return df_esm_event_time


@ -241,9 +349,12 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
        == QUESTIONNAIRE_ID_SAM.get("event_duration")
    ].assign(
        event_duration=lambda x: pd.to_datetime(
-            x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
+            x.esm_user_answer.str.slice(start=0, stop=-6),
+            errors="coerce",
+            format="%Y-%m-%d %H:%M:%S",
        ).dt.time
    )
+    # Example answer: 2020-09-29 00:05:00 +0200
    # TODO Explore the values recorded in event_duration and possibly fix mistakes.
    # For example, participants reported setting 23:50:00 instead of 00:50:00.

@ -251,7 +362,7 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    # we can determine whether:
    #   - this event is still going on ("1 - It is still going on")
    #   - the participant couldn't remember it's duration ("0 - I do not remember")
-    # Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
+    # Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm
    # but only the numeric types of questions and answers.
    # Since this was of "datetime" type, convert these specific answers here again.
    df_esm_event_duration["event_duration_info"] = np.nan
@ -264,4 +375,5 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    return df_esm_event_duration


-# TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?
+# TODO: How many questions about the stressfulness of the period were asked
+#  and how does this relate to events?
--- a/machine_learning/classification_models.py
+++ b/machine_learning/classification_models.py
@ -1,71 +1,123 @@
-from sklearn.dummy import DummyClassifier
-from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
+import pandas as pd
+import xgboost as xg
 from lightgbm import LGBMClassifier
-import xgboost as xg 
+from sklearn import ensemble, linear_model, naive_bayes, neighbors, svm, tree
+from sklearn.dummy import DummyClassifier

-class ClassificationModels():
-    
+
+class ClassificationModels:
    def __init__(self):
        self.cmodels = self.init_classification_models()
-        
+
    def get_cmodels(self):
        return self.cmodels

    def init_classification_models(self):
        cmodels = {
-            'dummy_classifier': {
-                'model': DummyClassifier(strategy="most_frequent"),
-                'metrics': [0, 0, 0, 0]
+            "dummy_classifier": {
+                "model": DummyClassifier(strategy="most_frequent"),
+                "metrics": [0, 0, 0, 0],
            },
-            'logistic_regression': {
-                'model': linear_model.LogisticRegression(max_iter=1000),
-                'metrics': [0, 0, 0, 0]
+            "logistic_regression": {
+                "model": linear_model.LogisticRegression(max_iter=1000),
+                "metrics": [0, 0, 0, 0],
            },
-            'support_vector_machine': {
-                'model': svm.SVC(),
-                'metrics': [0, 0, 0, 0]
+            "support_vector_machine": {"model": svm.SVC(), "metrics": [0, 0, 0, 0]},
+            "gaussian_naive_bayes": {
+                "model": naive_bayes.GaussianNB(),
+                "metrics": [0, 0, 0, 0],
            },
-            'gaussian_naive_bayes': {
-                'model': naive_bayes.GaussianNB(),
-                'metrics': [0, 0, 0, 0]
+            "stochastic_gradient_descent_classifier": {
+                "model": linear_model.SGDClassifier(),
+                "metrics": [0, 0, 0, 0],
            },
-            'stochastic_gradient_descent_classifier': {
-                'model': linear_model.SGDClassifier(),
-                'metrics': [0, 0, 0, 0]
+            "knn": {"model": neighbors.KNeighborsClassifier(), "metrics": [0, 0, 0, 0]},
+            "decision_tree": {
+                "model": tree.DecisionTreeClassifier(),
+                "metrics": [0, 0, 0, 0],
            },
-            'knn': {
-                'model': neighbors.KNeighborsClassifier(),
-                'metrics': [0, 0, 0, 0]
+            "random_forest_classifier": {
+                "model": ensemble.RandomForestClassifier(),
+                "metrics": [0, 0, 0, 0],
            },
-            'decision_tree': {
-                'model': tree.DecisionTreeClassifier(),
-                'metrics': [0, 0, 0, 0]
+            "gradient_boosting_classifier": {
+                "model": ensemble.GradientBoostingClassifier(),
+                "metrics": [0, 0, 0, 0],
            },
-            'random_forest_classifier': {
-                'model': ensemble.RandomForestClassifier(),
-                'metrics': [0, 0, 0, 0]
+            "lgbm_classifier": {"model": LGBMClassifier(), "metrics": [0, 0, 0, 0]},
+            "XGBoost_classifier": {
+                "model": xg.sklearn.XGBClassifier(),
+                "metrics": [0, 0, 0, 0],
            },
-            'gradient_boosting_classifier': {
-                'model': ensemble.GradientBoostingClassifier(),
-                'metrics': [0, 0, 0, 0]
-            },
-            'lgbm_classifier': {
-                'model': LGBMClassifier(),
-                'metrics': [0, 0, 0, 0]
-            },
-            'XGBoost_classifier': {
-                'model': xg.sklearn.XGBClassifier(),
-                'metrics': [0, 0, 0, 0]
-            }
        }
-        
+
        return cmodels
-    
+
    def get_total_models_scores(self, n_clusters=1):
+        scores = pd.DataFrame(columns=["method", "metric", "mean"])
        for model_title, model in self.cmodels.items():
+            scores_df = pd.DataFrame(columns=["method", "metric", "mean"])
            print("\n************************************\n")
            print("Current model:", model_title, end="\n")
-            print("Acc:", model['metrics'][0]/n_clusters)
-            print("Precision:", model['metrics'][1]/n_clusters)
-            print("Recall:", model['metrics'][2]/n_clusters)
-            print("F1:", model['metrics'][3]/n_clusters)
+            print("Acc:", model["metrics"][0] / n_clusters)
+            scores_df = pd.concat(
+                [
+                    scores_df,
+                    pd.DataFrame(
+                        {
+                            "method": model_title,
+                            "metric": "test_accuracy",
+                            "mean": model["metrics"][0] / n_clusters,
+                        },
+                        index=[0],
+                    ),
+                ],
+                ignore_index=True,
+            )
+            print("Precision:", model["metrics"][1] / n_clusters)
+            scores_df = pd.concat(
+                [
+                    scores_df,
+                    pd.DataFrame(
+                        {
+                            "method": model_title,
+                            "metric": "test_precision",
+                            "mean": model["metrics"][1] / n_clusters,
+                        },
+                        index=[0],
+                    ),
+                ],
+                ignore_index=True,
+            )
+            print("Recall:", model["metrics"][2] / n_clusters)
+            scores_df = pd.concat(
+                [
+                    scores_df,
+                    pd.DataFrame(
+                        {
+                            "method": model_title,
+                            "metric": "test_recall",
+                            "mean": model["metrics"][2] / n_clusters,
+                        },
+                        index=[0],
+                    ),
+                ],
+                ignore_index=True,
+            )
+            print("F1:", model["metrics"][3] / n_clusters)
+            scores_df = pd.concat(
+                [
+                    scores_df,
+                    pd.DataFrame(
+                        {
+                            "method": model_title,
+                            "metric": "test_f1",
+                            "mean": model["metrics"][3] / n_clusters,
+                        },
+                        index=[0],
+                    ),
+                ],
+                ignore_index=True,
+            )
+            scores = pd.concat([scores, scores_df])
+        return scores
--- a/machine_learning/cross_validation.py
+++ b/machine_learning/cross_validation.py
@ -49,8 +49,8 @@ class CrossValidation():

            data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
           
-        elif self.cv_method == "5kfold":
-            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
+        elif self.cv_method == "Stratified5kfold":
+            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], None

        self.X, self.y, self.groups = data_X, data_y, data_groups

@ -71,7 +71,7 @@ class CrossValidation():
        
        if self.cv_method in ["logo", "half_logo"]:
            self.cv = LeaveOneGroupOut()
-        elif self.cv_method == "5kfold":
+        elif self.cv_method == "Stratified5kfold":
            self.cv = StratifiedKFold(n_splits=5, shuffle=True)


@ -118,4 +118,11 @@ class CrossValidation():
        """
        return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
    
+    def get_groups_sets(self, split):
+        
+        if self.groups is None:
+            return None, None
+        else:
+            return self.groups.iloc[split[0]], self.groups.iloc[split[1]]
+    
    
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@ -1,11 +1,13 @@
 import os
 import sys
+import warnings

 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd

-from sklearn.feature_selection import SequentialFeatureSelector
+from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
+from sklearn.model_selection import cross_validate, StratifiedKFold, GroupKFold
 from sklearn.naive_bayes import GaussianNB
 from sklearn.linear_model import Lasso 

@ -21,13 +23,15 @@ from sklearn.linear_model import Lasso

 class FeatureSelection:

-    def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
-        pass # TODO.... 
+    def __init__(self, X, y, groups):
+        self.X = X
+        self.y = y
+        self.groups = groups

    
-    def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
+    def select_best_feature(self, features, method="remove", ml_category="classification", ml_subcategory="bin", metric="recall", stored_features=[]):
        """The method selects the best feature by testing the prediction on the feature set with or without the current feature.
-        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat 
+        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particular 
        feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
        specified as a parameter.

@ -35,7 +39,11 @@ class FeatureSelection:
            df (DataFrame): Input data on which the predictions will be made.
            features (list): List of features to select the best/worst from
            method (str, optional): remove or add features.  Defaults to "remove".
-            ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and  metric. Defaults to "classification".
+            ml_category (str, optional): Either classification or regression ml problem controls the ML algorithm and  metric. 
+                Defaults to "classification".
+            ml_subcategory (str, optional): In case of classification '_bin' for binary classification 
+                and 'multi' for multiclass classification. For regression an empty string '' is sufficient. 
+                Defaults to "bin".
            metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
            stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].

@ -49,173 +57,189 @@ class FeatureSelection:
        
        best_feature = None
        
-        if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
-            raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
-        elif ml_type == "regression" and metric not in ['r2']:
+        # Validacije tipov ML in specificiranimi metrikami
+        if ml_category == "classification":
+            if ml_subcategory == "bin" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
+                raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
+            elif ml_subcategory == "multi":
+                ml_subcategory_error = False
+                if metric != "accuracy" and "_" in metric:          
+                    metric_s, metric_t = metric.split("_")
+                    if metric_s not in ['accuracy', 'precision', 'recall', 'f1'] or metric_t not in ['micro', 'macro', 'weighted']:
+                        ml_subcategory_error = True
+                else:
+                    ml_subcategory_error = True
+                    
+                if ml_subcategory_error:
+                    raise ValueError(""""Classification metric for multi-class classification must be specified precisely.
+                                     Available metric are: 'accuracy', 'precision', 'recall' and 'f1'.
+                                     Only accuracy must be specified as 'accuracy'.
+                                     For others please add appropriate suffixes: '_macro', '_micro', or '_weighted', e.g., 'f1_macro'""")
+        elif ml_category == "regression" and metric not in ['r2']:
            raise ValueError("Regression metric not recognized. Please choose 'r2'")

        for feat in features:
            if method == "remove":
-                pred_features = [col for col in df.columns if feat != col] # All but feat
+                pred_features = [col for col in self.X.columns if feat != col] # All but feat
            elif method == "add":
                pred_features = [feat] + stored_features # Feat with stored features
            
-            X, y  = df.drop(columns=['target', 'pid'])[pred_features], df['target']
+            X  = self.X[pred_features].copy()
            
-            if ml_type == "classification":
+            if self.groups is not None:
+                cv = GroupKFold(n_splits=5)
+            else:
+                cv = StratifiedKFold(n_splits=5, shuffle=True)
+                
+            # See link about scoring for multiclassfication
+            # http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
+            if ml_category == "classification":
                nb = GaussianNB()
                model_cv = cross_validate(
                    nb,
                    X=X,
-                    y=y,
-                    cv=StratifiedKFold(n_splits=5, shuffle=True),
+                    y=self.y,
+                    cv=cv,
+                    groups=self.groups,
                    n_jobs=-1,
-                    scoring=('accuracy', 'precision', 'recall', 'f1')
+                    scoring=(metric)
                )
                
-                with warnings.catch_warnings():
-                    warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
-
-                    if metric == "accuracy":
-                        acc = np.mean(model_cv['test_accuracy'])
-                        acc_std = np.std(model_cv['test_accuracy'])
-                        
-                        if not best_feature or (acc > best_metric_score):
-                            best_feature = feat
-                            best_metric_score = acc
-                            best_metric_score_std = acc_std
-                    
-                    elif metric == "precision":
-                        prec = np.mean(model_cv['test_precision'])
-                        prec_std = np.std(model_cv['test_precision'])
-                        
-                        if not best_feature or (prec > best_metric_score):
-                            best_feature = feat
-                            best_metric_score = prec
-                            best_metric_score_std = prec_std
-                    
-                    elif metric == "recall":
-                        rec = np.mean(model_cv['test_recall'])
-                        rec_std = np.std(model_cv['test_recall'])
-                        
-                        if not best_feature or (rec > best_metric_score):
-                            best_feature = feat
-                            best_metric_score = rec
-                            best_metric_score_std = rec_std
-                    
-                    else:
-                        f1 = np.mean(model_cv['test_f1'])
-                        f1_std = np.std(model_cv['test_f1'])
-                        
-                        if not best_feature or (f1 > best_metric_score):
-                            best_feature = feat
-                            best_metric_score = f1
-                            best_metric_score_std = f1_std 
                                       
-            elif ml_type == "regression":
+            elif ml_category == "regression":
                lass = Lasso()
                model_cv = cross_validate(
                    lass,
                    X=X,
                    y=y,
-                    cv=StratifiedKFold(n_splits=5, shuffle=True),
+                    cv=cv,
+                    groups=self.groups,
                    n_jobs=-1,
                    scoring=('r2')
                )

-                if metric == "r2":
-                    r2 = np.mean(model_cv['test_r2'])
-                    r2_std = np.std(model_cv['test_r2'])
-                    
-                    if not best_feature or (r2 > best_metric_score):
-                        best_feature = feat
-                        best_metric_score = r2
-                        best_metric_score_std = r2_std
            else:
                raise ValueError("ML type not yet implemented!")
+            
+            # Section of metrics' scores comparison. 
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
+
+                metric_score = np.nanmean(model_cv["test_score"])
+                metric_score_std = np.nanstd(model_cv["test_score"])
+                
+                if not best_feature or (metric_score > best_metric_score):
+                    best_feature = feat
+                    best_metric_score = metric_score
+                    best_metric_score_std = metric_score_std
                    
        return best_feature, best_metric_score, best_metric_score_std
    
    
-    def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
+    def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10):
+        """This method selects a set of features and returns them as a list. It returns number of features 
+        determined in the interval of [n_min, n_max]. 
        
-        n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
-        if n_max > n_features:
-            n_max = n_features
+        The method consists of two steps: 
+        (1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter.
+        (2) The sequential features removal procedure is executed. Using the remaing features from (1).
+            The best score is detected using a removal procedure. The procedure sequentially removes the features 
+            that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is 
+            improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance) 
+            with which the next n removed features are inspected whether currently best score is improved.     
+
+        Args:
+            n_min (int, optional): Minimal amount of features returned.
+            n_max (int, optional): Maximal amount of features returned.
+            k (int, optional): Determines the k in the k-best features method. 
+                If None, SelectKBest feature selection does not execute.
+            ml_type(str, optional): Type of ML problem. Currently implemented options: 
+                'classification_bin', 'classification_multi', and 'regression_'
+            method (str, optional): "remove" or "add" features.  Defaults to "remove".
+            n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
+                the method returns index of feature with current best score as a tipping point feature.
+            
+        Returns:
+            list: list of selected features
+        """        
+
+        if k is not None and k <= n_max:
+            raise ValueError("The k parameter needs to be greater than the n_max parameter.")
        
+        # Select k-best feature dependent on the type of ML task
+        ml_category, ml_subcategory = ml_type.split("_")
+
+        if k is not None:
+            if ml_category == "classification":
+                if ml_subcategory== "bin":
+                    selector = SelectKBest(mutual_info_classif, k=k)
+                elif ml_subcategory== "multi":
+                    selector = SelectKBest(f_classif, k=k)
+                else:
+                    raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
+            elif ml_category == "regression":
+                selector = SelectKBest(f_regression, k=k)
+            else:
+                raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
+            
+            selector.fit(self.X, self.y)
+            cols_idxs = selector.get_support(indices=True)
+            self.X = self.X.iloc[:,cols_idxs]
+        
+        print("All columns (after SelectKBest method):")
+        print(self.X.columns)
+        
+        # Sequential feature addition / removal
+        n_features = self.X.shape[1]
+        if n_max >= n_features:
+            n_max = n_features-1 # The algorithm removes at least one feature
+            
        if n_min > n_features:
-            raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
+            raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
        
        if n_max < n_min:
            raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
        
-        features = df.columns.tolist()
-        features.remove("pid")
-        features.remove("target")
+        features = self.X.columns.tolist()
        feature_importance = []
        if method == "remove":
+            best_score = 0
+            best_feature_indx = None
+            i_worse = 0
            for i in reversed(range(n_features)):
                
+                if i+1 == n_min:
+                    break
+                
                best_feature, best_metric_score, best_metric_score_std = \
-                    self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
-                feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
+                    self.select_best_feature(features, method=method, ml_category=ml_category, ml_subcategory=ml_subcategory, metric=metric)
+                    
+                feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
                
                features.remove(best_feature)
+                print("Features left:", i) 
+                
+                if i <= n_max:
+                    if best_metric_score >= best_score:
+                        best_score = best_metric_score
+                        best_feature_indx = i+1
+                        i_worse = 0
+                    else:
+                        i_worse += 1
+                    
+                    if i_worse == n_tolerance: 
+                        break  
                
            feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
-            
-            # Selekcijski kriterij značilk v rangu max-min
-            # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
-            
-            # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
-            
-            # "Tipping point" značilka mora biti v rangu max-min
-            
-            selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
-            selection_area.set_index(["i", "name"], inplace=True)
-            diffrences = selection_area.diff()
-            diffrences.dropna(how='any', inplace=True)
-            
-            # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo 
-            cumulative_sumation = diffrences.cumsum()
-            tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]

-            # Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative 
-            tipping_feature_indx_2 = None
-            for indx, row in diffrences.iterrows():
-                if row["metric"] > 0:
-                    tipping_feature_indx_2 = indx
-                else: 
-                    break
-                
-            # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score     
-            tipping_feature_indx_3 = None
-            cum_sum_score = 0
-            i_worse = 0
-            # TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
-            for indx, row in selection_area.iterrows():
-                if row["metric"] > 0:
-                    tipping_feature_indx_3 = indx
-                    cum_sum_score += row["metric"]
-                    i_worse = 0
-                else:
-                    i_worse += 1
-                
-                if i_worse == n_not_improve:
-                    break
-                    
-                    
-                
-                
-            
+            print(feature_importance_df)
+            print("best_feature_indx", best_feature_indx)
+            print("best_score", best_score)
+
+            features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist()
+            selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]    
            
+            return selected_features
        
-
-    def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
-        pass
-
-    def vizualize_feature_selection_process():
-        pass
-
-    def execute_feature_selection_step():
-        pass
+        else:
+            raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
--- a/machine_learning/helper.py
+++ b/machine_learning/helper.py
@ -11,7 +11,13 @@ from sklearn import (
    svm,
 )
 from sklearn.dummy import DummyClassifier, DummyRegressor
-from sklearn.model_selection import LeaveOneGroupOut, cross_validate
+from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import (
+    BaseCrossValidator,
+    LeaveOneGroupOut,
+    StratifiedKFold,
+    cross_validate,
+)
 from xgboost import XGBClassifier, XGBRegressor


@ -73,7 +79,40 @@ def insert_row(df, row):
    return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)


-def prepare_regression_model_input(model_input, cv_method="logo"):
+def impute_encode_categorical_features(model_input: pd.DataFrame) -> pd.DataFrame:
+    categorical_feature_col_names = [
+        "gender",
+        "startlanguage",
+        "limesurvey_demand_control_ratio_quartile",
+    ]
+    additional_categorical_features = [
+        col
+        for col in model_input.columns
+        if "mostcommonactivity" in col or "homelabel" in col
+    ]
+    categorical_feature_col_names += additional_categorical_features
+
+    categorical_features = model_input[categorical_feature_col_names].copy()
+
+    mode_categorical_features = categorical_features.mode().iloc[0]
+    # fillna with mode
+    categorical_features = categorical_features.fillna(mode_categorical_features)
+    # one-hot encoding
+    categorical_features = categorical_features.apply(
+        lambda col: col.astype("category")
+    )
+    if not categorical_features.empty:
+        categorical_features = pd.get_dummies(categorical_features)
+
+    numerical_features = model_input.drop(categorical_feature_col_names, axis=1)
+
+    model_input = pd.concat([numerical_features, categorical_features], axis=1)
+    return model_input
+
+
+def prepare_sklearn_data_format(
+    model_input: pd.DataFrame, cv_method: str = "logo"
+) -> tuple:
    index_columns = [
        "local_segment",
        "local_segment_label",
@ -82,13 +121,7 @@ def prepare_regression_model_input(model_input, cv_method="logo"):
    ]
    model_input.set_index(index_columns, inplace=True)

-    if cv_method == "logo":
-        data_x, data_y, data_groups = (
-            model_input.drop(["target", "pid"], axis=1),
-            model_input["target"],
-            model_input["pid"],
-        )
-    else:
+    if cv_method == "half_logo":
        model_input["pid_index"] = model_input.groupby("pid").cumcount()
        model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")

@ -104,52 +137,53 @@ def prepare_regression_model_input(model_input, cv_method="logo"):
            model_input["target"],
            model_input["pid_half"],
        )
+    else:
+        data_x, data_y, data_groups = (
+            model_input.drop(["target", "pid"], axis=1),
+            model_input["target"],
+            model_input["pid"],
+        )
+    return data_x, data_y, data_groups

-    categorical_feature_colnames = [
-        "gender",
-        "startlanguage",
-        "limesurvey_demand_control_ratio_quartile",
-    ]
-    additional_categorical_features = [
-        col
-        for col in data_x.columns
-        if "mostcommonactivity" in col or "homelabel" in col
-    ]
-    categorical_feature_colnames += additional_categorical_features

-    categorical_features = data_x[categorical_feature_colnames].copy()
+def prepare_cross_validator(
+    data_x: pd.DataFrame,
+    data_y: pd.DataFrame,
+    data_groups: pd.DataFrame,
+    cv_method: str = "logo",
+) -> BaseCrossValidator:
+    if cv_method == "logo" or cv_method == "half_logo":
+        cv = LeaveOneGroupOut()
+        cv.get_n_splits(
+            data_x,
+            data_y,
+            groups=data_groups,
+        )
+    else:
+        cv = StratifiedKFold(n_splits=5, shuffle=True)
+    return cv

-    mode_categorical_features = categorical_features.mode().iloc[0]
-    # fillna with mode
-    categorical_features = categorical_features.fillna(mode_categorical_features)
-    # one-hot encoding
-    categorical_features = categorical_features.apply(
-        lambda col: col.astype("category")
+
+def aggregate_and_transpose(df: pd.DataFrame, statistics=None) -> pd.DataFrame:
+    if statistics is None:
+        statistics = ["max", "mean"]
+    return (
+        df.agg(statistics)
+        .transpose()
+        .reset_index()
+        .rename(columns={"index": "test_metric"})
    )
-    if not categorical_features.empty:
-        categorical_features = pd.get_dummies(categorical_features)
-
-    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
-
-    train_x = pd.concat([numerical_features, categorical_features], axis=1)
-
-    return train_x, data_y, data_groups


-def run_all_regression_models(input_csv):
-    # Prepare data
-    data_x, data_y, data_groups = prepare_regression_model_input(input_csv)
-
-    # Prepare cross validation
-    logo = LeaveOneGroupOut()
-    logo.get_n_splits(
-        data_x,
-        data_y,
-        groups=data_groups,
-    )
+def run_all_regression_models(
+    data_x: pd.DataFrame,
+    data_y: pd.DataFrame,
+    data_groups: pd.DataFrame,
+    cross_validator: BaseCrossValidator,
+) -> pd.DataFrame:
    metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
    test_metrics = ["test_" + metric for metric in metrics]
-    scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
+    scores = pd.DataFrame(columns=["method", "test_metric", "max", "nanmedian"])

    # Validate models
    dummy_regr = DummyRegressor(strategy="mean")
@ -158,7 +192,7 @@ def run_all_regression_models(input_csv):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
@ -166,17 +200,19 @@ def run_all_regression_models(input_csv):
    print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))

    scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "dummy"
    scores = pd.concat([scores, scores_df])
+    del dummy_regr
+    del dummy_regr_scores

-    lin_reg_rapids = linear_model.LinearRegression()
+    lin_reg = linear_model.LinearRegression()
    lin_reg_scores = cross_validate(
-        lin_reg_rapids,
+        lin_reg,
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
@ -184,9 +220,11 @@ def run_all_regression_models(input_csv):
    print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))

    scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "linear_reg"
    scores = pd.concat([scores, scores_df])
+    del lin_reg
+    del lin_reg_scores

    ridge_reg = linear_model.Ridge(alpha=0.5)
    ridge_reg_scores = cross_validate(
@ -194,16 +232,18 @@ def run_all_regression_models(input_csv):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("Ridge regression")

    scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "ridge_reg"
    scores = pd.concat([scores, scores_df])
+    del ridge_reg
+    del ridge_reg_scores

    lasso_reg = linear_model.Lasso(alpha=0.1)
    lasso_reg_score = cross_validate(
@ -211,16 +251,18 @@ def run_all_regression_models(input_csv):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("Lasso regression")

    scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "lasso_reg"
    scores = pd.concat([scores, scores_df])
+    del lasso_reg
+    del lasso_reg_score

    bayesian_ridge_reg = linear_model.BayesianRidge()
    bayesian_ridge_reg_score = cross_validate(
@ -228,16 +270,18 @@ def run_all_regression_models(input_csv):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("Bayesian Ridge")

    scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "bayesian_ridge"
    scores = pd.concat([scores, scores_df])
+    del bayesian_ridge_reg
+    del bayesian_ridge_reg_score

    ransac_reg = linear_model.RANSACRegressor()
    ransac_reg_score = cross_validate(
@ -245,27 +289,37 @@ def run_all_regression_models(input_csv):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("RANSAC (outlier robust regression)")

    scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "RANSAC"
    scores = pd.concat([scores, scores_df])
+    del ransac_reg
+    del ransac_reg_score

    svr = svm.SVR()
    svr_score = cross_validate(
-        svr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
+        svr,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
    )
    print("Support vector regression")

    scores_df = pd.DataFrame(svr_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "SVR"
    scores = pd.concat([scores, scores_df])
+    del svr
+    del svr_score

    kridge = kernel_ridge.KernelRidge()
    kridge_score = cross_validate(
@ -273,69 +327,130 @@ def run_all_regression_models(input_csv):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("Kernel Ridge regression")

    scores_df = pd.DataFrame(kridge_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "kernel_ridge"
    scores = pd.concat([scores, scores_df])
+    del kridge
+    del kridge_score

    gpr = gaussian_process.GaussianProcessRegressor()
    gpr_score = cross_validate(
-        gpr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
+        gpr,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
    )
    print("Gaussian Process Regression")

    scores_df = pd.DataFrame(gpr_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "gaussian_proc"
    scores = pd.concat([scores, scores_df])
+    del gpr
+    del gpr_score

    rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
    rfr_score = cross_validate(
-        rfr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
+        rfr,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
    )
    print("Random Forest Regression")

    scores_df = pd.DataFrame(rfr_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "random_forest"
    scores = pd.concat([scores, scores_df])
+    del rfr
+    del rfr_score

    xgb = XGBRegressor()
    xgb_score = cross_validate(
-        xgb, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
+        xgb,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
    )
    print("XGBoost Regressor")

    scores_df = pd.DataFrame(xgb_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "XGBoost"
    scores = pd.concat([scores, scores_df])
+    del xgb
+    del xgb_score

    ada = ensemble.AdaBoostRegressor()
    ada_score = cross_validate(
-        ada, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
+        ada,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=metrics,
    )
    print("ADA Boost Regressor")

    scores_df = pd.DataFrame(ada_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "ADA_boost"
    scores = pd.concat([scores, scores_df])
+    del ada
+    del ada_score

    return scores


-def run_all_classification_models(data_x, data_y, data_groups, cv_method):
-    metrics = ["accuracy", "average_precision", "recall", "f1"]
+def confusion_matrix_scorer(clf, X, y):
+    y_pred = clf.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]}
+
+
+def aggregate_confusion_matrix(scores_dict: dict) -> pd.DataFrame:
+    scores_aggregated = aggregate_and_transpose(
+        pd.DataFrame(scores_dict), statistics=["sum"]
+    )
+    return scores_aggregated[
+        ~scores_aggregated.test_metric.isin(["fit_time", "score_time"])
+    ]
+
+
+def run_all_classification_models(
+    data_x: pd.DataFrame,
+    data_y: pd.DataFrame,
+    data_groups: pd.DataFrame,
+    cross_validator: BaseCrossValidator,
+):
+    data_y_value_counts = data_y.value_counts()
+    if len(data_y_value_counts) == 1:
+        raise (ValueError("There is only one unique value in data_y."))
+    if len(data_y_value_counts) == 2:
+        metrics = ["accuracy", "average_precision", "recall", "f1"]
+    else:
+        metrics = ["accuracy", "precision_micro", "recall_micro", "f1_micro"]
+
    test_metrics = ["test_" + metric for metric in metrics]

-    scores = pd.DataFrame(columns=["method", "max", "mean"])
+    scores = pd.DataFrame(columns=["method", "test_metric", "max", "mean"])

    dummy_class = DummyClassifier(strategy="most_frequent")

@ -344,17 +459,39 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        error_score="raise",
        scoring=metrics,
    )
+    dummy_confusion_matrix = cross_validate(
+        dummy_class,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        error_score="raise",
+        scoring=confusion_matrix_scorer,
+    )
    print("Dummy")

    scores_df = pd.DataFrame(dummy_score)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
-    scores_df["method"] = "Dummy"
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(dummy_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
+    scores_df["method"] = "dummy_classifier"
    scores = pd.concat([scores, scores_df])
+    del dummy_class
+    del dummy_score
+    del dummy_confusion_matrix

    logistic_regression = linear_model.LogisticRegression()

@ -363,16 +500,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
+    log_reg_confusion_matrix = cross_validate(
+        logistic_regression,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=confusion_matrix_scorer,
+    )
    print("Logistic regression")

    scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
-    scores_df["method"] = "logistic_reg"
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(log_reg_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
+    scores_df["method"] = "logistic_regression"
    scores = pd.concat([scores, scores_df])
+    del logistic_regression
+    del log_reg_scores
+    del log_reg_confusion_matrix

    svc = svm.SVC()

@ -381,16 +539,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
+    svc_confusion_matrix = cross_validate(
+        svc,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=confusion_matrix_scorer,
+    )
    print("Support Vector Machine")

    scores_df = pd.DataFrame(svc_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
-    scores_df["method"] = "svc"
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(svc_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
+    scores_df["method"] = "SVC"
    scores = pd.concat([scores, scores_df])
+    del svc
+    del svc_scores
+    del svc_confusion_matrix

    gaussian_nb = naive_bayes.GaussianNB()

@ -399,16 +578,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
+    gaussian_nb_confusion_matrix = cross_validate(
+        gaussian_nb,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=confusion_matrix_scorer,
+    )
    print("Gaussian Naive Bayes")

    scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(gaussian_nb_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
    scores_df["method"] = "gaussian_naive_bayes"
    scores = pd.concat([scores, scores_df])
+    del gaussian_nb
+    del gaussian_nb_scores
+    del gaussian_nb_confusion_matrix

    sgdc = linear_model.SGDClassifier()

@ -417,16 +617,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
+    sgdc_confusion_matrix = cross_validate(
+        sgdc,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=confusion_matrix_scorer,
+    )
    print("Stochastic Gradient Descent")

    scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
-    scores_df["method"] = "stochastic_gradient_descent"
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(sgdc_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
+    scores_df["method"] = "stochastic_gradient_descent_classifier"
    scores = pd.concat([scores, scores_df])
+    del sgdc
+    del sgdc_scores
+    del sgdc_confusion_matrix

    rfc = ensemble.RandomForestClassifier()

@ -435,16 +656,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
+    rfc_confusion_matrix = cross_validate(
+        rfc,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=confusion_matrix_scorer,
+    )
    print("Random Forest")

    scores_df = pd.DataFrame(rfc_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
-    scores_df["method"] = "random_forest"
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(rfc_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
+    scores_df["method"] = "random_forest_classifier"
    scores = pd.concat([scores, scores_df])
+    del rfc
+    del rfc_scores
+    del rfc_confusion_matrix

    xgb_classifier = XGBClassifier()

@ -453,15 +695,36 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
+    xgb_confusion_matrix = cross_validate(
+        xgb_classifier,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=cross_validator,
+        n_jobs=-1,
+        scoring=confusion_matrix_scorer,
+    )
    print("XGBoost")

    scores_df = pd.DataFrame(xgb_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
-    scores_df["method"] = "xgboost"
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
+    scores_df = pd.concat(
+        [
+            scores_df,
+            aggregate_confusion_matrix(xgb_confusion_matrix).rename(
+                columns={"sum": "mean"}
+                # Note: the column is misleadingly renamed to get concise output.
+            ),
+        ]
+    )
+    scores_df["method"] = "XGBoost_classifier"
    scores = pd.concat([scores, scores_df])
+    del xgb_classifier
+    del xgb_scores
+    del xgb_confusion_matrix

    return scores
--- a/machine_learning/preprocessing.py
+++ b/machine_learning/preprocessing.py
@ -33,7 +33,7 @@ class Preprocessing:
        Args:
            categorical_features (DataFrame): DataFrame including only categorical columns.
            numerical_features (_type_): DataFrame including only numerical columns.
-            mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
+            mode (int): Mode of the column with which DataFrame is filled.

        Returns:
            DataFrame: Hot-One Encoded DataFrame.
@ -46,7 +46,7 @@ class Preprocessing:
        if not categorical_features.empty:
            categorical_features = pd.get_dummies(categorical_features)

-        return pd.concat([numerical_features, categorical_features], axis=1)
+        return pd.concat([numerical_features, categorical_features], axis=1), categorical_features.columns.tolist()


    def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
@ -68,20 +68,27 @@ class Preprocessing:
        categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]

        # For train set
-        
        train_X_categorical_features = self.train_X[categorical_columns].copy()
        train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
        mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
        
-        self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
+        self.train_X, train_cat_col_names = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
+        encoded_categorical_features = [col for col in self.train_X.columns if col.startswith(tuple(categorical_columns))]
        
        # For test set
-        
        test_X_categorical_features = self.test_X[categorical_columns].copy()
        test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
        
-        self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
+        self.test_X, test_cat_col_names = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)

+        # Create categorical columns that were not found in test set and fill them with 0        
+        missing_cols = [col for col in train_cat_col_names if col not in test_cat_col_names]
+        self.test_X[missing_cols] = 0
+        
+        # Sort column names alphabetically        
+        self.train_X = self.train_X.reindex(sorted(self.train_X.columns), axis=1)
+        self.test_X = self.test_X.reindex(sorted(self.test_X.columns), axis=1)
+        

    def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
        
--- a/presentation/event_stressful_detection_5fold.csv
+++ b/presentation/event_stressful_detection_5fold.csv
@ -1,29 +0,0 @@
-method,metric,max,mean
-Dummy,test_accuracy,0.8557046979865772,0.8548446932649828
-Dummy,test_average_precision,0.1457286432160804,0.14515530673501736
-Dummy,test_recall,0.0,0.0
-Dummy,test_f1,0.0,0.0
-logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606
-logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834
-logistic_reg,test_recall,0.3023255813953488,0.24266238973536486
-logistic_reg,test_f1,0.3909774436090226,0.318943511424051
-svc,test_accuracy,0.8557046979865772,0.8548446932649828
-svc,test_average_precision,0.44514416839823046,0.4068200938341621
-svc,test_recall,0.0,0.0
-svc,test_f1,0.0,0.0
-gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234
-gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853
-gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935
-gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699
-stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601
-stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592
-stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675
-stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583
-random_forest,test_accuracy,0.9110738255033557,0.9011129472867694
-random_forest,test_average_precision,0.6998372262021191,0.6619275281099584
-random_forest,test_recall,0.4069767441860465,0.35356856455493185
-random_forest,test_f1,0.5691056910569107,0.5078402513053142
-xgboost,test_accuracy,0.9128978224455612,0.9007711937764886
-xgboost,test_average_precision,0.7366643049075349,0.698622165966308
-xgboost,test_recall,0.5287356321839081,0.44346431435445066
-xgboost,test_f1,0.638888888888889,0.5633957169928393
--- a/presentation/event_stressful_detection_logo.csv
+++ b/presentation/event_stressful_detection_logo.csv
@ -1,29 +0,0 @@
-method,metric,max,mean
-Dummy,test_accuracy,1.0,0.8524114578096439
-Dummy,test_average_precision,0.7,0.14758854219035614
-Dummy,test_recall,0.0,0.0
-Dummy,test_f1,0.0,0.0
-logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311
-logistic_reg,test_average_precision,1.0,0.44605167668563583
-logistic_reg,test_recall,1.0,0.25353566685532386
-logistic_reg,test_f1,0.823529411764706,0.27951926390778625
-svc,test_accuracy,1.0,0.8524114578096439
-svc,test_average_precision,0.9612401707068228,0.44179454944271934
-svc,test_recall,0.0,0.0
-svc,test_f1,0.0,0.0
-gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129
-gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991
-gaussian_naive_bayes,test_recall,1.0,0.3743761174081108
-gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668
-stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739
-stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582
-stochastic_gradient_descent,test_recall,1.0,0.30156420704502945
-stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388
-random_forest,test_accuracy,1.0,0.8722158105763481
-random_forest,test_average_precision,1.0,0.49817066323226833
-random_forest,test_recall,1.0,0.18161263127840668
-random_forest,test_f1,1.0,0.2508096532365307
-xgboost,test_accuracy,1.0,0.8812627400277729
-xgboost,test_average_precision,1.0,0.5505695112208401
-xgboost,test_recall,1.0,0.2896161238315027
-xgboost,test_f1,0.9411764705882353,0.36887408735855665
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,7 @@
+[tool.isort]
+profile = "black"
+py_version = 311
+skip_gitignore = "true"
+
+[tool.black]
+target-version = ["py311"]
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 63f5a526fce4d288499168e1701adadb8b885d82
+Subproject commit 059774bda10545a83ab282f59eb7a329fef9ee4c
--- a/setup.py
+++ b/setup.py
@ -1,8 +1,7 @@
 import os

-import sqlalchemy.engine.url
 from dotenv import load_dotenv
-from sqlalchemy import create_engine
+from sqlalchemy import URL, create_engine
 from sqlalchemy.orm import sessionmaker

 load_dotenv()
@ -11,7 +10,7 @@ testing: bool = False

 db_password = os.getenv("DB_PASSWORD")

-db_uri = sqlalchemy.engine.url.URL(
+db_uri = URL.create(
    drivername="postgresql+psycopg2",
    username="staw_db",
    password=db_password,
--- a/statistical_analysis/scale_reliability.rmd
+++ b/statistical_analysis/scale_reliability.rmd
@ -0,0 +1,60 @@
+---
+title: "Reliability of SAM threat and challenge and COPE"
+output: html_notebook
+---
+
+
+```{r libraries, message=FALSE, warning=FALSE, include=FALSE, cache=FALSE}
+library(conflicted)
+library(here)
+library(tidyverse)
+library(magrittr)
+library(lavaan)
+library(kableExtra)
+
+conflicts_prefer(
+    readr::col_factor,
+    purrr::discard,
+    dplyr::filter,
+    dplyr::lag,
+    purrr::set_names,
+    tidyr::extract,
+    kableExtra::group_rows
+)
+```
+
+```{r style, include=FALSE, cache=FALSE}
+styler::style_file(
+    here("statistical_analysis", "scale_reliability.Rmd"),
+    scope = "tokens",
+    indent_by = 4L
+)
+```
+
+The data were preprocessed and cleaned using [expl_esm_labels.py](../exploration/expl_esm_labels.py) script and read as csv here.
+
+```{r read_data}
+COL_TYPES <- cols(
+    .default = col_double(),
+    participant_id = col_factor(),
+    username = col_factor(),
+    device_id = col_factor(),
+    esm_trigger = col_factor(),
+    esm_instructions = col_factor(),
+    double_esm_user_answer_timestamp = col_double(),
+    datetime_lj = col_datetime(format = ""),
+    date_lj = col_date(format = ""),
+    time = col_factor(),
+    esm_user_answer = col_factor()
+)
+df_SAM <- read_csv(here("data", "raw", "df_esm_SAM_threat_challenge.csv"), col_types = COL_TYPES)
+df_COPE <- read_csv(here("data", "raw", "df_esm_COPE.csv"), col_types = COL_TYPES)
+```
+
+Demonstrate factor analysis for a single participant.
+
+```{r}
+df_COPE %>%
+	group_by(question_id, questionnaire_id) %>%
+	count()
+```
--- a/straw2analysis.Rproj
+++ b/straw2analysis.Rproj
@ -0,0 +1,20 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: No
+NumSpacesForTab: 4
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: XeLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+
+PythonType: conda
+PythonVersion: 3.11.3
+PythonPath: E:/ProgramData/mambaforge/envs/straw2analysis/python.exe
Author	SHA1	Message	Date
junos	777d30365e	Fix questions that were slightly different in the morning.	2023-07-03 21:29:09 +02:00
junos	ca85131ed2	Use refactored methods.	2023-07-03 21:18:15 +02:00
junos	2aa5c5cb07	Fix trailing whitespace.	2023-07-03 21:17:40 +02:00
junos	82b53bc0d3	Extract method to reuse.	2023-07-03 21:13:50 +02:00
junos	c688580fe8	Increment answers separately if needed.	2023-07-03 21:01:15 +02:00
junos	8c0b66eddc	Extract method to reuse and simplify.	2023-07-03 20:52:08 +02:00
junos	e3ff4846e1	Document functions.	2023-07-03 20:45:39 +02:00
junos	825380a47e	Add a function to fix SAM question IDs.	2023-07-03 20:41:48 +02:00
junos	ef26772038	Add SAM question IDs.	2023-07-03 20:38:20 +02:00
junos	64c05ec5ec	Fix COPE question IDs.	2023-07-03 20:27:33 +02:00
junos	c0236b251c	Fix question IDs.	2023-07-03 20:11:11 +02:00
junos	2f22f2052a	[WIP] Continue factor analysis.	2023-07-03 20:10:45 +02:00
junos	ec51d7d406	[WIP] Add a function to recode question IDs.	2023-07-03 19:57:28 +02:00
junos	2aca64aa09	Add COPE questions and their IDs.	2023-07-03 19:34:11 +02:00
junos	9a87a0a34a	Add instructions to export.	2023-07-03 19:19:10 +02:00
junos	91a9f20839	Read in data.	2023-07-03 18:44:45 +02:00
junos	e3be17e56e	Export COPE data.	2023-07-03 18:33:28 +02:00
junos	5c0e2e2621	Export SAM data.	2023-07-03 18:25:40 +02:00
junos	51201c2bc9	Disable jupytext pre-commit again.	2023-07-03 18:24:37 +02:00
junos	fed4b33611	Add more jupytext args.	2023-07-03 18:22:36 +02:00
junos	471ce7c2cb	Document extract_stressful_events.	2023-07-03 17:37:34 +02:00
junos	dbb2033f78	Add jupytext to pre-commit hooks.	2023-07-03 17:35:45 +02:00
junos	1b77fb119c	Fix an error introduced in `ae2ca63bc4`.	2023-07-03 17:17:56 +02:00
junos	ae2ca63bc4	Define QUESTIONNAIRE IDs and use them. Clean up docstrings.	2023-07-03 17:09:40 +02:00
junos	577f1330da	Add docstrings flake8 checks.	2023-07-03 16:49:35 +02:00
junos	4af360f411	Use conda Python environment with R.	2023-07-03 14:51:07 +02:00
junos	96bbe32f56	Add statistics for some scales.	2023-07-03 14:50:35 +02:00
junos	40170339c2	Add R project and sample Markdown notebook.	2023-07-03 14:28:28 +02:00
junos	7b0c0037f7	Add an old script to test JCQ reversal.	2023-07-03 14:23:55 +02:00
junos	db06584ddd	Improve removal of "medium" class.	2023-05-31 22:46:49 +02:00
junos	112d968715	Add baseline features.	2023-05-31 22:25:39 +02:00
junos	9cc6bf7c21	Add PCA for composite target.	2023-05-31 21:12:21 +02:00
junos	78807b941c	Add analysis for composite score of stress.	2023-05-31 21:00:55 +02:00
junos	a9af113c9c	Add confusion matrices for all methods.	2023-05-31 17:41:50 +02:00
junos	97113fe9ab	Sum up confusion matrix and illustrate use with dummy.	2023-05-31 17:27:49 +02:00
junos	bc78a1d498	Define a confusion matrix scorer.	2023-05-31 16:05:39 +02:00
junos	aca84b214d	Small corrections.	2023-05-19 03:19:42 +02:00
junos	aa13123136	Handle clustering classification the same as other classification models again.	2023-05-19 03:04:09 +02:00
junos	c51e0da0f7	Handle clustering classification the same as other classification models.	2023-05-19 02:52:56 +02:00
junos	a2401b5e36	Add multiclass scoring.	2023-05-19 01:34:34 +02:00
junos	70232949c3	Better handling of "medium" category.	2023-05-19 01:10:30 +02:00
junos	8a9595c615	Better handling of input filename again.	2023-05-18 22:53:58 +02:00
junos	045b9fa0dc	Better handling of input filename.	2023-05-18 19:03:53 +02:00
junos	bb4445f1a8	Add bins to output filename.	2023-05-18 18:58:19 +02:00
junos	1318ae3609	Rename methods to make them consistent with regression methods.	2023-05-18 18:55:31 +02:00
junos	45441c288d	Correct column name.	2023-05-18 18:47:56 +02:00
junos	fa45b30955	Set output path programmatically.	2023-05-18 18:40:54 +02:00
junos	2336edffb6	Retain metric names in final scores.	2023-05-18 18:40:06 +02:00
junos	b756ed5feb	Set more parameters as user-specified constants.	2023-05-18 18:06:32 +02:00
junos	cad28c3fe8	Set path programmatically.	2023-05-18 16:36:46 +02:00
junos	38a405d378	Add index when inserting one row.	2023-05-17 18:13:20 +02:00
junos	2c5a0b4157	Label plot axes.	2023-05-17 16:32:27 +02:00
junos	0409c9e982	Fix format specification.	2023-05-16 17:22:09 +02:00
junos	a7446cc34a	Specify columns to aggregate and save figures as pdfs.	2023-05-16 17:05:43 +02:00
junos	118e686491	Specify format directly as infer_datetime_format was deprecated.	2023-05-16 17:04:48 +02:00
junos	9417a1b9f1	Do not break markdown lines.	2023-05-16 16:37:34 +02:00
junos	7b5db88f1d	Remove and ignore results.	2023-05-16 16:22:29 +02:00
junos	0f8f0b0fb6	Update URL call.	2023-05-16 16:17:53 +02:00
junos	26c7d22b83	Add an option to save figures.	2023-05-16 16:17:06 +02:00
junos	87781840d4	Use concat instead of append which was deprecated.	2023-05-12 16:32:08 +02:00
junos	3091328fc5	Format comments.	2023-05-11 16:51:38 +02:00
junos	055e87dbac	Return scores for classification.	2023-05-10 23:51:12 +02:00
junos	f58d20ffc2	Update classification runner.	2023-05-10 23:17:44 +02:00
junos	075fdab9ea	Select segment and save results.	2023-05-10 23:00:03 +02:00
junos	91e7352480	Thoroughly refactor classification runner.	2023-05-10 22:50:00 +02:00
junos	35c09374dd	Free up memory during model building.	2023-05-10 21:44:40 +02:00
junos	b505fb2b6a	Thoroughly refactor regression runner.	2023-05-10 20:30:51 +02:00
junos	47b1ecdbb9	First format with black and then check with flake8.	2023-05-10 15:29:32 +02:00
junos	24744c288d	Extract one step of preparation into a separate function.	2023-05-10 15:28:09 +02:00
junos	caeaf03239	Provide data instead of csv input.	2023-05-10 15:20:33 +02:00
junos	cd5d8b6a10	Update rapids and add regex=True. Reformat debug_heatmap.	2023-05-10 15:12:27 +02:00
junos	3e38b64b45	Merge branch 'ml_pipeline'	2023-05-10 15:02:17 +02:00
junos	76071fd550	Start using pre-commit hooks.	2023-04-24 15:38:54 +02:00
Primoz	26804cf8ea	Repair preprocessing one hot encoding of test set.	2023-04-21 13:24:31 +02:00
Primoz	865225994b	Added testing section after feature selection.	2023-04-20 13:29:14 +02:00
Primoz	259be708aa	Improve the feature selection method with validations etc.	2023-04-20 13:26:20 +02:00
Primoz	0594993133	Add GroupKFold to feature selection CV. Start with generic metric calculation procedure.	2023-04-20 11:20:26 +02:00
Primoz	1cbc743cf7	Add kBest method to initially filter out the worst performing features. Update comments.	2023-04-20 10:12:16 +02:00
Primoz	2a8f1ee613	Merge branch 'ml_pipeline' of https://repo.ijs.si/junoslukan/straw2analysis into ml_pipeline	2023-04-19 15:56:52 +02:00
Primoz	ce13a9e13b	Implement feature selection method which is used in ML pipeline.	2023-04-19 15:56:34 +02:00