Fix questions that were slightly different in the morning.

Use refactored methods.
Fix trailing whitespace.
2023-07-03 21:29:09 +02:00 · 2023-07-03 21:18:15 +02:00 · 2023-07-03 21:17:40 +02:00 · 2023-07-03 21:13:50 +02:00 · 2023-07-03 21:01:15 +02:00 · 2023-07-03 20:52:08 +02:00
36 changed files with 2379 additions and 1514 deletions
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,9 @@
 [flake8]
 max-line-length = 88
 extend-ignore =
    E203,
    # E501 line too long for docstrings
    D501
 per-file-ignores =
    exploration/*.py:E501
 docstring-convention = numpy
--- a/.gitignore
+++ b/.gitignore
@ -12,12 +12,15 @@ __pycache__/
 /data/*input*.csv
 /data/daily*
 /data/intradaily*
 /data/raw
 /data/stressfulness_event*
 /data/30min*
 /presentation/*scores.csv
 /presentation/Results.ods
 /presentation/results/
 .Rproj.user
 .Rhistory
 /presentation/*.nb.html
 presentation/event_stressful_detection_half_loso.csv
 presentation/event_stressful_detection_loso.csv
 /statistical_analysis/scale_reliability.nb.html
--- a/.idea/codeStyles/Project.xml
+++ b/.idea/codeStyles/Project.xml
@ -0,0 +1,6 @@
 <component name="ProjectCodeStyleConfiguration">
  <code_scheme name="Project" version="173">
    <option name="RIGHT_MARGIN" value="150" />
    <option name="SOFT_MARGINS" value="88" />
  </code_scheme>
 </component>
--- a/.idea/codeStyles/codeStyleConfig.xml
+++ b/.idea/codeStyles/codeStyleConfig.xml
@ -0,0 +1,5 @@
 <component name="ProjectCodeStyleConfiguration">
  <state>
    <option name="USE_PER_PROJECT_SETTINGS" value="true" />
  </state>
 </component>
--- a/.idea/dictionaries/junos.xml
+++ b/.idea/dictionaries/junos.xml
@ -0,0 +1,3 @@
 <component name="ProjectDictionaryState">
  <dictionary name="junos" />
 </component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -17,6 +17,14 @@
            </RMarkdownRenderProfile>
          </value>
        </entry>
        <entry key="file://$PROJECT_DIR$/statistical_analysis/scale_reliability.rmd">
          <value>
            <RMarkdownRenderProfile>
              <option name="lastOutput" value="$PROJECT_DIR$/statistical_analysis/scale_reliability.nb.html" />
              <option name="outputDirectoryUrl" value="file://$PROJECT_DIR$/statistical_analysis" />
            </RMarkdownRenderProfile>
          </value>
        </entry>
      </map>
    </option>
  </component>
--- a/.idea/rGraphicsSettings.xml
+++ b/.idea/rGraphicsSettings.xml
@ -0,0 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="RGraphicsSettings">
    <option name="height" value="600" />
    <option name="resolution" value="75" />
    <option name="version" value="2" />
    <option name="width" value="960" />
  </component>
 </project>
--- a/.idea/rMarkdownGraphicsSettings.xml
+++ b/.idea/rMarkdownGraphicsSettings.xml
@ -0,0 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="RMarkdownGraphicsSettings">
    <option name="globalResolution" value="75" />
    <option name="version" value="2" />
  </component>
 </project>
--- a/.idea/rSettings.xml
+++ b/.idea/rSettings.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="RSettings">
    <option name="interpreterPath" value="C:\Program Files\R\R-4.3.1\bin\R.exe" />
  </component>
 </project>
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,30 @@
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.4.0
    hooks:
      - id: check-yaml
      - id: end-of-file-fixer
      - id: trailing-whitespace
  - repo: https://github.com/pycqa/isort
    rev: 5.12.0
    hooks:
      - id: isort
        name: isort (python)
  - repo: https://github.com/psf/black
    rev: 23.3.0
    hooks:
      - id: black
        language_version: python3
  - repo: https://github.com/pycqa/flake8
    rev: 6.0.0
    hooks:
      - id: flake8
 #  - repo: https://github.com/mwouts/jupytext
 #    rev: v1.14.7
 #    hooks:
 #      - id: jupytext
 #        args: [ --from, "py:percent", --to, "ipynb" ]
 #        additional_dependencies:
 #          - isort==5.12.0 # Matches hook
 #          - black==23.3.0
 #          - flake8==6.0.0
--- a/config/environment.yml
+++ b/config/environment.yml
@ -6,6 +6,7 @@ dependencies:
  - black
  - isort
  - flake8
  - flake8-docstrings
  - imbalanced-learn=0.10.0
  - jupyterlab
  - jupytext
@ -14,6 +15,7 @@ dependencies:
  - nodejs
  - pandas
  - psycopg2 >= 2.9.1
  - pre-commit
  - python-dotenv
  - pytz
  - pyprojroot
--- a/exploration/debug_heatmap.py
+++ b/exploration/debug_heatmap.py
@ -14,15 +14,9 @@
 # ---
 # %%
 import os, sys
 import importlib
 import pandas as pd
 import numpy as np
-# import plotly.graph_objects as go
+from rapids.src.features.utils.utils import chunk_episodes
 from importlib import util
 from pathlib import Path
 import yaml
 # %%
 phone_data_yield = pd.read_csv(
@ -36,23 +30,29 @@ time_segments_labels = pd.read_csv(
 # %%
 phone_data_yield["assigned_segments"] = phone_data_yield[
    "assigned_segments"
-].str.replace(r"_RR\d+SS#", "#")
+].str.replace(r"_RR\d+SS#", "#", regex=True)
 time_segments_labels["label"] = time_segments_labels["label"].str.replace(
-    r"_RR\d+SS$", ""
+    r"_RR\d+SS$", "", regex=True
 )
 # %% tags=[]
-def filter_data_by_segment(data, time_segment):
+def filter_data_by_segment(data, time_segment_current):
    data.dropna(subset=["assigned_segments"], inplace=True)
    if data.shape[0] == 0:  # data is empty
        data["local_segment"] = data["timestamps_segment"] = None
        return data
-    datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
+    datetime_regex = (
-    timestamps_regex = "[0-9]{13}"
+        r"[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
-    segment_regex = "\[({}#{},{};{},{})\]".format(
+    )
-        time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex
+    timestamps_regex = r"[0-9]{13}"
    segment_regex = r"\[({}#{},{};{},{})\]".format(
        time_segment_current,
        datetime_regex,
        datetime_regex,
        timestamps_regex,
        timestamps_regex,
    )
    data["local_segment"] = data["assigned_segments"].str.extract(
        segment_regex, expand=True
@ -147,15 +147,18 @@ def getDataForPlot(phone_data_yield_per_segment):
        .fillna(0)
    )
-    # transpose the dataframe per local start datetime of the segment and discard the useless index layer
+    # transpose the dataframe per local start datetime of the segment
    # and discard the useless index layer
    phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
        "local_segment_start_datetimes"
    )[["minutes_after_segment_start", "sensor"]].apply(
        lambda x: x.set_index("minutes_after_segment_start").transpose()
    )
-    phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
+    phone_data_yield_per_segment.index = (
        phone_data_yield_per_segment.index.get_level_values(
            "local_segment_start_datetimes"
        )
    )
    return phone_data_yield_per_segment
@ -227,9 +230,13 @@ phone_data_yield_per_segment.tail()
 # # A workaround
 # %%
-phone_data_yield_per_segment["local_segment_start_datetimes", "minutes_after_segment_start"] = phone_data_yield_per_segment[
+phone_data_yield_per_segment[
    "local_segment_start_datetimes", "minutes_after_segment_start"
 ] = phone_data_yield_per_segment[
    ["local_segment_start_datetimes", "minutes_after_segment_start"]
-].drop_duplicates(keep="first")
+].drop_duplicates(
    keep="first"
 )
 # %%
 phone_data_yield_per_segment.set_index(
@ -244,8 +251,9 @@ phone_data_yield_per_segment.head()
 # %% [markdown]
 # # Retry
 # %%
-def getDataForPlot(phone_data_yield_per_segment):
+def get_data_for_plot(phone_data_yield_per_segment):
    # calculate the length (in minute) of per segment instance
    phone_data_yield_per_segment["length"] = (
        phone_data_yield_per_segment["timestamps_segment"]
@ -292,7 +300,10 @@ def getDataForPlot(phone_data_yield_per_segment):
        full_index,
        names=("local_segment_start_datetimes", "minutes_after_segment_start"),
    )
-    phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(subset=["local_segment_start_datetimes", "minutes_after_segment_start"],keep="first")
+    phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(
        subset=["local_segment_start_datetimes", "minutes_after_segment_start"],
        keep="first",
    )
    phone_data_yield_per_segment = (
        phone_data_yield_per_segment.set_index(
            ["local_segment_start_datetimes", "minutes_after_segment_start"]
@ -302,15 +313,18 @@ def getDataForPlot(phone_data_yield_per_segment):
        .fillna(0)
    )
-    # transpose the dataframe per local start datetime of the segment and discard the useless index layer
+    # transpose the dataframe per local start datetime of the segment
    # and discard the useless index layer
    phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
        "local_segment_start_datetimes"
    )[["minutes_after_segment_start", "sensor"]].apply(
        lambda x: x.set_index("minutes_after_segment_start").transpose()
    )
-    phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
+    phone_data_yield_per_segment.index = (
        phone_data_yield_per_segment.index.get_level_values(
            "local_segment_start_datetimes"
        )
    )
    return phone_data_yield_per_segment
@ -318,6 +332,6 @@ def getDataForPlot(phone_data_yield_per_segment):
 phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
 # %%
-data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
+data_for_plot_per_segment = get_data_for_plot(phone_data_yield_per_segment)
 # %%
--- a/exploration/expl_esm_labels.py
+++ b/exploration/expl_esm_labels.py
@ -7,7 +7,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -15,19 +15,33 @@
 # ---
 # %%
 import os
 import sys
 import datetime
 import seaborn as sns
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import participants.query_db
-from features.esm import *
+from features.esm import (
-from features.esm_JCQ import *
+    QUESTIONNAIRE_IDS,
-from features.esm_SAM import *
+    clean_up_esm,
    get_esm_data,
    increment_answers,
    preprocess_esm,
    reassign_question_ids,
 )
 from features.esm_COPE import DICT_COPE_QUESTION_IDS
 from features.esm_JCQ import reverse_jcq_demand_control_scoring
 from features.esm_SAM import DICT_SAM_QUESTION_IDS, extract_stressful_events
 # import os
 # import sys
 # nb_dir = os.path.split(os.getcwd())[0]
 # if nb_dir not in sys.path:
 #     sys.path.append(nb_dir)
 # %%
 save_figs = False
 export_data = True
 # %%
 participants_inactive_usernames = participants.query_db.get_usernames(
@ -43,8 +57,14 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)
 # %%
 df_esm_PANAS = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] == 8)
+    (
-    | (df_esm_preprocessed["questionnaire_id"] == 9)
+        df_esm_preprocessed["questionnaire_id"]
        == QUESTIONNAIRE_IDS["PANAS_positive_affect"]
    )
    | (
        df_esm_preprocessed["questionnaire_id"]
        == QUESTIONNAIRE_IDS["PANAS_negative_affect"]
    )
 ]
 df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)
@ -65,35 +85,47 @@ df_esm_PANAS_daily_means = (
 # %%
 df_esm_PANAS_summary_participant = (
    df_esm_PANAS_daily_means.groupby(["participant_id", "questionnaire_id"])
-    .agg(["mean", "median", "std"])
+    .esm_numeric_mean.agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
 df_esm_PANAS_summary_participant.columns = df_esm_PANAS_summary_participant.columns.get_level_values(
    1
 )
 df_esm_PANAS_summary_participant[
-    "PANAS_subscale"
+    "PANAS subscale"
 ] = df_esm_PANAS_daily_means.questionnaire_id.astype("category").cat.rename_categories(
-    {8.0: "PA", 9.0: "NA"}
+    {8.0: "positive affect", 9.0: "negative affect"}
 )
 # %%
-sns.displot(
+df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["mean"]
-    data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS_subscale", binwidth=0.2
+
 # %%
 df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["std"]
 # %%
 df_esm_PANAS_summary_participant.query("std == 0")
 # %%
 fig1 = sns.displot(
    data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS subscale", binwidth=0.2
 )
 fig1.set_axis_labels(x_var="participant mean", y_var="frequency")
 if save_figs:
    fig1.figure.savefig("PANAS_mean_participant.pdf", dpi=300)
 # %%
 sns.displot(
    data=df_esm_PANAS_summary_participant,
    x="median",
-    hue="PANAS_subscale",
+    hue="PANAS subscale",
    binwidth=0.2,
 )
 # %%
-sns.displot(
+fig2 = sns.displot(
-    data=df_esm_PANAS_summary_participant, x="std", hue="PANAS_subscale", binwidth=0.05
+    data=df_esm_PANAS_summary_participant, x="std", hue="PANAS subscale", binwidth=0.05
 )
 fig2.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
 if save_figs:
    fig2.figure.savefig("PANAS_std_participant.pdf", dpi=300)
 # %%
 df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
@ -109,8 +141,14 @@ df_SAM_all.head()
 # %%
 df_esm_SAM = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= 87)
+    (
-    & (df_esm_preprocessed["questionnaire_id"] <= 93)
+        df_esm_preprocessed["questionnaire_id"]
        >= QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
    )
    & (
        df_esm_preprocessed["questionnaire_id"]
        <= QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
    )
 ]
 df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
@ -118,9 +156,10 @@ df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
 # ## Stressful events
 # %%
-df_esm_SAM_event = df_esm_SAM_clean[df_esm_SAM_clean["questionnaire_id"] == 87].assign(
+df_esm_SAM_event = df_esm_SAM_clean[
-    stressful_event=lambda x: (x.esm_user_answer_numeric > 0)
+    df_esm_SAM_clean["questionnaire_id"]
-)
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
 ].assign(stressful_event=lambda x: (x.esm_user_answer_numeric > 0))
 # %%
 df_esm_SAM_daily_events = (
@ -131,20 +170,22 @@ df_esm_SAM_daily_events = (
 )
 # %% [markdown]
-# Calculate the daily mean of YES (1) or NO (0) answers to the question about a stressful events. This is then the daily ratio of EMA sessions that included a stressful event.
+# Calculate the daily mean of YES (1) or NO (0) answers to the question about stressful events. This is then the daily ratio of EMA sessions that included a stressful event.
 # %%
 df_esm_SAM_event_summary_participant = (
    df_esm_SAM_daily_events.groupby(["participant_id"])
-    .agg(["mean", "median", "std"])
+    .SAM_event_ratio.agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
 df_esm_SAM_event_summary_participant.columns = df_esm_SAM_event_summary_participant.columns.get_level_values(
    1
 )
 # %%
-sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
+fig6 = sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
 fig6.set_axis_labels(
    x_var="participant proportion of stressful events", y_var="frequency"
 )
 if save_figs:
    fig6.figure.savefig("SAM_events_mean_participant.pdf", dpi=300)
 # %%
 sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
@ -155,7 +196,12 @@ sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
 # %% [markdown]
 # * Example of threat: "Did this event make you feel anxious?"
 # * Example of challenge: "How eager are you to tackle this event?"
-# * Possible answers: 0 - Not at all, 1 - Slightly, 2 - Moderately, 3 - Considerably, 4 - Extremely
+# * Possible answers:
 #   0 - Not at all,
 #   1 - Slightly,
 #   2 - Moderately,
 #   3 - Considerably,
 #   4 - Extremely
 # %%
 df_esm_SAM_daily = (
@ -167,27 +213,45 @@ df_esm_SAM_daily = (
 # %%
 df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[
-    (df_esm_SAM_daily["questionnaire_id"] == 88)
+    (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
-    | (df_esm_SAM_daily["questionnaire_id"] == 89)
+    | (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
 ]
 # %%
 df_esm_SAM_summary_participant = (
    df_esm_SAM_daily.groupby(["participant_id", "questionnaire_id"])
-    .agg(["mean", "median", "std"])
+    .esm_numeric_mean.agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
-df_esm_SAM_summary_participant.columns = df_esm_SAM_summary_participant.columns.get_level_values(
+
-    1
+# %%
 df_esm_SAM_event_stressfulness_summary_participant = df_esm_SAM_summary_participant[
    df_esm_SAM_summary_participant["questionnaire_id"]
    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
 ]
 df_esm_SAM_event_stressfulness_summary_participant.describe()["mean"]
 # %%
 df_esm_SAM_event_stressfulness_summary_participant.describe()["std"]
 # %%
 sns.displot(
    data=df_esm_SAM_event_stressfulness_summary_participant, x="mean", binwidth=0.2
 )
 # %%
 df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[
-    (df_esm_SAM_summary_participant["questionnaire_id"] == 88)
+    (
-    | (df_esm_SAM_summary_participant["questionnaire_id"] == 89)
+        df_esm_SAM_summary_participant["questionnaire_id"]
        == QUESTIONNAIRE_IDS["appraisal_threat"]
    )
    | (
        df_esm_SAM_summary_participant["questionnaire_id"]
        == QUESTIONNAIRE_IDS["appraisal_challenge"]
    )
 ]
 df_esm_SAM_threat_challenge_summary_participant[
-    "event_subscale"
+    "event subscale"
 ] = df_esm_SAM_threat_challenge_summary_participant.questionnaire_id.astype(
    "category"
 ).cat.rename_categories(
@ -198,26 +262,84 @@ df_esm_SAM_threat_challenge_summary_participant[
 sns.displot(
    data=df_esm_SAM_threat_challenge_summary_participant,
    x="mean",
-    hue="event_subscale",
+    hue="event subscale",
    binwidth=0.2,
 )
 # %%
-sns.displot(
+fig3 = sns.displot(
    data=df_esm_SAM_threat_challenge_summary_participant,
    x="std",
-    hue="event_subscale",
+    hue="event subscale",
    binwidth=0.1,
 )
 fig3.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
 if save_figs:
    fig3.figure.savefig("SAM_std_participant.pdf", dpi=300)
 # %%
 df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
    "mean"
 ]
 # %%
 df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
    "std"
 ]
 # %%
 df_esm_SAM_clean.columns
 # %%
 df_esm_SAM_clean.esm_status.value_counts()
 # %%
 if export_data:
    df_esm_SAM_fixed = reassign_question_ids(df_esm_SAM_clean, DICT_SAM_QUESTION_IDS)
    df_esm_SAM_fixed = increment_answers(df_esm_SAM_fixed)
    df_esm_SAM_for_export = df_esm_SAM_fixed[
        [
            "participant_id",
            "username",
            "device_id",
            "_id",
            "esm_trigger",
            "esm_session",
            "esm_notification_id",
            "question_id",
            "questionnaire_id",
            "esm_instructions",
            "double_esm_user_answer_timestamp",
            "datetime_lj",
            "date_lj",
            "time",
            "esm_user_answer",
            "esm_user_answer_numeric",
        ]
    ]
    df_esm_SAM_for_export.sort_values(
        by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
    )
    print(df_esm_SAM_for_export.head())
    df_esm_SAM_for_export.to_csv(
        "../data/raw/df_esm_SAM_threat_challenge.csv", index=False
    )
 # %% [markdown]
 # ## Stressfulness of period
 # %%
 df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[
-    df_esm_SAM_summary_participant["questionnaire_id"] == 93
+    df_esm_SAM_summary_participant["questionnaire_id"]
    == QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
 ]
 # %%
 df_esm_SAM_period_summary_participant.describe()["mean"]
 # %%
 df_esm_SAM_period_summary_participant.describe()["std"]
 # %%
 sns.displot(data=df_esm_SAM_period_summary_participant, x="mean", binwidth=0.2)
@ -229,8 +351,8 @@ sns.displot(data=df_esm_SAM_period_summary_participant, x="std", binwidth=0.1)
 # %%
 df_esm_JCQ_demand_control = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= 10)
+    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["JCQ_job_demand"])
-    & (df_esm_preprocessed["questionnaire_id"] <= 11)
+    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["JCQ_job_control"])
 ]
 df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control)
@ -250,14 +372,11 @@ df_esm_JCQ_daily = (
 )
 df_esm_JCQ_summary_participant = (
    df_esm_JCQ_daily.groupby(["participant_id", "questionnaire_id"])
-    .agg(["mean", "median", "std"])
+    .esm_score_mean.agg(["mean", "median", "std"])
    .reset_index(col_level=1)
 )
 df_esm_JCQ_summary_participant.columns = df_esm_JCQ_summary_participant.columns.get_level_values(
    1
 )
 df_esm_JCQ_summary_participant[
-    "JCQ_subscale"
+    "JCQ subscale"
 ] = df_esm_JCQ_summary_participant.questionnaire_id.astype(
    "category"
 ).cat.rename_categories(
@ -265,11 +384,71 @@ df_esm_JCQ_summary_participant[
 )
 # %%
-sns.displot(
+df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["mean"]
    data=df_esm_JCQ_summary_participant, x="mean", hue="JCQ_subscale", binwidth=0.1,
 )
 # %%
-sns.displot(
+df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["std"]
-    data=df_esm_JCQ_summary_participant, x="std", hue="JCQ_subscale", binwidth=0.05,
+
 # %%
 fig4 = sns.displot(
    data=df_esm_JCQ_summary_participant,
    x="mean",
    hue="JCQ subscale",
    binwidth=0.1,
 )
 fig4.set_axis_labels(x_var="participant mean", y_var="frequency")
 if save_figs:
    fig4.figure.savefig("JCQ_mean_participant.pdf", dpi=300)
 # %%
 fig5 = sns.displot(
    data=df_esm_JCQ_summary_participant,
    x="std",
    hue="JCQ subscale",
    binwidth=0.05,
 )
 fig6.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
 if save_figs:
    fig5.figure.savefig("JCQ_std_participant.pdf", dpi=300)
 # %% [markdown]
 # # COPE Inventory
 # %%
 df_esm_COPE = df_esm_preprocessed[
    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["COPE_active"])
    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["COPE_emotions"])
 ]
 # %%
 df_esm_COPE_clean = clean_up_esm(df_esm_COPE)
 df_esm_COPE_clean = increment_answers(df_esm_COPE_clean)
 df_esm_COPE_fixed = reassign_question_ids(df_esm_COPE_clean, DICT_COPE_QUESTION_IDS)
 # %%
 if export_data:
    df_esm_COPE_for_export = df_esm_COPE_fixed[
        [
            "participant_id",
            "username",
            "device_id",
            "_id",
            "esm_trigger",
            "esm_session",
            "esm_notification_id",
            "question_id",
            "questionnaire_id",
            "esm_instructions",
            "double_esm_user_answer_timestamp",
            "datetime_lj",
            "date_lj",
            "time",
            "esm_user_answer",
            "esm_user_answer_numeric",
        ]
    ]
    df_esm_COPE_for_export.sort_values(
        by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
    )
    print(df_esm_COPE_for_export.head())
    df_esm_COPE_for_export.to_csv("../data/raw/df_esm_COPE.csv", index=False)
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@ -20,30 +20,74 @@ import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import recall_score, f1_score
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 from machine_learning.cross_validation import CrossValidation
 from machine_learning.preprocessing import Preprocessing
 from machine_learning.feature_selection import FeatureSelection
 # %% 
 df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 df.set_index(index_columns, inplace=True)
 # Create binary target 
 bins = [-1, 0, 4] # bins for stressfulness (0-4) target
 df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
 nan_cols = df.columns[df.isna().any()].tolist()
 df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))
 cv = CrossValidation(data=df, cv_method="logo")
 categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
 interval_feature_list, other_feature_list = [], []
-print(df.columns.tolist())
+# %%
 for split in cv.get_splits():
    train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
    pre = Preprocessing(train_X, train_y, test_X, test_y)
    pre.one_hot_encode_train_and_test_sets(categorical_columns)
    train_X, train_y, test_X, test_y = pre.get_train_test_sets()
    print(train_X.shape, test_X.shape)
    # Predict before feature selection
    rfc = RandomForestClassifier(n_estimators=10)
    rfc.fit(train_X, train_y)
    predictions = rfc.predict(test_X)
    print("Recall:", recall_score(test_y, predictions))
    print("F1:", f1_score(test_y, predictions))
    # Feature selection on train set
    train_groups, test_groups = cv.get_groups_sets(split)
    fs = FeatureSelection(train_X, train_y, train_groups) 
    selected_features = fs.select_features(n_min=20, n_max=29, k=40,
                                           ml_type="classification_bin", 
                                           metric="recall", n_tolerance=20)
    train_X = train_X[selected_features]
    test_X = test_X[selected_features]
    print(selected_features)
    print(len(selected_features))
    # Predict after feature selection    
    rfc = RandomForestClassifier(n_estimators=500)
    rfc.fit(train_X, train_y)
    predictions = rfc.predict(test_X)
    print("Recall:", recall_score(test_y, predictions))
    print("F1:", f1_score(test_y, predictions))
    break
 # %%
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@ -6,457 +6,129 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
-# %matplotlib inline
+# from IPython.core.interactiveshell import InteractiveShell
-import os
+from pathlib import Path
 import sys
-import numpy as np
+# matplotlib inline
-import matplotlib.pyplot as plt
+# import os
 # import sys
 import pandas as pd
-from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
+from machine_learning.helper import (
-from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
+    impute_encode_categorical_features,
-from sklearn.dummy import DummyClassifier
+    prepare_cross_validator,
-from sklearn.impute import SimpleImputer
+    prepare_sklearn_data_format,
    run_all_classification_models,
 )
-from lightgbm import LGBMClassifier
+# InteractiveShell.ast_node_interactivity = "all"
 import xgboost as xg
 from IPython.core.interactiveshell import InteractiveShell
 InteractiveShell.ast_node_interactivity = "all"
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import machine_learning.helper
 # %% [markdown]
 # # RAPIDS models
 # %% [markdown]
 # ## Set script's parameters
 #
 # nb_dir = os.path.split(os.getcwd())[0]
 # if nb_dir not in sys.path:
 #     sys.path.append(nb_dir)
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
 n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
 undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method)
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# %%
-model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
+CV_METHOD = "logo"  # logo, half_logo, 5kfold
-# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
+# Cross-validation method (could be regarded as a hyperparameter)
 print("CV_METHOD: " + CV_METHOD)
 N_SL = 3  # Number of largest/smallest accuracies (of particular CV) outputs
 UNDERSAMPLING = False
 # (bool) If True this will train and test data on balanced dataset
 # (using undersampling method)
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+PATH_BASE = Path("E:/STRAWresults/20230415")
 model_input.set_index(index_columns, inplace=True)
 model_input['target'].value_counts()
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+SEGMENT_TYPE = "period"
 print("SEGMENT_TYPE: " + SEGMENT_TYPE)
 SEGMENT_LENGTH = "30_minutes_before"
 print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
 TARGET_VARIABLE = "JCQ_job_control"
 print("TARGET_VARIABLE: " + TARGET_VARIABLE)
 if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
    TARGET_VARIABLE += "_"
    TARGET_VARIABLE += SEGMENT_TYPE
 PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
 model_input = pd.read_csv(PATH_FULL)
 if SEGMENT_LENGTH == "daily":
    DAY_LENGTH = "daily"  # or "working"
    print(DAY_LENGTH)
    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
 # %% jupyter={"outputs_hidden": false, "source_hidden": false}
 model_input["target"].value_counts()
 # %% jupyter={"outputs_hidden": false, "source_hidden": false}
 # bins = [-10, 0, 10] # bins for z-scored targets
-bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+BINS = [-1, 0, 4]  # bins for stressfulness (0-4) target
-model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
+print("BINS: ", BINS)
-model_input['target'].value_counts(), edges
+model_input["target"], edges = pd.cut(
-# model_input = model_input[model_input['target'] != "medium"]
+    model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
-model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
+)  # ['low', 'medium', 'high']
 print(model_input["target"].value_counts())
 REMOVE_MEDIUM = True
 if ("medium" in model_input["target"]) and REMOVE_MEDIUM:
    model_input = model_input[model_input["target"] != "medium"]
    model_input["target"] = (
        model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
    )
 else:
    model_input["target"] = model_input["target"].map(
        {"low": 0, "medium": 1, "high": 2}
    )
    print(model_input["target"].value_counts())
 model_input['target'].value_counts()
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
 # UnderSampling
-if undersampling:
+if UNDERSAMPLING:
-    no_stress = model_input[model_input['target'] == 0]
+    no_stress = model_input[model_input["target"] == 0]
-    stress = model_input[model_input['target'] == 1]
+    stress = model_input[model_input["target"] == 1]
    no_stress = no_stress.sample(n=len(stress))
    model_input = pd.concat([stress, no_stress], axis=0)
 #   model_input_new = pd.DataFrame(columns=model_input.columns)
 #   for pid in model_input["pid"].unique():
 #     stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
 #     no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
 #     if (len(stress) == 0):
 #       continue
 #     if (len(no_stress) == 0):
 #       continue
 #     model_input_new = pd.concat([model_input_new, stress], axis=0)
-#     no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
-#     # In case there are more stress samples than no_stress, take all instances of no_stress.
+model_input_encoded = impute_encode_categorical_features(model_input)
 #     model_input_new = pd.concat([model_input_new, no_stress], axis=0)
 #     model_input = model_input_new   
 #     model_input_new = pd.concat([model_input_new, no_stress], axis=0)
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 if cv_method_str == 'half_logo':
    model_input['pid_index'] = model_input.groupby('pid').cumcount()
    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
 else:
    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 categorical_feature_colnames = ["gender", "startlanguage"]
 additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
 categorical_features = data_x[categorical_feature_colnames].copy()
 mode_categorical_features = categorical_features.mode().iloc[0]
 # fillna with mode
 categorical_features = categorical_features.fillna(mode_categorical_features)
 # one-hot encoding
 categorical_features = categorical_features.apply(lambda col: col.astype("category"))
 if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)
 numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
 train_x = pd.concat([numerical_features, categorical_features], axis=1)
 train_x.dtypes
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
 if cv_method_str == 'logo' or cv_method_str == 'half_logo':
    cv_method = LeaveOneGroupOut()
    cv_method.get_n_splits(
        train_x,
        data_y,
        groups=data_groups,
    )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 imputer = SimpleImputer(missing_values=np.nan, strategy='median')
 # %% [markdown]
 # ### Baseline: Dummy Classifier (most frequent)
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 dummy_class = DummyClassifier(strategy="most_frequent")
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 dummy_classifier = cross_validate(
    dummy_class,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
 print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
 print("Precision", np.mean(dummy_classifier['test_precision']))
 print("Recall", np.mean(dummy_classifier['test_recall']))
 print("F1", np.mean(dummy_classifier['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown] nteract={"transient": {"deleting": false}}
 # ### All models
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 final_scores = machine_learning.helper.run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 # %%
-final_scores.index.name = "metric"
+data_x, data_y, data_groups = prepare_sklearn_data_format(
-final_scores = final_scores.set_index(["method", final_scores.index])
+    model_input_encoded, CV_METHOD
 final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv")
 # %% [markdown]
 # ### Logistic Regression
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 logistic_regression = linear_model.LogisticRegression()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 log_reg_scores = cross_validate(
    logistic_regression,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
 print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
 print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
 print("Precision", np.mean(log_reg_scores['test_precision']))
 print("Recall", np.mean(log_reg_scores['test_recall']))
 print("F1", np.mean(log_reg_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n_sl)[:n_sl]))
-# %% [markdown]
+# %%
-# ### Support Vector Machine
+data_y.head()
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# %%
-svc = svm.SVC()
+data_y.tail()
-
+# %%
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+data_y.shape
-svc_scores = cross_validate(
+# %%
-    svc,
+scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
-    X=imputer.fit_transform(train_x),
+# %%
-    y=data_y,
+PATH_OUTPUT = Path("..") / Path("presentation/results")
-    groups=data_groups,
+path_output_full = PATH_OUTPUT / (
-    cv=cv_method,
+    TARGET_VARIABLE
-    n_jobs=-1,
+    + "_"
-    scoring=('accuracy', 'precision', 'recall', 'f1')
+    + SEGMENT_LENGTH
    + "_classification"
    + str(BINS)
    + "_"
    + CV_METHOD
    + ".csv"
 )
-# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+scores.to_csv(path_output_full, index=False)
 print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
 print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
 print("Precision", np.mean(svc_scores['test_precision']))
 print("Recall", np.mean(svc_scores['test_recall']))
 print("F1", np.mean(svc_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### Gaussian Naive Bayes
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 gaussian_nb = naive_bayes.GaussianNB()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 gaussian_nb_scores = cross_validate(
    gaussian_nb,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
 print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
 print("Precision", np.mean(gaussian_nb_scores['test_precision']))
 print("Recall", np.mean(gaussian_nb_scores['test_recall']))
 print("F1", np.mean(gaussian_nb_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### Stochastic Gradient Descent Classifier
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 sgdc = linear_model.SGDClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 sgdc_scores = cross_validate(
    sgdc,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
 print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
 print("Precision", np.mean(sgdc_scores['test_precision']))
 print("Recall", np.mean(sgdc_scores['test_recall']))
 print("F1", np.mean(sgdc_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### K-nearest neighbors
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 knn = neighbors.KNeighborsClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 knn_scores = cross_validate(
    knn,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
 print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
 print("Precision", np.mean(knn_scores['test_precision']))
 print("Recall", np.mean(knn_scores['test_recall']))
 print("F1", np.mean(knn_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### Decision Tree
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 dtree = tree.DecisionTreeClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 dtree_scores = cross_validate(
    dtree,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
 print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
 print("Precision", np.mean(dtree_scores['test_precision']))
 print("Recall", np.mean(dtree_scores['test_recall']))
 print("F1", np.mean(dtree_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### Random Forest Classifier
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 rfc = ensemble.RandomForestClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 rfc_scores = cross_validate(
    rfc,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1'), 
    return_estimator=True
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
 print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
 print("Precision", np.mean(rfc_scores['test_precision']))
 print("Recall", np.mean(rfc_scores['test_recall']))
 print("F1", np.mean(rfc_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### Feature importance (RFC)
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
 for idx, estimator in enumerate(rfc_scores['estimator']):
    feature_importances = pd.DataFrame(estimator.feature_importances_,
                                       index = list(train_x.columns),
                                        columns=['importance'])
    # print("\nFeatures sorted by their score for estimator {}:".format(idx))
    # print(feature_importances.sort_values('importance', ascending=False).head(10))                                    
    rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
 pd.set_option('display.max_rows', 100)
 print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
 rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
 rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
 train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
 # %% [markdown]
 # ### Gradient Boosting Classifier
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 gbc = ensemble.GradientBoostingClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 gbc_scores = cross_validate(
    gbc,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
 print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
 print("Precision", np.mean(gbc_scores['test_precision']))
 print("Recall", np.mean(gbc_scores['test_recall']))
 print("F1", np.mean(gbc_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### LGBM Classifier
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 lgbm = LGBMClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 lgbm_scores = cross_validate(
    lgbm,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
 print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
 print("Precision", np.mean(lgbm_scores['test_precision']))
 print("Recall", np.mean(lgbm_scores['test_recall']))
 print("F1", np.mean(lgbm_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### XGBoost Classifier
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 xgb_classifier = xg.sklearn.XGBClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 xgb_classifier_scores = cross_validate(
    xgb_classifier,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
 print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
 print("Precision", np.mean(xgb_classifier_scores['test_precision']))
 print("Recall", np.mean(xgb_classifier_scores['test_recall']))
 print("F1", np.mean(xgb_classifier_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
--- a/exploration/ml_pipeline_classification_composite.py
+++ b/exploration/ml_pipeline_classification_composite.py
@ -0,0 +1,177 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"outputs_hidden": false, "source_hidden": false}
 from pathlib import Path
 import pandas as pd
 import seaborn as sns
 from sklearn.decomposition import PCA
 from machine_learning.helper import (
    impute_encode_categorical_features,
    prepare_cross_validator,
    prepare_sklearn_data_format,
    run_all_classification_models,
 )
 # %%
 CV_METHOD = "logo"  # logo, half_logo, 5kfold
 # Cross-validation method (could be regarded as a hyperparameter)
 print("CV_METHOD: " + CV_METHOD)
 N_SL = 3  # Number of largest/smallest accuracies (of particular CV) outputs
 UNDERSAMPLING = False
 # (bool) If True this will train and test data on balanced dataset
 # (using undersampling method)
 # %% jupyter={"outputs_hidden": false, "source_hidden": false}
 PATH_BASE = Path("E:/STRAWresults/20230415")
 SEGMENT_TYPE = "period"
 print("SEGMENT_TYPE: " + SEGMENT_TYPE)
 SEGMENT_LENGTH = "30_minutes_before"
 print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
 PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv"
 all_features_with_baseline = pd.read_csv(PATH_FULL)
 # %%
 TARGETS = [
    "PANAS_negative_affect_mean",
    "PANAS_positive_affect_mean",
    "JCQ_job_demand_mean",
    "JCQ_job_control_mean",
    "appraisal_stressfulness_period_mean",
 ]
 # %%
 all_features_cleaned = pd.DataFrame()
 for target in TARGETS:
    PATH_FULL = (
        PATH_BASE
        / SEGMENT_LENGTH
        / "features"
        / ("all_sensor_features_cleaned_straw_py_(" + target + ").csv")
    )
    current_features = pd.read_csv(PATH_FULL, index_col="local_segment")
    if all_features_cleaned.empty:
        all_features_cleaned = current_features
    else:
        all_features_cleaned = all_features_cleaned.join(
            current_features[("phone_esm_straw_" + target)],
            how="inner",
            rsuffix="_" + target,
        )
    print(all_features_cleaned.shape)
 # %%
 pca = PCA(n_components=1)
 TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS]
 pca.fit(all_features_cleaned[TARGETS_PREFIXED])
 print(pca.explained_variance_ratio_)
 # %%
 model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED)
 model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED])
 # %%
 sns.histplot(data=model_input, x="target")
 # %%
 model_input.target.quantile(0.6)
 # %% jupyter={"outputs_hidden": false, "source_hidden": false}
 # bins = [-10, 0, 10] # bins for z-scored targets
 BINS = [-10, 0, 10]  # bins for stressfulness (0-4) target
 print("BINS: ", BINS)
 model_input["target"], edges = pd.cut(
    model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
 )  # ['low', 'medium', 'high']
 print(model_input["target"].value_counts())
 REMOVE_MEDIUM = True
 if REMOVE_MEDIUM:
    if "medium" in model_input["target"]:
        model_input = model_input[model_input["target"] != "medium"]
    model_input["target"] = (
        model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
    )
 else:
    model_input["target"] = model_input["target"].map(
        {"low": 0, "medium": 1, "high": 2}
    )
    print(model_input["target"].value_counts())
 # %% jupyter={"outputs_hidden": false, "source_hidden": false}
 # UnderSampling
 if UNDERSAMPLING:
    no_stress = model_input[model_input["target"] == 0]
    stress = model_input[model_input["target"] == 1]
    no_stress = no_stress.sample(n=len(stress))
    model_input = pd.concat([stress, no_stress], axis=0)
 # %%
 TARGET_VARIABLE = "PANAS_negative_affect"
 print("TARGET_VARIABLE: " + TARGET_VARIABLE)
 PATH_FULL_HELP = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
 model_input_with_baseline = pd.read_csv(PATH_FULL_HELP, index_col="local_segment")
 # %%
 baseline_col_names = [
    col for col in model_input_with_baseline.columns if col not in model_input.columns
 ]
 print(baseline_col_names)
 # %%
 model_input = model_input.join(
    model_input_with_baseline[baseline_col_names], how="left"
 )
 model_input.reset_index(inplace=True)
 # %%
 model_input_encoded = impute_encode_categorical_features(model_input)
 # %%
 data_x, data_y, data_groups = prepare_sklearn_data_format(
    model_input_encoded, CV_METHOD
 )
 cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
 # %%
 data_y.head()
 # %%
 data_y.tail()
 # %%
 data_y.shape
 # %%
 scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
 # %%
 PATH_OUTPUT = Path("..") / Path("presentation/results")
 path_output_full = PATH_OUTPUT / (
    "composite_"
    + SEGMENT_LENGTH
    + "_classification"
    + str(BINS)
    + "_"
    + CV_METHOD
    + ".csv"
 )
 scores.to_csv(path_output_full, index=False)
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,80 +14,85 @@
 # ---
 # %% jupyter={"source_hidden": true}
-# %matplotlib inline
+from pathlib import Path
 import datetime
 import importlib
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 from scipy import stats
 from sklearn.model_selection import LeaveOneGroupOut, cross_validate
 from sklearn.impute import SimpleImputer
 from sklearn.dummy import DummyClassifier
 from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
 import xgboost as xg 
 from sklearn.cluster import KMeans
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold, cross_validate
 from IPython.core.interactiveshell import InteractiveShell
 InteractiveShell.ast_node_interactivity = "all"
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import machine_learning.labels
 import machine_learning.model
 from machine_learning.classification_models import ClassificationModels
-# %% [markdown]
+# %%
 # # RAPIDS models
 # %% [markdown]
 # ## Set script's parameters
-n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
+N_CLUSTERS = 4  # Number of clusters (could be regarded as a hyperparameter)
-cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+CV_METHOD = "logo"  # logo, halflogo, 5kfold
-n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
+# Cross-validation method (could be regarded as a hyperparameter)
 N_SL = 1  # Number of largest/smallest accuracies (of particular CV) outputs
 # %%
 PATH_BASE = Path("E:/STRAWresults/20230415")
 SEGMENT_TYPE = "period"
 print("SEGMENT_TYPE: " + SEGMENT_TYPE)
 SEGMENT_LENGTH = "30_minutes_before"
 print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
 TARGET_VARIABLE = "appraisal_stressfulness"
 print("TARGET_VARIABLE: " + TARGET_VARIABLE)
 if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
    TARGET_VARIABLE += "_"
    TARGET_VARIABLE += SEGMENT_TYPE
 PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
 model_input = pd.read_csv(PATH_FULL)
 if SEGMENT_LENGTH == "daily":
    DAY_LENGTH = "daily"  # or "working"
    print(DAY_LENGTH)
    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
 # %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
+index_columns = [
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+    "local_segment",
    "local_segment_label",
    "local_segment_start_datetime",
    "local_segment_end_datetime",
 ]
-clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
+CLUST_COL = "limesurvey_demand_control_ratio_quartile"
 print("CLUST_COL: " + CLUST_COL)
-model_input.columns[list(model_input.columns).index('age'):-1]
+BINS = [-1, 0, 4]
 print("BINS: " + str(BINS))
-lime_cols = [col for col in model_input if col.startswith('limesurvey')]
+model_input[CLUST_COL].describe()
 lime_cols
 lime_col = 'limesurvey_demand_control_ratio_quartile'
 clust_col = lime_col
 model_input[clust_col].describe()
 # %%
 model_input["target"].value_counts()
 # %% jupyter={"source_hidden": true}
 # Filter-out outlier rows by clust_col
 # model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
-uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
+uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
 uniq = uniq.dropna()
-plt.bar(uniq['pid'], uniq[clust_col])
+plt.bar(uniq["pid"], uniq[CLUST_COL])
 # %% jupyter={"source_hidden": true}
 # Get clusters by cluster col & and merge the clusters to main df
-km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
+km = KMeans(n_clusters=N_CLUSTERS).fit_predict(uniq.set_index("pid"))
 np.unique(km, return_counts=True)
-uniq['cluster'] = km
+uniq["cluster"] = km
 uniq
-model_input = model_input.merge(uniq[['pid', 'cluster']])   
+model_input = model_input.merge(uniq[["pid", "cluster"]])
 # %%
 model_input[["cluster", "target"]].value_counts().sort_index()
 # %% jupyter={"source_hidden": true}
 model_input.set_index(index_columns, inplace=True)
@ -98,31 +103,56 @@ cm = ClassificationModels()
 cmodels = cm.get_cmodels()
 # %% jupyter={"source_hidden": true}
-for k in range(n_clusters):
+for k in range(N_CLUSTERS):
    model_input_subset = model_input[model_input["cluster"] == k].copy()
-    bins = [-10, -1, 1, 10] # bins for z-scored targets
+    model_input_subset.loc[:, "target"] = pd.cut(
-    model_input_subset.loc[:, 'target'] = \
+        model_input_subset.loc[:, "target"],
-        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
+        bins=BINS,
-    model_input_subset['target'].value_counts()
+        labels=["low", "high"],
-    model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
+        right=True,
-    model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
+    )  # ['low', 'medium', 'high']
    model_input_subset["target"].value_counts()
    # model_input_subset = model_input_subset[model_input_subset["target"] != "medium"]
    model_input_subset["target"] = (
        model_input_subset["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
    )
-    model_input_subset['target'].value_counts()
+    print(model_input_subset["target"].value_counts())
-    if cv_method_str == 'half_logo':
+    if CV_METHOD == "half_logo":
-        model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
+        model_input_subset["pid_index"] = model_input_subset.groupby("pid").cumcount()
-        model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
+        model_input_subset["pid_count"] = model_input_subset.groupby("pid")[
            "pid"
        ].transform("count")
-        model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
+        model_input_subset["pid_index"] = (
-        model_input_subset["pid_half"] = model_input_subset["pid"] + "_" +  model_input_subset["pid_index"].astype(int).astype(str)
+            model_input_subset["pid_index"] / model_input_subset["pid_count"] + 1
        ).round()
        model_input_subset["pid_half"] = (
            model_input_subset["pid"]
            + "_"
            + model_input_subset["pid_index"].astype(int).astype(str)
        )
-        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
+        data_x, data_y, data_groups = (
            model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
            model_input_subset["target"],
            model_input_subset["pid_half"],
        )
    else:
-        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]
+        data_x, data_y, data_groups = (
            model_input_subset.drop(["target", "pid"], axis=1),
            model_input_subset["target"],
            model_input_subset["pid"],
        )
    # Treat categorical features
    categorical_feature_colnames = ["gender", "startlanguage"]
-    additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+    additional_categorical_features = [
        col
        for col in data_x.columns
        if "mostcommonactivity" in col or "homelabel" in col
    ]
    categorical_feature_colnames += additional_categorical_features
    categorical_features = data_x[categorical_feature_colnames].copy()
@ -132,7 +162,9 @@ for k in range(n_clusters):
    categorical_features = categorical_features.fillna(mode_categorical_features)
    # one-hot encoding
-    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+    categorical_features = categorical_features.apply(
        lambda col: col.astype("category")
    )
    if not categorical_features.empty:
        categorical_features = pd.get_dummies(categorical_features)
@ -140,8 +172,10 @@ for k in range(n_clusters):
    train_x = pd.concat([numerical_features, categorical_features], axis=1)
    # Establish cv method
-    cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
+    cv_method = StratifiedKFold(
-    if cv_method_str == 'logo' or cv_method_str == 'half_logo':
+        n_splits=5, shuffle=True
    )  # Defaults to 5 k-folds in cross_validate method
    if CV_METHOD == "logo" or CV_METHOD == "half_logo":
        cv_method = LeaveOneGroupOut()
        cv_method.get_n_splits(
            train_x,
@ -149,36 +183,57 @@ for k in range(n_clusters):
            groups=data_groups,
        )
-    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+    imputer = SimpleImputer(missing_values=np.nan, strategy="median")
    for model_title, model in cmodels.items():
        classifier = cross_validate(
-            model['model'],
+            model["model"],
            X=imputer.fit_transform(train_x),
            y=data_y,
            groups=data_groups,
            cv=cv_method,
            n_jobs=-1,
-            error_score='raise',
+            error_score="raise",
-            scoring=('accuracy', 'precision', 'recall', 'f1')
+            scoring=("accuracy", "precision", "recall", "f1"),
        )
        print("\n-------------------------------------\n")
        print("Current cluster:", k, end="\n")
        print("Current model:", model_title, end="\n")
-        print("Acc", np.mean(classifier['test_accuracy']))
+        print("Acc", np.mean(classifier["test_accuracy"]))
-        print("Precision", np.mean(classifier['test_precision']))
+        print("Precision", np.mean(classifier["test_precision"]))
-        print("Recall", np.mean(classifier['test_recall']))
+        print("Recall", np.mean(classifier["test_recall"]))
-        print("F1", np.mean(classifier['test_f1']))
+        print("F1", np.mean(classifier["test_f1"]))
-        print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
+        print(
-        print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))
+            f"Largest {N_SL} ACC:",
            np.sort(-np.partition(-classifier["test_accuracy"], N_SL)[:N_SL])[::-1],
        )
        print(
            f"Smallest {N_SL} ACC:",
            np.sort(np.partition(classifier["test_accuracy"], N_SL)[:N_SL]),
        )
-        cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
+        cmodels[model_title]["metrics"][0] += np.mean(classifier["test_accuracy"])
-        cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
+        cmodels[model_title]["metrics"][1] += np.mean(classifier["test_precision"])
-        cmodels[model_title]['metrics'][2] += np.mean(classifier['test_recall'])
+        cmodels[model_title]["metrics"][2] += np.mean(classifier["test_recall"])
-        cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])
+        cmodels[model_title]["metrics"][3] += np.mean(classifier["test_f1"])
 # %% jupyter={"source_hidden": true}
 # Get overall results
-cm.get_total_models_scores(n_clusters=n_clusters)
+scores = cm.get_total_models_scores(n_clusters=N_CLUSTERS)
 # %%
 PATH_OUTPUT = Path("..") / Path("presentation/results")
 path_output_full = PATH_OUTPUT / (
    TARGET_VARIABLE
    + "_"
    + SEGMENT_LENGTH
    + "_classification_"
    + CV_METHOD
    + str(BINS)
    + "_clust_"
    + CLUST_COL
    + str(N_CLUSTERS)
    + ".csv"
 )
 scores.to_csv(path_output_full, index=False)
--- a/exploration/ml_pipeline_classification_with_clustering_2_class.py
+++ b/exploration/ml_pipeline_classification_with_clustering_2_class.py
@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
@ -14,92 +14,83 @@
 # ---
 # %% jupyter={"source_hidden": true}
-# %matplotlib inline
+from pathlib import Path
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from scipy import stats
 from sklearn.model_selection import train_test_split
 from sklearn.impute import SimpleImputer
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 from sklearn.cluster import KMeans
-
+from sklearn.impute import SimpleImputer
-from IPython.core.interactiveshell import InteractiveShell
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
-InteractiveShell.ast_node_interactivity = "all"
+from sklearn.model_selection import train_test_split
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 from machine_learning.classification_models import ClassificationModels
-
+from machine_learning.helper import impute_encode_categorical_features
 # %% [markdown]
 # # RAPIDS models
 # %% [markdown]
 # # Useful method
 def treat_categorical_features(input_set):
    categorical_feature_colnames = ["gender", "startlanguage"]
    additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col]
    categorical_feature_colnames += additional_categorical_features
    categorical_features = input_set[categorical_feature_colnames].copy()
    mode_categorical_features = categorical_features.mode().iloc[0]
    # fillna with mode
    categorical_features = categorical_features.fillna(mode_categorical_features)
    # one-hot encoding
    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
    if not categorical_features.empty:
        categorical_features = pd.get_dummies(categorical_features)
    numerical_features = input_set.drop(categorical_feature_colnames, axis=1)
    return pd.concat([numerical_features, categorical_features], axis=1)
 # %% [markdown]
 # ## Set script's parameters
 #
 # %%
 n_clusters = 3  # Number of clusters (could be regarded as a hyperparameter)
 n_sl = 3  # Number of largest/smallest accuracies (of particular CV) outputs
 # %%
 PATH_BASE = Path("E:/STRAWresults/20230415")
 SEGMENT_TYPE = "period"
 print("SEGMENT_TYPE: " + SEGMENT_TYPE)
 SEGMENT_LENGTH = "30_minutes_before"
 print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
 TARGET_VARIABLE = "appraisal_stressfulness"
 print("TARGET_VARIABLE: " + TARGET_VARIABLE)
 if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
    TARGET_VARIABLE += "_"
    TARGET_VARIABLE += SEGMENT_TYPE
 PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
 model_input = pd.read_csv(PATH_FULL)
 if SEGMENT_LENGTH == "daily":
    DAY_LENGTH = "daily"  # or "working"
    print(DAY_LENGTH)
    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
 # %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+CLUST_COL = "limesurvey_demand_control_ratio"
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+print("CLUST_COL: " + CLUST_COL)
-clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
+BINS = [-1, 0, 4]
 print("BINS: " + str(BINS))
-model_input.columns[list(model_input.columns).index('age'):-1]
+index_columns = [
    "local_segment",
    "local_segment_label",
    "local_segment_start_datetime",
    "local_segment_end_datetime",
 ]
-lime_cols = [col for col in model_input if col.startswith('limesurvey')]
+model_input[CLUST_COL].describe()
 lime_cols
 lime_col = 'limesurvey_demand_control_ratio'
 clust_col = lime_col
 model_input[clust_col].describe()
 # %% jupyter={"source_hidden": true}
 # Filter-out outlier rows by clust_col
-model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
+model_input = model_input[(np.abs(stats.zscore(model_input[CLUST_COL])) < 3)]
-uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
+uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
-plt.bar(uniq['pid'], uniq[clust_col])
+plt.bar(uniq["pid"], uniq[CLUST_COL])
 # %% jupyter={"source_hidden": true}
 # Get clusters by cluster col & and merge the clusters to main df
-km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
+km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index("pid"))
 np.unique(km, return_counts=True)
-uniq['cluster'] = km
+uniq["cluster"] = km
-uniq
+print(uniq)
-model_input = model_input.merge(uniq[['pid', 'cluster']])   
+model_input = model_input.merge(uniq[["pid", "cluster"]])
 # %% jupyter={"source_hidden": true}
 model_input.set_index(index_columns, inplace=True)
@ -109,44 +100,58 @@ model_input.set_index(index_columns, inplace=True)
 cm = ClassificationModels()
 cmodels = cm.get_cmodels()
 # %%
 model_input["target"].value_counts()
 # %% jupyter={"source_hidden": true}
 for k in range(n_clusters):
    model_input_subset = model_input[model_input["cluster"] == k].copy()
    # Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
-    model_input_subset['numerical_target'] = model_input_subset['target']
+    # model_input_subset['numerical_target'] = model_input_subset['target']
    bins = [-10, 0, 10] # bins for z-scored targets
    model_input_subset.loc[:, 'target'] = \
        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True)
-    p15 = np.percentile(model_input_subset['numerical_target'], 15)
+    model_input_subset.loc[:, "target"] = pd.cut(
-    p85 = np.percentile(model_input_subset['numerical_target'], 85)
+        model_input_subset.loc[:, "target"], bins=BINS, labels=[0, 1], right=True
    )
    # p15 = np.percentile(model_input_subset['numerical_target'], 15)
    # p85 = np.percentile(model_input_subset['numerical_target'], 85)
    # Treat categorical features
-    model_input_subset = treat_categorical_features(model_input_subset)
+    model_input_subset = impute_encode_categorical_features(model_input_subset)
    # Split to train, validate, and test subsets
-    train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
+    # train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
-    test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
+    # test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
    train_set, test_set = train_test_split(
        model_input_subset,
        test_size=0.3,
        stratify=model_input_subset["pid"],
        random_state=42,
    )
-    train_set['target'].value_counts()
+    print(train_set["target"].value_counts())
-    test_set['target'].value_counts()
+    print(test_set["target"].value_counts())
    train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
-    validate_x, test_x, validate_y, test_y = \
+    validate_x, test_x, validate_y, test_y = train_test_split(
-        train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42)
+        test_set.drop(["target", "pid"], axis=1),
        test_set["target"],
        test_size=0.50,
        random_state=42,
    )
    # Impute missing values
-    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+    imputer = SimpleImputer(missing_values=np.nan, strategy="median")
    train_x = imputer.fit_transform(train_x)
    validate_x = imputer.fit_transform(validate_x)
    test_x = imputer.fit_transform(test_x)
    for model_title, model in cmodels.items():
-        model['model'].fit(train_x, train_y)
+        model["model"].fit(train_x, train_y)
-        y_pred = model['model'].predict(validate_x)
+        y_pred = model["model"].predict(validate_x)
        acc = accuracy_score(validate_y, y_pred)
        prec = precision_score(validate_y, y_pred)
@ -161,11 +166,29 @@ for k in range(n_clusters):
        print("Recall", rec)
        print("F1", f1)
-        cmodels[model_title]['metrics'][0] += acc
+        cmodels[model_title]["metrics"][0] += acc
-        cmodels[model_title]['metrics'][1] += prec
+        cmodels[model_title]["metrics"][1] += prec
-        cmodels[model_title]['metrics'][2] += rec
+        cmodels[model_title]["metrics"][2] += rec
-        cmodels[model_title]['metrics'][3] += f1
+        cmodels[model_title]["metrics"][3] += f1
 # %% jupyter={"source_hidden": true}
 # Get overall results
-cm.get_total_models_scores(n_clusters=n_clusters)
+scores = cm.get_total_models_scores(n_clusters=n_clusters)
 # %%
 print(scores)
 # %%
 PATH_OUTPUT = Path("..") / Path("presentation/results")
 path_output_full = PATH_OUTPUT / (
    TARGET_VARIABLE
    + "_"
    + SEGMENT_LENGTH
    + "_classification"
    + str(BINS)
    + "_CLUST_"
    + CLUST_COL
    + +str(n_clusters)
    + ".csv"
 )
 scores.to_csv(path_output_full, index=False)
--- a/exploration/ml_pipeline_regression.py
+++ b/exploration/ml_pipeline_regression.py
@ -6,445 +6,61 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.13.0
+#       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
-# %% jupyter={"source_hidden": true}
+# %%
 # %matplotlib inline
 import os
 import sys
 import numpy as np
 import pandas as pd
 import xgboost as xg
 from machine_learning.helper import prepare_regression_model_input
 from sklearn import gaussian_process, kernel_ridge, linear_model, svm
 from sklearn.dummy import DummyRegressor
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import LeaveOneGroupOut, cross_validate
-# from IPython.core.interactiveshell import InteractiveShell
+from machine_learning.helper import (
-# InteractiveShell.ast_node_interactivity = "all"
+    impute_encode_categorical_features,
    prepare_cross_validator,
    prepare_sklearn_data_format,
    run_all_regression_models,
 )
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
-# %% jupyter={"source_hidden": true}
+# %%
 model_input = pd.read_csv(
    "../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
 )
-# %% jupyter={"source_hidden": true}
+# %%
-cv_method = "half_logo"  # logo, half_logo, 5kfold
+model_input = model_input[model_input["local_segment"].str.contains("daily")]
 train_x, data_y, data_groups = prepare_regression_model_input(model_input, cv_method)
 # %% jupyter={"source_hidden": true}
 logo = LeaveOneGroupOut()
 logo.get_n_splits(
    train_x,
    data_y,
    groups=data_groups,
 )
 # Defaults to 5 k folds in cross_validate method
 if cv_method != "logo" and cv_method != "half_logo":
    logo = None
 # %% jupyter={"source_hidden": true}
 sum(data_y.isna())
 # %% [markdown]
 # ### Baseline: Dummy Regression (mean)
 dummy_regr = DummyRegressor(strategy="mean")
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
 # %% jupyter={"source_hidden": true}
 dummy_regressor = cross_validate(
    dummy_regr,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=(
        "r2",
        "neg_mean_squared_error",
        "neg_mean_absolute_error",
        "neg_root_mean_squared_error",
    ),
 )
 print(
    "Negative Mean Squared Error",
    np.median(dummy_regressor["test_neg_mean_squared_error"]),
 )
 print(
    "Negative Mean Absolute Error",
    np.median(dummy_regressor["test_neg_mean_absolute_error"]),
 )
 print(
    "Negative Root Mean Squared Error",
    np.median(dummy_regressor["test_neg_root_mean_squared_error"]),
 )
 print("R2", np.median(dummy_regressor["test_r2"]))
 # %% [markdown]
 # ### Linear Regression
 # %% jupyter={"source_hidden": true}
 lin_reg_rapids = linear_model.LinearRegression()
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
 # %% jupyter={"source_hidden": true}
 lin_reg_scores = cross_validate(
    lin_reg_rapids,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=(
        "r2",
        "neg_mean_squared_error",
        "neg_mean_absolute_error",
        "neg_root_mean_squared_error",
    ),
 )
 print(
    "Negative Mean Squared Error",
    np.median(lin_reg_scores["test_neg_mean_squared_error"]),
 )
 print(
    "Negative Mean Absolute Error",
    np.median(lin_reg_scores["test_neg_mean_absolute_error"]),
 )
 print(
    "Negative Root Mean Squared Error",
    np.median(lin_reg_scores["test_neg_root_mean_squared_error"]),
 )
 print("R2", np.median(lin_reg_scores["test_r2"]))
 # %% [markdown]
 # ### XGBRegressor Linear Regression
 # %% jupyter={"source_hidden": true}
 xgb_r = xg.XGBRegressor(objective="reg:squarederror", n_estimators=10)
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
 # %% jupyter={"source_hidden": true}
 xgb_reg_scores = cross_validate(
    xgb_r,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=(
        "r2",
        "neg_mean_squared_error",
        "neg_mean_absolute_error",
        "neg_root_mean_squared_error",
    ),
 )
 print(
    "Negative Mean Squared Error",
    np.median(xgb_reg_scores["test_neg_mean_squared_error"]),
 )
 print(
    "Negative Mean Absolute Error",
    np.median(xgb_reg_scores["test_neg_mean_absolute_error"]),
 )
 print(
    "Negative Root Mean Squared Error",
    np.median(xgb_reg_scores["test_neg_root_mean_squared_error"]),
 )
 print("R2", np.median(xgb_reg_scores["test_r2"]))
 # %% [markdown]
 # ### XGBRegressor Pseudo Huber Error Regression
 # %% jupyter={"source_hidden": true}
 xgb_psuedo_huber_r = xg.XGBRegressor(objective="reg:pseudohubererror", n_estimators=10)
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
 # %% jupyter={"source_hidden": true}
 xgb_psuedo_huber_reg_scores = cross_validate(
    xgb_psuedo_huber_r,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=(
        "r2",
        "neg_mean_squared_error",
        "neg_mean_absolute_error",
        "neg_root_mean_squared_error",
    ),
 )
 print(
    "Negative Mean Squared Error",
    np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_squared_error"]),
 )
 print(
    "Negative Mean Absolute Error",
    np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_absolute_error"]),
 )
 print(
    "Negative Root Mean Squared Error",
    np.median(xgb_psuedo_huber_reg_scores["test_neg_root_mean_squared_error"]),
 )
 print("R2", np.median(xgb_psuedo_huber_reg_scores["test_r2"]))
 # %% [markdown]
 # ### Ridge regression
 # %% jupyter={"source_hidden": true}
 ridge_reg = linear_model.Ridge(alpha=0.5)
 # %% tags=[] jupyter={"source_hidden": true}
 ridge_reg_scores = cross_validate(
    ridge_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=(
        "r2",
        "neg_mean_squared_error",
        "neg_mean_absolute_error",
        "neg_root_mean_squared_error",
    ),
 )
 print(
    "Negative Mean Squared Error",
    np.median(ridge_reg_scores["test_neg_mean_squared_error"]),
 )
 print(
    "Negative Mean Absolute Error",
    np.median(ridge_reg_scores["test_neg_mean_absolute_error"]),
 )
 print(
    "Negative Root Mean Squared Error",
    np.median(ridge_reg_scores["test_neg_root_mean_squared_error"]),
 )
 print("R2", np.median(ridge_reg_scores["test_r2"]))
 # %% [markdown]
 # ### Lasso
 # %% jupyter={"source_hidden": true}
 lasso_reg = linear_model.Lasso(alpha=0.1)
 # %% jupyter={"source_hidden": true}
 lasso_reg_score = cross_validate(
    lasso_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=(
        "r2",
        "neg_mean_squared_error",
        "neg_mean_absolute_error",
        "neg_root_mean_squared_error",
    ),
 )
 print(
    "Negative Mean Squared Error",
    np.median(lasso_reg_score["test_neg_mean_squared_error"]),
 )
 print(
    "Negative Mean Absolute Error",
    np.median(lasso_reg_score["test_neg_mean_absolute_error"]),
 )
 print(
    "Negative Root Mean Squared Error",
    np.median(lasso_reg_score["test_neg_root_mean_squared_error"]),
 )
 print("R2", np.median(lasso_reg_score["test_r2"]))
 # %% [markdown]
 # ### Bayesian Ridge
 # %% jupyter={"source_hidden": true}
 bayesian_ridge_reg = linear_model.BayesianRidge()
 # %% jupyter={"source_hidden": true}
 bayesian_ridge_reg_score = cross_validate(
    bayesian_ridge_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=(
        "r2",
        "neg_mean_squared_error",
        "neg_mean_absolute_error",
        "neg_root_mean_squared_error",
    ),
 )
 print(
    "Negative Mean Squared Error",
    np.median(bayesian_ridge_reg_score["test_neg_mean_squared_error"]),
 )
 print(
    "Negative Mean Absolute Error",
    np.median(bayesian_ridge_reg_score["test_neg_mean_absolute_error"]),
 )
 print(
    "Negative Root Mean Squared Error",
    np.median(bayesian_ridge_reg_score["test_neg_root_mean_squared_error"]),
 )
 print("R2", np.median(bayesian_ridge_reg_score["test_r2"]))
 # %% [markdown]
 # ### RANSAC (outlier robust regression)
 # %% jupyter={"source_hidden": true}
 ransac_reg = linear_model.RANSACRegressor()
 # %% jupyter={"source_hidden": true}
 ransac_reg_scores = cross_validate(
    ransac_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=(
        "r2",
        "neg_mean_squared_error",
        "neg_mean_absolute_error",
        "neg_root_mean_squared_error",
    ),
 )
 print(
    "Negative Mean Squared Error",
    np.median(ransac_reg_scores["test_neg_mean_squared_error"]),
 )
 print(
    "Negative Mean Absolute Error",
    np.median(ransac_reg_scores["test_neg_mean_absolute_error"]),
 )
 print(
    "Negative Root Mean Squared Error",
    np.median(ransac_reg_scores["test_neg_root_mean_squared_error"]),
 )
 print("R2", np.median(ransac_reg_scores["test_r2"]))
 # %% [markdown]
 # ### Support vector regression
 # %% jupyter={"source_hidden": true}
 svr = svm.SVR()
 # %% jupyter={"source_hidden": true}
 svr_scores = cross_validate(
    svr,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=(
        "r2",
        "neg_mean_squared_error",
        "neg_mean_absolute_error",
        "neg_root_mean_squared_error",
    ),
 )
 print(
    "Negative Mean Squared Error", np.median(svr_scores["test_neg_mean_squared_error"])
 )
 print(
    "Negative Mean Absolute Error",
    np.median(svr_scores["test_neg_mean_absolute_error"]),
 )
 print(
    "Negative Root Mean Squared Error",
    np.median(svr_scores["test_neg_root_mean_squared_error"]),
 )
 print("R2", np.median(svr_scores["test_r2"]))
 # %% [markdown]
 # ### Kernel Ridge regression
 # %% jupyter={"source_hidden": true}
 kridge = kernel_ridge.KernelRidge()
 # %% jupyter={"source_hidden": true}
 kridge_scores = cross_validate(
    kridge,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=(
        "r2",
        "neg_mean_squared_error",
        "neg_mean_absolute_error",
        "neg_root_mean_squared_error",
    ),
 )
 print(
    "Negative Mean Squared Error",
    np.median(kridge_scores["test_neg_mean_squared_error"]),
 )
 print(
    "Negative Mean Absolute Error",
    np.median(kridge_scores["test_neg_mean_absolute_error"]),
 )
 print(
    "Negative Root Mean Squared Error",
    np.median(kridge_scores["test_neg_root_mean_squared_error"]),
 )
 print("R2", np.median(kridge_scores["test_r2"]))
 # %% [markdown]
 # ### Gaussian Process Regression
 # %% jupyter={"source_hidden": true}
 gpr = gaussian_process.GaussianProcessRegressor()
 # %% jupyter={"source_hidden": true}
 gpr_scores = cross_validate(
    gpr,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=(
        "r2",
        "neg_mean_squared_error",
        "neg_mean_absolute_error",
        "neg_root_mean_squared_error",
    ),
 )
 print(
    "Negative Mean Squared Error", np.median(gpr_scores["test_neg_mean_squared_error"])
 )
 print(
    "Negative Mean Absolute Error",
    np.median(gpr_scores["test_neg_mean_absolute_error"]),
 )
 print(
    "Negative Root Mean Squared Error",
    np.median(gpr_scores["test_neg_root_mean_squared_error"]),
 )
 print("R2", np.median(gpr_scores["test_r2"]))
 # %%
 CV_METHOD = "logo"  # logo, half_logo, 5kfold
 model_input_encoded = impute_encode_categorical_features(model_input)
 # %%
 data_x, data_y, data_groups = prepare_sklearn_data_format(
    model_input_encoded, CV_METHOD
 )
 cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
 # %%
 data_y.head()
 # %%
 data_y.tail()
 # %%
 data_y.shape
 # %%
 scores = run_all_regression_models(data_x, data_y, data_groups, cross_validator)
 # %%
 scores.to_csv(
    "../presentation/JCQ_supervisor_support_regression_" + CV_METHOD + ".csv",
    index=False,
 )
--- a/exploration/test_JCQ_reversal.py
+++ b/exploration/test_JCQ_reversal.py
@ -0,0 +1,217 @@
 # ---
 # jupyter:
 #   jupytext:
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %%
 import pandas as pd
 from features.esm_JCQ import DICT_JCQ_DEMAND_CONTROL_REVERSE
 # %%
 limesurvey_questions = pd.read_csv(
    "E:/STRAWbaseline/survey637813+question_text.csv", header=None
 ).T
 # %%
 limesurvey_questions
 # %%
 limesurvey_questions[["code", "text"]] = limesurvey_questions[0].str.split(
    r"\.\s", expand=True, n=1
 )
 # %%
 limesurvey_questions
 # %%
 demand_reverse_lime_rows = (
    limesurvey_questions["text"].str.startswith(" [Od mene se ne zahteva,")
    | limesurvey_questions["text"].str.startswith(" [Imam dovolj časa, da končam")
    | limesurvey_questions["text"].str.startswith(
        " [Pri svojem delu se ne srečujem s konfliktnimi"
    )
 )
 control_reverse_lime_rows = limesurvey_questions["text"].str.startswith(
    " [Moje delo vključuje veliko ponavljajočega"
 ) | limesurvey_questions["text"].str.startswith(
    " [Pri svojem delu imam zelo malo svobode"
 )
 # %%
 demand_reverse_lime = limesurvey_questions[demand_reverse_lime_rows]
 demand_reverse_lime.loc[:, "qid"] = demand_reverse_lime["code"].str.extract(
    r"\[(\d+)\]"
 )
 control_reverse_lime = limesurvey_questions[control_reverse_lime_rows]
 control_reverse_lime.loc[:, "qid"] = control_reverse_lime["code"].str.extract(
    r"\[(\d+)\]"
 )
 # %%
 limesurvey_questions.loc[89, "text"]
 # %%
 limesurvey_questions[limesurvey_questions["code"].str.startswith("JobEisen")]
 # %%
 demand_reverse_lime
 # %%
 control_reverse_lime
 # %%
 participant_info = pd.read_csv(
    "C:/Users/junos/Documents/FWO-ARRS/Analysis/straw2analysis/rapids/data/raw/p031/participant_baseline_raw.csv",
    parse_dates=["date_of_birth"],
 )
 # %%
 participant_info_t = participant_info.T
 # %%
 rows_baseline = participant_info_t.index
 # %%
 rows_demand = rows_baseline.str.startswith("JobEisen") & ~rows_baseline.str.endswith(
    "Time"
 )
 # %%
 rows_baseline[rows_demand]
 # %%
 limesurvey_control = (
    participant_info_t[rows_demand]
    .reset_index()
    .rename(columns={"index": "question", 0: "score_original"})
 )
 # %%
 limesurvey_control
 # %%
 limesurvey_control["qid"] = (
    limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
 )
 # %%
 limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
 # %%
 limesurvey_control["score"] = limesurvey_control["score_original"]
 # %%
 limesurvey_control["qid"][0]
 # %%
 rows_demand_reverse = limesurvey_control["qid"].isin(
    DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
 )
 limesurvey_control.loc[rows_demand_reverse, "score"] = (
    4 + 1 - limesurvey_control.loc[rows_demand_reverse, "score_original"]
 )
 # %%
 JCQ_DEMAND = "JobEisen"
 JCQ_CONTROL = "JobControle"
 dict_JCQ_demand_control_reverse = {
    JCQ_DEMAND: {
        3: " [Od mene se ne zahteva,",
        4: " [Imam dovolj časa, da končam",
        5: " [Pri svojem delu se ne srečujem s konfliktnimi",
    },
    JCQ_CONTROL: {
        2: " |Moje delo vključuje veliko ponavljajočega",
        6: " [Pri svojem delu imam zelo malo svobode",
    },
 }
 # %%
 limesurvey_control
 # %%
 test = pd.DataFrame(
    data={"question": "one", "score_original": 3, "score": 3, "qid": 10}, index=[0]
 )
 # %%
 pd.concat([test, limesurvey_control]).reset_index()
 # %%
 limesurvey_control["score"].sum()
 # %%
 rows_demand_reverse
 # %%
 dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
 # %%
 limesurvey_control
 # %%
 DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
 DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
 JCQ_NORMS = {
    "F": {
        0: DEMAND_CONTROL_RATIO_MIN,
        1: 0.45,
        2: 0.52,
        3: 0.62,
        4: DEMAND_CONTROL_RATIO_MAX,
    },
    "M": {
        0: DEMAND_CONTROL_RATIO_MIN,
        1: 0.41,
        2: 0.48,
        3: 0.56,
        4: DEMAND_CONTROL_RATIO_MAX,
    },
 }
 # %%
 JCQ_NORMS[participant_info.loc[0, "gender"]][0]
 # %%
 participant_info_t.index.str.startswith("JobControle")
 # %%
 columns_baseline = participant_info.columns
 # %%
 columns_demand = columns_baseline.str.startswith(
    "JobControle"
 ) & ~columns_baseline.str.endswith("Time")
 # %%
 columns_baseline[columns_demand]
 # %%
 participant_control = participant_info.loc[:, columns_demand]
 # %%
 participant_control["id"] = participant_control.index
 # %%
 participant_control
 # %%
 pd.wide_to_long(
    participant_control,
    stubnames="JobControle",
    i="id",
    j="qid",
    sep="[",
    suffix="(\\d+)]",
 )
--- a/features/esm.py
+++ b/features/esm.py
@ -20,11 +20,47 @@ ANSWER_DAY_OFF = "DayOff3421"
 ANSWER_SET_EVENING = "DayFinishedSetEvening"
 MAX_MORNING_LENGTH = 3
-# When the participants was not yet at work at the time of the first (morning) EMA,
+# When the participant was not yet at work at the time of the first (morning) EMA,
 # only three items were answered.
 # Two sleep related items and one indicating NOT starting work yet.
 # Daytime EMAs are all longer, in fact they always consist of at least 6 items.
 QUESTIONNAIRE_IDS = {
    "sleep_quality": 1,
    "PANAS_positive_affect": 8,
    "PANAS_negative_affect": 9,
    "JCQ_job_demand": 10,
    "JCQ_job_control": 11,
    "JCQ_supervisor_support": 12,
    "JCQ_coworker_support": 13,
    "PFITS_supervisor": 14,
    "PFITS_coworkers": 15,
    "UWES_vigor": 16,
    "UWES_dedication": 17,
    "UWES_absorption": 18,
    "COPE_active": 19,
    "COPE_support": 20,
    "COPE_emotions": 21,
    "balance_life_work": 22,
    "balance_work_life": 23,
    "recovery_experience_detachment": 24,
    "recovery_experience_relaxation": 25,
    "symptoms": 26,
    "appraisal_stressfulness_event": 87,
    "appraisal_threat": 88,
    "appraisal_challenge": 89,
    "appraisal_event_time": 90,
    "appraisal_event_duration": 91,
    "appraisal_event_work_related": 92,
    "appraisal_stressfulness_period": 93,
    "late_work": 94,
    "work_hours": 95,
    "left_work": 96,
    "activities": 97,
    "coffee_breaks": 98,
    "at_work_yet": 99,
 }
 def get_esm_data(usernames: Collection) -> pd.DataFrame:
    """
@ -52,6 +88,8 @@ def get_esm_data(usernames: Collection) -> pd.DataFrame:
 def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    """
    Convert timestamps and expand JSON column.
    Convert timestamps into human-readable datetimes and dates
        and expand the JSON column into several Pandas DF columns.
@ -63,7 +101,8 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    Returns
    -------
    df_esm_preprocessed: pd.DataFrame
-        A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
+        A dataframe with added columns: datetime in Ljubljana timezone
            and all fields from ESM_JSON column.
    """
    df_esm = helper.get_date_from_timestamp(df_esm)
@ -76,31 +115,39 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
 def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
    For each distinct EMA session, determine how the participant responded to it.
-    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
+
    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED,
        and SESSION_STATUS_COMPLETE
    This is done in three steps.
    First, the esm_status is considered.
-    If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
+    If any of the ESMs in a session has a status *other than* "answered",
        then this session is taken as unfinished.
    Second, the sessions which do not represent full questionnaires are identified.
-    These are sessions where participants only marked they are finished with the day or have not yet started working.
+    These are sessions where participants only marked they are finished with the day
        or have not yet started working.
    Third, the sessions with only one item are marked with their trigger.
-    We never offered questionnaires with single items, so we can be sure these are unfinished.
+    We never offered questionnaires with single items,
        so we can be sure these are unfinished.
    Finally, all sessions that remain are marked as completed.
-    By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option.
+    By going through different possibilities in expl_esm_adherence.ipynb,
        this turned out to be a reasonable option.
    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
            which must include the session ID (esm_session).
    Returns
    -------
    df_session_counts: pd.Dataframe
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
            with their statuses and the number of items.
    """
    sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
@ -155,17 +202,22 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat
 def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
-    For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)
+    Classify EMA sessions into morning, workday, or evening.
    For each EMA session, determine the time of the first user answer
        and its time type (morning, workday, or evening).
    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
            which must include the session ID (esm_session).
    Returns
    -------
    df_session_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
            with their time type and timestamp of first answer.
    """
    df_session_time = (
        df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
@ -179,13 +231,17 @@ def classify_sessions_by_completion_time(
    df_esm_preprocessed: pd.DataFrame,
 ) -> pd.DataFrame:
    """
-    The point of this function is to not only classify sessions by using the previously defined functions.
+    Classify sessions and correct the time type.
    The point of this function is to not only classify sessions
        by using the previously defined functions.
    It also serves to "correct" the time type of some EMA sessions.
    A morning questionnaire could seamlessly transition into a daytime questionnaire,
        if the participant was already at work.
    In this case, the "time" label changed mid-session.
-    Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
+    Because of the way classify_sessions_by_time works,
        this questionnaire was classified as "morning".
    But for all intents and purposes, it can be treated as a "daytime" EMA.
    The way this scenario is differentiated from a true "morning" questionnaire,
@ -194,13 +250,16 @@ def classify_sessions_by_completion_time(
    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
            which must include the session ID (esm_session).
    Returns
    -------
    df_session_counts_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items,
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses,
-            their time type (with some morning EMAs reclassified) and timestamp of first answer.
+            the number of items,
            their time type (with some morning EMAs reclassified)
            and timestamp of first answer.
    """
    df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
@ -219,7 +278,8 @@ def classify_sessions_by_completion_time(
 def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
-    This function eliminates invalid ESM responses.
+    Eliminate invalid ESM responses.
    It removes unanswered ESMs and those that indicate end of work and similar.
    It also extracts a numeric answer from strings such as "4 - I strongly agree".
@ -256,3 +316,100 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
        )
    )
    return df_esm_clean
 def increment_answers(df_esm_clean: pd.DataFrame, increment_by=1):
    """
    Increment answers to keep in line with original scoring.
    We always used 0 for the lowest value of user answer.
    Some scales originally used other scoring, such as starting from 1.
    This restores original scoring so that the values are comparable to references.
    Parameters
    ----------
    df_esm_clean: pd.DataFrame
        A cleaned ESM dataframe, which must also include esm_user_answer_numeric.
    increment_by:
        A number to add to the user answer.
    Returns
    -------
    df_esm_clean: pd.DataFrame
        The same df with addition of a column 'esm_user_answer_numeric'.
    """
    try:
        df_esm_clean = df_esm_clean.assign(
            esm_user_score=lambda x: x.esm_user_answer_numeric + increment_by
        )
    except AttributeError as e:
        print("Please, clean the dataframe first using features.esm.clean_up_esm.")
        print(e)
    return df_esm_clean
 def reassign_question_ids(
    df_esm_cleaned: pd.DataFrame, question_ids_content: dict
 ) -> pd.DataFrame:
    """
    Fix question IDs to match their actual content.
    Unfortunately, when altering the protocol to adapt to COVID pandemic,
    we did not retain original question IDs.
    This means that for participants before 2021, they are different
    from for the rest of them.
    This function searches for question IDs by matching their strings.
    Parameters
    ----------
    df_esm_cleaned: pd.DataFrame
        A cleaned up dataframe, which must also include esm_user_answer_numeric.
    question_ids_content: dict
        A dictionary, linking question IDs with their content ("instructions").
    Returns
    -------
    df_esm_fixed: pd.DataFrame
        The same dataframe but with fixed question IDs.
    """
    df_esm_unique_questions = (
        df_esm_cleaned.groupby("question_id")
        .esm_instructions.value_counts()
        .rename()
        .reset_index()
    )
    # Tabulate all possible answers to each question (group by question ID).
    # First, check that we anticipated all esm instructions.
    for q_id in question_ids_content.keys():
        # Look for all questions ("instructions") occurring in the dataframe.
        actual_questions = df_esm_unique_questions.loc[
            df_esm_unique_questions["question_id"] == q_id,
            "esm_instructions",
        ]
        # These are all answers to a given question (by q_id).
        questions_matches = actual_questions.str.startswith(
            question_ids_content.get(q_id)
        )
        # See if they are expected, i.e. included in the dictionary.
        if ~actual_questions.all():
            print("One of the questions that occur in the data was undefined.")
            print("This were the questions found in the data: ")
            raise KeyError(actual_questions[~questions_matches])
            # In case there is an unexpected answer, raise an exception.
    # Next, replace question IDs.
    df_esm_fixed = df_esm_cleaned.copy()
    df_esm_fixed["question_id"] = df_esm_cleaned["esm_instructions"].apply(
        lambda x: next(
            (
                key
                for key, values in question_ids_content.items()
                if x.startswith(values)
            ),
            None,
        )
    )
    return df_esm_fixed
--- a/features/esm_COPE.py
+++ b/features/esm_COPE.py
@ -0,0 +1,125 @@
 COPE_ORIGINAL_MAX = 4
 COPE_ORIGINAL_MIN = 1
 DICT_COPE_QUESTION_IDS = {
    164: (
        "I took additional action to try to get rid of the problem",
        "Ik deed extra mijn best om er iets aan te doen",
        "Vložila sem dodaten napor, da bi rešila problem",
        "Vložil sem dodaten napor, da bi rešil problem",
    ),
    165: (
        "I concentrated my efforts on doing something about it",
        "Ik probeerde de situatie te verbeteren",
        "Svoje sile sem usmerila v reševanje nastale situacije",
        "Svoje sile sem usmeril v reševanje nastale situacije",
    ),
    166: (
        "I did what had to be done, one step at a time",
        "Ik deed stap voor stap wat nodig was",
        "Naredila sem, kar je bilo potrebno – korak za korakom",
        "Naredil sem, kar je bilo potrebno – korak za korakom",
    ),
    167: (
        "I took direct action to get around the problem",
        "Ik handelde vlug om het probleem te verhelpen",
        "Nekaj sem naredila, da sem zaobšla problem",
        "Nekaj sem naredil, da sem zaobšel problem",
    ),
    168: (
        "I tried to come up with a strategy about what to do",
        "Ik probeerde te verzinnen wat ik er aan kon doen",
        "Skušala sem najti ustrezen način za rešitev situacije",
        "Skušal sem najti ustrezen način za rešitev situacije",
    ),
    169: (
        "I made a plan of action",
        "Ik maakte een plan",
        "Naredila sem načrt za delovanje",
        "Naredil sem načrt za delovanje",
    ),
    170: (
        "I thought hard about what steps to take",
        "Ik dacht hard na over wat ik moest doen",
        "Dobro sem premislila, katere korake moram narediti, da rešim problem",
        "Dobro sem premislil, katere korake moram narediti, da rešim problem",
    ),
    171: (
        "I thought about how I might best handle the problem",
        "lk dacht na over hoe ik het probleem het best kon aanpakken",
        "Razmišljala sem, kaj bi bilo najbolje narediti s problemom",
        "Razmišljal sem, kaj bi bilo najbolje narediti s problemom",
    ),
    172: (
        "I asked people who have had similar experiences what they did",
        "Ik vroeg aan mensen met dergelijke ervaringen hoe zij reageerden",
        "Vprašala sem posameznike s podobnimi izkušnjami, kaj so storili",
        "Vprašal sem posameznike s podobnimi izkušnjami, kaj so storili",
    ),
    173: (
        "I tried to get advice from someone about what to do",
        "lk vroeg advies aan iemand",
        "Pri drugih sem poskušala dobiti nasvet, kaj naj storim",
        "Pri drugih sem poskušal dobiti nasvet, kaj naj storim",
    ),
    174: (
        "I talked to someone to find out more about the situation",
        "Ik sprak met iemand om meer te weten te komen over de situatie",
        "Z nekom sem se pogovorila, da bi izvedela še kaj o svojem problemu",
        "Z nekom sem se pogovoril, da bi izvedel še kaj o svojem problemu",
    ),
    175: (
        "I talked to someone who could do something concrete about the problem",
        "Ik sprak met iemand die iets aan het probleem kon doen",
        "Pogovorila sem se s kom, ki bi lahko naredil kaj konkretnega",
        "Pogovoril sem se s kom, ki bi lahko naredil kaj konkretnega",
    ),
    176: (
        "I talked to someone about how I felt",
        "Ik sprak met iemand over hoe ik mij voelde",
        "Z nekom sem se pogovorila o tem, kako sem se počutila",
        "Z nekom sem se pogovoril o tem, kako sem se počutil",
    ),
    177: (
        "I tried to get emotional support from friends or relatives",
        "Ik zocht steun bij vrienden of familie",
        "Skušala sem dobiti čustveno podporo prijateljev ali sorodnikov",
        "Skušal sem dobiti čustveno podporo prijateljev ali sorodnikov",
    ),
    178: (
        "I discussed my feelings with someone",
        "lk besprak mijn gevoelens met iemand",
        "O svojih občutkih sem se z nekom pogovorila",
        "O svojih občutkih sem se z nekom pogovoril",
    ),
    179: (
        "I got sympathy and understanding from someone",
        "Ik vroeg medeleven en begrip van iemand",
        "Poiskala sem naklonjenost in razumevanje drugih",
        "Poiskal sem naklonjenost in razumevanje drugih",
    ),
    180: (
        "I got upset and let my emotions out",
        "Ik raakte van streek",
        "Razburila sem se in to tudi pokazala",
        "Razburil sem se in to tudi pokazal",
    ),
    181: (
        "I let my feelings out",
        "Ik toonde mijn gevoelens",
        "Svojim čustvom sem dala prosto pot",
        "Svojim čustvom sem dal prosto pot",
    ),
    182: (
        "I felt a lot of emotional distress and I found myself expressing",
        "lk liet duidelijk blijken hoe ellendig ik mij voelde",
        "Doživljala sem veliko stresa in opažala, da sem čustva",
        "Doživljal sem veliko stresa in opažal, da sem čustva",
    ),
    183: (
        "I got upset, and I was really aware of it",
        "Ik merkte dat ik erg van streek was",
        "Razburila sem se in razmišljala samo o tem",
        "Razburil sem se in razmišljal samo o tem",
    ),
 }
--- a/features/esm_JCQ.py
+++ b/features/esm_JCQ.py
@ -1,9 +1,11 @@
 import pandas as pd
 from features.esm import increment_answers
 JCQ_ORIGINAL_MAX = 4
 JCQ_ORIGINAL_MIN = 1
-dict_JCQ_demand_control_reverse = {
+DICT_JCQ_DEMAND_CONTROL_REVERSE = {
    75: (
        "I was NOT asked",
        "Men legde mij geen overdreven",
@ -40,10 +42,14 @@ def reverse_jcq_demand_control_scoring(
    df_esm_jcq_demand_control: pd.DataFrame,
 ) -> pd.DataFrame:
    """
-    This function recodes answers in Job content questionnaire by first incrementing them by 1,
+    Reverse JCQ demand and control answers.
-    to be in line with original (1-4) scoring.
+
-    Then, some answers are reversed (i.e. 1 becomes 4 etc.), because the questions are negatively phrased.
+    This function recodes answers in Job content questionnaire
-    These answers are listed in dict_JCQ_demand_control_reverse and identified by their question ID.
+        by first incrementing them by 1, to be in line with original (1-4) scoring.
    Then, some answers are reversed (i.e. 1 becomes 4 etc.),
        because the questions are negatively phrased.
    These answers are listed in DICT_JCQ_DEMAND_CONTROL_REVERSE
        and identified by their question ID.
    However, the existing data is checked against literal phrasing of these questions
        to protect against wrong numbering of questions (differing question IDs).
@ -55,7 +61,8 @@ def reverse_jcq_demand_control_scoring(
    Returns
    -------
    df_esm_jcq_demand_control: pd.DataFrame
-        The same dataframe with a column esm_user_score containing answers recoded and reversed.
+        The same dataframe with a column esm_user_score
            containing answers recoded and reversed.
    """
    df_esm_jcq_demand_control_unique_answers = (
        df_esm_jcq_demand_control.groupby("question_id")
@ -64,7 +71,7 @@ def reverse_jcq_demand_control_scoring(
        .reset_index()
    )
    # Tabulate all possible answers to each question (group by question ID).
-    for q_id in dict_JCQ_demand_control_reverse.keys():
+    for q_id in DICT_JCQ_DEMAND_CONTROL_REVERSE.keys():
        # Look through all answers that need to be reversed.
        possible_answers = df_esm_jcq_demand_control_unique_answers.loc[
            df_esm_jcq_demand_control_unique_answers["question_id"] == q_id,
@ -72,7 +79,7 @@ def reverse_jcq_demand_control_scoring(
        ]
        # These are all answers to a given question (by q_id).
        answers_matches = possible_answers.str.startswith(
-            dict_JCQ_demand_control_reverse.get(q_id)
+            DICT_JCQ_DEMAND_CONTROL_REVERSE.get(q_id)
        )
        # See if they are expected, i.e. included in the dictionary.
        if ~answers_matches.all():
@ -82,18 +89,16 @@ def reverse_jcq_demand_control_scoring(
            # In case there is an unexpected answer, raise an exception.
    try:
-        df_esm_jcq_demand_control = df_esm_jcq_demand_control.assign(
+        df_esm_jcq_demand_control = increment_answers(df_esm_jcq_demand_control)
-            esm_user_score=lambda x: x.esm_user_answer_numeric + 1
+        # Increment the original answer by 1 to keep in line
-        )
+        # with traditional scoring (from JCQ_ORIGINAL_MIN to JCQ_ORIGINAL_MAX).
        # Increment the original answer by 1
        # to keep in line with traditional scoring (JCQ_ORIGINAL_MIN - JCQ_ORIGINAL_MAX).
        df_esm_jcq_demand_control[
            df_esm_jcq_demand_control["question_id"].isin(
-                dict_JCQ_demand_control_reverse.keys()
+                DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
            )
        ] = df_esm_jcq_demand_control[
            df_esm_jcq_demand_control["question_id"].isin(
-                dict_JCQ_demand_control_reverse.keys()
+                DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
            )
        ].assign(
            esm_user_score=lambda x: JCQ_ORIGINAL_MAX
--- a/features/esm_SAM.py
+++ b/features/esm_SAM.py
@ -3,6 +3,9 @@ import pandas as pd
 import features.esm
 SAM_ORIGINAL_MAX = 5
 SAM_ORIGINAL_MIN = 1
 QUESTIONNAIRE_ID_SAM = {
    "event_stress": 87,
    "event_threat": 88,
@ -20,10 +23,107 @@ GROUP_QUESTIONNAIRES_BY = [
    "device_id",
    "esm_session",
 ]
-# Each questionnaire occurs only once within each esm_session on the same device within the same participant.
+# Each questionnaire occurs only once within each esm_session on the same device
 # within the same participant.
 DICT_SAM_QUESTION_IDS = {
    87: (
        "Was there a particular event that created tension in you?",
        "Was er een bepaalde gebeurtenis die spanning veroorzaakte?",
        "Je prišlo do kakega dogodka, ki je v vas ustvaril napetost?",
    ),
    88: (
        "Did this event make you feel anxious?",
        "Voelde je je angstig door deze gebeurtenis?",
        "Ste se zaradi tega dogodka počutili tesnob",
    ),
    89: (
        "Will the outcome of this event be negative?",
        "Zal de uitkomst van deze gebeurtenis negatief zijn?",
        "Bo izid tega dogodka negativen?",
    ),
    90: (
        "How threatening was this event?",
        "Hoe bedreigend was deze gebeurtenis?",
        "Kako grozeč je bil ta dogodek?",
    ),
    91: (
        "Is this going to have a negative impact on you?",
        "Zal dit een negatieve impact op je hebben?",
        "Ali bo to negativno vplivalo na vas?",
    ),
    92: (
        "Is this going to have a positive impact on you?",
        "Zal dit een positief effect op je hebben?",
        "Ali bo to pozitivno vplivalo na vas?",
    ),
    93: (
        "How eager are you to tackle this event?",
        "Hoe graag wil je deze gebeurtenis aanpakken?",
        "Kako zagnani ste bili",
    ),
    94: (
        "To what extent can you become a stronger person because of this event?",
        "In welke mate kan je een sterkere persoon worden door deze gebeurtenis?",
        "V kolikšni meri lahko zaradi tega dogodka postanete močnejša oseba?",
    ),
    95: (
        "To what extent are you excited thinking about the outcome of this event?",
        "In welke mate ben je enthousiast bij de gedachte",
        "V kolikšni meri vas misel na izid tega dogodka navdušuje?",
    ),
    96: (
        "At what time did this event occur?",
        "Hoe laat vond deze gebeurtenis plaats?",
        "Kdaj se je ta dogodek zgodil?",
    ),
    97: (
        "How long did this event last?",
        "Hoe lang duurde deze gebeurtenis?",
        "Kako dolgo je trajal ta dogodek?",
    ),
    98: (
        "Was/is this event work-related?",
        "Was/is deze gebeurtenis werkgerelateerd?",
        "Je (bil) ta dogodek povezan s službo?",
        "Je bil ali je ta dogodek povezan s službo?",
    ),
    99: (
        "Did this overall period create tension in you?",
        "Heeft deze globale periode spanning veroorzaakt?",
        "Je to obdobje kot celota v vas ustvarilo napetost?",
        "Je to celo obdobje v vas ustvarilo napetost?",
    ),
    100: (
        "To what extent do you perceive this overall period as stressful?",
        "In welke mate ervaar je deze globale periode als stressvol?",
        "V kolikšni meri ste to obdobje dojemali kot stresno?",
        "V kolikšni meri ste celo to obdobje dojemali kot stresno?",
    ),
 }
 def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
    """
    Extract information about stressful events.
    Participants were asked: "Was there a particular event that created tension in you?"
    Then a subset of questions related to this event followed.
    This function goes through the follow-up questions one by one
        and preprocesses them, so that it adds new columns to the dataframe.
    Parameters
    ----------
    df_esm: pd.DataFrame
        A raw dataframe of all ESM data.
    Returns
    -------
    df_esm_events: pd.DataFrame
        A cleaned up df of Stress Appraisal Measure items with additional columns.
    """
    # 0. Select only questions from Stress Appraisal Measure.
    df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
    df_esm_sam = df_esm_preprocessed[
@ -78,7 +178,8 @@ def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
 def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
-    This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
+    This function calculates challenge and threat
        (two Stress Appraisal Measure subscales) means,
        for each ESM session (within participants and devices).
    It creates a grouped dataframe with means in two columns.
@ -90,7 +191,8 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
    Returns
    -------
    df_esm_event_threat_challenge_mean_wide: pd.DataFrame
-        A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
+        A dataframe of unique ESM sessions (by participants and devices)
        with threat and challenge means.
    """
    # Select only threat and challenge assessments for events
    df_esm_event_threat_challenge = df_esm_sam_clean[
@ -112,8 +214,8 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
        aggfunc="mean",
    )
    # Drop unnecessary column values.
-    df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
+    df_esm_event_threat_challenge_mean_wide.columns = (
-        1
+        df_esm_event_threat_challenge_mean_wide.columns.get_level_values(1)
    )
    df_esm_event_threat_challenge_mean_wide.columns.name = None
    df_esm_event_threat_challenge_mean_wide.rename(
@ -189,10 +291,12 @@ def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
 def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
-    This function only serves to convert the string datetime answer into a real datetime type.
+    This function only serves to convert the string datetime answer
-    Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
+        into a real datetime type.
-    NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
+    Errors during this conversion are coerced, meaning that non-datetime answers
-        the NaTs can be interpreted to mean this.
+        are assigned Not a Time (NaT).
    NOTE: Since the only available non-datetime answer to this question was
        "0 - I do not remember", the NaTs can be interpreted to mean this.
    Parameters
    ----------
@ -208,9 +312,13 @@ def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
        df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
    ].assign(
        event_time=lambda x: pd.to_datetime(
-            x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
+            x.esm_user_answer,
            errors="coerce",
            format="%Y-%m-%d %H:%M:%S %z",
            exact=True,
        )
    )
    # Example answer: 2020-09-29 00:05:00 +0200
    return df_esm_event_time
@ -241,9 +349,12 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
        == QUESTIONNAIRE_ID_SAM.get("event_duration")
    ].assign(
        event_duration=lambda x: pd.to_datetime(
-            x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
+            x.esm_user_answer.str.slice(start=0, stop=-6),
            errors="coerce",
            format="%Y-%m-%d %H:%M:%S",
        ).dt.time
    )
    # Example answer: 2020-09-29 00:05:00 +0200
    # TODO Explore the values recorded in event_duration and possibly fix mistakes.
    # For example, participants reported setting 23:50:00 instead of 00:50:00.
@ -251,7 +362,7 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    # we can determine whether:
    #   - this event is still going on ("1 - It is still going on")
    #   - the participant couldn't remember it's duration ("0 - I do not remember")
-    # Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
+    # Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm
    # but only the numeric types of questions and answers.
    # Since this was of "datetime" type, convert these specific answers here again.
    df_esm_event_duration["event_duration_info"] = np.nan
@ -264,4 +375,5 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    return df_esm_event_duration
-# TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?
+# TODO: How many questions about the stressfulness of the period were asked
 #  and how does this relate to events?
--- a/machine_learning/classification_models.py
+++ b/machine_learning/classification_models.py
@ -1,10 +1,11 @@
-from sklearn.dummy import DummyClassifier
+import pandas as pd
 from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
 from lightgbm import LGBMClassifier
 import xgboost as xg
 from lightgbm import LGBMClassifier
 from sklearn import ensemble, linear_model, naive_bayes, neighbors, svm, tree
 from sklearn.dummy import DummyClassifier
 class ClassificationModels():
 class ClassificationModels:
    def __init__(self):
        self.cmodels = self.init_classification_models()
@ -13,59 +14,110 @@ class ClassificationModels():
    def init_classification_models(self):
        cmodels = {
-            'dummy_classifier': {
+            "dummy_classifier": {
-                'model': DummyClassifier(strategy="most_frequent"),
+                "model": DummyClassifier(strategy="most_frequent"),
-                'metrics': [0, 0, 0, 0]
+                "metrics": [0, 0, 0, 0],
            },
-            'logistic_regression': {
+            "logistic_regression": {
-                'model': linear_model.LogisticRegression(max_iter=1000),
+                "model": linear_model.LogisticRegression(max_iter=1000),
-                'metrics': [0, 0, 0, 0]
+                "metrics": [0, 0, 0, 0],
            },
-            'support_vector_machine': {
+            "support_vector_machine": {"model": svm.SVC(), "metrics": [0, 0, 0, 0]},
-                'model': svm.SVC(),
+            "gaussian_naive_bayes": {
-                'metrics': [0, 0, 0, 0]
+                "model": naive_bayes.GaussianNB(),
                "metrics": [0, 0, 0, 0],
            },
-            'gaussian_naive_bayes': {
+            "stochastic_gradient_descent_classifier": {
-                'model': naive_bayes.GaussianNB(),
+                "model": linear_model.SGDClassifier(),
-                'metrics': [0, 0, 0, 0]
+                "metrics": [0, 0, 0, 0],
            },
-            'stochastic_gradient_descent_classifier': {
+            "knn": {"model": neighbors.KNeighborsClassifier(), "metrics": [0, 0, 0, 0]},
-                'model': linear_model.SGDClassifier(),
+            "decision_tree": {
-                'metrics': [0, 0, 0, 0]
+                "model": tree.DecisionTreeClassifier(),
                "metrics": [0, 0, 0, 0],
            },
-            'knn': {
+            "random_forest_classifier": {
-                'model': neighbors.KNeighborsClassifier(),
+                "model": ensemble.RandomForestClassifier(),
-                'metrics': [0, 0, 0, 0]
+                "metrics": [0, 0, 0, 0],
            },
-            'decision_tree': {
+            "gradient_boosting_classifier": {
-                'model': tree.DecisionTreeClassifier(),
+                "model": ensemble.GradientBoostingClassifier(),
-                'metrics': [0, 0, 0, 0]
+                "metrics": [0, 0, 0, 0],
            },
-            'random_forest_classifier': {
+            "lgbm_classifier": {"model": LGBMClassifier(), "metrics": [0, 0, 0, 0]},
-                'model': ensemble.RandomForestClassifier(),
+            "XGBoost_classifier": {
-                'metrics': [0, 0, 0, 0]
+                "model": xg.sklearn.XGBClassifier(),
                "metrics": [0, 0, 0, 0],
            },
            'gradient_boosting_classifier': {
                'model': ensemble.GradientBoostingClassifier(),
                'metrics': [0, 0, 0, 0]
            },
            'lgbm_classifier': {
                'model': LGBMClassifier(),
                'metrics': [0, 0, 0, 0]
            },
            'XGBoost_classifier': {
                'model': xg.sklearn.XGBClassifier(),
                'metrics': [0, 0, 0, 0]
            }
        }
        return cmodels
    def get_total_models_scores(self, n_clusters=1):
        scores = pd.DataFrame(columns=["method", "metric", "mean"])
        for model_title, model in self.cmodels.items():
            scores_df = pd.DataFrame(columns=["method", "metric", "mean"])
            print("\n************************************\n")
            print("Current model:", model_title, end="\n")
-            print("Acc:", model['metrics'][0]/n_clusters)
+            print("Acc:", model["metrics"][0] / n_clusters)
-            print("Precision:", model['metrics'][1]/n_clusters)
+            scores_df = pd.concat(
-            print("Recall:", model['metrics'][2]/n_clusters)
+                [
-            print("F1:", model['metrics'][3]/n_clusters)
+                    scores_df,
                    pd.DataFrame(
                        {
                            "method": model_title,
                            "metric": "test_accuracy",
                            "mean": model["metrics"][0] / n_clusters,
                        },
                        index=[0],
                    ),
                ],
                ignore_index=True,
            )
            print("Precision:", model["metrics"][1] / n_clusters)
            scores_df = pd.concat(
                [
                    scores_df,
                    pd.DataFrame(
                        {
                            "method": model_title,
                            "metric": "test_precision",
                            "mean": model["metrics"][1] / n_clusters,
                        },
                        index=[0],
                    ),
                ],
                ignore_index=True,
            )
            print("Recall:", model["metrics"][2] / n_clusters)
            scores_df = pd.concat(
                [
                    scores_df,
                    pd.DataFrame(
                        {
                            "method": model_title,
                            "metric": "test_recall",
                            "mean": model["metrics"][2] / n_clusters,
                        },
                        index=[0],
                    ),
                ],
                ignore_index=True,
            )
            print("F1:", model["metrics"][3] / n_clusters)
            scores_df = pd.concat(
                [
                    scores_df,
                    pd.DataFrame(
                        {
                            "method": model_title,
                            "metric": "test_f1",
                            "mean": model["metrics"][3] / n_clusters,
                        },
                        index=[0],
                    ),
                ],
                ignore_index=True,
            )
            scores = pd.concat([scores, scores_df])
        return scores
--- a/machine_learning/cross_validation.py
+++ b/machine_learning/cross_validation.py
@ -49,8 +49,8 @@ class CrossValidation():
            data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
-        elif self.cv_method == "5kfold":
+        elif self.cv_method == "Stratified5kfold":
-            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
+            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], None
        self.X, self.y, self.groups = data_X, data_y, data_groups
@ -71,7 +71,7 @@ class CrossValidation():
        if self.cv_method in ["logo", "half_logo"]:
            self.cv = LeaveOneGroupOut()
-        elif self.cv_method == "5kfold":
+        elif self.cv_method == "Stratified5kfold":
            self.cv = StratifiedKFold(n_splits=5, shuffle=True)
@ -118,4 +118,11 @@ class CrossValidation():
        """
        return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
    def get_groups_sets(self, split):
        if self.groups is None:
            return None, None
        else:
            return self.groups.iloc[split[0]], self.groups.iloc[split[1]]
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@ -1,11 +1,13 @@
 import os
 import sys
 import warnings
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
-from sklearn.feature_selection import SequentialFeatureSelector
+from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
 from sklearn.model_selection import cross_validate, StratifiedKFold, GroupKFold
 from sklearn.naive_bayes import GaussianNB
 from sklearn.linear_model import Lasso 
@ -21,13 +23,15 @@ from sklearn.linear_model import Lasso
 class FeatureSelection:
-    def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
+    def __init__(self, X, y, groups):
-        pass # TODO.... 
+        self.X = X
        self.y = y
        self.groups = groups
-    def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
+    def select_best_feature(self, features, method="remove", ml_category="classification", ml_subcategory="bin", metric="recall", stored_features=[]):
        """The method selects the best feature by testing the prediction on the feature set with or without the current feature.
-        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat 
+        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particular 
        feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
        specified as a parameter.
@ -35,7 +39,11 @@ class FeatureSelection:
            df (DataFrame): Input data on which the predictions will be made.
            features (list): List of features to select the best/worst from
            method (str, optional): remove or add features.  Defaults to "remove".
-            ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and  metric. Defaults to "classification".
+            ml_category (str, optional): Either classification or regression ml problem controls the ML algorithm and  metric. 
                Defaults to "classification".
            ml_subcategory (str, optional): In case of classification '_bin' for binary classification 
                and 'multi' for multiclass classification. For regression an empty string '' is sufficient. 
                Defaults to "bin".
            metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
            stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
@ -49,173 +57,189 @@ class FeatureSelection:
        best_feature = None
-        if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
+        # Validacije tipov ML in specificiranimi metrikami
        if ml_category == "classification":
            if ml_subcategory == "bin" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
                raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
-        elif ml_type == "regression" and metric not in ['r2']:
+            elif ml_subcategory == "multi":
                ml_subcategory_error = False
                if metric != "accuracy" and "_" in metric:          
                    metric_s, metric_t = metric.split("_")
                    if metric_s not in ['accuracy', 'precision', 'recall', 'f1'] or metric_t not in ['micro', 'macro', 'weighted']:
                        ml_subcategory_error = True
                else:
                    ml_subcategory_error = True
                if ml_subcategory_error:
                    raise ValueError(""""Classification metric for multi-class classification must be specified precisely.
                                     Available metric are: 'accuracy', 'precision', 'recall' and 'f1'.
                                     Only accuracy must be specified as 'accuracy'.
                                     For others please add appropriate suffixes: '_macro', '_micro', or '_weighted', e.g., 'f1_macro'""")
        elif ml_category == "regression" and metric not in ['r2']:
            raise ValueError("Regression metric not recognized. Please choose 'r2'")
        for feat in features:
            if method == "remove":
-                pred_features = [col for col in df.columns if feat != col] # All but feat
+                pred_features = [col for col in self.X.columns if feat != col] # All but feat
            elif method == "add":
                pred_features = [feat] + stored_features # Feat with stored features
-            X, y  = df.drop(columns=['target', 'pid'])[pred_features], df['target']
+            X  = self.X[pred_features].copy()
-            if ml_type == "classification":
+            if self.groups is not None:
                cv = GroupKFold(n_splits=5)
            else:
                cv = StratifiedKFold(n_splits=5, shuffle=True)
            # See link about scoring for multiclassfication
            # http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
            if ml_category == "classification":
                nb = GaussianNB()
                model_cv = cross_validate(
                    nb,
                    X=X,
-                    y=y,
+                    y=self.y,
-                    cv=StratifiedKFold(n_splits=5, shuffle=True),
+                    cv=cv,
                    groups=self.groups,
                    n_jobs=-1,
-                    scoring=('accuracy', 'precision', 'recall', 'f1')
+                    scoring=(metric)
                )
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
-                    if metric == "accuracy":
+            elif ml_category == "regression":
                        acc = np.mean(model_cv['test_accuracy'])
                        acc_std = np.std(model_cv['test_accuracy'])
                        if not best_feature or (acc > best_metric_score):
                            best_feature = feat
                            best_metric_score = acc
                            best_metric_score_std = acc_std
                    elif metric == "precision":
                        prec = np.mean(model_cv['test_precision'])
                        prec_std = np.std(model_cv['test_precision'])
                        if not best_feature or (prec > best_metric_score):
                            best_feature = feat
                            best_metric_score = prec
                            best_metric_score_std = prec_std
                    elif metric == "recall":
                        rec = np.mean(model_cv['test_recall'])
                        rec_std = np.std(model_cv['test_recall'])
                        if not best_feature or (rec > best_metric_score):
                            best_feature = feat
                            best_metric_score = rec
                            best_metric_score_std = rec_std
                    else:
                        f1 = np.mean(model_cv['test_f1'])
                        f1_std = np.std(model_cv['test_f1'])
                        if not best_feature or (f1 > best_metric_score):
                            best_feature = feat
                            best_metric_score = f1
                            best_metric_score_std = f1_std 
            elif ml_type == "regression":
                lass = Lasso()
                model_cv = cross_validate(
                    lass,
                    X=X,
                    y=y,
-                    cv=StratifiedKFold(n_splits=5, shuffle=True),
+                    cv=cv,
                    groups=self.groups,
                    n_jobs=-1,
                    scoring=('r2')
                )
                if metric == "r2":
                    r2 = np.mean(model_cv['test_r2'])
                    r2_std = np.std(model_cv['test_r2'])
                    if not best_feature or (r2 > best_metric_score):
                        best_feature = feat
                        best_metric_score = r2
                        best_metric_score_std = r2_std
            else:
                raise ValueError("ML type not yet implemented!")
            # Section of metrics' scores comparison. 
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
                metric_score = np.nanmean(model_cv["test_score"])
                metric_score_std = np.nanstd(model_cv["test_score"])
                if not best_feature or (metric_score > best_metric_score):
                    best_feature = feat
                    best_metric_score = metric_score
                    best_metric_score_std = metric_score_std
        return best_feature, best_metric_score, best_metric_score_std
-    def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
+    def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10):
        """This method selects a set of features and returns them as a list. It returns number of features 
        determined in the interval of [n_min, n_max]. 
-        n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
+        The method consists of two steps: 
-        if n_max > n_features:
+        (1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter.
-            n_max = n_features
+        (2) The sequential features removal procedure is executed. Using the remaing features from (1).
            The best score is detected using a removal procedure. The procedure sequentially removes the features 
            that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is 
            improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance) 
            with which the next n removed features are inspected whether currently best score is improved.     
        Args:
            n_min (int, optional): Minimal amount of features returned.
            n_max (int, optional): Maximal amount of features returned.
            k (int, optional): Determines the k in the k-best features method. 
                If None, SelectKBest feature selection does not execute.
            ml_type(str, optional): Type of ML problem. Currently implemented options: 
                'classification_bin', 'classification_multi', and 'regression_'
            method (str, optional): "remove" or "add" features.  Defaults to "remove".
            n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
                the method returns index of feature with current best score as a tipping point feature.
        Returns:
            list: list of selected features
        """        
        if k is not None and k <= n_max:
            raise ValueError("The k parameter needs to be greater than the n_max parameter.")
        # Select k-best feature dependent on the type of ML task
        ml_category, ml_subcategory = ml_type.split("_")
        if k is not None:
            if ml_category == "classification":
                if ml_subcategory== "bin":
                    selector = SelectKBest(mutual_info_classif, k=k)
                elif ml_subcategory== "multi":
                    selector = SelectKBest(f_classif, k=k)
                else:
                    raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
            elif ml_category == "regression":
                selector = SelectKBest(f_regression, k=k)
            else:
                raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
            selector.fit(self.X, self.y)
            cols_idxs = selector.get_support(indices=True)
            self.X = self.X.iloc[:,cols_idxs]
        print("All columns (after SelectKBest method):")
        print(self.X.columns)
        # Sequential feature addition / removal
        n_features = self.X.shape[1]
        if n_max >= n_features:
            n_max = n_features-1 # The algorithm removes at least one feature
        if n_min > n_features:
-            raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
+            raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
        if n_max < n_min:
            raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
-        features = df.columns.tolist()
+        features = self.X.columns.tolist()
        features.remove("pid")
        features.remove("target")
        feature_importance = []
        if method == "remove":
            best_score = 0
            best_feature_indx = None
            i_worse = 0
            for i in reversed(range(n_features)):
-                best_feature, best_metric_score, best_metric_score_std = \
+                if i+1 == n_min:
                    self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
                feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
                features.remove(best_feature)
            feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
            # Selekcijski kriterij značilk v rangu max-min
            # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
            # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
            # "Tipping point" značilka mora biti v rangu max-min
            selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
            selection_area.set_index(["i", "name"], inplace=True)
            diffrences = selection_area.diff()
            diffrences.dropna(how='any', inplace=True)
            # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo 
            cumulative_sumation = diffrences.cumsum()
            tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
            # Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative 
            tipping_feature_indx_2 = None
            for indx, row in diffrences.iterrows():
                if row["metric"] > 0:
                    tipping_feature_indx_2 = indx
                else: 
                    break
-            # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score     
+                best_feature, best_metric_score, best_metric_score_std = \
-            tipping_feature_indx_3 = None
+                    self.select_best_feature(features, method=method, ml_category=ml_category, ml_subcategory=ml_subcategory, metric=metric)
-            cum_sum_score = 0
+                    
-            i_worse = 0
+                feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
-            # TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
+                
-            for indx, row in selection_area.iterrows():
+                features.remove(best_feature)
-                if row["metric"] > 0:
+                print("Features left:", i) 
-                    tipping_feature_indx_3 = indx
+                
-                    cum_sum_score += row["metric"]
+                if i <= n_max:
                    if best_metric_score >= best_score:
                        best_score = best_metric_score
                        best_feature_indx = i+1
                        i_worse = 0
                    else:
                        i_worse += 1
-                if i_worse == n_not_improve:
+                    if i_worse == n_tolerance: 
                        break  
            feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
            print(feature_importance_df)
            print("best_feature_indx", best_feature_indx)
            print("best_score", best_score)
            features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist()
            selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]    
            return selected_features
-            
+        else:
-        
+            raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
    def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
        pass
    def vizualize_feature_selection_process():
        pass
    def execute_feature_selection_step():
        pass
--- a/machine_learning/helper.py
+++ b/machine_learning/helper.py
@ -11,7 +11,13 @@ from sklearn import (
    svm,
 )
 from sklearn.dummy import DummyClassifier, DummyRegressor
-from sklearn.model_selection import LeaveOneGroupOut, cross_validate
+from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import (
    BaseCrossValidator,
    LeaveOneGroupOut,
    StratifiedKFold,
    cross_validate,
 )
 from xgboost import XGBClassifier, XGBRegressor
@ -73,7 +79,40 @@ def insert_row(df, row):
    return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
-def prepare_regression_model_input(model_input, cv_method="logo"):
+def impute_encode_categorical_features(model_input: pd.DataFrame) -> pd.DataFrame:
    categorical_feature_col_names = [
        "gender",
        "startlanguage",
        "limesurvey_demand_control_ratio_quartile",
    ]
    additional_categorical_features = [
        col
        for col in model_input.columns
        if "mostcommonactivity" in col or "homelabel" in col
    ]
    categorical_feature_col_names += additional_categorical_features
    categorical_features = model_input[categorical_feature_col_names].copy()
    mode_categorical_features = categorical_features.mode().iloc[0]
    # fillna with mode
    categorical_features = categorical_features.fillna(mode_categorical_features)
    # one-hot encoding
    categorical_features = categorical_features.apply(
        lambda col: col.astype("category")
    )
    if not categorical_features.empty:
        categorical_features = pd.get_dummies(categorical_features)
    numerical_features = model_input.drop(categorical_feature_col_names, axis=1)
    model_input = pd.concat([numerical_features, categorical_features], axis=1)
    return model_input
 def prepare_sklearn_data_format(
    model_input: pd.DataFrame, cv_method: str = "logo"
 ) -> tuple:
    index_columns = [
        "local_segment",
        "local_segment_label",
@ -82,13 +121,7 @@ def prepare_regression_model_input(model_input, cv_method="logo"):
    ]
    model_input.set_index(index_columns, inplace=True)
-    if cv_method == "logo":
+    if cv_method == "half_logo":
        data_x, data_y, data_groups = (
            model_input.drop(["target", "pid"], axis=1),
            model_input["target"],
            model_input["pid"],
        )
    else:
        model_input["pid_index"] = model_input.groupby("pid").cumcount()
        model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")
@ -104,52 +137,53 @@ def prepare_regression_model_input(model_input, cv_method="logo"):
            model_input["target"],
            model_input["pid_half"],
        )
-
+    else:
-    categorical_feature_colnames = [
+        data_x, data_y, data_groups = (
-        "gender",
+            model_input.drop(["target", "pid"], axis=1),
-        "startlanguage",
+            model_input["target"],
-        "limesurvey_demand_control_ratio_quartile",
+            model_input["pid"],
    ]
    additional_categorical_features = [
        col
        for col in data_x.columns
        if "mostcommonactivity" in col or "homelabel" in col
    ]
    categorical_feature_colnames += additional_categorical_features
    categorical_features = data_x[categorical_feature_colnames].copy()
    mode_categorical_features = categorical_features.mode().iloc[0]
    # fillna with mode
    categorical_features = categorical_features.fillna(mode_categorical_features)
    # one-hot encoding
    categorical_features = categorical_features.apply(
        lambda col: col.astype("category")
        )
-    if not categorical_features.empty:
+    return data_x, data_y, data_groups
        categorical_features = pd.get_dummies(categorical_features)
    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
    train_x = pd.concat([numerical_features, categorical_features], axis=1)
    return train_x, data_y, data_groups
-def run_all_regression_models(input_csv):
+def prepare_cross_validator(
-    # Prepare data
+    data_x: pd.DataFrame,
-    data_x, data_y, data_groups = prepare_regression_model_input(input_csv)
+    data_y: pd.DataFrame,
-
+    data_groups: pd.DataFrame,
-    # Prepare cross validation
+    cv_method: str = "logo",
-    logo = LeaveOneGroupOut()
+) -> BaseCrossValidator:
-    logo.get_n_splits(
+    if cv_method == "logo" or cv_method == "half_logo":
        cv = LeaveOneGroupOut()
        cv.get_n_splits(
            data_x,
            data_y,
            groups=data_groups,
        )
    else:
        cv = StratifiedKFold(n_splits=5, shuffle=True)
    return cv
 def aggregate_and_transpose(df: pd.DataFrame, statistics=None) -> pd.DataFrame:
    if statistics is None:
        statistics = ["max", "mean"]
    return (
        df.agg(statistics)
        .transpose()
        .reset_index()
        .rename(columns={"index": "test_metric"})
    )
 def run_all_regression_models(
    data_x: pd.DataFrame,
    data_y: pd.DataFrame,
    data_groups: pd.DataFrame,
    cross_validator: BaseCrossValidator,
 ) -> pd.DataFrame:
    metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
    test_metrics = ["test_" + metric for metric in metrics]
-    scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
+    scores = pd.DataFrame(columns=["method", "test_metric", "max", "nanmedian"])
    # Validate models
    dummy_regr = DummyRegressor(strategy="mean")
@ -158,7 +192,7 @@ def run_all_regression_models(input_csv):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
@ -166,17 +200,19 @@ def run_all_regression_models(input_csv):
    print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))
    scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "dummy"
    scores = pd.concat([scores, scores_df])
    del dummy_regr
    del dummy_regr_scores
-    lin_reg_rapids = linear_model.LinearRegression()
+    lin_reg = linear_model.LinearRegression()
    lin_reg_scores = cross_validate(
-        lin_reg_rapids,
+        lin_reg,
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
@ -184,9 +220,11 @@ def run_all_regression_models(input_csv):
    print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))
    scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "linear_reg"
    scores = pd.concat([scores, scores_df])
    del lin_reg
    del lin_reg_scores
    ridge_reg = linear_model.Ridge(alpha=0.5)
    ridge_reg_scores = cross_validate(
@ -194,16 +232,18 @@ def run_all_regression_models(input_csv):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("Ridge regression")
    scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "ridge_reg"
    scores = pd.concat([scores, scores_df])
    del ridge_reg
    del ridge_reg_scores
    lasso_reg = linear_model.Lasso(alpha=0.1)
    lasso_reg_score = cross_validate(
@ -211,16 +251,18 @@ def run_all_regression_models(input_csv):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("Lasso regression")
    scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "lasso_reg"
    scores = pd.concat([scores, scores_df])
    del lasso_reg
    del lasso_reg_score
    bayesian_ridge_reg = linear_model.BayesianRidge()
    bayesian_ridge_reg_score = cross_validate(
@ -228,16 +270,18 @@ def run_all_regression_models(input_csv):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("Bayesian Ridge")
    scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "bayesian_ridge"
    scores = pd.concat([scores, scores_df])
    del bayesian_ridge_reg
    del bayesian_ridge_reg_score
    ransac_reg = linear_model.RANSACRegressor()
    ransac_reg_score = cross_validate(
@ -245,27 +289,37 @@ def run_all_regression_models(input_csv):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("RANSAC (outlier robust regression)")
    scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "RANSAC"
    scores = pd.concat([scores, scores_df])
    del ransac_reg
    del ransac_reg_score
    svr = svm.SVR()
    svr_score = cross_validate(
-        svr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
+        svr,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("Support vector regression")
    scores_df = pd.DataFrame(svr_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "SVR"
    scores = pd.concat([scores, scores_df])
    del svr
    del svr_score
    kridge = kernel_ridge.KernelRidge()
    kridge_score = cross_validate(
@ -273,69 +327,130 @@ def run_all_regression_models(input_csv):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=logo,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("Kernel Ridge regression")
    scores_df = pd.DataFrame(kridge_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "kernel_ridge"
    scores = pd.concat([scores, scores_df])
    del kridge
    del kridge_score
    gpr = gaussian_process.GaussianProcessRegressor()
    gpr_score = cross_validate(
-        gpr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
+        gpr,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("Gaussian Process Regression")
    scores_df = pd.DataFrame(gpr_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "gaussian_proc"
    scores = pd.concat([scores, scores_df])
    del gpr
    del gpr_score
    rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
    rfr_score = cross_validate(
-        rfr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
+        rfr,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("Random Forest Regression")
    scores_df = pd.DataFrame(rfr_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "random_forest"
    scores = pd.concat([scores, scores_df])
    del rfr
    del rfr_score
    xgb = XGBRegressor()
    xgb_score = cross_validate(
-        xgb, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
+        xgb,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("XGBoost Regressor")
    scores_df = pd.DataFrame(xgb_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "XGBoost"
    scores = pd.concat([scores, scores_df])
    del xgb
    del xgb_score
    ada = ensemble.AdaBoostRegressor()
    ada_score = cross_validate(
-        ada, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
+        ada,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    print("ADA Boost Regressor")
    scores_df = pd.DataFrame(ada_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "ADA_boost"
    scores = pd.concat([scores, scores_df])
    del ada
    del ada_score
    return scores
-def run_all_classification_models(data_x, data_y, data_groups, cv_method):
+def confusion_matrix_scorer(clf, X, y):
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]}
 def aggregate_confusion_matrix(scores_dict: dict) -> pd.DataFrame:
    scores_aggregated = aggregate_and_transpose(
        pd.DataFrame(scores_dict), statistics=["sum"]
    )
    return scores_aggregated[
        ~scores_aggregated.test_metric.isin(["fit_time", "score_time"])
    ]
 def run_all_classification_models(
    data_x: pd.DataFrame,
    data_y: pd.DataFrame,
    data_groups: pd.DataFrame,
    cross_validator: BaseCrossValidator,
 ):
    data_y_value_counts = data_y.value_counts()
    if len(data_y_value_counts) == 1:
        raise (ValueError("There is only one unique value in data_y."))
    if len(data_y_value_counts) == 2:
        metrics = ["accuracy", "average_precision", "recall", "f1"]
    else:
        metrics = ["accuracy", "precision_micro", "recall_micro", "f1_micro"]
    test_metrics = ["test_" + metric for metric in metrics]
-    scores = pd.DataFrame(columns=["method", "max", "mean"])
+    scores = pd.DataFrame(columns=["method", "test_metric", "max", "mean"])
    dummy_class = DummyClassifier(strategy="most_frequent")
@ -344,17 +459,39 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        error_score="raise",
        scoring=metrics,
    )
    dummy_confusion_matrix = cross_validate(
        dummy_class,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=cross_validator,
        n_jobs=-1,
        error_score="raise",
        scoring=confusion_matrix_scorer,
    )
    print("Dummy")
    scores_df = pd.DataFrame(dummy_score)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
-    scores_df["method"] = "Dummy"
+    scores_df = pd.concat(
        [
            scores_df,
            aggregate_confusion_matrix(dummy_confusion_matrix).rename(
                columns={"sum": "mean"}
                # Note: the column is misleadingly renamed to get concise output.
            ),
        ]
    )
    scores_df["method"] = "dummy_classifier"
    scores = pd.concat([scores, scores_df])
    del dummy_class
    del dummy_score
    del dummy_confusion_matrix
    logistic_regression = linear_model.LogisticRegression()
@ -363,16 +500,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    log_reg_confusion_matrix = cross_validate(
        logistic_regression,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=cross_validator,
        n_jobs=-1,
        scoring=confusion_matrix_scorer,
    )
    print("Logistic regression")
    scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
-    scores_df["method"] = "logistic_reg"
+    scores_df = pd.concat(
        [
            scores_df,
            aggregate_confusion_matrix(log_reg_confusion_matrix).rename(
                columns={"sum": "mean"}
                # Note: the column is misleadingly renamed to get concise output.
            ),
        ]
    )
    scores_df["method"] = "logistic_regression"
    scores = pd.concat([scores, scores_df])
    del logistic_regression
    del log_reg_scores
    del log_reg_confusion_matrix
    svc = svm.SVC()
@ -381,16 +539,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    svc_confusion_matrix = cross_validate(
        svc,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=cross_validator,
        n_jobs=-1,
        scoring=confusion_matrix_scorer,
    )
    print("Support Vector Machine")
    scores_df = pd.DataFrame(svc_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
-    scores_df["method"] = "svc"
+    scores_df = pd.concat(
        [
            scores_df,
            aggregate_confusion_matrix(svc_confusion_matrix).rename(
                columns={"sum": "mean"}
                # Note: the column is misleadingly renamed to get concise output.
            ),
        ]
    )
    scores_df["method"] = "SVC"
    scores = pd.concat([scores, scores_df])
    del svc
    del svc_scores
    del svc_confusion_matrix
    gaussian_nb = naive_bayes.GaussianNB()
@ -399,16 +578,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    gaussian_nb_confusion_matrix = cross_validate(
        gaussian_nb,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=cross_validator,
        n_jobs=-1,
        scoring=confusion_matrix_scorer,
    )
    print("Gaussian Naive Bayes")
    scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
    scores_df = pd.concat(
        [
            scores_df,
            aggregate_confusion_matrix(gaussian_nb_confusion_matrix).rename(
                columns={"sum": "mean"}
                # Note: the column is misleadingly renamed to get concise output.
            ),
        ]
    )
    scores_df["method"] = "gaussian_naive_bayes"
    scores = pd.concat([scores, scores_df])
    del gaussian_nb
    del gaussian_nb_scores
    del gaussian_nb_confusion_matrix
    sgdc = linear_model.SGDClassifier()
@ -417,16 +617,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    sgdc_confusion_matrix = cross_validate(
        sgdc,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=cross_validator,
        n_jobs=-1,
        scoring=confusion_matrix_scorer,
    )
    print("Stochastic Gradient Descent")
    scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
-    scores_df["method"] = "stochastic_gradient_descent"
+    scores_df = pd.concat(
        [
            scores_df,
            aggregate_confusion_matrix(sgdc_confusion_matrix).rename(
                columns={"sum": "mean"}
                # Note: the column is misleadingly renamed to get concise output.
            ),
        ]
    )
    scores_df["method"] = "stochastic_gradient_descent_classifier"
    scores = pd.concat([scores, scores_df])
    del sgdc
    del sgdc_scores
    del sgdc_confusion_matrix
    rfc = ensemble.RandomForestClassifier()
@ -435,16 +656,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    rfc_confusion_matrix = cross_validate(
        rfc,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=cross_validator,
        n_jobs=-1,
        scoring=confusion_matrix_scorer,
    )
    print("Random Forest")
    scores_df = pd.DataFrame(rfc_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
-    scores_df["method"] = "random_forest"
+    scores_df = pd.concat(
        [
            scores_df,
            aggregate_confusion_matrix(rfc_confusion_matrix).rename(
                columns={"sum": "mean"}
                # Note: the column is misleadingly renamed to get concise output.
            ),
        ]
    )
    scores_df["method"] = "random_forest_classifier"
    scores = pd.concat([scores, scores_df])
    del rfc
    del rfc_scores
    del rfc_confusion_matrix
    xgb_classifier = XGBClassifier()
@ -453,15 +695,36 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
        X=data_x,
        y=data_y,
        groups=data_groups,
-        cv=cv_method,
+        cv=cross_validator,
        n_jobs=-1,
        scoring=metrics,
    )
    xgb_confusion_matrix = cross_validate(
        xgb_classifier,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=cross_validator,
        n_jobs=-1,
        scoring=confusion_matrix_scorer,
    )
    print("XGBoost")
    scores_df = pd.DataFrame(xgb_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
-    scores_df["method"] = "xgboost"
+    scores_df = pd.concat(
        [
            scores_df,
            aggregate_confusion_matrix(xgb_confusion_matrix).rename(
                columns={"sum": "mean"}
                # Note: the column is misleadingly renamed to get concise output.
            ),
        ]
    )
    scores_df["method"] = "XGBoost_classifier"
    scores = pd.concat([scores, scores_df])
    del xgb_classifier
    del xgb_scores
    del xgb_confusion_matrix
    return scores
--- a/machine_learning/preprocessing.py
+++ b/machine_learning/preprocessing.py
@ -33,7 +33,7 @@ class Preprocessing:
        Args:
            categorical_features (DataFrame): DataFrame including only categorical columns.
            numerical_features (_type_): DataFrame including only numerical columns.
-            mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
+            mode (int): Mode of the column with which DataFrame is filled.
        Returns:
            DataFrame: Hot-One Encoded DataFrame.
@ -46,7 +46,7 @@ class Preprocessing:
        if not categorical_features.empty:
            categorical_features = pd.get_dummies(categorical_features)
-        return pd.concat([numerical_features, categorical_features], axis=1)
+        return pd.concat([numerical_features, categorical_features], axis=1), categorical_features.columns.tolist()
    def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
@ -68,19 +68,26 @@ class Preprocessing:
        categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
        # For train set
        train_X_categorical_features = self.train_X[categorical_columns].copy()
        train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
        mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
-        self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
+        self.train_X, train_cat_col_names = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
        encoded_categorical_features = [col for col in self.train_X.columns if col.startswith(tuple(categorical_columns))]
        # For test set
        test_X_categorical_features = self.test_X[categorical_columns].copy()
        test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
-        self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
+        self.test_X, test_cat_col_names = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
        # Create categorical columns that were not found in test set and fill them with 0        
        missing_cols = [col for col in train_cat_col_names if col not in test_cat_col_names]
        self.test_X[missing_cols] = 0
        # Sort column names alphabetically        
        self.train_X = self.train_X.reindex(sorted(self.train_X.columns), axis=1)
        self.test_X = self.test_X.reindex(sorted(self.test_X.columns), axis=1)
    def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
--- a/presentation/event_stressful_detection_5fold.csv
+++ b/presentation/event_stressful_detection_5fold.csv
@ -1,29 +0,0 @@
 method,metric,max,mean
 Dummy,test_accuracy,0.8557046979865772,0.8548446932649828
 Dummy,test_average_precision,0.1457286432160804,0.14515530673501736
 Dummy,test_recall,0.0,0.0
 Dummy,test_f1,0.0,0.0
 logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606
 logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834
 logistic_reg,test_recall,0.3023255813953488,0.24266238973536486
 logistic_reg,test_f1,0.3909774436090226,0.318943511424051
 svc,test_accuracy,0.8557046979865772,0.8548446932649828
 svc,test_average_precision,0.44514416839823046,0.4068200938341621
 svc,test_recall,0.0,0.0
 svc,test_f1,0.0,0.0
 gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234
 gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853
 gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935
 gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699
 stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601
 stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592
 stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675
 stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583
 random_forest,test_accuracy,0.9110738255033557,0.9011129472867694
 random_forest,test_average_precision,0.6998372262021191,0.6619275281099584
 random_forest,test_recall,0.4069767441860465,0.35356856455493185
 random_forest,test_f1,0.5691056910569107,0.5078402513053142
 xgboost,test_accuracy,0.9128978224455612,0.9007711937764886
 xgboost,test_average_precision,0.7366643049075349,0.698622165966308
 xgboost,test_recall,0.5287356321839081,0.44346431435445066
 xgboost,test_f1,0.638888888888889,0.5633957169928393
--- a/presentation/event_stressful_detection_logo.csv
+++ b/presentation/event_stressful_detection_logo.csv
@ -1,29 +0,0 @@
 method,metric,max,mean
 Dummy,test_accuracy,1.0,0.8524114578096439
 Dummy,test_average_precision,0.7,0.14758854219035614
 Dummy,test_recall,0.0,0.0
 Dummy,test_f1,0.0,0.0
 logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311
 logistic_reg,test_average_precision,1.0,0.44605167668563583
 logistic_reg,test_recall,1.0,0.25353566685532386
 logistic_reg,test_f1,0.823529411764706,0.27951926390778625
 svc,test_accuracy,1.0,0.8524114578096439
 svc,test_average_precision,0.9612401707068228,0.44179454944271934
 svc,test_recall,0.0,0.0
 svc,test_f1,0.0,0.0
 gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129
 gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991
 gaussian_naive_bayes,test_recall,1.0,0.3743761174081108
 gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668
 stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739
 stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582
 stochastic_gradient_descent,test_recall,1.0,0.30156420704502945
 stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388
 random_forest,test_accuracy,1.0,0.8722158105763481
 random_forest,test_average_precision,1.0,0.49817066323226833
 random_forest,test_recall,1.0,0.18161263127840668
 random_forest,test_f1,1.0,0.2508096532365307
 xgboost,test_accuracy,1.0,0.8812627400277729
 xgboost,test_average_precision,1.0,0.5505695112208401
 xgboost,test_recall,1.0,0.2896161238315027
 xgboost,test_f1,0.9411764705882353,0.36887408735855665
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,7 @@
 [tool.isort]
 profile = "black"
 py_version = 311
 skip_gitignore = "true"
 [tool.black]
 target-version = ["py311"]
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 63f5a526fce4d288499168e1701adadb8b885d82
+Subproject commit 059774bda10545a83ab282f59eb7a329fef9ee4c
--- a/setup.py
+++ b/setup.py
@ -1,8 +1,7 @@
 import os
 import sqlalchemy.engine.url
 from dotenv import load_dotenv
-from sqlalchemy import create_engine
+from sqlalchemy import URL, create_engine
 from sqlalchemy.orm import sessionmaker
 load_dotenv()
@ -11,7 +10,7 @@ testing: bool = False
 db_password = os.getenv("DB_PASSWORD")
-db_uri = sqlalchemy.engine.url.URL(
+db_uri = URL.create(
    drivername="postgresql+psycopg2",
    username="staw_db",
    password=db_password,
--- a/statistical_analysis/scale_reliability.rmd
+++ b/statistical_analysis/scale_reliability.rmd
@ -0,0 +1,60 @@
 ---
 title: "Reliability of SAM threat and challenge and COPE"
 output: html_notebook
 ---
 ```{r libraries, message=FALSE, warning=FALSE, include=FALSE, cache=FALSE}
 library(conflicted)
 library(here)
 library(tidyverse)
 library(magrittr)
 library(lavaan)
 library(kableExtra)
 conflicts_prefer(
    readr::col_factor,
    purrr::discard,
    dplyr::filter,
    dplyr::lag,
    purrr::set_names,
    tidyr::extract,
    kableExtra::group_rows
 )
 ```
 ```{r style, include=FALSE, cache=FALSE}
 styler::style_file(
    here("statistical_analysis", "scale_reliability.Rmd"),
    scope = "tokens",
    indent_by = 4L
 )
 ```
 The data were preprocessed and cleaned using [expl_esm_labels.py](../exploration/expl_esm_labels.py) script and read as csv here.
 ```{r read_data}
 COL_TYPES <- cols(
    .default = col_double(),
    participant_id = col_factor(),
    username = col_factor(),
    device_id = col_factor(),
    esm_trigger = col_factor(),
    esm_instructions = col_factor(),
    double_esm_user_answer_timestamp = col_double(),
    datetime_lj = col_datetime(format = ""),
    date_lj = col_date(format = ""),
    time = col_factor(),
    esm_user_answer = col_factor()
 )
 df_SAM <- read_csv(here("data", "raw", "df_esm_SAM_threat_challenge.csv"), col_types = COL_TYPES)
 df_COPE <- read_csv(here("data", "raw", "df_esm_COPE.csv"), col_types = COL_TYPES)
 ```
 Demonstrate factor analysis for a single participant.
 ```{r}
 df_COPE %>%
 	group_by(question_id, questionnaire_id) %>%
 	count()
 ```
--- a/straw2analysis.Rproj
+++ b/straw2analysis.Rproj
@ -0,0 +1,20 @@
 Version: 1.0
 RestoreWorkspace: Default
 SaveWorkspace: Default
 AlwaysSaveHistory: Default
 EnableCodeIndexing: Yes
 UseSpacesForTab: No
 NumSpacesForTab: 4
 Encoding: UTF-8
 RnwWeave: Sweave
 LaTeX: XeLaTeX
 AutoAppendNewline: Yes
 StripTrailingWhitespace: Yes
 PythonType: conda
 PythonVersion: 3.11.3
 PythonPath: E:/ProgramData/mambaforge/envs/straw2analysis/python.exe
Author	SHA1	Message	Date
junos	777d30365e	Fix questions that were slightly different in the morning.	2023-07-03 21:29:09 +02:00
junos	ca85131ed2	Use refactored methods.	2023-07-03 21:18:15 +02:00
junos	2aa5c5cb07	Fix trailing whitespace.	2023-07-03 21:17:40 +02:00
junos	82b53bc0d3	Extract method to reuse.	2023-07-03 21:13:50 +02:00
junos	c688580fe8	Increment answers separately if needed.	2023-07-03 21:01:15 +02:00
junos	8c0b66eddc	Extract method to reuse and simplify.	2023-07-03 20:52:08 +02:00
junos	e3ff4846e1	Document functions.	2023-07-03 20:45:39 +02:00
junos	825380a47e	Add a function to fix SAM question IDs.	2023-07-03 20:41:48 +02:00
junos	ef26772038	Add SAM question IDs.	2023-07-03 20:38:20 +02:00
junos	64c05ec5ec	Fix COPE question IDs.	2023-07-03 20:27:33 +02:00
junos	c0236b251c	Fix question IDs.	2023-07-03 20:11:11 +02:00
junos	2f22f2052a	[WIP] Continue factor analysis.	2023-07-03 20:10:45 +02:00
junos	ec51d7d406	[WIP] Add a function to recode question IDs.	2023-07-03 19:57:28 +02:00
junos	2aca64aa09	Add COPE questions and their IDs.	2023-07-03 19:34:11 +02:00
junos	9a87a0a34a	Add instructions to export.	2023-07-03 19:19:10 +02:00
junos	91a9f20839	Read in data.	2023-07-03 18:44:45 +02:00
junos	e3be17e56e	Export COPE data.	2023-07-03 18:33:28 +02:00
junos	5c0e2e2621	Export SAM data.	2023-07-03 18:25:40 +02:00
junos	51201c2bc9	Disable jupytext pre-commit again.	2023-07-03 18:24:37 +02:00
junos	fed4b33611	Add more jupytext args.	2023-07-03 18:22:36 +02:00
junos	471ce7c2cb	Document extract_stressful_events.	2023-07-03 17:37:34 +02:00
junos	dbb2033f78	Add jupytext to pre-commit hooks.	2023-07-03 17:35:45 +02:00
junos	1b77fb119c	Fix an error introduced in `ae2ca63bc4`.	2023-07-03 17:17:56 +02:00
junos	ae2ca63bc4	Define QUESTIONNAIRE IDs and use them. Clean up docstrings.	2023-07-03 17:09:40 +02:00
junos	577f1330da	Add docstrings flake8 checks.	2023-07-03 16:49:35 +02:00
junos	4af360f411	Use conda Python environment with R.	2023-07-03 14:51:07 +02:00
junos	96bbe32f56	Add statistics for some scales.	2023-07-03 14:50:35 +02:00
junos	40170339c2	Add R project and sample Markdown notebook.	2023-07-03 14:28:28 +02:00
junos	7b0c0037f7	Add an old script to test JCQ reversal.	2023-07-03 14:23:55 +02:00
junos	db06584ddd	Improve removal of "medium" class.	2023-05-31 22:46:49 +02:00
junos	112d968715	Add baseline features.	2023-05-31 22:25:39 +02:00
junos	9cc6bf7c21	Add PCA for composite target.	2023-05-31 21:12:21 +02:00
junos	78807b941c	Add analysis for composite score of stress.	2023-05-31 21:00:55 +02:00
junos	a9af113c9c	Add confusion matrices for all methods.	2023-05-31 17:41:50 +02:00
junos	97113fe9ab	Sum up confusion matrix and illustrate use with dummy.	2023-05-31 17:27:49 +02:00
junos	bc78a1d498	Define a confusion matrix scorer.	2023-05-31 16:05:39 +02:00
junos	aca84b214d	Small corrections.	2023-05-19 03:19:42 +02:00
junos	aa13123136	Handle clustering classification the same as other classification models again.	2023-05-19 03:04:09 +02:00
junos	c51e0da0f7	Handle clustering classification the same as other classification models.	2023-05-19 02:52:56 +02:00
junos	a2401b5e36	Add multiclass scoring.	2023-05-19 01:34:34 +02:00
junos	70232949c3	Better handling of "medium" category.	2023-05-19 01:10:30 +02:00
junos	8a9595c615	Better handling of input filename again.	2023-05-18 22:53:58 +02:00
junos	045b9fa0dc	Better handling of input filename.	2023-05-18 19:03:53 +02:00
junos	bb4445f1a8	Add bins to output filename.	2023-05-18 18:58:19 +02:00
junos	1318ae3609	Rename methods to make them consistent with regression methods.	2023-05-18 18:55:31 +02:00
junos	45441c288d	Correct column name.	2023-05-18 18:47:56 +02:00
junos	fa45b30955	Set output path programmatically.	2023-05-18 18:40:54 +02:00
junos	2336edffb6	Retain metric names in final scores.	2023-05-18 18:40:06 +02:00
junos	b756ed5feb	Set more parameters as user-specified constants.	2023-05-18 18:06:32 +02:00
junos	cad28c3fe8	Set path programmatically.	2023-05-18 16:36:46 +02:00
junos	38a405d378	Add index when inserting one row.	2023-05-17 18:13:20 +02:00
junos	2c5a0b4157	Label plot axes.	2023-05-17 16:32:27 +02:00
junos	0409c9e982	Fix format specification.	2023-05-16 17:22:09 +02:00
junos	a7446cc34a	Specify columns to aggregate and save figures as pdfs.	2023-05-16 17:05:43 +02:00
junos	118e686491	Specify format directly as infer_datetime_format was deprecated.	2023-05-16 17:04:48 +02:00
junos	9417a1b9f1	Do not break markdown lines.	2023-05-16 16:37:34 +02:00
junos	7b5db88f1d	Remove and ignore results.	2023-05-16 16:22:29 +02:00
junos	0f8f0b0fb6	Update URL call.	2023-05-16 16:17:53 +02:00
junos	26c7d22b83	Add an option to save figures.	2023-05-16 16:17:06 +02:00
junos	87781840d4	Use concat instead of append which was deprecated.	2023-05-12 16:32:08 +02:00
junos	3091328fc5	Format comments.	2023-05-11 16:51:38 +02:00
junos	055e87dbac	Return scores for classification.	2023-05-10 23:51:12 +02:00
junos	f58d20ffc2	Update classification runner.	2023-05-10 23:17:44 +02:00
junos	075fdab9ea	Select segment and save results.	2023-05-10 23:00:03 +02:00
junos	91e7352480	Thoroughly refactor classification runner.	2023-05-10 22:50:00 +02:00
junos	35c09374dd	Free up memory during model building.	2023-05-10 21:44:40 +02:00
junos	b505fb2b6a	Thoroughly refactor regression runner.	2023-05-10 20:30:51 +02:00
junos	47b1ecdbb9	First format with black and then check with flake8.	2023-05-10 15:29:32 +02:00
junos	24744c288d	Extract one step of preparation into a separate function.	2023-05-10 15:28:09 +02:00
junos	caeaf03239	Provide data instead of csv input.	2023-05-10 15:20:33 +02:00
junos	cd5d8b6a10	Update rapids and add regex=True. Reformat debug_heatmap.	2023-05-10 15:12:27 +02:00
junos	3e38b64b45	Merge branch 'ml_pipeline'	2023-05-10 15:02:17 +02:00
junos	76071fd550	Start using pre-commit hooks.	2023-04-24 15:38:54 +02:00
Primoz	26804cf8ea	Repair preprocessing one hot encoding of test set.	2023-04-21 13:24:31 +02:00
Primoz	865225994b	Added testing section after feature selection.	2023-04-20 13:29:14 +02:00
Primoz	259be708aa	Improve the feature selection method with validations etc.	2023-04-20 13:26:20 +02:00
Primoz	0594993133	Add GroupKFold to feature selection CV. Start with generic metric calculation procedure.	2023-04-20 11:20:26 +02:00
Primoz	1cbc743cf7	Add kBest method to initially filter out the worst performing features. Update comments.	2023-04-20 10:12:16 +02:00
Primoz	2a8f1ee613	Merge branch 'ml_pipeline' of https://repo.ijs.si/junoslukan/straw2analysis into ml_pipeline	2023-04-19 15:56:52 +02:00
Primoz	ce13a9e13b	Implement feature selection method which is used in ML pipeline.	2023-04-19 15:56:34 +02:00
		`@ -1 +1 @@`
			`Subproject commit 63f5a526fce4d288499168e1701adadb8b885d82`				`Subproject commit 059774bda10545a83ab282f59eb7a329fef9ee4c`