Merge branch 'ml_pipeline'

2023-04-18 15:55:03 +02:00 · 2023-04-18 15:55:03 +02:00 · 297eb45933
parent c0904dd681 0b16aa6fe4
commit 297eb45933
45 changed files with 4782 additions and 951 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,7 +5,19 @@ __pycache__/
 /exploration/*.ipynb
 /config/*.ipynb
 /statistical_analysis/*.ipynb
+/presentation/*.ipynb
 /machine_learning/intermediate_results/
 /data/features/
 /data/baseline/
 /data/*input*.csv
+/data/daily*
+/data/intradaily*
+/data/stressfulness_event*
+/data/30min*
+/presentation/*scores.csv
+/presentation/Results.ods
+.Rproj.user
+.Rhistory
+/presentation/*.nb.html
+presentation/event_stressful_detection_half_loso.csv
+presentation/event_stressful_detection_loso.csv
--- a/config/environment.yml
+++ b/config/environment.yml
@ -7,8 +7,10 @@ dependencies:
  - black
  - isort
  - flake8
+  - imbalanced-learn=0.10.0
  - jupyterlab
  - jupytext
+  - lightgbm
  - mypy
  - nodejs
  - pandas
--- a/data/input_PANAS_negative_affect_mean.csv
+++ b/data/input_PANAS_negative_affect_mean.csv
--- a/exploration/all_sensors_sequential_addition_scores.xlsx
+++ b/exploration/all_sensors_sequential_addition_scores.xlsx
--- a/exploration/ex_all_feat_ml_pipeline.py
+++ b/exploration/ex_all_feat_ml_pipeline.py
@ -1,473 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.13.0
-#   kernelspec:
-#     display_name: straw2analysis
-#     language: python
-#     name: straw2analysis
-# ---
-
-# %% jupyter={"source_hidden": true}
-# %matplotlib inline
-import datetime
-import importlib
-import os
-import sys
-
-import numpy as np
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-import yaml
-from pyprojroot import here
-from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble
-from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
-from sklearn.metrics import mean_squared_error, r2_score
-from sklearn.impute import SimpleImputer
-from xgboost import XGBRegressor
-
-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
-
-import machine_learning.features_sensor
-import machine_learning.labels
-import machine_learning.model
-
-# %% [markdown]
-# # RAPIDS models
-
-# %% [markdown]
-# ## PANAS negative affect
-
-# %% jupyter={"source_hidden": true}
-# model_input = pd.read_csv("../data/input_PANAS_NA.csv") # Nestandardizirani podatki
-model_input = pd.read_csv("../data/z_input_PANAS_NA.csv") # Standardizirani podatki
-
-# %% [markdown]
-# ### NaNs before dropping cols and rows
-
-# %% jupyter={"source_hidden": true}
-sns.set(rc={"figure.figsize":(16, 8)})
-sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
-
-# %% jupyter={"source_hidden": true}
-nan_cols = list(model_input.loc[:, model_input.isna().all()].columns)
-nan_cols
-
-# %% jupyter={"source_hidden": true}
-model_input.dropna(axis=1, how="all", inplace=True)
-model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
-
-# %% [markdown]
-# ### NaNs after dropping NaN cols and rows where target is NaN
-
-# %% jupyter={"source_hidden": true}
-sns.set(rc={"figure.figsize":(16, 8)})
-sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
-
-# %% jupyter={"source_hidden": true}
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-#if "pid" in model_input.columns:
-#    index_columns.append("pid")
-model_input.set_index(index_columns, inplace=True)
-
-data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
-
-# %% jupyter={"source_hidden": true}
-categorical_feature_colnames = ["gender", "startlanguage"]
-
-# %% jupyter={"source_hidden": true}
-categorical_features = data_x[categorical_feature_colnames].copy()
-
-# %% jupyter={"source_hidden": true}
-mode_categorical_features = categorical_features.mode().iloc[0]
-
-# %% jupyter={"source_hidden": true}
-# fillna with mode
-categorical_features = categorical_features.fillna(mode_categorical_features)
-
-# %% jupyter={"source_hidden": true}
-# one-hot encoding
-categorical_features = categorical_features.apply(lambda col: col.astype("category"))
-if not categorical_features.empty:
-    categorical_features = pd.get_dummies(categorical_features)
-
-# %% jupyter={"source_hidden": true}
-numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
-
-# %% jupyter={"source_hidden": true}
-train_x = pd.concat([numerical_features, categorical_features], axis=1)
-
-# %% jupyter={"source_hidden": true}
-train_x.dtypes
-
-# %% jupyter={"source_hidden": true}
-logo = LeaveOneGroupOut()
-logo.get_n_splits(
-    train_x,
-    data_y,
-    groups=data_groups,
-)
-
-# %% jupyter={"source_hidden": true}
-sum(data_y.isna())
-
-# %% [markdown]
-# ### Linear Regression
-
-# %% jupyter={"source_hidden": true}
-lin_reg_rapids = linear_model.LinearRegression()
-
-# %% jupyter={"source_hidden": true}
-imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
-
-# %% jupyter={"source_hidden": true}
-lin_reg_scores = cross_val_score(
-    lin_reg_rapids,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring='r2'
-)
-lin_reg_scores
-np.median(lin_reg_scores)
-
-# %% [markdown]
-# ### Ridge regression
-
-# %% jupyter={"source_hidden": true}
-ridge_reg = linear_model.Ridge(alpha=.5)
-
-# %% tags=[] jupyter={"source_hidden": true}
-ridge_reg_scores = cross_val_score(
-    ridge_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-)
-np.median(ridge_reg_scores)
-
-# %% [markdown]
-# ### Lasso
-
-# %% jupyter={"source_hidden": true}
-lasso_reg = linear_model.Lasso(alpha=0.1)
-
-# %% jupyter={"source_hidden": true}
-lasso_reg_score = cross_val_score(
-    lasso_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-)
-np.median(lasso_reg_score)
-
-# %% [markdown]
-# ### Bayesian Ridge
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg = linear_model.BayesianRidge()
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg_score = cross_val_score(
-    bayesian_ridge_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-)
-np.median(bayesian_ridge_reg_score)
-
-# %% [markdown]
-# ### RANSAC (outlier robust regression)
-
-# %% jupyter={"source_hidden": true}
-ransac_reg = linear_model.RANSACRegressor()
-
-# %% jupyter={"source_hidden": true}
-np.median(
-    cross_val_score(
-    ransac_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-    )
-)
-
-# %% [markdown]
-# ### Support vector regression
-
-# %% jupyter={"source_hidden": true}
-svr = svm.SVR()
-
-# %% jupyter={"source_hidden": true}
-np.median(
-    cross_val_score(
-    svr,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-    )
-)
-
-# %% [markdown]
-# ### Kernel Ridge regression
-
-# %% jupyter={"source_hidden": true}
-kridge = kernel_ridge.KernelRidge()
-
-# %% jupyter={"source_hidden": true}
-np.median(
-    cross_val_score(
-        kridge,
-        X=imputer.fit_transform(train_x),
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-)
-# %% [markdown]
-# ### Gaussian Process Regression
-
-# %% jupyter={"source_hidden": true}
-gpr = gaussian_process.GaussianProcessRegressor()
-
-# %% jupyter={"source_hidden": true}
-
-np.median(
-    cross_val_score(
-        gpr,
-        X=imputer.fit_transform(train_x),
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-)
-# %%
-def insert_row(df, row):
-    return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
-
-# %%
-def run_all_models(input_csv):
-    # Prepare data
-    model_input = pd.read_csv(input_csv)
-    model_input.dropna(axis=1, how="all", inplace=True)
-    model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
-
-    index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-    model_input.set_index(index_columns, inplace=True)
-
-    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
-
-    categorical_feature_colnames = ["gender", "startlanguage"]
-    categorical_features = data_x[categorical_feature_colnames].copy()
-    mode_categorical_features = categorical_features.mode().iloc[0]
-    # fillna with mode
-    categorical_features = categorical_features.fillna(mode_categorical_features)
-    # one-hot encoding
-    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
-    if not categorical_features.empty:
-        categorical_features = pd.get_dummies(categorical_features)
-
-    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
-
-    train_x = pd.concat([numerical_features, categorical_features], axis=1)
-    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
-    train_x_imputed = imputer.fit_transform(train_x)
-
-    # Prepare cross validation
-    logo = LeaveOneGroupOut()
-    logo.get_n_splits(
-        train_x,
-        data_y,
-        groups=data_groups,
-    )
-    scores = pd.DataFrame(columns=["method", "median", "max"])
-
-    # Validate models
-    lin_reg_rapids = linear_model.LinearRegression()
-    lin_reg_scores = cross_val_score(
-        lin_reg_rapids,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring='r2'
-    )
-    print("Linear regression:")
-    print(np.median(lin_reg_scores))
-    scores = insert_row(scores, ["Linear regression",np.median(lin_reg_scores),np.max(lin_reg_scores)])
-
-    ridge_reg = linear_model.Ridge(alpha=.5)
-    ridge_reg_scores = cross_val_score(
-        ridge_reg,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Ridge regression")
-    print(np.median(ridge_reg_scores))
-    scores = insert_row(scores, ["Ridge regression",np.median(ridge_reg_scores),np.max(ridge_reg_scores)])
-
-    lasso_reg = linear_model.Lasso(alpha=0.1)
-    lasso_reg_score = cross_val_score(
-        lasso_reg,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Lasso regression")
-    print(np.median(lasso_reg_score))
-    scores = insert_row(scores, ["Lasso regression",np.median(lasso_reg_score),np.max(lasso_reg_score)])
-
-    bayesian_ridge_reg = linear_model.BayesianRidge()
-    bayesian_ridge_reg_score = cross_val_score(
-        bayesian_ridge_reg,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Bayesian Ridge")
-    print(np.median(bayesian_ridge_reg_score))
-    scores = insert_row(scores, ["Bayesian Ridge",np.median(bayesian_ridge_reg_score),np.max(bayesian_ridge_reg_score)])
-
-    ransac_reg = linear_model.RANSACRegressor()
-    ransac_reg_score = cross_val_score(
-        ransac_reg,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("RANSAC (outlier robust regression)")
-    print(np.median(ransac_reg_score))
-    scores = insert_row(scores, ["RANSAC",np.median(ransac_reg_score),np.max(ransac_reg_score)])
-
-    svr = svm.SVR()
-    svr_score = cross_val_score(
-        svr,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Support vector regression")
-    print(np.median(svr_score))
-    scores = insert_row(scores, ["Support vector regression",np.median(svr_score),np.max(svr_score)])
-
-    kridge = kernel_ridge.KernelRidge()
-    kridge_score = cross_val_score(
-        kridge,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Kernel Ridge regression")
-    print(np.median(kridge_score))
-    scores = insert_row(scores, ["Kernel Ridge regression",np.median(kridge_score),np.max(kridge_score)])
-
-    gpr = gaussian_process.GaussianProcessRegressor()
-    gpr_score = cross_val_score(
-        gpr,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Gaussian Process Regression")
-    print(np.median(gpr_score))
-    scores = insert_row(scores, ["Gaussian Process Regression",np.median(gpr_score),np.max(gpr_score)])
-
-    rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
-    rfr_score = cross_val_score(
-        rfr,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Random Forest Regression")
-    print(np.median(rfr_score))
-    scores = insert_row(scores, ["Random Forest Regression",np.median(rfr_score),np.max(rfr_score)])
-
-    xgb = XGBRegressor()
-    xgb_score = cross_val_score(
-        xgb,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("XGBoost Regressor")
-    print(np.median(xgb_score))
-    scores = insert_row(scores, ["XGBoost Regressor",np.median(xgb_score),np.max(xgb_score)])
-
-    ada = ensemble.AdaBoostRegressor()
-    ada_score = cross_val_score(
-        ada,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("ADA Boost Regressor")
-    print(np.median(ada_score))
-    scores = insert_row(scores, ["ADA Boost Regressor",np.median(ada_score),np.max(ada_score)])
-
-    return scores
-
-
-
-
--- a/exploration/expl_features_analysis.py
+++ b/exploration/expl_features_analysis.py
@ -0,0 +1,318 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# %matplotlib inline
+
+import os, sys, math
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+from sklearn.tree import DecisionTreeClassifier
+from sklearn import tree
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
+
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+def calc_entropy(column):
+    """
+    Calculate entropy given a pandas series, list, or numpy array.
+    """
+    # Compute the counts of each unique value in the column
+    counts = np.bincount(column)
+    # Divide by the total column length to get a probability
+    probabilities = counts / len(column)
+    
+    # Initialize the entropy to 0
+    entropy = 0
+    # Loop through the probabilities, and add each one to the total entropy
+    for prob in probabilities:
+        if prob > 0:
+            # use log from math and set base to 2
+            entropy += prob * math.log(prob, 2)
+    
+    return -entropy
+
+
+def calc_information_gain(data, split_name, target_name):
+    """
+    Calculate information gain given a data set, column to split on, and target
+    """
+    # Calculate the original entropy
+    original_entropy = calc_entropy(data[target_name])
+    #Find the unique values in the column
+    values = data[split_name].unique()
+    
+    # Make two subsets of the data, based on the unique values
+    left_split = data[data[split_name] == values[0]]
+    right_split = data[data[split_name] == values[1]]
+    
+    # Loop through the splits and calculate the subset entropies
+    to_subtract = 0
+    for subset in [left_split, right_split]:
+        prob = (subset.shape[0] / data.shape[0]) 
+        to_subtract += prob * calc_entropy(subset[target_name])
+    
+    # Return information gain
+    return original_entropy - to_subtract
+
+
+def get_information_gains(data, target_name):
+  #Intialize an empty dictionary for information gains
+  information_gains = {}
+  
+  #Iterate through each column name in our list
+  for col in list(data.columns):
+    #Find the information gain for the column
+    information_gain = calc_information_gain(data, col, target_name)
+    #Add the information gain to our dictionary using the column name as the ekey                                         
+    information_gains[col] = information_gain
+  
+  #Return the key with the highest value                                          
+  #return max(information_gains, key=information_gains.get)
+  
+  return information_gains
+
+def n_features_with_highest_info_gain(info_gain_dict, n=None):
+    """
+    Get n-features that have highest information gain
+    """
+    if n is None:
+        n = len(info_gain_dict)
+    import heapq
+    n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
+    return {feature[0]: feature[1] for feature in n_largest}
+
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
+
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+categorical_features = model_input[categorical_feature_colnames].copy()
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
+model_input = pd.concat([numerical_features, categorical_features], axis=1)
+
+# Binarizacija targeta
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) 
+print(model_input['target'].value_counts(), edges)
+
+# %%
+info_gains = get_information_gains(model_input, 'target')
+
+# %% [markdown]
+# Present the feature importance results
+
+# %%
+print("Total columns:", len(info_gains))
+print(pd.Series(info_gains).value_counts())
+
+n_features_with_highest_info_gain(info_gains, n=189)
+
+# %%
+def compute_impurity(feature, impurity_criterion):
+    """
+    This function calculates impurity of a feature.
+    Supported impurity criteria: 'entropy', 'gini'
+    input: feature (this needs to be a Pandas series)
+    output: feature impurity
+    """
+    probs = feature.value_counts(normalize=True)
+    
+    if impurity_criterion == 'entropy':
+        impurity = -1 * np.sum(np.log2(probs) * probs)
+    elif impurity_criterion == 'gini':
+        impurity = 1 - np.sum(np.square(probs))
+    else:
+        raise ValueError('Unknown impurity criterion')
+        
+    return impurity
+
+
+def comp_feature_information_gain(df, target, descriptive_feature, split_criterion, print_flag=False):
+    """
+    This function calculates information gain for splitting on 
+    a particular descriptive feature for a given dataset
+    and a given impurity criteria.
+    Supported split criterion: 'entropy', 'gini'
+    """
+    if print_flag:
+        print('target feature:', target)
+        print('descriptive_feature:', descriptive_feature)
+        print('split criterion:', split_criterion)
+            
+    target_entropy = compute_impurity(df[target], split_criterion)
+
+    # we define two lists below:
+    # entropy_list to store the entropy of each partition
+    # weight_list to store the relative number of observations in each partition
+    entropy_list = list()
+    weight_list = list()
+    
+    # loop over each level of the descriptive feature
+    # to partition the dataset with respect to that level
+    # and compute the entropy and the weight of the level's partition
+    for level in df[descriptive_feature].unique():
+        df_feature_level = df[df[descriptive_feature] == level]
+        entropy_level = compute_impurity(df_feature_level[target], split_criterion)
+        entropy_list.append(round(entropy_level, 3))
+        weight_level = len(df_feature_level) / len(df)
+        weight_list.append(round(weight_level, 3))
+
+    # print('impurity of partitions:', entropy_list)
+    # print('weights of partitions:', weight_list)
+
+    feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
+    
+    information_gain = target_entropy - feature_remaining_impurity
+    
+    if print_flag:
+        print('impurity of partitions:', entropy_list)
+        print('weights of partitions:', weight_list)
+        print('remaining impurity:', feature_remaining_impurity)
+        print('information gain:', information_gain)
+        print('====================')
+        
+    return information_gain
+
+
+def calc_information_gain_2(data, split_name, target_name, split_criterion):
+    """
+    Calculate information gain given a data set, column to split on, and target
+    """
+    # Calculate the original impurity
+    original_impurity = compute_impurity(data[target_name], split_criterion)
+    #Find the unique values in the column
+    values = data[split_name].unique()
+    
+    # Make two subsets of the data, based on the unique values
+    left_split = data[data[split_name] == values[0]]
+    right_split = data[data[split_name] == values[1]]
+    
+    # Loop through the splits and calculate the subset impurities
+    to_subtract = 0
+    for subset in [left_split, right_split]:
+        prob = (subset.shape[0] / data.shape[0]) 
+        to_subtract += prob * compute_impurity(subset[target_name], split_criterion) 
+    
+    # Return information gain
+    return original_impurity - to_subtract
+
+
+def get_information_gains_2(data, target_name, split_criterion):
+  #Intialize an empty dictionary for information gains
+  information_gains = {}
+  
+  #Iterate through each column name in our list
+  for feature in list(data.columns):
+    #Find the information gain for the column
+    information_gain = calc_information_gain_2(model_input, target_name, feature, split_criterion)
+    #Add the information gain to our dictionary using the column name as the ekey                                         
+    information_gains[feature] = information_gain
+  
+  #Return the key with the highest value                                          
+  #return max(information_gains, key=information_gains.get)
+  
+  return information_gains
+
+# %% [markdown]
+# Present the feature importance results from other methods
+
+# %%
+split_criterion = 'entropy'
+print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
+information_gains = get_information_gains_2(model_input, 'target', split_criterion)
+print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
+n_features_with_highest_info_gain(information_gains)
+
+# %%
+# Present the feature importance using a tree (that uses gini imputity measure)
+split_criterion = 'entropy'
+print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
+
+X, y  = model_input.drop(columns=['target', 'pid']), model_input['target']
+imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+X = imputer.fit_transform(X)
+X, _, y, _ =  train_test_split(X, y, random_state=19, test_size=0.25)
+
+
+clf = DecisionTreeClassifier(criterion=split_criterion)
+clf.fit(X, y)
+
+feat_importance = clf.tree_.compute_feature_importances(normalize=False)
+print("feat importance = ", feat_importance)
+print("shape", feat_importance.shape)
+tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist()))
+info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp))
+info_gains_dict[info_gains_dict > 0]
+
+# %%
+# Binarizacija vrednosti tree Information Gain-a
+bins = [-0.1, 0, 0.1] # bins for target's correlations with features
+cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True) 
+plt.title(f"Tree information gains by value ({split_criterion})")
+cut_info_gains.value_counts().plot(kind='bar', color='purple')
+plt.xticks(rotation=45, ha='right')
+print(cut_info_gains.value_counts())
+
+
+pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20))
+
+# %%
+# Plot feature importance tree graph 
+plt.figure(figsize=(12,12))
+tree.plot_tree(clf,
+               feature_names = list(model_input.drop(columns=['target', 'pid']).columns), 
+               class_names=True,
+               filled = True, fontsize=5, max_depth=3)
+
+plt.savefig('tree_high_dpi', dpi=800)
+
+
+# %% [markdown]
+# Present the feature importance by correlation with target
+
+corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
+# corrs.sort_values(ascending=False)
+
+# Binarizacija vrednosti korelacij
+bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features
+cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True) 
+plt.title("Target's correlations with features")
+cut_corrs.value_counts().plot(kind='bar')
+plt.xticks(rotation=45, ha='right')
+print(cut_corrs.value_counts())
+print(corrs[corrs > 0.1]) # or corrs < -0.1])
+# %%
+
+# %%
--- a/exploration/expl_features_groups_analysis.py
+++ b/exploration/expl_features_groups_analysis.py
@ -0,0 +1,328 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# %matplotlib inline
+
+import os, sys, math
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.impute import SimpleImputer
+from sklearn.naive_bayes import GaussianNB  
+from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
+from sklearn import metrics 
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
+
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+categorical_features = model_input[categorical_feature_colnames].copy()
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
+model_input = pd.concat([numerical_features, categorical_features], axis=1)
+
+# Binarizacija targeta
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) 
+
+print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
+print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)
+
+
+# %%
+# Add prefix to demographical features
+demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile', 
+                'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']
+
+new_names = [(col, "demo_"+col) for col in demo_features]
+model_input.rename(columns=dict(new_names), inplace=True)
+
+demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio', 
+                'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M', 
+                'demo_startlanguage_nl', 'demo_startlanguage_sl']
+
+# %%
+# Get phone and non-phone columns
+import warnings
+
+def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
+    """
+    This function makes predictions with sensor groups. 
+    It takes in a dataframe (df), a list of group substrings (groups_substrings) 
+    and an optional parameter include_group (default is True). 
+    It creates a list of columns in the dataframe that contain the group substrings, 
+    while excluding the 'pid' and 'target' columns. It then splits the data into training 
+    and test sets, using a test size of 0.25 for the first split and 0.2 for the second split. 
+    A SimpleImputer is used to fill in missing values with median values. 
+    A LogisticRegression is then used to fit the training set and make predictions 
+    on the test set. Finally, accuracy, precision, recall and F1 scores are printed 
+    for each substring group depending on whether or not include_group 
+    is set to True or False.
+
+    """
+    
+    best_sensor = None
+    best_recall_score, best_f1_score = None, None
+
+    for fgroup_substr in groups_substrings:
+        if fgroup_substr is None:
+            feature_group_cols = list(df.columns)
+            feature_group_cols.remove("pid")
+            feature_group_cols.remove("target")
+        else: 
+            if include_group:
+                feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
+            else:    
+                feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
+
+
+        X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
+        X, _, y, _ =  train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
+        
+        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+        
+        nb = GaussianNB()
+        model_cv = cross_validate(
+            nb,
+            X=imputer.fit_transform(X),
+            y=y,
+            cv=StratifiedKFold(n_splits=5, shuffle=True),
+            n_jobs=-1,
+            scoring=('accuracy', 'precision', 'recall', 'f1')
+        )
+        X_train, X_test, y_train, y_test =  train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
+        
+
+        if print_flag:
+            if include_group:
+                print("\nPrediction with", fgroup_substr)
+            else:
+                print("\nPrediction without", fgroup_substr)
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
+
+            acc = np.mean(model_cv['test_accuracy'])
+            acc_std = np.std(model_cv['test_accuracy'])
+            
+            prec = np.mean(model_cv['test_precision'])
+            prec_std = np.std(model_cv['test_precision'])
+            
+            rec = np.mean(model_cv['test_recall'])
+            rec_std = np.std(model_cv['test_recall'])
+            
+            f1 = np.mean(model_cv['test_f1'])
+            f1_std = np.std(model_cv['test_f1'])
+
+        if print_flag:
+            print("************************************************")
+            print(f"Accuracy: {acc} (sd={acc_std})")
+            print(f"Precison: {prec} (sd={prec_std})")
+            print(f"Recall: {rec} (sd={rec_std})")
+            print(f"F1: {f1} (sd={f1_std})\n")
+
+        if (not best_recall_score and not best_f1_score) or (rec > best_recall_score):
+            best_sensor = fgroup_substr
+            best_recall_score, best_f1_score = rec, f1
+            best_recall_score_std, best_f1_score_std = rec_std, f1_std
+        
+    return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std 
+
+# %% [markdown]
+# ### sensor big feature groups (phone, empatica, demographical)
+big_groups_substr = ["phone_", "empatica_", "demo_"]
+make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)
+
+# %% [markdown]
+# ### Empatica sezor groups
+# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
+# e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
+# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
+
+# %% [markdown]
+# ### Phone sensor groups
+# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
+# phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", 
+#                 "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
+# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
+
+# %%
+# Write all the sensors  (phone, empatica), seperate other (demographical) cols also
+
+sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
+                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
+                        "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
+# %%
+def find_sensor_group_features_importance(model_input, sensor_groups_strings):
+    """
+    This function finds the importance of sensor groups for a given model input. It takes two parameters: 
+    model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores, 
+    which will be populated with tuples containing the best sensor, its recall score, and its F1 score. 
+    It then makes a copy of the model input and the sensor groups strings. It then loops through each group 
+    in the list of strings, creating a list of important columns from the sensor importance scores list. 
+    It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score, 
+    and its F1 score. These values are added to the sensor importance scores list as a tuple. The function 
+    then removes that best sensor from the list of strings before looping again until all groups have been evaluated. 
+    Finally, it returns the populated list of tuples containing all sensors' scores. 
+    """
+    sensor_importance_scores = []
+    model_input = model_input.copy()
+    sensor_groups_strings = sensor_groups_strings.copy()
+    groups_len = len(sensor_groups_strings)
+    for i in range(groups_len):
+        important_cols = [col[0] for col in sensor_importance_scores]
+        with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
+        
+
+        best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std  = \
+            make_predictions_with_sensor_groups(model_input, 
+            groups_substrings=sensor_groups_strings, include_group=True, 
+            with_cols=with_cols)
+        sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std ))
+        print(f"\nAdded sensor: {best_sensor}\n")
+        sensor_groups_strings.remove(best_sensor)
+    
+    return sensor_importance_scores
+
+
+# %%
+# Method for sorting list of tuples into 3 lists
+def sort_tuples_to_lists(list_of_tuples):
+    """
+    sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument 
+    and sorts them into three separate lists. The first list, xs, contains the first element 
+    of each tuple. The second list, yrecall, contains the second element of each tuple rounded 
+    to 4 decimal places. The third list, y_fscore, contains the third element of each tuple 
+    rounded to 4 decimal places. The method returns all three lists. 
+    """
+    xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], []
+    for a_tuple in list_of_tuples:
+        xs.append(a_tuple[0])
+        y_recall.append(round(a_tuple[1], 4))
+        y_fscore.append(round(a_tuple[2], 4))
+        recall_std.append(round(a_tuple[3], 4))
+        fscore_std.append(round(a_tuple[4], 4))
+    return xs, y_recall, y_fscore, recall_std, fscore_std
+
+def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
+                                                        title="Sequential addition of features and its F1, and recall scores"):
+    """
+    This function plots the sequential progress of feature addition scores using two subplots. 
+    The first subplot is for recall scores and the second subplot is for F1-scores. 
+    The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes. 
+    The title of the plot can be specified by the user using the parameter title. 
+    The maximum recall index and maximum F1-score index are also plotted using a black dot. 
+    The figure size is set to 18.5 inches in width and 10.5 inches in height, 
+    and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed 
+    using plt.show().
+    """
+    
+    fig, ax = plt.subplots(nrows=2, sharex=True)
+    ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD
+    ax[0].plot(xs, y_recall, color='red')
+    ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD
+    mrec_indx = np.argmax(y_recall)
+    ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
+    ax[0].legend(["Upper std", "Mean Recall", "Lower std"])
+
+    ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD
+    ax[1].plot(xs, y_fscore)
+    ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD
+    mfscore_indx = np.argmax(y_fscore)
+    ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
+    ax[1].legend(["Upper std", "Mean F1-score", "Lower std"])
+    
+    fig.set_size_inches(18.5, 10.5)
+
+    ax[0].title.set_text('Recall scores')
+    ax[1].title.set_text('F1-scores')
+    plt.suptitle(title, fontsize=14)
+    plt.xticks(rotation=90)
+    plt.show()
+
+# %%
+sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
+                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
+                        "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
+
+# sensors_features_groups = ["phone_", "empatica_", "demo_"]
+
+# %%
+# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
+sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
+xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores)
+
+# %% [markdown]
+# ### Visualize sensors groups F1 and recall scores
+print(sensor_groups_importance_scores)
+plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
+                                                    title="Sequential addition of sensors and its F1, and recall scores")
+
+# %%
+# Take the most important feature group and investigate it feature-by-feature
+best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
+best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
+
+# best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
+
+# xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(best_sensor_features_scores)
+
+# %% [markdown]
+# ### Visualize best sensor's F1 and recall scores
+# print(best_sensor_features_scores)
+# plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
+#                                                     title="Best sensor addition it's features with F1 and recall scores")
+
+# %%
+# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
+# It also saves the sequence of scores for all sensors' features in excel file
+seq_columns = ["sensor_name", "feature_sequence", "recall", "f1_score"]
+feature_sequence = pd.DataFrame(columns=seq_columns)
+for i, sensor_group in enumerate(sensor_groups_importance_scores):
+
+    current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
+    current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
+    xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(current_sensor_features_scores)
+    feature_sequence = pd.concat([feature_sequence, pd.DataFrame({"sensor_name":sensor_group[0], "feature_sequence": [xs], "recall": [y_recall], 
+                                                             "f1_score": [y_fscore], "recall_std": [recall_std], "f1_std": [fscore_std]})])
+
+    plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std, 
+    title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")
+
+feature_sequence.to_excel("all_sensors_sequential_addition_scores.xlsx", index=False)
+
+# %%
+# TODO: method that reads data from the excel file, specified above, and then the method,
+# that selects only features that are max a thresh[%] below the max value (best for recall
+# possibly for f1). This method should additionally take threshold parameter.
+
+# %%
+
--- a/exploration/expl_stress_event.py
+++ b/exploration/expl_stress_event.py
@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %%
+import os
+import sys
+import datetime
+import math
+
+import seaborn as sns
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+import participants.query_db
+from features.esm import *
+from features.esm_JCQ import *
+from features.esm_SAM import *
+
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+# %%
+participants_inactive_usernames = participants.query_db.get_usernames(
+    collection_start=datetime.date.fromisoformat("2020-08-01")
+)
+df_esm_inactive = get_esm_data(participants_inactive_usernames)
+
+# %%
+df_esm_preprocessed = preprocess_esm(df_esm_inactive)
+
+
+# %% [markdown]
+# Investigate stressfulness events
+# %%
+extracted_ers = df_esm_preprocessed.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire length
+extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire answering is 15 min
+session_start_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].min().to_frame().rename(columns={'timestamp': 'session_start_timestamp'}) # questionnaire start timestamp
+session_end_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
+se_time = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
+se_duration = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
+
+# Make se_durations to the appropriate lengths
+
+# Extracted 3 targets that will be transfered in the csv file to the cleaning script. 
+df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].columns
+se_stressfulness_event_tg = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'appraisal_stressfulness_event'})
+
+# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
+extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \
+                                .join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
+                                .join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
+                                .join(se_time, on=['device_id', 'esm_session'], how='left') \
+                                .join(se_duration, on=['device_id', 'esm_session'], how='left') \
+
+# Filter-out the sessions that are not useful. Because of the ambiguity this excludes: 
+# (1) straw event times that are marked as "0 - I don't remember"
+# (2) straw event durations that are marked as "0 - I don't remember" 
+extracted_ers = extracted_ers[(~extracted_ers.se_time.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("Removed "))]
+extracted_ers.reset_index(drop=True, inplace=True)
+
+# Add default duration in case if participant answered that no stressful event occured
+
+# Prepare data to fit the data structure in the CSV file ...
+# Add the event time as the start of the questionnaire if no stress event occured
+extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
+# Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds 
+extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
+extracted_ers['shift_direction'] = -1
+
+""">>>>> begin section (could be optimized) <<<<<"""
+
+# Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
+# is taken as end time of the segment. Else the user input duration is taken. 
+extracted_ers['temp_duration'] = extracted_ers['se_duration']
+extracted_ers['se_duration'] = \
+    np.where(
+        extracted_ers['se_duration'].astype(str).str.startswith("1 - "),
+        extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'], 
+        extracted_ers['se_duration']
+    )
+
+# This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds.
+
+extracted_ers['se_duration'] = \
+    extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else abs(pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60)
+
+# Check whether min se_duration is at least the same duration as the ioi. Filter-out the rest.
+
+""">>>>> end section <<<<<"""
+
+# %% [markdown]
+# Count negative values of duration
+print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
+print("Count stressed:", extracted_ers[(~extracted_ers['se_duration'].isna())][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
+print("Count negative durations (invalid se_time user input):", extracted_ers[extracted_ers['se_duration'] < 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
+print("Count 0 durations:", extracted_ers[extracted_ers['se_duration'] == 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
+extracted_ers[extracted_ers['se_duration'] <= 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]
+extracted_ers[(~extracted_ers['se_duration'].isna()) & (extracted_ers['se_duration'] <= 0)][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']]
+
+ax = extracted_ers.hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
+hist, bin_edges = np.histogram(extracted_ers['se_duration'].dropna())
+hist
+bin_edges
+
+extracted_ers = extracted_ers[extracted_ers['se_duration'] >= 0]
+
+# %%
+# bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos'  ..... right=False
+bins = [-100000000, -0.0000001, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
+
+extracted_ers['bins'], edges = pd.cut(extracted_ers.se_duration, bins=bins, labels=['neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
+sns.displot(
+    data=extracted_ers.dropna(),
+    x="bins",
+    binwidth=0.1,
+)
+
+# %% [markdown]
+extracted_ers[extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'] >= 0]
+extracted_ers['se_time'].value_counts()
+pd.set_option('display.max_rows', 100)
+# Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika. 
+extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()] # Remove no stress events
+extracted_ers['diff_se_time_session_end'] = (extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'])
+
+print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
+print("Count negative durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] < 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']])
+print("Count 0 durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] == 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
+
+extracted_ers[extracted_ers['diff_se_time_session_end'] < 0]['diff_se_time_session_end']
+# extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)]
+bins2 = [-100000, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
+extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['neg_zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
+extracted_ers['bins2']
+sns.displot(
+    data=extracted_ers.dropna(),
+    x="bins2",
+    binwidth=0.1,
+)
+
+extracted_ers.shape
+extracted_ers.dropna().shape
+
+print()
+
+
+# %%
+extracted_ers['appraisal_stressfulness_event_num'] = extracted_ers['appraisal_stressfulness_event'].str[0].astype(int)
+print("duration-target (corr):", extracted_ers['se_duration'].corr(extracted_ers['appraisal_stressfulness_event_num']))
+
+# %%
+# Explore groupby participants?
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@ -0,0 +1,49 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% 
+import sys, os
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+from machine_learning.cross_validation import CrossValidation
+from machine_learning.preprocessing import Preprocessing
+
+# %% 
+df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+df.set_index(index_columns, inplace=True)
+
+cv = CrossValidation(data=df, cv_method="logo")
+
+categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
+interval_feature_list, other_feature_list = [], []
+
+print(df.columns.tolist())
+
+for split in cv.get_splits():
+    train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
+    pre = Preprocessing(train_X, train_y, test_X, test_y)
+    pre.one_hot_encode_train_and_test_sets(categorical_columns)
+    train_X, train_y, test_X, test_y = pre.get_train_test_sets()
+    break
+
+# %%
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@ -0,0 +1,462 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# %matplotlib inline
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
+from sklearn.dummy import DummyClassifier
+from sklearn.impute import SimpleImputer
+
+from lightgbm import LGBMClassifier
+import xgboost as xg
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+    
+import machine_learning.helper
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# ## Set script's parameters
+#
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
+undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method)
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
+# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+model_input.set_index(index_columns, inplace=True)
+model_input['target'].value_counts()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# bins = [-10, 0, 10] # bins for z-scored targets
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
+model_input['target'].value_counts(), edges
+# model_input = model_input[model_input['target'] != "medium"]
+model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
+
+model_input['target'].value_counts()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# UnderSampling
+if undersampling:
+    no_stress = model_input[model_input['target'] == 0]
+    stress = model_input[model_input['target'] == 1]
+    
+    no_stress = no_stress.sample(n=len(stress))
+    model_input = pd.concat([stress,no_stress], axis=0)
+
+#   model_input_new = pd.DataFrame(columns=model_input.columns)
+#   for pid in model_input["pid"].unique():
+#     stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
+#     no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
+#     if (len(stress) == 0):
+#       continue
+#     if (len(no_stress) == 0):
+#       continue
+#     model_input_new = pd.concat([model_input_new, stress], axis=0)
+    
+#     no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
+#     # In case there are more stress samples than no_stress, take all instances of no_stress.
+#     model_input_new = pd.concat([model_input_new, no_stress], axis=0)
+#     model_input = model_input_new   
+#     model_input_new = pd.concat([model_input_new, no_stress], axis=0)
+
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+if cv_method_str == 'half_logo':
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
+else:
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+categorical_features = data_x[categorical_feature_colnames].copy()
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+train_x = pd.concat([numerical_features, categorical_features], axis=1)
+train_x.dtypes
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
+if cv_method_str == 'logo' or cv_method_str == 'half_logo':
+    cv_method = LeaveOneGroupOut()
+    cv_method.get_n_splits(
+        train_x,
+        data_y,
+        groups=data_groups,
+    )
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+
+# %% [markdown]
+# ### Baseline: Dummy Classifier (most frequent)
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+dummy_class = DummyClassifier(strategy="most_frequent")
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+dummy_classifier = cross_validate(
+    dummy_class,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
+print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
+print("Precision", np.mean(dummy_classifier['test_precision']))
+print("Recall", np.mean(dummy_classifier['test_recall']))
+print("F1", np.mean(dummy_classifier['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown] nteract={"transient": {"deleting": false}}
+# ### All models
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+final_scores = machine_learning.helper.run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+# %%
+final_scores.index.name = "metric"
+final_scores = final_scores.set_index(["method", final_scores.index])
+final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv")
+
+# %% [markdown]
+# ### Logistic Regression
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+logistic_regression = linear_model.LogisticRegression()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+log_reg_scores = cross_validate(
+    logistic_regression,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
+print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
+print("Precision", np.mean(log_reg_scores['test_precision']))
+print("Recall", np.mean(log_reg_scores['test_recall']))
+print("F1", np.mean(log_reg_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Support Vector Machine
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+svc = svm.SVC()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+svc_scores = cross_validate(
+    svc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
+print("Precision", np.mean(svc_scores['test_precision']))
+print("Recall", np.mean(svc_scores['test_recall']))
+print("F1", np.mean(svc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Gaussian Naive Bayes
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+gaussian_nb = naive_bayes.GaussianNB()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+gaussian_nb_scores = cross_validate(
+    gaussian_nb,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
+print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
+print("Precision", np.mean(gaussian_nb_scores['test_precision']))
+print("Recall", np.mean(gaussian_nb_scores['test_recall']))
+print("F1", np.mean(gaussian_nb_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Stochastic Gradient Descent Classifier
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+sgdc = linear_model.SGDClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+sgdc_scores = cross_validate(
+    sgdc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
+print("Precision", np.mean(sgdc_scores['test_precision']))
+print("Recall", np.mean(sgdc_scores['test_recall']))
+print("F1", np.mean(sgdc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### K-nearest neighbors
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+knn = neighbors.KNeighborsClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+knn_scores = cross_validate(
+    knn,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
+print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
+print("Precision", np.mean(knn_scores['test_precision']))
+print("Recall", np.mean(knn_scores['test_recall']))
+print("F1", np.mean(knn_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Decision Tree
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+dtree = tree.DecisionTreeClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+dtree_scores = cross_validate(
+    dtree,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
+print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
+print("Precision", np.mean(dtree_scores['test_precision']))
+print("Recall", np.mean(dtree_scores['test_recall']))
+print("F1", np.mean(dtree_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Random Forest Classifier
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+rfc = ensemble.RandomForestClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+rfc_scores = cross_validate(
+    rfc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1'), 
+    return_estimator=True
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
+print("Precision", np.mean(rfc_scores['test_precision']))
+print("Recall", np.mean(rfc_scores['test_recall']))
+print("F1", np.mean(rfc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Feature importance (RFC)
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
+for idx, estimator in enumerate(rfc_scores['estimator']):
+    feature_importances = pd.DataFrame(estimator.feature_importances_,
+                                       index = list(train_x.columns),
+                                        columns=['importance'])
+    # print("\nFeatures sorted by their score for estimator {}:".format(idx))
+    # print(feature_importances.sort_values('importance', ascending=False).head(10))                                    
+    rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
+
+pd.set_option('display.max_rows', 100)
+print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
+
+rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
+
+rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
+
+train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
+
+# %% [markdown]
+# ### Gradient Boosting Classifier
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+gbc = ensemble.GradientBoostingClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+gbc_scores = cross_validate(
+    gbc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
+print("Precision", np.mean(gbc_scores['test_precision']))
+print("Recall", np.mean(gbc_scores['test_recall']))
+print("F1", np.mean(gbc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### LGBM Classifier
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+lgbm = LGBMClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+lgbm_scores = cross_validate(
+    lgbm,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
+print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
+print("Precision", np.mean(lgbm_scores['test_precision']))
+print("Recall", np.mean(lgbm_scores['test_recall']))
+print("F1", np.mean(lgbm_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### XGBoost Classifier
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+xgb_classifier = xg.sklearn.XGBClassifier()
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+xgb_classifier_scores = cross_validate(
+    xgb_classifier,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
+print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
+print("Precision", np.mean(xgb_classifier_scores['test_precision']))
+print("Recall", np.mean(xgb_classifier_scores['test_recall']))
+print("F1", np.mean(xgb_classifier_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
+
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@ -0,0 +1,184 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from scipy import stats
+
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate
+from sklearn.impute import SimpleImputer
+
+from sklearn.dummy import DummyClassifier
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
+import xgboost as xg 
+
+from sklearn.cluster import KMeans
+
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.labels
+import machine_learning.model
+from machine_learning.classification_models import ClassificationModels
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# ## Set script's parameters
+n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
+cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
+
+# %% jupyter={"source_hidden": true}
+model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+
+clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
+
+model_input.columns[list(model_input.columns).index('age'):-1]
+
+lime_cols = [col for col in model_input if col.startswith('limesurvey')]
+lime_cols
+lime_col = 'limesurvey_demand_control_ratio_quartile'
+clust_col = lime_col
+
+model_input[clust_col].describe()
+
+
+# %% jupyter={"source_hidden": true}
+
+# Filter-out outlier rows by clust_col 
+#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
+
+uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
+uniq = uniq.dropna()
+plt.bar(uniq['pid'], uniq[clust_col])
+
+# %% jupyter={"source_hidden": true}
+# Get clusters by cluster col & and merge the clusters to main df
+km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
+np.unique(km, return_counts=True)
+uniq['cluster'] = km
+uniq
+
+model_input = model_input.merge(uniq[['pid', 'cluster']])   
+
+# %% jupyter={"source_hidden": true}
+model_input.set_index(index_columns, inplace=True)
+
+# %% jupyter={"source_hidden": true}
+# Create dict with classification ml models
+cm = ClassificationModels()
+cmodels = cm.get_cmodels()
+
+# %% jupyter={"source_hidden": true}
+for k in range(n_clusters):
+    model_input_subset = model_input[model_input["cluster"] == k].copy()
+    bins = [-10, -1, 1, 10] # bins for z-scored targets
+    model_input_subset.loc[:, 'target'] = \
+        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
+    model_input_subset['target'].value_counts()
+    model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
+    model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
+
+    model_input_subset['target'].value_counts()
+    
+    if cv_method_str == 'half_logo':
+        model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
+        model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
+
+        model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
+        model_input_subset["pid_half"] = model_input_subset["pid"] + "_" +  model_input_subset["pid_index"].astype(int).astype(str)
+
+        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
+    else:
+        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]
+
+    # Treat categorical features
+    categorical_feature_colnames = ["gender", "startlanguage"]
+    additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+    categorical_feature_colnames += additional_categorical_features
+
+    categorical_features = data_x[categorical_feature_colnames].copy()
+    mode_categorical_features = categorical_features.mode().iloc[0]
+
+    # fillna with mode
+    categorical_features = categorical_features.fillna(mode_categorical_features)
+
+    # one-hot encoding
+    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+    if not categorical_features.empty:
+        categorical_features = pd.get_dummies(categorical_features)
+
+    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+    train_x = pd.concat([numerical_features, categorical_features], axis=1)
+
+    # Establish cv method
+    cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
+    if cv_method_str == 'logo' or cv_method_str == 'half_logo':
+        cv_method = LeaveOneGroupOut()
+        cv_method.get_n_splits(
+            train_x,
+            data_y,
+            groups=data_groups,
+        )
+
+    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+
+    for model_title, model in cmodels.items():
+
+        classifier = cross_validate(
+            model['model'],
+            X=imputer.fit_transform(train_x),
+            y=data_y,
+            groups=data_groups,
+            cv=cv_method,
+            n_jobs=-1,
+            error_score='raise',
+            scoring=('accuracy', 'precision', 'recall', 'f1')
+        )
+        
+        print("\n-------------------------------------\n")
+        print("Current cluster:", k, end="\n")
+        print("Current model:", model_title, end="\n")
+        print("Acc", np.mean(classifier['test_accuracy']))
+        print("Precision", np.mean(classifier['test_precision']))
+        print("Recall", np.mean(classifier['test_recall']))
+        print("F1", np.mean(classifier['test_f1']))
+        print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
+        print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))
+        
+        cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
+        cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
+        cmodels[model_title]['metrics'][2] += np.mean(classifier['test_recall'])
+        cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])
+
+# %% jupyter={"source_hidden": true}
+# Get overall results
+cm.get_total_models_scores(n_clusters=n_clusters)
--- a/exploration/ml_pipeline_classification_with_clustering_2_class.py
+++ b/exploration/ml_pipeline_classification_with_clustering_2_class.py
@ -0,0 +1,171 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+from scipy import stats
+
+from sklearn.model_selection import train_test_split
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+
+from sklearn.cluster import KMeans
+
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+from machine_learning.classification_models import ClassificationModels
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# # Useful method
+def treat_categorical_features(input_set):
+    categorical_feature_colnames = ["gender", "startlanguage"]
+    additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col]
+    categorical_feature_colnames += additional_categorical_features
+        
+    categorical_features = input_set[categorical_feature_colnames].copy()
+    mode_categorical_features = categorical_features.mode().iloc[0]
+
+    # fillna with mode
+    categorical_features = categorical_features.fillna(mode_categorical_features)
+
+    # one-hot encoding
+    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+    if not categorical_features.empty:
+        categorical_features = pd.get_dummies(categorical_features)
+
+    numerical_features = input_set.drop(categorical_feature_colnames, axis=1)
+    
+    return pd.concat([numerical_features, categorical_features], axis=1)
+
+# %% [markdown]
+# ## Set script's parameters
+n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
+n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
+
+# %% jupyter={"source_hidden": true}
+model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+
+clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
+
+model_input.columns[list(model_input.columns).index('age'):-1]
+
+lime_cols = [col for col in model_input if col.startswith('limesurvey')]
+lime_cols
+lime_col = 'limesurvey_demand_control_ratio'
+clust_col = lime_col
+
+model_input[clust_col].describe()
+
+
+# %% jupyter={"source_hidden": true}
+
+# Filter-out outlier rows by clust_col 
+model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
+
+uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
+plt.bar(uniq['pid'], uniq[clust_col])
+
+# %% jupyter={"source_hidden": true}
+# Get clusters by cluster col & and merge the clusters to main df
+km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
+np.unique(km, return_counts=True)
+uniq['cluster'] = km
+uniq
+
+model_input = model_input.merge(uniq[['pid', 'cluster']])   
+
+# %% jupyter={"source_hidden": true}
+model_input.set_index(index_columns, inplace=True)
+
+# %% jupyter={"source_hidden": true}
+# Create dict with classification ml models
+cm = ClassificationModels()
+cmodels = cm.get_cmodels()
+
+# %% jupyter={"source_hidden": true}
+for k in range(n_clusters):
+    model_input_subset = model_input[model_input["cluster"] == k].copy()
+    
+    # Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
+    model_input_subset['numerical_target'] = model_input_subset['target']
+    bins = [-10, 0, 10] # bins for z-scored targets
+    model_input_subset.loc[:, 'target'] = \
+        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True)
+        
+    p15 = np.percentile(model_input_subset['numerical_target'], 15)
+    p85 = np.percentile(model_input_subset['numerical_target'], 85)
+    
+    # Treat categorical features
+    model_input_subset = treat_categorical_features(model_input_subset)
+    
+    # Split to train, validate, and test subsets
+    train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
+    test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
+
+    train_set['target'].value_counts()
+    test_set['target'].value_counts()
+    
+    train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
+    
+    validate_x, test_x, validate_y, test_y = \
+        train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42)
+    
+    # Impute missing values
+    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+
+    train_x = imputer.fit_transform(train_x)
+    validate_x = imputer.fit_transform(validate_x)
+    test_x = imputer.fit_transform(test_x)
+
+    for model_title, model in cmodels.items():
+        model['model'].fit(train_x, train_y)
+        y_pred = model['model'].predict(validate_x)
+        
+        acc = accuracy_score(validate_y, y_pred)
+        prec = precision_score(validate_y, y_pred)
+        rec = recall_score(validate_y, y_pred)
+        f1 = f1_score(validate_y, y_pred)
+        
+        print("\n-------------------------------------\n")
+        print("Current cluster:", k, end="\n")
+        print("Current model:", model_title, end="\n")
+        print("Acc", acc)
+        print("Precision", prec)
+        print("Recall", rec)
+        print("F1", f1)
+        
+        cmodels[model_title]['metrics'][0] += acc
+        cmodels[model_title]['metrics'][1] += prec
+        cmodels[model_title]['metrics'][2] += rec
+        cmodels[model_title]['metrics'][3] += f1
+
+# %% jupyter={"source_hidden": true}
+# Get overall results
+cm.get_total_models_scores(n_clusters=n_clusters)
--- a/exploration/ml_pipeline_regression.py
+++ b/exploration/ml_pipeline_regression.py
@ -0,0 +1,355 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+import yaml
+from pyprojroot import here
+from sklearn import linear_model, svm, kernel_ridge, gaussian_process
+from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.impute import SimpleImputer
+from sklearn.dummy import DummyRegressor
+import xgboost as xg
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.features_sensor
+import machine_learning.labels
+import machine_learning.model
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# ## PANAS negative affect
+
+# %% jupyter={"source_hidden": true}
+model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+
+# %% jupyter={"source_hidden": true}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+#if "pid" in model_input.columns:
+#    index_columns.append("pid")
+model_input.set_index(index_columns, inplace=True)
+
+cv_method = 'half_logo' # logo, half_logo, 5kfold
+if cv_method == 'logo':
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+else:
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
+
+# %% jupyter={"source_hidden": true}
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+# %% jupyter={"source_hidden": true}
+categorical_features = data_x[categorical_feature_colnames].copy()
+
+# %% jupyter={"source_hidden": true}
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# %% jupyter={"source_hidden": true}
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# %% jupyter={"source_hidden": true}
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+# %% jupyter={"source_hidden": true}
+numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+
+# %% jupyter={"source_hidden": true}
+train_x = pd.concat([numerical_features, categorical_features], axis=1)
+
+# %% jupyter={"source_hidden": true}
+train_x.dtypes
+
+# %% jupyter={"source_hidden": true}
+logo = LeaveOneGroupOut()
+logo.get_n_splits(
+    train_x,
+    data_y,
+    groups=data_groups,
+)
+
+# Defaults to 5 k folds in cross_validate method
+if cv_method != 'logo' and cv_method != 'half_logo':
+    logo = None
+
+# %% jupyter={"source_hidden": true}
+sum(data_y.isna())
+
+# %% [markdown]
+# ### Baseline: Dummy Regression (mean)
+dummy_regr = DummyRegressor(strategy="mean")
+
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+dummy_regressor = cross_validate(
+    dummy_regr,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(dummy_regressor['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(dummy_regressor['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(dummy_regressor['test_neg_root_mean_squared_error']))
+print("R2", np.median(dummy_regressor['test_r2']))
+
+# %% [markdown]
+# ### Linear Regression
+
+# %% jupyter={"source_hidden": true}
+lin_reg_rapids = linear_model.LinearRegression()
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+lin_reg_scores = cross_validate(
+    lin_reg_rapids,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(lin_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### XGBRegressor Linear Regression
+# %% jupyter={"source_hidden": true}
+xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+xgb_reg_scores = cross_validate(
+    xgb_r,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(xgb_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(xgb_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(xgb_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(xgb_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### XGBRegressor Pseudo Huber Error Regression
+# %% jupyter={"source_hidden": true}
+xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+xgb_psuedo_huber_reg_scores = cross_validate(
+    xgb_psuedo_huber_r,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(xgb_psuedo_huber_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Ridge regression
+
+# %% jupyter={"source_hidden": true}
+ridge_reg = linear_model.Ridge(alpha=.5)
+
+# %% tags=[] jupyter={"source_hidden": true}
+ridge_reg_scores = cross_validate(
+    ridge_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(ridge_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(ridge_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(ridge_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(ridge_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Lasso
+
+# %% jupyter={"source_hidden": true}
+lasso_reg = linear_model.Lasso(alpha=0.1)
+
+# %% jupyter={"source_hidden": true}
+lasso_reg_score = cross_validate(
+    lasso_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(lasso_reg_score['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(lasso_reg_score['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(lasso_reg_score['test_neg_root_mean_squared_error']))
+print("R2", np.median(lasso_reg_score['test_r2']))
+
+# %% [markdown]
+# ### Bayesian Ridge
+
+# %% jupyter={"source_hidden": true}
+bayesian_ridge_reg = linear_model.BayesianRidge()
+
+# %% jupyter={"source_hidden": true}
+bayesian_ridge_reg_score = cross_validate(
+    bayesian_ridge_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
+print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
+
+# %% [markdown]
+# ### RANSAC (outlier robust regression)
+
+# %% jupyter={"source_hidden": true}
+ransac_reg = linear_model.RANSACRegressor()
+
+# %% jupyter={"source_hidden": true}
+ransac_reg_scores = cross_validate(
+    ransac_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(ransac_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(ransac_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(ransac_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(ransac_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Support vector regression
+
+# %% jupyter={"source_hidden": true}
+svr = svm.SVR()
+
+# %% jupyter={"source_hidden": true}
+svr_scores = cross_validate(
+    svr,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(svr_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(svr_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(svr_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(svr_scores['test_r2']))
+
+# %% [markdown]
+# ### Kernel Ridge regression
+
+# %% jupyter={"source_hidden": true}
+kridge = kernel_ridge.KernelRidge()
+
+# %% jupyter={"source_hidden": true}
+kridge_scores = cross_validate(
+    kridge,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(kridge_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(kridge_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(kridge_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(kridge_scores['test_r2']))
+
+# %% [markdown]
+# ### Gaussian Process Regression
+
+# %% jupyter={"source_hidden": true}
+gpr = gaussian_process.GaussianProcessRegressor()
+
+# %% jupyter={"source_hidden": true}
+
+gpr_scores = cross_validate(
+    gpr,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(gpr_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(gpr_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(gpr_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(gpr_scores['test_r2']))
+
+# %%
--- a/exploration/ml_pipeline_stress_event_cleaned.py
+++ b/exploration/ml_pipeline_stress_event_cleaned.py
@ -0,0 +1,359 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+import yaml
+from pyprojroot import here
+from sklearn import linear_model, svm, kernel_ridge, gaussian_process
+from sklearn.model_selection import LeaveOneGroupOut, LeavePGroupsOut, cross_val_score, cross_validate
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.impute import SimpleImputer
+from sklearn.dummy import DummyRegressor
+import xgboost as xg
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.features_sensor
+import machine_learning.labels
+import machine_learning.model
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# ## PANAS negative affect
+
+# %% jupyter={"source_hidden": true}
+model_input = pd.read_csv("../data/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
+
+# %% jupyter={"source_hidden": true}
+
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+
+model_input.set_index(index_columns, inplace=True)
+
+cv_method = 'half_logo'
+if cv_method == 'logo':
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+else:
+
+    model_input[(model_input['pid'] == "p037") | (model_input['pid'] == "p064") | (model_input['pid'] == "p092")]
+
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
+
+# %% jupyter={"source_hidden": true}
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+# %% jupyter={"source_hidden": true}
+categorical_features = data_x[categorical_feature_colnames].copy()
+
+# %% jupyter={"source_hidden": true}
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# %% jupyter={"source_hidden": true}
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# %% jupyter={"source_hidden": true}
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+# %% jupyter={"source_hidden": true}
+numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+
+# %% jupyter={"source_hidden": true}
+train_x = pd.concat([numerical_features, categorical_features], axis=1)
+
+# %% jupyter={"source_hidden": true}
+train_x.dtypes
+
+# %% jupyter={"source_hidden": true}
+logo = LeaveOneGroupOut()
+logo.get_n_splits(
+    train_x,
+    data_y,
+    groups=data_groups,
+)
+
+# Defaults to 5 k folds in cross_validate method
+if cv_method != 'logo' and cv_method != 'half_logo':
+    logo = None
+
+# %% jupyter={"source_hidden": true}
+sum(data_y.isna())
+
+# %% [markdown]
+# ### Baseline: Dummy Regression (mean)
+# %%
+dummy_regr = DummyRegressor(strategy="mean")
+
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+lin_reg_scores = cross_validate(
+    dummy_regr,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(lin_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(lin_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Linear Regression
+
+# %% jupyter={"source_hidden": true}
+lin_reg_rapids = linear_model.LinearRegression()
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+lin_reg_scores = cross_validate(
+    lin_reg_rapids,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(lin_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(lin_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### XGBRegressor Linear Regression
+# %% jupyter={"source_hidden": true}
+xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+xgb_reg_scores = cross_validate(
+    xgb_r,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(xgb_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(xgb_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(xgb_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(xgb_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### XGBRegressor Pseudo Huber Error Regression
+# %% jupyter={"source_hidden": true}
+xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% jupyter={"source_hidden": true}
+xgb_psuedo_huber_reg_scores = cross_validate(
+    xgb_psuedo_huber_r,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(xgb_psuedo_huber_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Ridge regression
+
+# %% jupyter={"source_hidden": true}
+ridge_reg = linear_model.Ridge(alpha=.5)
+
+# %% tags=[] jupyter={"source_hidden": true}
+ridge_reg_scores = cross_validate(
+    ridge_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(ridge_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(ridge_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(ridge_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(ridge_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Lasso
+
+# %% jupyter={"source_hidden": true}
+lasso_reg = linear_model.Lasso(alpha=0.1)
+
+# %% jupyter={"source_hidden": true}
+lasso_reg_score = cross_validate(
+    lasso_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(lasso_reg_score['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(lasso_reg_score['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(lasso_reg_score['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(lasso_reg_score['test_r2']))
+
+# %% [markdown]
+# ### Bayesian Ridge
+
+# %% jupyter={"source_hidden": true}
+bayesian_ridge_reg = linear_model.BayesianRidge()
+
+# %% jupyter={"source_hidden": true}
+bayesian_ridge_reg_score = cross_validate(
+    bayesian_ridge_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(bayesian_ridge_reg_score['test_r2']))
+
+# %% [markdown]
+# ### RANSAC (outlier robust regression)
+
+# %% jupyter={"source_hidden": true}
+ransac_reg = linear_model.RANSACRegressor()
+
+# %% jupyter={"source_hidden": true}
+ransac_reg_scores = cross_validate(
+    ransac_reg,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(ransac_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(ransac_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(ransac_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(ransac_reg_scores['test_r2']))
+
+# %% [markdown]
+# ### Support vector regression
+
+# %% jupyter={"source_hidden": true}
+svr = svm.SVR()
+
+# %% jupyter={"source_hidden": true}
+svr_scores = cross_validate(
+    svr,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(svr_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(svr_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(svr_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(svr_scores['test_r2']))
+
+# %% [markdown]
+# ### Kernel Ridge regression
+
+# %% jupyter={"source_hidden": true}
+kridge = kernel_ridge.KernelRidge()
+
+# %% jupyter={"source_hidden": true}
+kridge_scores = cross_validate(
+    kridge,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(kridge_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(kridge_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(kridge_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(kridge_scores['test_r2']))
+
+# %% [markdown]
+# ### Gaussian Process Regression
+
+# %% jupyter={"source_hidden": true}
+gpr = gaussian_process.GaussianProcessRegressor()
+
+# %% jupyter={"source_hidden": true}
+
+gpr_scores = cross_validate(
+    gpr,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.nanmedian(gpr_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.nanmedian(gpr_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.nanmedian(gpr_scores['test_neg_root_mean_squared_error']))
+print("R2", np.nanmedian(gpr_scores['test_r2']))
+
+# %%
--- a/exploration/tree_high_dpi.png
+++ b/exploration/tree_high_dpi.png
--- a/machine_learning/classification_models.py
+++ b/machine_learning/classification_models.py
@ -0,0 +1,71 @@
+from sklearn.dummy import DummyClassifier
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
+from lightgbm import LGBMClassifier
+import xgboost as xg 
+
+class ClassificationModels():
+    
+    def __init__(self):
+        self.cmodels = self.init_classification_models()
+        
+    def get_cmodels(self):
+        return self.cmodels
+
+    def init_classification_models(self):
+        cmodels = {
+            'dummy_classifier': {
+                'model': DummyClassifier(strategy="most_frequent"),
+                'metrics': [0, 0, 0, 0]
+            },
+            'logistic_regression': {
+                'model': linear_model.LogisticRegression(max_iter=1000),
+                'metrics': [0, 0, 0, 0]
+            },
+            'support_vector_machine': {
+                'model': svm.SVC(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'gaussian_naive_bayes': {
+                'model': naive_bayes.GaussianNB(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'stochastic_gradient_descent_classifier': {
+                'model': linear_model.SGDClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'knn': {
+                'model': neighbors.KNeighborsClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'decision_tree': {
+                'model': tree.DecisionTreeClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'random_forest_classifier': {
+                'model': ensemble.RandomForestClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'gradient_boosting_classifier': {
+                'model': ensemble.GradientBoostingClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'lgbm_classifier': {
+                'model': LGBMClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'XGBoost_classifier': {
+                'model': xg.sklearn.XGBClassifier(),
+                'metrics': [0, 0, 0, 0]
+            }
+        }
+        
+        return cmodels
+    
+    def get_total_models_scores(self, n_clusters=1):
+        for model_title, model in self.cmodels.items():
+            print("\n************************************\n")
+            print("Current model:", model_title, end="\n")
+            print("Acc:", model['metrics'][0]/n_clusters)
+            print("Precision:", model['metrics'][1]/n_clusters)
+            print("Recall:", model['metrics'][2]/n_clusters)
+            print("F1:", model['metrics'][3]/n_clusters)
--- a/machine_learning/cross_validation.py
+++ b/machine_learning/cross_validation.py
@ -0,0 +1,121 @@
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold
+
+class CrossValidation():
+    """This code implements a CrossValidation class for creating cross validation splits.
+    """
+    
+    
+    def __init__(self, data=None, cv_method='logo'):
+        """This method initializes the cv_method argument and optionally prepares the data if supplied.
+
+        Args:
+            cv_method (str, optional): String of cross validation method; options are 'logo', 'half_logo' and '5kfold'. 
+                Defaults to 'logo'.
+            data (DataFrame, optional): Pandas DataFrame with target, pid columns and other features as columns. 
+                Defaults to None.
+        """
+        
+        self.initialize_cv_method(cv_method)
+        
+        if data is not None:
+            self.prepare_data(data)
+        
+        
+    def prepare_data(self, data):
+        """Prepares the data ready to be passed to the cross-validation algorithm, depending on the cv_method type. 
+            For example, if cv_method is set to 'half_logo' new columns 'pid_index', 'pid_count', 'pid_half' 
+            are added and used in the process.
+
+        Args:
+            data (_type_): Pandas DataFrame with target, pid columns and other features as columns.
+        """
+        self.data = data
+        if self.cv_method == "logo":
+            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
+            
+        elif self.cv_method == "half_logo":
+            data['pid_index'] = data.groupby('pid').cumcount()
+            data['pid_count'] = data.groupby('pid')['pid'].transform('count')
+
+            data["pid_index"] = (data['pid_index'] / data['pid_count'] + 1).round()
+            data["pid_half"] = data["pid"] + "_" +  data["pid_index"].astype(int).astype(str)
+
+            data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
+           
+        elif self.cv_method == "5kfold":
+            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
+
+        self.X, self.y, self.groups = data_X, data_y, data_groups
+
+
+    def initialize_cv_method(self, cv_method):
+        """Initializes the given cv_method type. Depending on the type, the appropriate splitting technique is used.
+        
+        Args:
+            cv_method (str): The type of cross-validation method to use; options are 'logo', 'half_logo' and '5kfold'.
+
+        Raises:
+            ValueError: If cv_method is not in the list of available methods, it raises an ValueError.
+        """
+        
+        self.cv_method = cv_method
+        if self.cv_method not in ["logo", "half_logo", "5kfold"]:
+            raise ValueError("Invalid cv_method input. Correct values are: 'logo', 'half_logo', '5kfold'")
+        
+        if self.cv_method in ["logo", "half_logo"]:
+            self.cv = LeaveOneGroupOut()
+        elif self.cv_method == "5kfold":
+            self.cv = StratifiedKFold(n_splits=5, shuffle=True)
+
+
+    def get_splits(self):
+        """Returns a generator object containing the cross-validation splits. 
+
+        Raises:
+            ValueError: Raises ValueError if no data has been set.
+
+        """
+        if not self.data.empty:
+            return self.cv.split(self.X, self.y, self.groups)
+        else: 
+            raise ValueError("No data has been set. Use 'prepare_data(data)' method to set the data.")
+        
+        
+    def get_data(self):
+        """data getter
+
+        Returns:
+            Pandas DataFrame: Returns the data from the class instance.
+        """
+        return self.data
+    
+    
+    def get_x_y_groups(self):
+        """X, y, and groups data getter
+
+        Returns:
+            Pandas DataFrame: Returns the data from the class instance.
+        """
+        return self.X, self.y, self.groups
+    
+    
+    def get_train_test_sets(self, split):
+        """Gets train and test sets, dependent on the split parameter. This method can be used in a specific splitting context,
+        where by index we can get train and test sets.
+
+        Args:
+            split (tuple of indices): It represents one iteration of the split generator (see get_splits method). 
+
+        Returns:
+            tuple of Pandas DataFrames: This method returns train_X, train_y, test_X, test_y, with correctly indexed rows by split param.
+        """
+        return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
+    
+    
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@ -0,0 +1,221 @@
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.feature_selection import SequentialFeatureSelector
+from sklearn.naive_bayes import GaussianNB
+from sklearn.linear_model import Lasso 
+
+
+""" Feature selection pipeline: a methods that can be used in the wrapper metod alongside other wrapper contents (hyperparameter tuning etc.).
+
+(1) Establish methods for each of the steps in feature selection protocol.
+(2) Ensure that above methods are given only a part of data and use appropriate random seeds - to later simulate use case in production. 
+(3) Implement a method which gives graphical exploration of (1) (a) and (b) steps of the feature selection.
+(4) Prepare a core method that can be fit into a wrapper (see sklearn wrapper methods) and integrates methods from (1)
+
+"""
+
+class FeatureSelection:
+
+    def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
+        pass # TODO.... 
+
+    
+    def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
+        """The method selects the best feature by testing the prediction on the feature set with or without the current feature.
+        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat 
+        feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
+        specified as a parameter.
+
+        Args:
+            df (DataFrame): Input data on which the predictions will be made.
+            features (list): List of features to select the best/worst from
+            method (str, optional): remove or add features.  Defaults to "remove".
+            ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and  metric. Defaults to "classification".
+            metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
+            stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
+
+        Raises:
+            ValueError: Raises if classification or regression metrics are not recognised if a specific ml_type is selected.
+            ValueError: If unknown ml_type is chosen. 
+            
+        Returns:
+            tuple: name of the best feature, best feature score, best feature score standard deviation.
+        """
+        
+        best_feature = None
+        
+        if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
+            raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
+        elif ml_type == "regression" and metric not in ['r2']:
+            raise ValueError("Regression metric not recognized. Please choose 'r2'")
+
+        for feat in features:
+            if method == "remove":
+                pred_features = [col for col in df.columns if feat != col] # All but feat
+            elif method == "add":
+                pred_features = [feat] + stored_features # Feat with stored features
+            
+            X, y  = df.drop(columns=['target', 'pid'])[pred_features], df['target']
+            
+            if ml_type == "classification":
+                nb = GaussianNB()
+                model_cv = cross_validate(
+                    nb,
+                    X=X,
+                    y=y,
+                    cv=StratifiedKFold(n_splits=5, shuffle=True),
+                    n_jobs=-1,
+                    scoring=('accuracy', 'precision', 'recall', 'f1')
+                )
+                
+                with warnings.catch_warnings():
+                    warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
+
+                    if metric == "accuracy":
+                        acc = np.mean(model_cv['test_accuracy'])
+                        acc_std = np.std(model_cv['test_accuracy'])
+                        
+                        if not best_feature or (acc > best_metric_score):
+                            best_feature = feat
+                            best_metric_score = acc
+                            best_metric_score_std = acc_std
+                    
+                    elif metric == "precision":
+                        prec = np.mean(model_cv['test_precision'])
+                        prec_std = np.std(model_cv['test_precision'])
+                        
+                        if not best_feature or (prec > best_metric_score):
+                            best_feature = feat
+                            best_metric_score = prec
+                            best_metric_score_std = prec_std
+                    
+                    elif metric == "recall":
+                        rec = np.mean(model_cv['test_recall'])
+                        rec_std = np.std(model_cv['test_recall'])
+                        
+                        if not best_feature or (rec > best_metric_score):
+                            best_feature = feat
+                            best_metric_score = rec
+                            best_metric_score_std = rec_std
+                    
+                    else:
+                        f1 = np.mean(model_cv['test_f1'])
+                        f1_std = np.std(model_cv['test_f1'])
+                        
+                        if not best_feature or (f1 > best_metric_score):
+                            best_feature = feat
+                            best_metric_score = f1
+                            best_metric_score_std = f1_std 
+                                       
+            elif ml_type == "regression":
+                lass = Lasso()
+                model_cv = cross_validate(
+                    lass,
+                    X=X,
+                    y=y,
+                    cv=StratifiedKFold(n_splits=5, shuffle=True),
+                    n_jobs=-1,
+                    scoring=('r2')
+                )
+
+                if metric == "r2":
+                    r2 = np.mean(model_cv['test_r2'])
+                    r2_std = np.std(model_cv['test_r2'])
+                    
+                    if not best_feature or (r2 > best_metric_score):
+                        best_feature = feat
+                        best_metric_score = r2
+                        best_metric_score_std = r2_std
+            else:
+                raise ValueError("ML type not yet implemented!")
+                    
+        return best_feature, best_metric_score, best_metric_score_std
+    
+    
+    def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
+        
+        n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
+        if n_max > n_features:
+            n_max = n_features
+        
+        if n_min > n_features:
+            raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
+        
+        if n_max < n_min:
+            raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
+        
+        features = df.columns.tolist()
+        features.remove("pid")
+        features.remove("target")
+        feature_importance = []
+        if method == "remove":
+            for i in reversed(range(n_features)):
+                
+                best_feature, best_metric_score, best_metric_score_std = \
+                    self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
+                feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
+                
+                features.remove(best_feature)
+                
+            feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
+            
+            # Selekcijski kriterij značilk v rangu max-min
+            # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
+            
+            # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
+            
+            # "Tipping point" značilka mora biti v rangu max-min
+            
+            selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
+            selection_area.set_index(["i", "name"], inplace=True)
+            diffrences = selection_area.diff()
+            diffrences.dropna(how='any', inplace=True)
+            
+            # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo 
+            cumulative_sumation = diffrences.cumsum()
+            tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
+
+            # Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative 
+            tipping_feature_indx_2 = None
+            for indx, row in diffrences.iterrows():
+                if row["metric"] > 0:
+                    tipping_feature_indx_2 = indx
+                else: 
+                    break
+                
+            # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score     
+            tipping_feature_indx_3 = None
+            cum_sum_score = 0
+            i_worse = 0
+            # TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
+            for indx, row in selection_area.iterrows():
+                if row["metric"] > 0:
+                    tipping_feature_indx_3 = indx
+                    cum_sum_score += row["metric"]
+                    i_worse = 0
+                else:
+                    i_worse += 1
+                
+                if i_worse == n_not_improve:
+                    break
+                    
+                    
+                
+                
+            
+            
+        
+
+    def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
+        pass
+
+    def vizualize_feature_selection_process():
+        pass
+
+    def execute_feature_selection_step():
+        pass
--- a/machine_learning/helper.py
+++ b/machine_learning/helper.py
@ -1,6 +1,15 @@
 from pathlib import Path
+from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble, naive_bayes, neighbors, tree
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.impute import SimpleImputer
+from sklearn.dummy import DummyRegressor, DummyClassifier
+
+from xgboost import XGBRegressor, XGBClassifier
+import xgboost as xg

 import pandas as pd
+import numpy as np


 def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
@ -55,3 +64,396 @@ def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> P
    export_filename = filename_prefix + "_" + data_type + ".csv"
    full_path = folder / export_filename
    return full_path
+
+def insert_row(df, row):
+    return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
+
+def prepare_regression_model_input(input_csv):
+
+    model_input = pd.read_csv(input_csv)
+
+    index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+    model_input.set_index(index_columns, inplace=True)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+
+    categorical_feature_colnames = ["gender", "startlanguage", "limesurvey_demand_control_ratio_quartile"]
+    additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+    categorical_feature_colnames += additional_categorical_features
+    #TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully
+    categorical_features = data_x[categorical_feature_colnames].copy()
+    mode_categorical_features = categorical_features.mode().iloc[0]
+    # fillna with mode
+    categorical_features = categorical_features.fillna(mode_categorical_features)
+    # one-hot encoding
+    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+    if not categorical_features.empty:
+        categorical_features = pd.get_dummies(categorical_features)
+
+    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+
+    train_x = pd.concat([numerical_features, categorical_features], axis=1)
+
+    return train_x, data_y, data_groups
+
+
+def run_all_regression_models(input_csv):
+    # Prepare data
+    data_x, data_y, data_groups = prepare_regression_model_input(input_csv)
+
+    # Prepare cross validation
+    logo = LeaveOneGroupOut()
+    logo.get_n_splits(
+        data_x,
+        data_y,
+        groups=data_groups,
+    )
+    metrics = ['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error']
+    test_metrics = ["test_" + metric for metric in metrics]
+    scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
+
+    # Validate models
+    dummy_regr = DummyRegressor(strategy="mean")
+    dummy_regr_scores = cross_validate(
+        dummy_regr,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=logo,
+        n_jobs=-1,
+        scoring=metrics
+    )
+    print("Dummy model:")
+    print("R^2: ", np.nanmedian(dummy_regr_scores['test_r2']))
+    
+    scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
+    scores_df["method"] = "dummy"
+    scores = pd.concat([scores, scores_df])
+
+    lin_reg_rapids = linear_model.LinearRegression()
+    lin_reg_scores = cross_validate(
+        lin_reg_rapids,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=logo,
+        n_jobs=-1,
+        scoring=metrics
+    )
+    print("Linear regression:")
+    print("R^2: ", np.nanmedian(lin_reg_scores['test_r2']))
+
+    scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
+    scores_df["method"] = "linear_reg"
+    scores = pd.concat([scores, scores_df])
+
+    ridge_reg = linear_model.Ridge(alpha=.5)
+    ridge_reg_scores = cross_validate(
+        ridge_reg,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=logo,
+        n_jobs=-1,
+        scoring=metrics
+    )
+    print("Ridge regression")
+
+    scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
+    scores_df["method"] = "ridge_reg"
+    scores = pd.concat([scores, scores_df])
+
+
+    lasso_reg = linear_model.Lasso(alpha=0.1)
+    lasso_reg_score = cross_validate(
+        lasso_reg,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=logo,
+        n_jobs=-1,
+        scoring=metrics
+    )
+    print("Lasso regression")
+
+    scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
+    scores_df["method"] = "lasso_reg"
+    scores = pd.concat([scores, scores_df])
+
+    bayesian_ridge_reg = linear_model.BayesianRidge()
+    bayesian_ridge_reg_score = cross_validate(
+        bayesian_ridge_reg,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=logo,
+        n_jobs=-1,
+        scoring=metrics
+    )
+    print("Bayesian Ridge")
+
+    scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
+    scores_df["method"] = "bayesian_ridge"
+    scores = pd.concat([scores, scores_df])
+
+    ransac_reg = linear_model.RANSACRegressor()
+    ransac_reg_score = cross_validate(
+        ransac_reg,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=logo,
+        n_jobs=-1,
+        scoring=metrics
+    )
+    print("RANSAC (outlier robust regression)")
+
+    scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
+    scores_df["method"] = "RANSAC"
+    scores = pd.concat([scores, scores_df])
+
+    svr = svm.SVR()
+    svr_score = cross_validate(
+        svr,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=logo,
+        n_jobs=-1,
+        scoring=metrics
+    )
+    print("Support vector regression")
+    
+    scores_df = pd.DataFrame(svr_score)[test_metrics]
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
+    scores_df["method"] = "SVR"
+    scores = pd.concat([scores, scores_df])
+
+    kridge = kernel_ridge.KernelRidge()
+    kridge_score = cross_validate(
+        kridge,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=logo,
+        n_jobs=-1,
+        scoring=metrics
+    )
+    print("Kernel Ridge regression")
+    
+    scores_df = pd.DataFrame(kridge_score)[test_metrics]
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
+    scores_df["method"] = "kernel_ridge"
+    scores = pd.concat([scores, scores_df])
+
+    gpr = gaussian_process.GaussianProcessRegressor()
+    gpr_score = cross_validate(
+        gpr,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=logo,
+        n_jobs=-1,
+        scoring=metrics
+    )
+    print("Gaussian Process Regression")
+
+    scores_df = pd.DataFrame(gpr_score)[test_metrics]
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
+    scores_df["method"] = "gaussian_proc"
+    scores = pd.concat([scores, scores_df])
+
+    rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
+    rfr_score = cross_validate(
+        rfr,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=logo,
+        n_jobs=-1,
+        scoring=metrics
+    )
+    print("Random Forest Regression")
+
+    scores_df = pd.DataFrame(rfr_score)[test_metrics]
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
+    scores_df["method"] = "random_forest"
+    scores = pd.concat([scores, scores_df])
+
+    xgb = XGBRegressor()
+    xgb_score = cross_validate(
+        xgb,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=logo,
+        n_jobs=-1,
+        scoring=metrics
+    )
+    print("XGBoost Regressor")
+
+    scores_df = pd.DataFrame(xgb_score)[test_metrics]
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
+    scores_df["method"] = "XGBoost"
+    scores = pd.concat([scores, scores_df])
+
+    ada = ensemble.AdaBoostRegressor()
+    ada_score = cross_validate(
+        ada,
+        X=data_x,
+        y=data_y,
+        groups=data_groups,
+        cv=logo,
+        n_jobs=-1,
+        scoring=metrics
+    )
+    print("ADA Boost Regressor")
+
+    scores_df = pd.DataFrame(ada_score)[test_metrics]
+    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
+    scores_df["method"] = "ADA_boost"
+    scores = pd.concat([scores, scores_df])
+
+    return scores
+
+
+def run_all_classification_models(data_x, data_y, data_groups, cv_method):
+    metrics = ['accuracy', 'average_precision', 'recall', 'f1']
+    test_metrics = ["test_" + metric for metric in metrics]
+
+    scores = pd.DataFrame(columns=["method", "max", "mean"])
+
+    dummy_class = DummyClassifier(strategy="most_frequent")
+
+    dummy_score = cross_validate(
+    dummy_class,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=metrics
+    )
+    print("Dummy")
+
+    scores_df = pd.DataFrame(dummy_score)[test_metrics]
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
+    scores_df["method"] = "Dummy"
+    scores = pd.concat([scores, scores_df])
+
+    logistic_regression = linear_model.LogisticRegression()
+
+    log_reg_scores = cross_validate(
+    logistic_regression,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=metrics
+    )
+    print("Logistic regression")
+
+    scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
+    scores_df["method"] = "logistic_reg"
+    scores = pd.concat([scores, scores_df])
+
+    svc = svm.SVC()
+
+    svc_scores = cross_validate(
+    svc,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=metrics
+    )
+    print("Support Vector Machine")
+
+    scores_df = pd.DataFrame(svc_scores)[test_metrics]
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
+    scores_df["method"] = "svc"
+    scores = pd.concat([scores, scores_df])
+
+    gaussian_nb = naive_bayes.GaussianNB()
+    
+    gaussian_nb_scores = cross_validate(
+    gaussian_nb,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=metrics
+    )
+    print("Gaussian Naive Bayes")
+
+    scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
+    scores_df["method"] = "gaussian_naive_bayes"
+    scores = pd.concat([scores, scores_df])
+
+    sgdc = linear_model.SGDClassifier()
+
+    sgdc_scores = cross_validate(
+    sgdc,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=metrics
+    )
+    print("Stochastic Gradient Descent")
+
+    scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
+    scores_df["method"] = "stochastic_gradient_descent"
+    scores = pd.concat([scores, scores_df])
+
+    rfc = ensemble.RandomForestClassifier()
+
+    rfc_scores = cross_validate(
+    rfc,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=metrics
+    )
+    print("Random Forest")
+
+    scores_df = pd.DataFrame(rfc_scores)[test_metrics]
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
+    scores_df["method"] = "random_forest"
+    scores = pd.concat([scores, scores_df])
+
+    xgb_classifier = XGBClassifier()
+
+    xgb_scores = cross_validate(
+    xgb_classifier,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=metrics
+    )
+    print("XGBoost")
+
+    scores_df = pd.DataFrame(xgb_scores)[test_metrics]
+    scores_df = scores_df.agg(['max', 'mean']).transpose()
+    scores_df["method"] = "xgboost"
+    scores = pd.concat([scores, scores_df])
+
+    return scores
--- a/machine_learning/preprocessing.py
+++ b/machine_learning/preprocessing.py
@ -0,0 +1,126 @@
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+class Preprocessing:
+    """This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data. 
+       It's blind to the test data - e.g, it imputes the test data with train data mean. 
+       This means, it somehow needs an access to the information about data split. In context 
+    """
+    
+
+    def __init__(self, train_X, train_y, test_X, test_y):
+        self.train_X = train_X
+        self.train_y = train_y
+        self.test_X = test_X
+        self.test_y = test_y
+
+
+    def one_hot_encoder(self, categorical_features, numerical_features, mode):
+        """
+        This code is an implementation of one-hot encoding. It takes in two data sets, 
+        one with categorical features and one with numerical features and a mode parameter. 
+        First it uses the fillna() function to fill in any missing values present in the 
+        categorical data set with the mode value. Then it uses the apply () method to 
+        convert each column of the data set into a category data type which is then 
+        transformed using the pd.get_dummies() function. Finally it concatenates the 
+        numerical data set and the transformed categorical data set using pd.concat() and 
+        returns it.
+
+        Args:
+            categorical_features (DataFrame): DataFrame including only categorical columns.
+            numerical_features (_type_): DataFrame including only numerical columns.
+            mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
+
+        Returns:
+            DataFrame: Hot-One Encoded DataFrame.
+        """
+        # Fill train set with mode
+        categorical_features = categorical_features.fillna(mode)
+
+        # one-hot encoding
+        categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+        if not categorical_features.empty:
+            categorical_features = pd.get_dummies(categorical_features)
+
+        return pd.concat([numerical_features, categorical_features], axis=1)
+
+
+    def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
+        """
+        This code is used to transform categorical data into numerical representations. 
+        It first identifies the categorical columns, then copies them and saves them as 
+        a new dataset. The missing data is filled with the mode (most frequent value in 
+        the respective column). This new dataset is then subjected to one-hot encoding, 
+        which is a process of transforming categorical data into machine interpretable 
+        numerical form by converting categories into multiple binary outcome variables. 
+        These encoded values are then concatenated to the numerical features prior to 
+        being returned as the final dataset.
+
+        Args:
+            categorical_columns (list, optional): List of categorical columns in the dataset. 
+                Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"].
+        
+        """
+        categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
+
+        # For train set
+        
+        train_X_categorical_features = self.train_X[categorical_columns].copy()
+        train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
+        mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
+        
+        self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
+        
+        # For test set
+        
+        test_X_categorical_features = self.test_X[categorical_columns].copy()
+        test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
+        
+        self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
+
+
+    def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
+        
+        # TODO: TESTING
+        
+        if groupby:
+            # Interval numerical features # TODO: How can we get and assign appropriate groupby means and assign them to correct columns?
+            
+            # VVVVV ......  IN PROGRES ...... VVVVV
+            means = self.train_X[interval_feature_list].groupby(groupby_feature).mean() 
+            self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + other_feature_list)]] = \
+                self.train_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
+                
+            self.test_X[self.test_X.loc[:, ~self.test_X.columns.isin([groupby_feature] + other_feature_list)]] = \
+                self.test_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
+                
+            # Other features
+            self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + interval_feature_list)]] = \
+                self.train_X[other_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.median()))
+            
+        else:
+            # Interval numerical features
+            means = self.train_X[interval_feature_list].mean()
+            self.train_X[interval_feature_list].fillna(means, inplace=True)
+            self.test_X[interval_feature_list].fillna(means, inplace=True)
+                    
+            # Other features
+            medians = self.train_X[other_feature_list].median()
+            self.train_X[other_feature_list].fillna(medians, inplace=True)
+            self.test_X[other_feature_list].fillna(medians, inplace=True)
+            
+            
+    def get_train_test_sets(self):
+        """Train and test sets getter
+
+        Returns:
+            tuple of Pandas DataFrames: Gets train test sets in traditional sklearn format.
+        """
+        return self.train_X, self.train_y, self.test_X, self.test_y
+        
+        
+
--- a/machine_learning/prox_comm_PANAS_nb.ipynb
+++ b/machine_learning/prox_comm_PANAS_nb.ipynb
--- a/presentation/ApplicationCategories.R
+++ b/presentation/ApplicationCategories.R
@ -0,0 +1,51 @@
+library(conflicted)
+library(yaml)
+library(RPostgreSQL)
+library(tidyverse)
+conflicts_prefer(
+  dplyr::filter,
+  dplyr::lag
+)
+library(magrittr)
+
+# read the password from file
+credentials <- yaml.load_file("../rapids/credentials.yaml")
+pw <- credentials$PSQL_STRAW$password
+
+# load the PostgreSQL driver
+drv <- RPostgres::Postgres()
+
+# creates a connection to the postgres database
+# note that "con" will be used later in each connection to the database
+con <- RPostgres::dbConnect(drv,
+  dbname = "staw",
+  host = "eol.ijs.si", port = 5432,
+  user = "staw_db", password = pw
+)
+
+rm(pw, credentials) # removes the password
+
+# check for the bluetooth table, an example
+dbExistsTable(con, "app_categories")
+
+df_app_categories <- tbl(con, "app_categories") %>%
+  collect()
+
+head(df_app_categories)
+table(df_app_categories$play_store_genre)
+
+# Correct some mistakes
+df_app_categories %<>% mutate(
+  play_store_genre = {
+    function(x) {
+      case_when(
+        x == "Education,Education" ~ "Education",
+        x == "EducationEducation" ~ "Education",
+        x == "not_found" ~ "System",
+        .default = x
+      )
+    }
+  }(play_store_genre)
+)
+
+dbDisconnect(con)
--- a/presentation/StressfulEvents.Rmd
+++ b/presentation/StressfulEvents.Rmd
@ -0,0 +1,103 @@
+---
+title: "Stressful event detection"
+output: html_notebook
+---
+
+```{r chunk_options, include=FALSE}
+knitr::opts_chunk$set(
+  comment = "#>", echo = FALSE, fig.width = 6
+)
+```
+
+```{r libraries, include=FALSE}
+library(knitr)
+library(kableExtra)
+library(stringr)
+library(RColorBrewer)
+library(magrittr)
+library(tidyverse)
+```
+
+```{r fig_setup, include=FALSE}
+accent <- RColorBrewer::brewer.pal(7, "Accent")
+```
+
+
+```{r read_data, include=FALSE}
+podatki <- read_csv("E:/STRAWresults/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
+podatki %<>% mutate(pid = as_factor(pid))
+```
+
+# Event descriptions
+
+Participants were asked "Was there a particular event that created tension in you?" with the following options:
+
+- 0 - No	
+- 1 - Yes, slightly	
+- 2 - Yes, moderately	
+- 3 - Yes, considerably	
+- 4 - Yes, extremely
+
+If they answered anything but "No", they were also asked about the event's perceived threat (e.g. "Did this event make you feel anxious?") and challenge (e.g. "How eager are you to tackle this event?"). 
+We only consider general "stressfulness" in this presentation.
+
+Most of the time, nothing stressful happened:
+ 
+```{r target_table}
+kable(table(podatki$target), col.names = c("stressfulness", "frequency")) %>% 
+  kable_styling(full_width = FALSE)
+```
+
+Most participants had somewhere between 0 and 10 stressful events.
+
+```{r target_distribution}
+podatki %>% 
+  group_by(pid) %>% 
+  summarise(no_of_events = sum(target > 0)) %>% 
+  ggplot(aes(no_of_events)) +
+  geom_histogram(binwidth = 1, fill = accent[1]) +
+  coord_cartesian(expand = FALSE) +
+  labs(x = "Number of events per participant") +
+  theme_classic()
+```
+
+When a stressful event occurred, participants mostly perceived it as slightly to moderately stressful on average.
+
+```{r mean_stressfulness_distribution}
+podatki %>% 
+  filter(target > 0) %>% 
+  group_by(pid) %>% 
+  summarise(mean_stressfulness = mean(target)) %>% 
+  ggplot(aes(mean_stressfulness)) +
+  geom_histogram(binwidth = 0.1, fill = accent[1]) +
+  coord_cartesian(expand = FALSE) +
+  labs(x = "Mean stressfulness per participant") +
+  theme_classic()
+```
+
+# Problem description
+
+We are trying to predict whether a stressful event occurred, i.e. stressfulness > 0, or not (stressfulness == 0).
+First, set up a leave-one-subject-out validation and use original distribution of the class variable.
+
+For this, the majority classifier has a mean accuracy of 0.85 (and median 0.90), while the F1-score, precision and recall are all 0.
+
+We also have an option to validate the results differently, such as with "half-loso", i.e. leaving half of the subject's data in the training set and only use half for testing, or k-fold cross-validation.
+Additionally, we can undersample the majority class to balance the dataset.
+
+# Results
+## Leave one subject out, original distribution
+
+```{r event_detection}
+scores <- read_csv("event_stressful_detection_loso.csv", col_types = "ffdd")
+scores_wide <- scores %>% 
+  select(!max) %>% 
+  pivot_wider(names_from = metric, 
+              names_sep = "_",
+              values_from = mean) %>% 
+  rename_all(~str_replace(.,"^test_",""))
+kable(scores_wide, digits = 2) %>% 
+  column_spec(4, color = 'white', background = 'black') %>% 
+  kable_styling(full_width = TRUE)
+```
+
--- a/presentation/classification.py
+++ b/presentation/classification.py
@ -0,0 +1,127 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate
+from sklearn.dummy import DummyClassifier
+from sklearn.impute import SimpleImputer
+
+import xgboost as xg
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+from pathlib import Path
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.labels
+import machine_learning.model
+from machine_learning.helper import run_all_classification_models
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# ## Set script's parameters
+#
+
+# %%
+cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
+
+# %% jupyter={"source_hidden": true}
+filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
+model_input = pd.read_csv(filename)
+
+# %% jupyter={"source_hidden": true}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+model_input.set_index(index_columns, inplace=True)
+model_input['target'].value_counts()
+
+# %% jupyter={"source_hidden": true}
+bins = [-10, -1, 1, 10] # bins for z-scored targets
+# bins = [0, 1, 4] # bins for stressfulness (1-4) target
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'medium', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
+model_input['target'].value_counts(), edges
+model_input = model_input[model_input['target'] != "medium"]
+model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
+
+model_input['target'].value_counts()
+
+if cv_method_str == 'halflogo':
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
+else:
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+
+
+# %% jupyter={"source_hidden": true}
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+categorical_features = data_x[categorical_feature_colnames].copy()
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+train_x = pd.concat([numerical_features, categorical_features], axis=1)
+
+# %% jupyter={"source_hidden": true}
+cv_method = None # Defaults to 5 k-folds in cross_validate method
+if cv_method_str == 'logo' or cv_method_str == 'half_logo':
+    cv_method = LeaveOneGroupOut()
+    cv_method.get_n_splits(
+        train_x,
+        data_y,
+        groups=data_groups,
+    )
+
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+
+# %%
+final_scores = run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
+
+# %%
+final_scores.index.name = "metric"
+final_scores = final_scores.set_index(["method", final_scores.index])
+final_scores.to_csv("event_stressfulness_lmh_lh_scores.csv")
--- a/presentation/event_stressful_detection_5fold.csv
+++ b/presentation/event_stressful_detection_5fold.csv
@ -0,0 +1,29 @@
+method,metric,max,mean
+Dummy,test_accuracy,0.8557046979865772,0.8548446932649828
+Dummy,test_average_precision,0.1457286432160804,0.14515530673501736
+Dummy,test_recall,0.0,0.0
+Dummy,test_f1,0.0,0.0
+logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606
+logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834
+logistic_reg,test_recall,0.3023255813953488,0.24266238973536486
+logistic_reg,test_f1,0.3909774436090226,0.318943511424051
+svc,test_accuracy,0.8557046979865772,0.8548446932649828
+svc,test_average_precision,0.44514416839823046,0.4068200938341621
+svc,test_recall,0.0,0.0
+svc,test_f1,0.0,0.0
+gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234
+gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853
+gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935
+gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699
+stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601
+stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592
+stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675
+stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583
+random_forest,test_accuracy,0.9110738255033557,0.9011129472867694
+random_forest,test_average_precision,0.6998372262021191,0.6619275281099584
+random_forest,test_recall,0.4069767441860465,0.35356856455493185
+random_forest,test_f1,0.5691056910569107,0.5078402513053142
+xgboost,test_accuracy,0.9128978224455612,0.9007711937764886
+xgboost,test_average_precision,0.7366643049075349,0.698622165966308
+xgboost,test_recall,0.5287356321839081,0.44346431435445066
+xgboost,test_f1,0.638888888888889,0.5633957169928393
--- a/presentation/event_stressful_detection_logo.csv
+++ b/presentation/event_stressful_detection_logo.csv
@ -0,0 +1,29 @@
+method,metric,max,mean
+Dummy,test_accuracy,1.0,0.8524114578096439
+Dummy,test_average_precision,0.7,0.14758854219035614
+Dummy,test_recall,0.0,0.0
+Dummy,test_f1,0.0,0.0
+logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311
+logistic_reg,test_average_precision,1.0,0.44605167668563583
+logistic_reg,test_recall,1.0,0.25353566685532386
+logistic_reg,test_f1,0.823529411764706,0.27951926390778625
+svc,test_accuracy,1.0,0.8524114578096439
+svc,test_average_precision,0.9612401707068228,0.44179454944271934
+svc,test_recall,0.0,0.0
+svc,test_f1,0.0,0.0
+gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129
+gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991
+gaussian_naive_bayes,test_recall,1.0,0.3743761174081108
+gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668
+stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739
+stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582
+stochastic_gradient_descent,test_recall,1.0,0.30156420704502945
+stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388
+random_forest,test_accuracy,1.0,0.8722158105763481
+random_forest,test_average_precision,1.0,0.49817066323226833
+random_forest,test_recall,1.0,0.18161263127840668
+random_forest,test_f1,1.0,0.2508096532365307
+xgboost,test_accuracy,1.0,0.8812627400277729
+xgboost,test_average_precision,1.0,0.5505695112208401
+xgboost,test_recall,1.0,0.2896161238315027
+xgboost,test_f1,0.9411764705882353,0.36887408735855665
--- a/presentation/event_stressfulness.py
+++ b/presentation/event_stressfulness.py
@ -0,0 +1,60 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: Python 3.10.8 ('straw2analysis')
+#     language: python
+#     name: python3
+# ---
+
+# %%
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+import yaml
+from pyprojroot import here
+from sklearn import linear_model, svm, kernel_ridge, gaussian_process
+from sklearn.model_selection import LeaveOneGroupOut, LeavePGroupsOut, cross_val_score, cross_validate
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.impute import SimpleImputer
+from sklearn.dummy import DummyRegressor
+import xgboost as xg
+
+from pathlib import Path
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.features_sensor
+import machine_learning.labels
+import machine_learning.model
+import machine_learning.helper
+
+
+
+# %% tags=["active-ipynb"]
+# filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
+# filename = Path('C:/Users/Primoz/VSCodeProjects/straw2analysis/data/stressfulness_event/input_appraisal_stressfulness_event_mean.csv')
+
+# %%
+final_scores = machine_learning.helper.run_all_regression_models(filename)
+
+# %%
+final_scores.index.name = "metric"
+final_scores = final_scores.set_index(["method", final_scores.index])
+
+# %%
+final_scores.to_csv("event_stressfulness_scores.csv")
--- a/presentation/plots/d18NArfr_PCA.pdf
+++ b/presentation/plots/d18NArfr_PCA.pdf
--- a/presentation/plots/d18NArfr_hist.pdf
+++ b/presentation/plots/d18NArfr_hist.pdf
--- a/presentation/plots/d18NArfr_relplot.pdf
+++ b/presentation/plots/d18NArfr_relplot.pdf
--- a/presentation/plots/d18demandBayRidge_PCA.pdf
+++ b/presentation/plots/d18demandBayRidge_PCA.pdf
--- a/presentation/plots/d18demandBayRidge_relplot.pdf
+++ b/presentation/plots/d18demandBayRidge_relplot.pdf
--- a/presentation/plots/d18demandBayridge_hist.pdf
+++ b/presentation/plots/d18demandBayridge_hist.pdf
--- a/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
+++ b/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
--- a/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
+++ b/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
--- a/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
+++ b/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
--- a/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
+++ b/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
--- a/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
+++ b/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
--- a/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
+++ b/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
--- a/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
+++ b/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
--- a/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
+++ b/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
--- a/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
+++ b/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
--- a/presentation/presentation.Rproj
+++ b/presentation/presentation.Rproj
@ -0,0 +1,17 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+
+SpellingDictionary: en_GB
--- a/presentation/prox_comm_PANAS_nb.py
+++ b/presentation/prox_comm_PANAS_nb.py
@ -0,0 +1,131 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %%
+# %matplotlib inline
+import yaml
+from sklearn import linear_model
+from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
+import os
+import importlib
+import matplotlib.pyplot as plt
+import sys
+import numpy as np
+import seaborn as sns
+import pandas as pd
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+# %%
+from machine_learning import pipeline, features_sensor, labels, model
+
+# %%
+importlib.reload(labels)
+
+# %%
+with open("./config/prox_comm_PANAS_features.yaml", "r") as file:
+    sensor_features_params = yaml.safe_load(file)
+sensor_features = features_sensor.SensorFeatures(**sensor_features_params)
+#sensor_features.set_sensor_data()
+sensor_features.calculate_features(cached=True)
+
+# %%
+all_features = sensor_features.get_features("all","all")
+
+# %%
+with open("./config/prox_comm_PANAS_labels.yaml", "r") as file:
+    labels_params = yaml.safe_load(file)
+labels_current = labels.Labels(**labels_params)
+#labels_current.set_labels()
+labels_current.aggregate_labels(cached=True)
+
+# %%
+model_validation = model.ModelValidation(
+    sensor_features.get_features("all", "all"),
+    labels_current.get_aggregated_labels(),
+    group_variable="participant_id",
+    cv_name="loso",
+)
+model_validation.model = linear_model.LinearRegression()
+model_validation.set_cv_method()
+
+# %%
+model_loso_r2 = model_validation.cross_validate()
+
+# %%
+print(model_loso_r2)
+print(np.mean(model_loso_r2))
+
+# %%
+model_loso_r2[model_loso_r2 > 0]
+
+# %%
+logo = LeaveOneGroupOut()
+
+# %%
+try_X = model_validation.X.reset_index().drop(["participant_id","date_lj"], axis=1)
+try_y = model_validation.y.reset_index().drop(["participant_id","date_lj"], axis=1)
+
+# %%
+model_loso_mean_absolute_error = -1 * cross_val_score(
+estimator=model_validation.model,
+X=try_X,
+y=try_y,
+groups=model_validation.groups,
+cv=logo.split(X=try_X, y=try_y, groups=model_validation.groups), 
+scoring='neg_mean_absolute_error'
+)
+
+# %%
+model_loso_mean_absolute_error
+
+# %%
+np.median(model_loso_mean_absolute_error)
+
+# %%
+model_validation.model.fit(try_X, try_y)
+
+# %%
+Y_predicted = model_validation.model.predict(try_X)
+
+# %%
+try_y.rename(columns={"NA": "NA_true"}, inplace=True)
+try_y["NA_predicted"] = Y_predicted
+NA_long = pd.wide_to_long(
+    try_y.reset_index(),
+    i="index",
+    j="value",
+    stubnames="NA",
+    sep="_",
+    suffix=".+",
+)
+
+# %%
+g1 = sns.displot(NA_long, x="NA", hue="value", binwidth=0.1, height=5, aspect=1.5)
+sns.move_legend(g1, "upper left", bbox_to_anchor=(.55, .45))
+g1.set_axis_labels("Daily mean", "Day count")
+
+display(g1)
+g1.savefig("prox_comm_PANAS_predictions.pdf")
+
+# %%
+from sklearn.metrics import mean_absolute_error
+
+# %%
+mean_absolute_error(try_y["NA_true"], try_y["NA_predicted"])
+
+# %%
+model_loso_mean_absolute_error
--- a/presentation/results_presentation.py
+++ b/presentation/results_presentation.py
@ -0,0 +1,163 @@
+# %%
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+import yaml
+from pyprojroot import here
+from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble
+from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate, cross_val_predict
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.impute import SimpleImputer
+from sklearn.dummy import DummyRegressor
+from sklearn.decomposition import PCA
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.helper
+
+# %%
+segment = "intradaily_30_min"
+target = "JCQ_job_demand"
+csv_name = "./data/" + segment + "_all_targets/input_" + target + "_mean.csv"
+#csv_name = "./data/daily_18_hours_all_targets/input_JCQ_job_demand_mean.csv"
+
+# %%
+data_x, data_y, data_groups = machine_learning.helper.prepare_model_input(csv_name)
+
+# %%
+data_y.head()
+
+# %%
+scores = machine_learning.helper.run_all_models(csv_name)
+
+
+# %% jupyter={"source_hidden": true}
+logo = LeaveOneGroupOut()
+logo.get_n_splits(
+    data_x,
+    data_y,
+    groups=data_groups,
+)
+
+# %% [markdown]
+# ### Baseline: Dummy Regression (mean)
+dummy_regr = DummyRegressor(strategy="mean")
+
+# %% jupyter={"source_hidden": true}
+lin_reg_scores = cross_validate(
+    dummy_regr,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
+print("R2", np.median(lin_reg_scores['test_r2']))
+
+##################
+# %%
+chosen_model = "Random Forest"
+rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
+rfr_score = cross_validate(
+    rfr,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Squared Error", np.median(rfr_score['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(rfr_score['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(rfr_score['test_neg_root_mean_squared_error']))
+print("R2", np.median(rfr_score['test_r2']))
+
+# %%
+y_predicted = cross_val_predict(rfr, data_x, data_y, groups=data_groups, cv=logo)
+#########################
+# %%
+chosen_model = "Bayesian Ridge"
+bayesian_ridge_reg = linear_model.BayesianRidge()
+bayesian_ridge_reg_score = cross_validate(
+    bayesian_ridge_reg,
+    X=data_x,
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
+)
+print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
+print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
+
+# %%
+y_predicted = cross_val_predict(bayesian_ridge_reg, data_x, data_y, groups=data_groups, cv=logo)
+
+# %%
+data_y = pd.DataFrame(pd.concat([data_y, data_groups], axis=1))
+data_y.rename(columns={"target": "y_true"}, inplace=True)
+data_y["y_predicted"] = y_predicted
+
+# %%
+data_y.head()
+
+# %%
+g1 = sns.relplot(data=data_y, x="y_true", y="y_predicted")
+#g1.set_axis_labels("true", "predicted")
+#g1.map(plt.axhline, y=0, color=".7", dashes=(2, 1), zorder=0)
+#g1.map(plt.axline, xy1=(0,0), slope=1)
+g1.set(title=",".join([segment, target, chosen_model]))
+display(g1)
+g1.savefig("_".join([segment, target, chosen_model, "_relplot.pdf"]))
+
+# %%
+data_y_long = pd.wide_to_long(
+    data_y.reset_index(),
+    i=["local_segment", "pid"],
+    j="value",
+    stubnames="y",
+    sep="_",
+    suffix=".+",
+)
+
+# %%
+data_y_long.head()
+# %%
+g2 = sns.displot(data_y_long, x="y", hue="value", binwidth=0.1, height=5, aspect=1.5)
+sns.move_legend(g2, "upper left", bbox_to_anchor=(.55, .45))
+g2.set(title=",".join([segment, target, chosen_model]))
+g2.savefig("_".join([segment, target, chosen_model, "hist.pdf"]))
+
+# %%
+pca = PCA(n_components=2)
+pca.fit(data_x)
+print(pca.explained_variance_ratio_)
+
+# %%
+data_x_pca = pca.fit_transform(data_x)
+data_pca = pd.DataFrame(pd.concat([data_y.reset_index()["y_true"], pd.DataFrame(data_x_pca, columns = {"pca_0", "pca_1"})], axis=1))
+
+# %%
+data_pca
+# %%
+
+g3 = sns.relplot(data = data_pca, x = "pca_0", y = "pca_1", hue = "y_true", palette = sns.color_palette("Spectral", as_cmap=True))
+g3.set(title=",".join([segment, target, chosen_model]) + "\n variance explained = " + str(round(sum(pca.explained_variance_ratio_), 2)))
+g3.savefig("_".join([segment, target, chosen_model, "_PCA.pdf"]))
+
+# %%