From 8defb271c97c07bc50f46baebb53cb8a6694b645 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Mon, 21 Nov 2022 11:23:47 +0100
Subject: [PATCH 01/11] Extend ml pipeline scripts with two additional CV
 methods.

---
 exploration/ml_pipeline_daily.py              | 19 ++++++++++---
 .../ml_pipeline_daily_cleaned_intradaily.py   | 18 +++++++++++--
 .../ml_pipeline_stress_event_cleaned.py       | 27 +++++++++++++------
 3 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/exploration/ml_pipeline_daily.py b/exploration/ml_pipeline_daily.py
index db4ab7e..e12cc1f 100644
--- a/exploration/ml_pipeline_daily.py
+++ b/exploration/ml_pipeline_daily.py
@@ -72,11 +72,20 @@ sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
 
 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-#if "pid" in model_input.columns:
-#    index_columns.append("pid")
+
 model_input.set_index(index_columns, inplace=True)
 
-data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+cv_method = '5kfold'
+if cv_method == 'half_logo':
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+else:
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
 
 # %% jupyter={"source_hidden": true}
 categorical_feature_colnames = ["gender", "startlanguage"]
@@ -114,6 +123,10 @@ logo.get_n_splits(
     groups=data_groups,
 )
 
+# Defaults to 5 k folds in cross_validate method
+if cv_method != 'logo' and cv_method != 'half_logo':
+    logo = None
+
 # %% jupyter={"source_hidden": true}
 sum(data_y.isna())
 
diff --git a/exploration/ml_pipeline_daily_cleaned_intradaily.py b/exploration/ml_pipeline_daily_cleaned_intradaily.py
index 3e27620..dccdfd8 100644
--- a/exploration/ml_pipeline_daily_cleaned_intradaily.py
+++ b/exploration/ml_pipeline_daily_cleaned_intradaily.py
@@ -50,7 +50,7 @@ import machine_learning.model
 # ## PANAS negative affect
 
 # %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_PANAS_negative_affect_mean.csv")
+model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
 
 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
@@ -58,7 +58,17 @@ index_columns = ["local_segment", "local_segment_label", "local_segment_start_da
 #    index_columns.append("pid")
 model_input.set_index(index_columns, inplace=True)
 
-data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+cv_method = '5kfold'
+if cv_method == 'half_logo':
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+else:
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
 
 # %% jupyter={"source_hidden": true}
 categorical_feature_colnames = ["gender", "startlanguage"]
@@ -98,6 +108,10 @@ logo.get_n_splits(
     groups=data_groups,
 )
 
+# Defaults to 5 k folds in cross_validate method
+if cv_method != 'logo' and cv_method != 'half_logo':
+    logo = None
+
 # %% jupyter={"source_hidden": true}
 sum(data_y.isna())
 
diff --git a/exploration/ml_pipeline_stress_event_cleaned.py b/exploration/ml_pipeline_stress_event_cleaned.py
index 3b6cd6d..9bef7f9 100644
--- a/exploration/ml_pipeline_stress_event_cleaned.py
+++ b/exploration/ml_pipeline_stress_event_cleaned.py
@@ -53,12 +53,25 @@ import machine_learning.model
 model_input = pd.read_csv("../data/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
 
 # %% jupyter={"source_hidden": true}
+
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-#if "pid" in model_input.columns:
-#    index_columns.append("pid")
+
 model_input.set_index(index_columns, inplace=True)
 
-data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+cv_method = 'half_logo'
+if cv_method == 'logo':
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+else:
+
+    model_input[(model_input['pid'] == "p037") | (model_input['pid'] == "p064") | (model_input['pid'] == "p092")]
+
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
 
 # %% jupyter={"source_hidden": true}
 categorical_feature_colnames = ["gender", "startlanguage"]
@@ -97,12 +110,10 @@ logo.get_n_splits(
     data_y,
     groups=data_groups,
 )
-logo.split(
-    train_x,
-    data_y,
-    groups=data_groups,
-)
 
+# Defaults to 5 k folds in cross_validate method
+if cv_method != 'logo' and cv_method != 'half_logo':
+    logo = None
 
 # %% jupyter={"source_hidden": true}
 sum(data_y.isna())

From ae0f54ecc26a296e8f8572e7af3396cd48bcc6af Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Mon, 21 Nov 2022 11:41:11 +0100
Subject: [PATCH 02/11] Combine different segment scripts and set ml pipeline
 as a regression problem.

---
 exploration/ml_pipeline_daily.py              | 284 ---------------
 .../ml_pipeline_daily_cleaned_daily.py        | 332 ------------------
 ...ntradaily.py => ml_pipeline_regression.py} |  10 +-
 3 files changed, 5 insertions(+), 621 deletions(-)
 delete mode 100644 exploration/ml_pipeline_daily.py
 delete mode 100644 exploration/ml_pipeline_daily_cleaned_daily.py
 rename exploration/{ml_pipeline_daily_cleaned_intradaily.py => ml_pipeline_regression.py} (97%)

diff --git a/exploration/ml_pipeline_daily.py b/exploration/ml_pipeline_daily.py
deleted file mode 100644
index e12cc1f..0000000
--- a/exploration/ml_pipeline_daily.py
+++ /dev/null
@@ -1,284 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.13.0
-#   kernelspec:
-#     display_name: straw2analysis
-#     language: python
-#     name: straw2analysis
-# ---
-
-# %% jupyter={"source_hidden": true}
-# %matplotlib inline
-import datetime
-import importlib
-import os
-import sys
-
-import numpy as np
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-import yaml
-from pyprojroot import here
-from sklearn import linear_model, svm, kernel_ridge, gaussian_process
-from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
-from sklearn.metrics import mean_squared_error, r2_score
-from sklearn.impute import SimpleImputer
-
-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
-
-import machine_learning.features_sensor
-import machine_learning.labels
-import machine_learning.model
-
-# %% [markdown]
-# # RAPIDS models
-
-# %% [markdown]
-# ## PANAS negative affect
-
-# %% jupyter={"source_hidden": true}
-# model_input = pd.read_csv("../data/input_PANAS_NA.csv") # Nestandardizirani podatki - pred temeljitim čiščenjem
-model_input = pd.read_csv("../data/z_input_PANAS_NA.csv") # Standardizirani podatki - pred temeljitim čiščenjem
-# %% [markdown]
-# ### NaNs before dropping cols and rows
-
-# %% jupyter={"source_hidden": true}
-sns.set(rc={"figure.figsize":(16, 8)})
-sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
-
-# %% jupyter={"source_hidden": true}
-nan_cols = list(model_input.loc[:, model_input.isna().all()].columns)
-nan_cols
-
-# %% jupyter={"source_hidden": true}
-model_input.dropna(axis=1, how="all", inplace=True)
-model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
-
-# %% [markdown]
-# ### NaNs after dropping NaN cols and rows where target is NaN
-
-# %% jupyter={"source_hidden": true}
-sns.set(rc={"figure.figsize":(16, 8)})
-sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
-
-# %% jupyter={"source_hidden": true}
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-
-model_input.set_index(index_columns, inplace=True)
-
-cv_method = '5kfold'
-if cv_method == 'half_logo':
-    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
-else:
-    model_input['pid_index'] = model_input.groupby('pid').cumcount()
-    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
-
-    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
-    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
-
-    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
-
-# %% jupyter={"source_hidden": true}
-categorical_feature_colnames = ["gender", "startlanguage"]
-
-# %% jupyter={"source_hidden": true}
-categorical_features = data_x[categorical_feature_colnames].copy()
-
-# %% jupyter={"source_hidden": true}
-mode_categorical_features = categorical_features.mode().iloc[0]
-
-# %% jupyter={"source_hidden": true}
-# fillna with mode
-categorical_features = categorical_features.fillna(mode_categorical_features)
-
-# %% jupyter={"source_hidden": true}
-# one-hot encoding
-categorical_features = categorical_features.apply(lambda col: col.astype("category"))
-if not categorical_features.empty:
-    categorical_features = pd.get_dummies(categorical_features)
-
-# %% jupyter={"source_hidden": true}
-numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
-
-# %% jupyter={"source_hidden": true}
-train_x = pd.concat([numerical_features, categorical_features], axis=1)
-
-# %% jupyter={"source_hidden": true}
-train_x.dtypes
-
-# %% jupyter={"source_hidden": true}
-logo = LeaveOneGroupOut()
-logo.get_n_splits(
-    train_x,
-    data_y,
-    groups=data_groups,
-)
-
-# Defaults to 5 k folds in cross_validate method
-if cv_method != 'logo' and cv_method != 'half_logo':
-    logo = None
-
-# %% jupyter={"source_hidden": true}
-sum(data_y.isna())
-
-# %% [markdown]
-# ### Linear Regression
-
-# %% jupyter={"source_hidden": true}
-lin_reg_rapids = linear_model.LinearRegression()
-
-# %% jupyter={"source_hidden": true}
-imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
-
-# %% jupyter={"source_hidden": true}
-lin_reg_scores = cross_val_score(
-    lin_reg_rapids,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring='r2'
-)
-lin_reg_scores
-np.median(lin_reg_scores)
-
-# %% [markdown]
-# ### Ridge regression
-
-# %% jupyter={"source_hidden": true}
-ridge_reg = linear_model.Ridge(alpha=.5)
-
-# %% tags=[] jupyter={"source_hidden": true}
-ridge_reg_scores = cross_val_score(
-    ridge_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-)
-np.median(ridge_reg_scores)
-
-# %% [markdown]
-# ### Lasso
-
-# %% jupyter={"source_hidden": true}
-lasso_reg = linear_model.Lasso(alpha=0.1)
-
-# %% jupyter={"source_hidden": true}
-lasso_reg_score = cross_val_score(
-    lasso_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-)
-np.median(lasso_reg_score)
-
-# %% [markdown]
-# ### Bayesian Ridge
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg = linear_model.BayesianRidge()
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg_score = cross_val_score(
-    bayesian_ridge_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-)
-np.median(bayesian_ridge_reg_score)
-
-# %% [markdown]
-# ### RANSAC (outlier robust regression)
-
-# %% jupyter={"source_hidden": true}
-ransac_reg = linear_model.RANSACRegressor()
-
-# %% jupyter={"source_hidden": true}
-np.median(
-    cross_val_score(
-    ransac_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-    )
-)
-
-# %% [markdown]
-# ### Support vector regression
-
-# %% jupyter={"source_hidden": true}
-svr = svm.SVR()
-
-# %% jupyter={"source_hidden": true}
-np.median(
-    cross_val_score(
-    svr,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-    )
-)
-
-# %% [markdown]
-# ### Kernel Ridge regression
-
-# %% jupyter={"source_hidden": true}
-kridge = kernel_ridge.KernelRidge()
-
-# %% jupyter={"source_hidden": true}
-np.median(
-    cross_val_score(
-        kridge,
-        X=imputer.fit_transform(train_x),
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-)
-# %% [markdown]
-# ### Gaussian Process Regression
-
-# %% jupyter={"source_hidden": true}
-gpr = gaussian_process.GaussianProcessRegressor()
-
-# %% jupyter={"source_hidden": true}
-
-np.median(
-    cross_val_score(
-        gpr,
-        X=imputer.fit_transform(train_x),
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-)
-# %%
diff --git a/exploration/ml_pipeline_daily_cleaned_daily.py b/exploration/ml_pipeline_daily_cleaned_daily.py
deleted file mode 100644
index 37b973a..0000000
--- a/exploration/ml_pipeline_daily_cleaned_daily.py
+++ /dev/null
@@ -1,332 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.13.0
-#   kernelspec:
-#     display_name: straw2analysis
-#     language: python
-#     name: straw2analysis
-# ---
-
-# %% jupyter={"source_hidden": true}
-# %matplotlib inline
-import datetime
-import importlib
-import os
-import sys
-
-import numpy as np
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-import yaml
-from pyprojroot import here
-from sklearn import linear_model, svm, kernel_ridge, gaussian_process
-from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
-from sklearn.metrics import mean_squared_error, r2_score
-from sklearn.impute import SimpleImputer
-from sklearn.dummy import DummyRegressor
-import xgboost as xg
-from IPython.core.interactiveshell import InteractiveShell
-InteractiveShell.ast_node_interactivity = "all"
-
-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
-
-import machine_learning.features_sensor
-import machine_learning.labels
-import machine_learning.model
-
-# %% [markdown]
-# # RAPIDS models
-
-# %% [markdown]
-# ## PANAS negative affect
-
-# %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/daily_18_hours_all_targets/input_PANAS_negative_affect_mean.csv")
-
-# %% jupyter={"source_hidden": true}
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-#if "pid" in model_input.columns:
-#    index_columns.append("pid")
-model_input.set_index(index_columns, inplace=True)
-
-data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
-
-# %% jupyter={"source_hidden": true}
-categorical_feature_colnames = ["gender", "startlanguage"]
-additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
-categorical_feature_colnames += additional_categorical_features
-
-# %% jupyter={"source_hidden": true}
-categorical_features = data_x[categorical_feature_colnames].copy()
-
-# %% jupyter={"source_hidden": true}
-mode_categorical_features = categorical_features.mode().iloc[0]
-
-# %% jupyter={"source_hidden": true}
-# fillna with mode
-categorical_features = categorical_features.fillna(mode_categorical_features)
-
-# %% jupyter={"source_hidden": true}
-# one-hot encoding
-categorical_features = categorical_features.apply(lambda col: col.astype("category"))
-if not categorical_features.empty:
-    categorical_features = pd.get_dummies(categorical_features)
-
-# %% jupyter={"source_hidden": true}
-numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
-
-# %% jupyter={"source_hidden": true}
-train_x = pd.concat([numerical_features, categorical_features], axis=1)
-
-# %% jupyter={"source_hidden": true}
-train_x.dtypes
-
-# %% jupyter={"source_hidden": true}
-logo = LeaveOneGroupOut()
-logo.get_n_splits(
-    train_x,
-    data_y,
-    groups=data_groups,
-)
-
-# %% jupyter={"source_hidden": true}
-sum(data_y.isna())
-
-# %% [markdown]
-# ### Baseline: Dummy Regression (mean)
-dummy_regr = DummyRegressor(strategy="mean")
-
-# %% jupyter={"source_hidden": true}
-lin_reg_scores = cross_validate(
-    dummy_regr,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(lin_reg_scores['test_r2']))
-
-# %% [markdown]
-# ### Linear Regression
-
-# %% jupyter={"source_hidden": true}
-lin_reg_rapids = linear_model.LinearRegression()
-
-# %% jupyter={"source_hidden": true}
-lin_reg_scores = cross_validate(
-    lin_reg_rapids,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(lin_reg_scores['test_r2']))
-
-# %% [markdown]
-# ### XGBRegressor Linear Regression
-# %% jupyter={"source_hidden": true}
-xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
-
-# %% jupyter={"source_hidden": true}
-xgb_reg_scores = cross_validate(
-    xgb_r,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(xgb_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(xgb_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(xgb_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(xgb_reg_scores['test_r2']))
-
-# %% [markdown]
-# ### XGBRegressor Pseudo Huber Error Regression
-# %% jupyter={"source_hidden": true}
-xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
-
-# %% jupyter={"source_hidden": true}
-xgb_psuedo_huber_reg_scores = cross_validate(
-    xgb_psuedo_huber_r,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(xgb_psuedo_huber_reg_scores['test_r2']))
-
-# %% [markdown]
-# ### Ridge regression
-
-# %% jupyter={"source_hidden": true}
-ridge_reg = linear_model.Ridge(alpha=.5)
-
-# %% tags=[] jupyter={"source_hidden": true}
-ridge_reg_scores = cross_validate(
-    ridge_reg,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(ridge_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(ridge_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(ridge_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(ridge_reg_scores['test_r2']))
-
-# %% [markdown]
-# ### Lasso
-
-# %% jupyter={"source_hidden": true}
-lasso_reg = linear_model.Lasso(alpha=0.1)
-
-# %% jupyter={"source_hidden": true}
-lasso_reg_score = cross_validate(
-    lasso_reg,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(lasso_reg_score['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(lasso_reg_score['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(lasso_reg_score['test_neg_root_mean_squared_error']))
-print("R2", np.median(lasso_reg_score['test_r2']))
-
-# %% [markdown]
-# ### Bayesian Ridge
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg = linear_model.BayesianRidge()
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg_score = cross_validate(
-    bayesian_ridge_reg,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
-print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
-
-# %% [markdown]
-# ### RANSAC (outlier robust regression)
-
-# %% jupyter={"source_hidden": true}
-ransac_reg = linear_model.RANSACRegressor()
-
-# %% jupyter={"source_hidden": true}
-ransac_reg_scores = cross_validate(
-    ransac_reg,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(ransac_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(ransac_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(ransac_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(ransac_reg_scores['test_r2']))
-
-# %% [markdown]
-# ### Support vector regression
-
-# %% jupyter={"source_hidden": true}
-svr = svm.SVR()
-
-# %% jupyter={"source_hidden": true}
-svr_scores = cross_validate(
-    svr,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(svr_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(svr_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(svr_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(svr_scores['test_r2']))
-
-# %% [markdown]
-# ### Kernel Ridge regression
-
-# %% jupyter={"source_hidden": true}
-kridge = kernel_ridge.KernelRidge()
-
-# %% jupyter={"source_hidden": true}
-kridge_scores = cross_validate(
-    kridge,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(kridge_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(kridge_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(kridge_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(kridge_scores['test_r2']))
-
-# %% [markdown]
-# ### Gaussian Process Regression
-
-# %% jupyter={"source_hidden": true}
-gpr = gaussian_process.GaussianProcessRegressor()
-
-# %% jupyter={"source_hidden": true}
-
-gpr_scores = cross_validate(
-    gpr,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(gpr_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(gpr_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(gpr_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(gpr_scores['test_r2']))
-
-# %%
diff --git a/exploration/ml_pipeline_daily_cleaned_intradaily.py b/exploration/ml_pipeline_regression.py
similarity index 97%
rename from exploration/ml_pipeline_daily_cleaned_intradaily.py
rename to exploration/ml_pipeline_regression.py
index dccdfd8..21d02cb 100644
--- a/exploration/ml_pipeline_daily_cleaned_intradaily.py
+++ b/exploration/ml_pipeline_regression.py
@@ -123,7 +123,7 @@ dummy_regr = DummyRegressor(strategy="mean")
 imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
 
 # %% jupyter={"source_hidden": true}
-lin_reg_scores = cross_validate(
+dummy_regressor = cross_validate(
     dummy_regr,
     X=imputer.fit_transform(train_x),
     y=data_y,
@@ -132,10 +132,10 @@ lin_reg_scores = cross_validate(
     n_jobs=-1,
     scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
-print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(lin_reg_scores['test_r2']))
+print("Negative Mean Squared Error", np.median(dummy_regressor['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(dummy_regressor['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(dummy_regressor['test_neg_root_mean_squared_error']))
+print("R2", np.median(dummy_regressor['test_r2']))
 
 # %% [markdown]
 # ### Linear Regression

From 40029a8205311d54f4e046325aba920df2a8b5dd Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Mon, 21 Nov 2022 14:47:19 +0100
Subject: [PATCH 03/11] Add a script for ml classification pipeline.

---
 exploration/ml_pipeline_classification.py | 356 ++++++++++++++++++++++
 exploration/ml_pipeline_regression.py     |   4 +-
 2 files changed, 358 insertions(+), 2 deletions(-)
 create mode 100644 exploration/ml_pipeline_classification.py

diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py
new file mode 100644
index 0000000..140464b
--- /dev/null
+++ b/exploration/ml_pipeline_classification.py
@@ -0,0 +1,356 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate
+from sklearn.dummy import DummyClassifier
+from sklearn.impute import SimpleImputer
+
+from lightgbm import LGBMClassifier
+import xgboost as xg
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.labels
+import machine_learning.model
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# ## PANAS negative affect
+
+# %% jupyter={"source_hidden": true}
+model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+
+# %% jupyter={"source_hidden": true}
+bins = [-4, -1, 1, 4] # bins for z-scored targets
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'medium', 'high'], retbins=True, right=False)
+model_input['target'].value_counts(), edges
+model_input = model_input[model_input['target'] != "medium"]
+model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
+
+model_input['target'].value_counts()
+
+# %% jupyter={"source_hidden": true}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+#if "pid" in model_input.columns:
+#    index_columns.append("pid")
+model_input.set_index(index_columns, inplace=True)
+
+# %% jupyter={"source_hidden": true}
+cv_method = '5kfold'
+if cv_method == 'logo':
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+else:
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
+
+# %% jupyter={"source_hidden": true}
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [] #[col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+categorical_features = data_x[categorical_feature_colnames].copy()
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+train_x = pd.concat([numerical_features, categorical_features], axis=1)
+train_x.dtypes
+
+# %% jupyter={"source_hidden": true}
+logo = LeaveOneGroupOut()
+logo.get_n_splits(
+    train_x,
+    data_y,
+    groups=data_groups,
+)
+
+# Defaults to 5 k-folds in cross_validate method
+if cv_method != 'logo' and cv_method != 'half_logo':
+    logo = None
+
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+
+# %% [markdown]
+# ### Baseline: Dummy Classifier (most frequent)
+dummy_class = DummyClassifier(strategy="most_frequent")
+
+# %% jupyter={"source_hidden": true}
+dummy_classifier = cross_validate(
+    dummy_class,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('accuracy', 'average_precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.median(dummy_classifier['test_accuracy']))
+print("Precision", np.median(dummy_classifier['test_average_precision']))
+print("Recall", np.median(dummy_classifier['test_recall']))
+print("F1", np.median(dummy_classifier['test_f1']))
+
+# %% [markdown]
+# ### Logistic Regression
+
+# %% jupyter={"source_hidden": true}
+logistic_regression = linear_model.LogisticRegression()
+
+# %% jupyter={"source_hidden": true}
+log_reg_scores = cross_validate(
+    logistic_regression,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.median(log_reg_scores['test_accuracy']))
+print("Precision", np.median(log_reg_scores['test_precision']))
+print("Recall", np.median(log_reg_scores['test_recall']))
+print("F1", np.median(log_reg_scores['test_f1']))
+
+# %% [markdown]
+# ### Support Vector Machine
+
+# %% jupyter={"source_hidden": true}
+svc = svm.SVC()
+
+# %% jupyter={"source_hidden": true}
+svc_scores = cross_validate(
+    svc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.median(svc_scores['test_accuracy']))
+print("Precision", np.median(svc_scores['test_precision']))
+print("Recall", np.median(svc_scores['test_recall']))
+print("F1", np.median(svc_scores['test_f1']))
+
+# %% [markdown]
+# ### Gaussian Naive Bayes
+
+# %% jupyter={"source_hidden": true}
+gaussian_nb = naive_bayes.GaussianNB()
+
+# %% jupyter={"source_hidden": true}
+gaussian_nb_scores = cross_validate(
+    gaussian_nb,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.median(gaussian_nb_scores['test_accuracy']))
+print("Precision", np.median(gaussian_nb_scores['test_precision']))
+print("Recall", np.median(gaussian_nb_scores['test_recall']))
+print("F1", np.median(gaussian_nb_scores['test_f1']))
+
+# %% [markdown]
+# ### Stochastic Gradient Descent Classifier
+
+# %% jupyter={"source_hidden": true}
+sgdc = linear_model.SGDClassifier()
+
+# %% jupyter={"source_hidden": true}
+sgdc_scores = cross_validate(
+    sgdc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.median(sgdc_scores['test_accuracy']))
+print("Precision", np.median(sgdc_scores['test_precision']))
+print("Recall", np.median(sgdc_scores['test_recall']))
+print("F1", np.median(sgdc_scores['test_f1']))
+
+# %% [markdown]
+# ### K-nearest neighbors
+
+# %% jupyter={"source_hidden": true}
+knn = neighbors.KNeighborsClassifier()
+
+# %% jupyter={"source_hidden": true}
+knn_scores = cross_validate( # Nekaj ne funkcionira pravilno
+    knn,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+    # error_score='raise'
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.median(knn_scores['test_accuracy']))
+print("Precision", np.median(knn_scores['test_precision']))
+print("Recall", np.median(knn_scores['test_recall']))
+print("F1", np.median(knn_scores['test_f1']))
+
+# %% [markdown]
+# ### Decision Tree
+
+# %% jupyter={"source_hidden": true}
+dtree = tree.DecisionTreeClassifier()
+
+# %% jupyter={"source_hidden": true}
+dtree_scores = cross_validate(
+    dtree,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.median(dtree_scores['test_accuracy']))
+print("Precision", np.median(dtree_scores['test_precision']))
+print("Recall", np.median(dtree_scores['test_recall']))
+print("F1", np.median(dtree_scores['test_f1']))
+
+# %% [markdown]
+# ### Random Forest Classifier
+
+# %% jupyter={"source_hidden": true}
+rfc = ensemble.RandomForestClassifier()
+
+# %% jupyter={"source_hidden": true}
+rfc_scores = cross_validate(
+    rfc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.median(rfc_scores['test_accuracy']))
+print("Precision", np.median(rfc_scores['test_precision']))
+print("Recall", np.median(rfc_scores['test_recall']))
+print("F1", np.median(rfc_scores['test_f1']))
+
+# %% [markdown]
+# ### Gradient Boosting Classifier
+
+# %% jupyter={"source_hidden": true}
+gbc = ensemble.GradientBoostingClassifier()
+
+# %% jupyter={"source_hidden": true}
+gbc_scores = cross_validate(
+    gbc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.median(gbc_scores['test_accuracy']))
+print("Precision", np.median(gbc_scores['test_precision']))
+print("Recall", np.median(gbc_scores['test_recall']))
+print("F1", np.median(gbc_scores['test_f1']))
+
+# %% [markdown]
+# ### LGBM Classifier
+
+# %% jupyter={"source_hidden": true}
+lgbm = LGBMClassifier()
+
+# %% jupyter={"source_hidden": true}
+lgbm_scores = cross_validate(
+    lgbm,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.median(lgbm_scores['test_accuracy']))
+print("Precision", np.median(lgbm_scores['test_precision']))
+print("Recall", np.median(lgbm_scores['test_recall']))
+print("F1", np.median(lgbm_scores['test_f1']))
+
+# %% [markdown]
+# ### XGBoost Classifier
+
+# %% jupyter={"source_hidden": true}
+xgb_classifier = xg.sklearn.XGBClassifier()
+
+# %% jupyter={"source_hidden": true}
+xgb_classifier_scores = cross_validate(
+    xgb_classifier,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=logo,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.median(xgb_classifier_scores['test_accuracy']))
+print("Precision", np.median(xgb_classifier_scores['test_precision']))
+print("Recall", np.median(xgb_classifier_scores['test_recall']))
+print("F1", np.median(xgb_classifier_scores['test_f1']))
diff --git a/exploration/ml_pipeline_regression.py b/exploration/ml_pipeline_regression.py
index 21d02cb..98b2e3f 100644
--- a/exploration/ml_pipeline_regression.py
+++ b/exploration/ml_pipeline_regression.py
@@ -58,8 +58,8 @@ index_columns = ["local_segment", "local_segment_label", "local_segment_start_da
 #    index_columns.append("pid")
 model_input.set_index(index_columns, inplace=True)
 
-cv_method = '5kfold'
-if cv_method == 'half_logo':
+cv_method = 'half_logo' # logo, half_logo, 5kfold
+if cv_method == 'logo':
     data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
 else:
     model_input['pid_index'] = model_input.groupby('pid').cumcount()

From 183758cd37681eafee174c602f056ba08c227581 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Tue, 22 Nov 2022 14:31:49 +0100
Subject: [PATCH 04/11] Improve general ml classification pipeline script.

---
 exploration/ml_pipeline_classification.py | 104 ++++++++++++++--------
 1 file changed, 67 insertions(+), 37 deletions(-)

diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py
index 140464b..fc2fb81 100644
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@@ -52,25 +52,20 @@ import machine_learning.model
 model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
 
 # %% jupyter={"source_hidden": true}
-bins = [-4, -1, 1, 4] # bins for z-scored targets
-model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'medium', 'high'], retbins=True, right=False)
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+model_input.set_index(index_columns, inplace=True)
+
+# %% jupyter={"source_hidden": true}
+bins = [-10, -1, 1, 10] # bins for z-scored targets
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'medium', 'high'], retbins=True, right=False) #['low', 'medium', 'high']
 model_input['target'].value_counts(), edges
 model_input = model_input[model_input['target'] != "medium"]
 model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
 
 model_input['target'].value_counts()
 
-# %% jupyter={"source_hidden": true}
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-#if "pid" in model_input.columns:
-#    index_columns.append("pid")
-model_input.set_index(index_columns, inplace=True)
-
-# %% jupyter={"source_hidden": true}
-cv_method = '5kfold'
-if cv_method == 'logo':
-    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
-else:
+cv_method_str = 'logo' # logo, halflogo, 5kfold
+if cv_method_str == 'halflogo':
     model_input['pid_index'] = model_input.groupby('pid').cumcount()
     model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
 
@@ -78,6 +73,9 @@ else:
     model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
 
     data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
+else:
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+
 
 # %% jupyter={"source_hidden": true}
 categorical_feature_colnames = ["gender", "startlanguage"]
@@ -100,19 +98,21 @@ train_x = pd.concat([numerical_features, categorical_features], axis=1)
 train_x.dtypes
 
 # %% jupyter={"source_hidden": true}
-logo = LeaveOneGroupOut()
-logo.get_n_splits(
-    train_x,
-    data_y,
-    groups=data_groups,
-)
-
-# Defaults to 5 k-folds in cross_validate method
-if cv_method != 'logo' and cv_method != 'half_logo':
-    logo = None
+cv_method = None # Defaults to 5 k-folds in cross_validate method
+if cv_method_str == 'logo' or cv_method_str == 'half_logo':
+    cv_method = LeaveOneGroupOut()
+    cv_method.get_n_splits(
+        train_x,
+        data_y,
+        groups=data_groups,
+    )
+# %% jupyter={"source_hidden": true}
+# %% [markdown]
+# ### Set n for nlargest and nsmallest
+n = 5
 
 # %% jupyter={"source_hidden": true}
-imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
+imputer = SimpleImputer(missing_values=np.nan, strategy='median')
 
 # %% [markdown]
 # ### Baseline: Dummy Classifier (most frequent)
@@ -124,8 +124,9 @@ dummy_classifier = cross_validate(
     X=imputer.fit_transform(train_x),
     y=data_y,
     groups=data_groups,
-    cv=logo,
+    cv=cv_method,
     n_jobs=-1,
+    error_score='raise',
     scoring=('accuracy', 'average_precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
@@ -133,6 +134,8 @@ print("Acc", np.median(dummy_classifier['test_accuracy']))
 print("Precision", np.median(dummy_classifier['test_average_precision']))
 print("Recall", np.median(dummy_classifier['test_recall']))
 print("F1", np.median(dummy_classifier['test_f1']))
+print("Largest 5 ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n)[:n])[::-1])
+print("Smallest 5 ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n)[:n]))
 
 # %% [markdown]
 # ### Logistic Regression
@@ -146,7 +149,7 @@ log_reg_scores = cross_validate(
     X=imputer.fit_transform(train_x),
     y=data_y,
     groups=data_groups,
-    cv=logo,
+    cv=cv_method,
     n_jobs=-1,
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
@@ -155,6 +158,8 @@ print("Acc", np.median(log_reg_scores['test_accuracy']))
 print("Precision", np.median(log_reg_scores['test_precision']))
 print("Recall", np.median(log_reg_scores['test_recall']))
 print("F1", np.median(log_reg_scores['test_f1']))
+print("Largest 5 ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n)[:n])[::-1])
+print("Smallest 5 ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n)[:n]))
 
 # %% [markdown]
 # ### Support Vector Machine
@@ -168,7 +173,7 @@ svc_scores = cross_validate(
     X=imputer.fit_transform(train_x),
     y=data_y,
     groups=data_groups,
-    cv=logo,
+    cv=cv_method,
     n_jobs=-1,
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
@@ -177,6 +182,8 @@ print("Acc", np.median(svc_scores['test_accuracy']))
 print("Precision", np.median(svc_scores['test_precision']))
 print("Recall", np.median(svc_scores['test_recall']))
 print("F1", np.median(svc_scores['test_f1']))
+print("Largest 5 ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n)[:n])[::-1])
+print("Smallest 5 ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n)[:n]))
 
 # %% [markdown]
 # ### Gaussian Naive Bayes
@@ -190,8 +197,9 @@ gaussian_nb_scores = cross_validate(
     X=imputer.fit_transform(train_x),
     y=data_y,
     groups=data_groups,
-    cv=logo,
+    cv=cv_method,
     n_jobs=-1,
+    error_score='raise',
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
@@ -199,6 +207,8 @@ print("Acc", np.median(gaussian_nb_scores['test_accuracy']))
 print("Precision", np.median(gaussian_nb_scores['test_precision']))
 print("Recall", np.median(gaussian_nb_scores['test_recall']))
 print("F1", np.median(gaussian_nb_scores['test_f1']))
+print("Largest 5 ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n)[:n])[::-1])
+print("Smallest 5 ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n)[:n]))
 
 # %% [markdown]
 # ### Stochastic Gradient Descent Classifier
@@ -212,8 +222,9 @@ sgdc_scores = cross_validate(
     X=imputer.fit_transform(train_x),
     y=data_y,
     groups=data_groups,
-    cv=logo,
+    cv=cv_method,
     n_jobs=-1,
+    error_score='raise',
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
@@ -221,6 +232,8 @@ print("Acc", np.median(sgdc_scores['test_accuracy']))
 print("Precision", np.median(sgdc_scores['test_precision']))
 print("Recall", np.median(sgdc_scores['test_recall']))
 print("F1", np.median(sgdc_scores['test_f1']))
+print("Largest 5 ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n)[:n])[::-1])
+print("Smallest 5 ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n)[:n]))
 
 # %% [markdown]
 # ### K-nearest neighbors
@@ -229,21 +242,23 @@ print("F1", np.median(sgdc_scores['test_f1']))
 knn = neighbors.KNeighborsClassifier()
 
 # %% jupyter={"source_hidden": true}
-knn_scores = cross_validate( # Nekaj ne funkcionira pravilno
+knn_scores = cross_validate(
     knn,
     X=imputer.fit_transform(train_x),
     y=data_y,
     groups=data_groups,
-    cv=logo,
+    cv=cv_method,
     n_jobs=-1,
+    error_score='raise',
     scoring=('accuracy', 'precision', 'recall', 'f1')
-    # error_score='raise'
 )
 # %% jupyter={"source_hidden": true}
 print("Acc", np.median(knn_scores['test_accuracy']))
 print("Precision", np.median(knn_scores['test_precision']))
 print("Recall", np.median(knn_scores['test_recall']))
 print("F1", np.median(knn_scores['test_f1']))
+print("Largest 5 ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n)[:n])[::-1])
+print("Smallest 5 ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n)[:n]))
 
 # %% [markdown]
 # ### Decision Tree
@@ -257,8 +272,9 @@ dtree_scores = cross_validate(
     X=imputer.fit_transform(train_x),
     y=data_y,
     groups=data_groups,
-    cv=logo,
+    cv=cv_method,
     n_jobs=-1,
+    error_score='raise',
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
@@ -266,6 +282,8 @@ print("Acc", np.median(dtree_scores['test_accuracy']))
 print("Precision", np.median(dtree_scores['test_precision']))
 print("Recall", np.median(dtree_scores['test_recall']))
 print("F1", np.median(dtree_scores['test_f1']))
+print("Largest 5 ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n)[:n])[::-1])
+print("Smallest 5 ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n)[:n]))
 
 # %% [markdown]
 # ### Random Forest Classifier
@@ -279,8 +297,9 @@ rfc_scores = cross_validate(
     X=imputer.fit_transform(train_x),
     y=data_y,
     groups=data_groups,
-    cv=logo,
+    cv=cv_method,
     n_jobs=-1,
+    error_score='raise',
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
@@ -288,6 +307,8 @@ print("Acc", np.median(rfc_scores['test_accuracy']))
 print("Precision", np.median(rfc_scores['test_precision']))
 print("Recall", np.median(rfc_scores['test_recall']))
 print("F1", np.median(rfc_scores['test_f1']))
+print("Largest 5 ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n)[:n])[::-1])
+print("Smallest 5 ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n)[:n]))
 
 # %% [markdown]
 # ### Gradient Boosting Classifier
@@ -301,8 +322,9 @@ gbc_scores = cross_validate(
     X=imputer.fit_transform(train_x),
     y=data_y,
     groups=data_groups,
-    cv=logo,
+    cv=cv_method,
     n_jobs=-1,
+    error_score='raise',
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
@@ -310,6 +332,8 @@ print("Acc", np.median(gbc_scores['test_accuracy']))
 print("Precision", np.median(gbc_scores['test_precision']))
 print("Recall", np.median(gbc_scores['test_recall']))
 print("F1", np.median(gbc_scores['test_f1']))
+print("Largest 5 ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n)[:n])[::-1])
+print("Smallest 5 ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n)[:n]))
 
 # %% [markdown]
 # ### LGBM Classifier
@@ -323,8 +347,9 @@ lgbm_scores = cross_validate(
     X=imputer.fit_transform(train_x),
     y=data_y,
     groups=data_groups,
-    cv=logo,
+    cv=cv_method,
     n_jobs=-1,
+    error_score='raise',
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
@@ -332,6 +357,8 @@ print("Acc", np.median(lgbm_scores['test_accuracy']))
 print("Precision", np.median(lgbm_scores['test_precision']))
 print("Recall", np.median(lgbm_scores['test_recall']))
 print("F1", np.median(lgbm_scores['test_f1']))
+print("Largest 5 ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n)[:n])[::-1])
+print("Smallest 5 ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n)[:n]))
 
 # %% [markdown]
 # ### XGBoost Classifier
@@ -345,8 +372,9 @@ xgb_classifier_scores = cross_validate(
     X=imputer.fit_transform(train_x),
     y=data_y,
     groups=data_groups,
-    cv=logo,
+    cv=cv_method,
     n_jobs=-1,
+    error_score='raise',
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
@@ -354,3 +382,5 @@ print("Acc", np.median(xgb_classifier_scores['test_accuracy']))
 print("Precision", np.median(xgb_classifier_scores['test_precision']))
 print("Recall", np.median(xgb_classifier_scores['test_recall']))
 print("F1", np.median(xgb_classifier_scores['test_f1']))
+print("Largest 5 ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n)[:n])[::-1])
+print("Smallest 5 ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n)[:n]))

From 7afef5582f2c5462c4a79c9dd7dacd15e661f5c3 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Tue, 22 Nov 2022 14:44:33 +0100
Subject: [PATCH 05/11] Add TEMP lime_survey cols

---
 exploration/ml_pipeline_classification.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py
index fc2fb81..bde5c73 100644
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@@ -51,6 +51,13 @@ import machine_learning.model
 # %% jupyter={"source_hidden": true}
 model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
 
+lime_cols = [col for col in model_input if col.startswith('limesurvey_demand')]
+model_input['limesurvey_demand_control_ratio'].describe()
+lime_cols
+
+# TODO: prek lime_cols ustvari klastre, ki jih nato kasneje ločeno preveriš z modeli (npr. k=5). Potrebno bo trikrat ponoviti spodnji postopek. 
+# Pomisli, če gre kaj zavizi v for loop (npr. modeli v seznamu)
+
 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input.set_index(index_columns, inplace=True)

From ddde80b4212875d321a91cd07b40ed781b9f09c7 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Thu, 24 Nov 2022 09:24:13 +0100
Subject: [PATCH 06/11] Add classification with clustering ml pipeline script.

---
 exploration/ml_pipeline_classification.py     |   7 -
 ...pipeline_classification_with_clustering.py | 178 ++++++++++++++++++
 2 files changed, 178 insertions(+), 7 deletions(-)
 create mode 100644 exploration/ml_pipeline_classification_with_clustering.py

diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py
index bde5c73..fc2fb81 100644
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@@ -51,13 +51,6 @@ import machine_learning.model
 # %% jupyter={"source_hidden": true}
 model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
 
-lime_cols = [col for col in model_input if col.startswith('limesurvey_demand')]
-model_input['limesurvey_demand_control_ratio'].describe()
-lime_cols
-
-# TODO: prek lime_cols ustvari klastre, ki jih nato kasneje ločeno preveriš z modeli (npr. k=5). Potrebno bo trikrat ponoviti spodnji postopek. 
-# Pomisli, če gre kaj zavizi v for loop (npr. modeli v seznamu)
-
 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input.set_index(index_columns, inplace=True)
diff --git a/exploration/ml_pipeline_classification_with_clustering.py b/exploration/ml_pipeline_classification_with_clustering.py
new file mode 100644
index 0000000..9d81b21
--- /dev/null
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@@ -0,0 +1,178 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from scipy import stats
+
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate
+from sklearn.dummy import DummyClassifier
+from sklearn.impute import SimpleImputer
+from lightgbm import LGBMClassifier
+import xgboost as xg
+
+from sklearn.cluster import KMeans
+
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.labels
+import machine_learning.model
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# ## PANAS negative affect
+
+# %% jupyter={"source_hidden": true}
+model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+
+lime_cols = [col for col in model_input if col.startswith('limesurvey_demand')]
+lime_col = 'limesurvey_demand_control_ratio'
+model_input[lime_col].describe()
+
+# %% jupyter={"source_hidden": true}
+
+# Filter-out outlier rows by lime_col 
+model_input = model_input[(np.abs(stats.zscore(model_input[lime_col])) < 3)]
+
+uniq = model_input[[lime_col, 'pid']].drop_duplicates().reset_index(drop=True)
+plt.bar(uniq['pid'], uniq[lime_col])
+
+# %% jupyter={"source_hidden": true}
+# Get clusters by lime col & and merge the clusters to main df
+km = KMeans(n_clusters=5).fit_predict(uniq.set_index('pid'))
+np.unique(km, return_counts=True)
+uniq['cluster'] = km
+uniq
+
+model_input = model_input.merge(uniq[['pid', 'cluster']])   
+
+# %% jupyter={"source_hidden": true}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+model_input.set_index(index_columns, inplace=True)
+
+# %% jupyter={"source_hidden": true}
+
+for k in range(5):
+    model_input_subset = model_input[model_input["cluster"] == k].copy()
+    bins = [-10, -1, 1, 10] # bins for z-scored targets
+    model_input_subset.loc[:, 'target'] = \
+        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
+    model_input_subset['target'].value_counts()
+    model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
+    model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
+
+    model_input_subset['target'].value_counts()
+    
+
+    cv_method_str = 'logo' # logo, halflogo, 5kfold
+    if cv_method_str == 'halflogo':
+        model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
+        model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
+
+        model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
+        model_input_subset["pid_half"] = model_input_subset["pid"] + "_" +  model_input_subset["pid_index"].astype(int).astype(str)
+
+        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
+    else:
+        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]
+
+    # Treat categorical features
+    categorical_feature_colnames = ["gender", "startlanguage"]
+    additional_categorical_features = [] #[col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+    categorical_feature_colnames += additional_categorical_features
+
+    categorical_features = data_x[categorical_feature_colnames].copy()
+    mode_categorical_features = categorical_features.mode().iloc[0]
+
+    # fillna with mode
+    categorical_features = categorical_features.fillna(mode_categorical_features)
+
+    # one-hot encoding
+    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+    if not categorical_features.empty:
+        categorical_features = pd.get_dummies(categorical_features)
+
+    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+    train_x = pd.concat([numerical_features, categorical_features], axis=1)
+
+    # Establish cv method
+    cv_method = None # Defaults to 5 k-folds in cross_validate method
+    if cv_method_str == 'logo' or cv_method_str == 'half_logo':
+        cv_method = LeaveOneGroupOut()
+        cv_method.get_n_splits(
+            train_x,
+            data_y,
+            groups=data_groups,
+        )
+
+    n = 3
+
+    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+
+    # Create dict with classification ml models
+    cmodels = {
+        'dummy_classifier': DummyClassifier(strategy="most_frequent"),
+        'logistic_regression': linear_model.LogisticRegression(),
+        'support_vector_machine': svm.SVC(),
+        'gaussian_naive_bayes': naive_bayes.GaussianNB(),
+        'stochastic_gradient_descent_classifier': linear_model.SGDClassifier(),
+        'knn': neighbors.KNeighborsClassifier(),
+        'decision_tree': tree.DecisionTreeClassifier(),
+        'random_forest_classifier': ensemble.RandomForestClassifier(),
+        'gradient_boosting_classifier': ensemble.GradientBoostingClassifier(),
+        'lgbm_classifier': LGBMClassifier(),
+        'XGBoost_classifier': xg.sklearn.XGBClassifier()
+    }
+
+    for model_title, model in cmodels.items():
+        
+        classifier = cross_validate(
+            model,
+            X=imputer.fit_transform(train_x),
+            y=data_y,
+            groups=data_groups,
+            cv=cv_method,
+            n_jobs=-1,
+            error_score='raise',
+            scoring=('accuracy', 'average_precision', 'recall', 'f1')
+        )
+        
+        print("\n-------------------------------------\n")
+        print("Current cluster:", k, end="\n")
+        print("Current model:", model_title, end="\n")
+        print("Acc", np.median(classifier['test_accuracy']))
+        print("Precision", np.median(classifier['test_average_precision']))
+        print("Recall", np.median(classifier['test_recall']))
+        print("F1", np.median(classifier['test_f1']))
+        print("Largest 5 ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n)[:n])[::-1])
+        print("Smallest 5 ACC:", np.sort(np.partition(classifier['test_accuracy'], n)[:n]))
+# %%

From 218b6845149b5cbbffa3551cea1dd2d0018d53a5 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Thu, 24 Nov 2022 16:12:20 +0100
Subject: [PATCH 07/11] Automize clustering classification logic and add
 parameters at the begining of the scripts. General changes and improvements.

---
 exploration/ml_pipeline_classification.py     | 143 +++++++++---------
 ...pipeline_classification_with_clustering.py | 134 +++++++++++-----
 2 files changed, 164 insertions(+), 113 deletions(-)

diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py
index fc2fb81..ad460d3 100644
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@@ -46,7 +46,9 @@ import machine_learning.model
 # # RAPIDS models
 
 # %% [markdown]
-# ## PANAS negative affect
+# ## Set script's parameters
+cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
 
 # %% jupyter={"source_hidden": true}
 model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
@@ -57,14 +59,13 @@ model_input.set_index(index_columns, inplace=True)
 
 # %% jupyter={"source_hidden": true}
 bins = [-10, -1, 1, 10] # bins for z-scored targets
-model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'medium', 'high'], retbins=True, right=False) #['low', 'medium', 'high']
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'medium', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
 model_input['target'].value_counts(), edges
 model_input = model_input[model_input['target'] != "medium"]
 model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
 
 model_input['target'].value_counts()
 
-cv_method_str = 'logo' # logo, halflogo, 5kfold
 if cv_method_str == 'halflogo':
     model_input['pid_index'] = model_input.groupby('pid').cumcount()
     model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
@@ -106,10 +107,6 @@ if cv_method_str == 'logo' or cv_method_str == 'half_logo':
         data_y,
         groups=data_groups,
     )
-# %% jupyter={"source_hidden": true}
-# %% [markdown]
-# ### Set n for nlargest and nsmallest
-n = 5
 
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy='median')
@@ -130,12 +127,12 @@ dummy_classifier = cross_validate(
     scoring=('accuracy', 'average_precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.median(dummy_classifier['test_accuracy']))
-print("Precision", np.median(dummy_classifier['test_average_precision']))
-print("Recall", np.median(dummy_classifier['test_recall']))
-print("F1", np.median(dummy_classifier['test_f1']))
-print("Largest 5 ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n)[:n])[::-1])
-print("Smallest 5 ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n)[:n]))
+print("Acc", np.mean(dummy_classifier['test_accuracy']))
+print("Precision", np.mean(dummy_classifier['test_average_precision']))
+print("Recall", np.mean(dummy_classifier['test_recall']))
+print("F1", np.mean(dummy_classifier['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
 
 # %% [markdown]
 # ### Logistic Regression
@@ -154,12 +151,12 @@ log_reg_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.median(log_reg_scores['test_accuracy']))
-print("Precision", np.median(log_reg_scores['test_precision']))
-print("Recall", np.median(log_reg_scores['test_recall']))
-print("F1", np.median(log_reg_scores['test_f1']))
-print("Largest 5 ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n)[:n])[::-1])
-print("Smallest 5 ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n)[:n]))
+print("Acc", np.mean(log_reg_scores['test_accuracy']))
+print("Precision", np.mean(log_reg_scores['test_precision']))
+print("Recall", np.mean(log_reg_scores['test_recall']))
+print("F1", np.mean(log_reg_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n_sl)[:n_sl]))
 
 # %% [markdown]
 # ### Support Vector Machine
@@ -178,12 +175,12 @@ svc_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.median(svc_scores['test_accuracy']))
-print("Precision", np.median(svc_scores['test_precision']))
-print("Recall", np.median(svc_scores['test_recall']))
-print("F1", np.median(svc_scores['test_f1']))
-print("Largest 5 ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n)[:n])[::-1])
-print("Smallest 5 ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n)[:n]))
+print("Acc", np.mean(svc_scores['test_accuracy']))
+print("Precision", np.mean(svc_scores['test_precision']))
+print("Recall", np.mean(svc_scores['test_recall']))
+print("F1", np.mean(svc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n_sl)[:n_sl]))
 
 # %% [markdown]
 # ### Gaussian Naive Bayes
@@ -203,12 +200,12 @@ gaussian_nb_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.median(gaussian_nb_scores['test_accuracy']))
-print("Precision", np.median(gaussian_nb_scores['test_precision']))
-print("Recall", np.median(gaussian_nb_scores['test_recall']))
-print("F1", np.median(gaussian_nb_scores['test_f1']))
-print("Largest 5 ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n)[:n])[::-1])
-print("Smallest 5 ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n)[:n]))
+print("Acc", np.mean(gaussian_nb_scores['test_accuracy']))
+print("Precision", np.mean(gaussian_nb_scores['test_precision']))
+print("Recall", np.mean(gaussian_nb_scores['test_recall']))
+print("F1", np.mean(gaussian_nb_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl]))
 
 # %% [markdown]
 # ### Stochastic Gradient Descent Classifier
@@ -228,12 +225,12 @@ sgdc_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.median(sgdc_scores['test_accuracy']))
-print("Precision", np.median(sgdc_scores['test_precision']))
-print("Recall", np.median(sgdc_scores['test_recall']))
-print("F1", np.median(sgdc_scores['test_f1']))
-print("Largest 5 ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n)[:n])[::-1])
-print("Smallest 5 ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n)[:n]))
+print("Acc", np.mean(sgdc_scores['test_accuracy']))
+print("Precision", np.mean(sgdc_scores['test_precision']))
+print("Recall", np.mean(sgdc_scores['test_recall']))
+print("F1", np.mean(sgdc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n_sl)[:n_sl]))
 
 # %% [markdown]
 # ### K-nearest neighbors
@@ -253,12 +250,12 @@ knn_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.median(knn_scores['test_accuracy']))
-print("Precision", np.median(knn_scores['test_precision']))
-print("Recall", np.median(knn_scores['test_recall']))
-print("F1", np.median(knn_scores['test_f1']))
-print("Largest 5 ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n)[:n])[::-1])
-print("Smallest 5 ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n)[:n]))
+print("Acc", np.mean(knn_scores['test_accuracy']))
+print("Precision", np.mean(knn_scores['test_precision']))
+print("Recall", np.mean(knn_scores['test_recall']))
+print("F1", np.mean(knn_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n_sl)[:n_sl]))
 
 # %% [markdown]
 # ### Decision Tree
@@ -278,12 +275,12 @@ dtree_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.median(dtree_scores['test_accuracy']))
-print("Precision", np.median(dtree_scores['test_precision']))
-print("Recall", np.median(dtree_scores['test_recall']))
-print("F1", np.median(dtree_scores['test_f1']))
-print("Largest 5 ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n)[:n])[::-1])
-print("Smallest 5 ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n)[:n]))
+print("Acc", np.mean(dtree_scores['test_accuracy']))
+print("Precision", np.mean(dtree_scores['test_precision']))
+print("Recall", np.mean(dtree_scores['test_recall']))
+print("F1", np.mean(dtree_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n_sl)[:n_sl]))
 
 # %% [markdown]
 # ### Random Forest Classifier
@@ -303,12 +300,12 @@ rfc_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.median(rfc_scores['test_accuracy']))
-print("Precision", np.median(rfc_scores['test_precision']))
-print("Recall", np.median(rfc_scores['test_recall']))
-print("F1", np.median(rfc_scores['test_f1']))
-print("Largest 5 ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n)[:n])[::-1])
-print("Smallest 5 ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n)[:n]))
+print("Acc", np.mean(rfc_scores['test_accuracy']))
+print("Precision", np.mean(rfc_scores['test_precision']))
+print("Recall", np.mean(rfc_scores['test_recall']))
+print("F1", np.mean(rfc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
 
 # %% [markdown]
 # ### Gradient Boosting Classifier
@@ -328,12 +325,12 @@ gbc_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.median(gbc_scores['test_accuracy']))
-print("Precision", np.median(gbc_scores['test_precision']))
-print("Recall", np.median(gbc_scores['test_recall']))
-print("F1", np.median(gbc_scores['test_f1']))
-print("Largest 5 ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n)[:n])[::-1])
-print("Smallest 5 ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n)[:n]))
+print("Acc", np.mean(gbc_scores['test_accuracy']))
+print("Precision", np.mean(gbc_scores['test_precision']))
+print("Recall", np.mean(gbc_scores['test_recall']))
+print("F1", np.mean(gbc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n_sl)[:n_sl]))
 
 # %% [markdown]
 # ### LGBM Classifier
@@ -353,12 +350,12 @@ lgbm_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.median(lgbm_scores['test_accuracy']))
-print("Precision", np.median(lgbm_scores['test_precision']))
-print("Recall", np.median(lgbm_scores['test_recall']))
-print("F1", np.median(lgbm_scores['test_f1']))
-print("Largest 5 ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n)[:n])[::-1])
-print("Smallest 5 ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n)[:n]))
+print("Acc", np.mean(lgbm_scores['test_accuracy']))
+print("Precision", np.mean(lgbm_scores['test_precision']))
+print("Recall", np.mean(lgbm_scores['test_recall']))
+print("F1", np.mean(lgbm_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n_sl)[:n_sl]))
 
 # %% [markdown]
 # ### XGBoost Classifier
@@ -378,9 +375,9 @@ xgb_classifier_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.median(xgb_classifier_scores['test_accuracy']))
-print("Precision", np.median(xgb_classifier_scores['test_precision']))
-print("Recall", np.median(xgb_classifier_scores['test_recall']))
-print("F1", np.median(xgb_classifier_scores['test_f1']))
-print("Largest 5 ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n)[:n])[::-1])
-print("Smallest 5 ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n)[:n]))
+print("Acc", np.mean(xgb_classifier_scores['test_accuracy']))
+print("Precision", np.mean(xgb_classifier_scores['test_precision']))
+print("Recall", np.mean(xgb_classifier_scores['test_recall']))
+print("F1", np.mean(xgb_classifier_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
diff --git a/exploration/ml_pipeline_classification_with_clustering.py b/exploration/ml_pipeline_classification_with_clustering.py
index 9d81b21..56edc90 100644
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@@ -49,26 +49,38 @@ import machine_learning.model
 # # RAPIDS models
 
 # %% [markdown]
-# ## PANAS negative affect
+# ## Set script's parameters
+n_clusters = 5 # Number of clusters (could be regarded as a hyperparameter)
+cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
 
 # %% jupyter={"source_hidden": true}
 model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 
-lime_cols = [col for col in model_input if col.startswith('limesurvey_demand')]
+clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
+
+model_input.columns[list(model_input.columns).index('age'):-1]
+
+lime_cols = [col for col in model_input if col.startswith('limesurvey')]
+lime_cols
 lime_col = 'limesurvey_demand_control_ratio'
-model_input[lime_col].describe()
+clust_col = lime_col
+
+model_input[clust_col].describe()
+
 
 # %% jupyter={"source_hidden": true}
 
-# Filter-out outlier rows by lime_col 
-model_input = model_input[(np.abs(stats.zscore(model_input[lime_col])) < 3)]
+# Filter-out outlier rows by clust_col 
+model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
 
-uniq = model_input[[lime_col, 'pid']].drop_duplicates().reset_index(drop=True)
-plt.bar(uniq['pid'], uniq[lime_col])
+uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
+plt.bar(uniq['pid'], uniq[clust_col])
 
 # %% jupyter={"source_hidden": true}
-# Get clusters by lime col & and merge the clusters to main df
-km = KMeans(n_clusters=5).fit_predict(uniq.set_index('pid'))
+# Get clusters by cluster col & and merge the clusters to main df
+km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
 np.unique(km, return_counts=True)
 uniq['cluster'] = km
 uniq
@@ -76,12 +88,59 @@ uniq
 model_input = model_input.merge(uniq[['pid', 'cluster']])   
 
 # %% jupyter={"source_hidden": true}
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input.set_index(index_columns, inplace=True)
 
 # %% jupyter={"source_hidden": true}
+# Create dict with classification ml models
+cmodels = {
+    'dummy_classifier': {
+        'model': DummyClassifier(strategy="most_frequent"),
+        'metrics': [0, 0, 0, 0]
+    },
+    'logistic_regression': {
+        'model': linear_model.LogisticRegression(),
+        'metrics': [0, 0, 0, 0]
+    },
+    'support_vector_machine': {
+        'model': svm.SVC(),
+        'metrics': [0, 0, 0, 0]
+    },
+    'gaussian_naive_bayes': {
+        'model': naive_bayes.GaussianNB(),
+        'metrics': [0, 0, 0, 0]
+    },
+    'stochastic_gradient_descent_classifier': {
+        'model': linear_model.SGDClassifier(),
+        'metrics': [0, 0, 0, 0]
+    },
+    'knn': {
+        'model': neighbors.KNeighborsClassifier(),
+        'metrics': [0, 0, 0, 0]
+    },
+    'decision_tree': {
+        'model': tree.DecisionTreeClassifier(),
+        'metrics': [0, 0, 0, 0]
+    },
+    'random_forest_classifier': {
+        'model': ensemble.RandomForestClassifier(),
+        'metrics': [0, 0, 0, 0]
+    },
+    'gradient_boosting_classifier': {
+        'model': ensemble.GradientBoostingClassifier(),
+        'metrics': [0, 0, 0, 0]
+    },
+    'lgbm_classifier': {
+        'model': LGBMClassifier(),
+        'metrics': [0, 0, 0, 0]
+    },
+    'XGBoost_classifier': {
+        'model': xg.sklearn.XGBClassifier(),
+        'metrics': [0, 0, 0, 0]
+    }
+}
 
-for k in range(5):
+# %% jupyter={"source_hidden": true}
+for k in range(n_clusters):
     model_input_subset = model_input[model_input["cluster"] == k].copy()
     bins = [-10, -1, 1, 10] # bins for z-scored targets
     model_input_subset.loc[:, 'target'] = \
@@ -92,8 +151,6 @@ for k in range(5):
 
     model_input_subset['target'].value_counts()
     
-
-    cv_method_str = 'logo' # logo, halflogo, 5kfold
     if cv_method_str == 'halflogo':
         model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
         model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
@@ -134,45 +191,42 @@ for k in range(5):
             groups=data_groups,
         )
 
-    n = 3
-
     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
 
-    # Create dict with classification ml models
-    cmodels = {
-        'dummy_classifier': DummyClassifier(strategy="most_frequent"),
-        'logistic_regression': linear_model.LogisticRegression(),
-        'support_vector_machine': svm.SVC(),
-        'gaussian_naive_bayes': naive_bayes.GaussianNB(),
-        'stochastic_gradient_descent_classifier': linear_model.SGDClassifier(),
-        'knn': neighbors.KNeighborsClassifier(),
-        'decision_tree': tree.DecisionTreeClassifier(),
-        'random_forest_classifier': ensemble.RandomForestClassifier(),
-        'gradient_boosting_classifier': ensemble.GradientBoostingClassifier(),
-        'lgbm_classifier': LGBMClassifier(),
-        'XGBoost_classifier': xg.sklearn.XGBClassifier()
-    }
-
     for model_title, model in cmodels.items():
-        
+
         classifier = cross_validate(
-            model,
+            model['model'],
             X=imputer.fit_transform(train_x),
             y=data_y,
             groups=data_groups,
             cv=cv_method,
             n_jobs=-1,
             error_score='raise',
-            scoring=('accuracy', 'average_precision', 'recall', 'f1')
+            scoring=('accuracy', 'precision', 'recall', 'f1')
         )
         
         print("\n-------------------------------------\n")
         print("Current cluster:", k, end="\n")
         print("Current model:", model_title, end="\n")
-        print("Acc", np.median(classifier['test_accuracy']))
-        print("Precision", np.median(classifier['test_average_precision']))
-        print("Recall", np.median(classifier['test_recall']))
-        print("F1", np.median(classifier['test_f1']))
-        print("Largest 5 ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n)[:n])[::-1])
-        print("Smallest 5 ACC:", np.sort(np.partition(classifier['test_accuracy'], n)[:n]))
-# %%
+        print("Acc", np.mean(classifier['test_accuracy']))
+        print("Precision", np.mean(classifier['test_precision']))
+        print("Recall", np.mean(classifier['test_recall']))
+        print("F1", np.mean(classifier['test_f1']))
+        print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
+        print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))
+        
+        cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
+        cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
+        cmodels[model_title]['metrics'][2] += np.mean(classifier['test_accuracy'])
+        cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])
+
+# %% jupyter={"source_hidden": true}
+# Get overall results
+for model_title, model in cmodels.items():
+    print("\n************************************\n")
+    print("Current model:", model_title, end="\n")
+    print("Acc", model['metrics'][0]/n_clusters)
+    print("Precision", model['metrics'][1]/n_clusters)
+    print("Recall", model['metrics'][2]/n_clusters)
+    print("F1", model['metrics'][3]/n_clusters)

From 98f78d72fc3312d88ef0693616e5e4f952648e22 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Fri, 25 Nov 2022 12:35:45 +0100
Subject: [PATCH 08/11] Create a classification models class and use it in the
 ml pipeline script.

---
 ...pipeline_classification_with_clustering.py | 64 +++--------------
 machine_learning/classification_models.py     | 71 +++++++++++++++++++
 2 files changed, 79 insertions(+), 56 deletions(-)
 create mode 100644 machine_learning/classification_models.py

diff --git a/exploration/ml_pipeline_classification_with_clustering.py b/exploration/ml_pipeline_classification_with_clustering.py
index 56edc90..4b771b8 100644
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@@ -26,12 +26,13 @@ import pandas as pd
 import seaborn as sns
 from scipy import stats
 
-from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
 from sklearn.model_selection import LeaveOneGroupOut, cross_validate
-from sklearn.dummy import DummyClassifier
 from sklearn.impute import SimpleImputer
+
+from sklearn.dummy import DummyClassifier
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
 from lightgbm import LGBMClassifier
-import xgboost as xg
+import xgboost as xg 
 
 from sklearn.cluster import KMeans
 
@@ -44,6 +45,7 @@ if nb_dir not in sys.path:
 
 import machine_learning.labels
 import machine_learning.model
+from machine_learning.classification_models import ClassificationModels
 
 # %% [markdown]
 # # RAPIDS models
@@ -92,52 +94,8 @@ model_input.set_index(index_columns, inplace=True)
 
 # %% jupyter={"source_hidden": true}
 # Create dict with classification ml models
-cmodels = {
-    'dummy_classifier': {
-        'model': DummyClassifier(strategy="most_frequent"),
-        'metrics': [0, 0, 0, 0]
-    },
-    'logistic_regression': {
-        'model': linear_model.LogisticRegression(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'support_vector_machine': {
-        'model': svm.SVC(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'gaussian_naive_bayes': {
-        'model': naive_bayes.GaussianNB(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'stochastic_gradient_descent_classifier': {
-        'model': linear_model.SGDClassifier(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'knn': {
-        'model': neighbors.KNeighborsClassifier(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'decision_tree': {
-        'model': tree.DecisionTreeClassifier(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'random_forest_classifier': {
-        'model': ensemble.RandomForestClassifier(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'gradient_boosting_classifier': {
-        'model': ensemble.GradientBoostingClassifier(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'lgbm_classifier': {
-        'model': LGBMClassifier(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'XGBoost_classifier': {
-        'model': xg.sklearn.XGBClassifier(),
-        'metrics': [0, 0, 0, 0]
-    }
-}
+cm = ClassificationModels()
+cmodels = cm.get_cmodels()
 
 # %% jupyter={"source_hidden": true}
 for k in range(n_clusters):
@@ -223,10 +181,4 @@ for k in range(n_clusters):
 
 # %% jupyter={"source_hidden": true}
 # Get overall results
-for model_title, model in cmodels.items():
-    print("\n************************************\n")
-    print("Current model:", model_title, end="\n")
-    print("Acc", model['metrics'][0]/n_clusters)
-    print("Precision", model['metrics'][1]/n_clusters)
-    print("Recall", model['metrics'][2]/n_clusters)
-    print("F1", model['metrics'][3]/n_clusters)
+cm.get_total_models_scores(n_clusters=n_clusters)
diff --git a/machine_learning/classification_models.py b/machine_learning/classification_models.py
new file mode 100644
index 0000000..094e280
--- /dev/null
+++ b/machine_learning/classification_models.py
@@ -0,0 +1,71 @@
+from sklearn.dummy import DummyClassifier
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
+from lightgbm import LGBMClassifier
+import xgboost as xg 
+
+class ClassificationModels():
+    
+    def __init__(self):
+        self.cmodels = self.init_classification_models()
+        
+    def get_cmodels(self):
+        return self.cmodels
+
+    def init_classification_models(self):
+        cmodels = {
+            'dummy_classifier': {
+                'model': DummyClassifier(strategy="most_frequent"),
+                'metrics': [0, 0, 0, 0]
+            },
+            'logistic_regression': {
+                'model': linear_model.LogisticRegression(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'support_vector_machine': {
+                'model': svm.SVC(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'gaussian_naive_bayes': {
+                'model': naive_bayes.GaussianNB(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'stochastic_gradient_descent_classifier': {
+                'model': linear_model.SGDClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'knn': {
+                'model': neighbors.KNeighborsClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'decision_tree': {
+                'model': tree.DecisionTreeClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'random_forest_classifier': {
+                'model': ensemble.RandomForestClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'gradient_boosting_classifier': {
+                'model': ensemble.GradientBoostingClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'lgbm_classifier': {
+                'model': LGBMClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'XGBoost_classifier': {
+                'model': xg.sklearn.XGBClassifier(),
+                'metrics': [0, 0, 0, 0]
+            }
+        }
+        
+        return cmodels
+    
+    def get_total_models_scores(self, n_clusters=1):
+        for model_title, model in self.cmodels.items():
+            print("\n************************************\n")
+            print("Current model:", model_title, end="\n")
+            print("Acc:", model['metrics'][0]/n_clusters)
+            print("Precision:", model['metrics'][1]/n_clusters)
+            print("Recall:", model['metrics'][2]/n_clusters)
+            print("F1:", model['metrics'][3]/n_clusters)
\ No newline at end of file

From 9a218c8e2a3368526013be4aa1fa15e217a6a59d Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Fri, 25 Nov 2022 14:44:11 +0100
Subject: [PATCH 09/11] Add a script for two class train test split clustering
 classification.

---
 ...pipeline_classification_with_clustering.py |   2 +-
 ..._classification_with_clustering_2_class.py | 181 ++++++++++++++++++
 machine_learning/classification_models.py     |   2 +-
 3 files changed, 183 insertions(+), 2 deletions(-)
 create mode 100644 exploration/ml_pipeline_classification_with_clustering_2_class.py

diff --git a/exploration/ml_pipeline_classification_with_clustering.py b/exploration/ml_pipeline_classification_with_clustering.py
index 4b771b8..8887b50 100644
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@@ -176,7 +176,7 @@ for k in range(n_clusters):
         
         cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
         cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
-        cmodels[model_title]['metrics'][2] += np.mean(classifier['test_accuracy'])
+        cmodels[model_title]['metrics'][2] += np.mean(classifier['test_recall'])
         cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])
 
 # %% jupyter={"source_hidden": true}
diff --git a/exploration/ml_pipeline_classification_with_clustering_2_class.py b/exploration/ml_pipeline_classification_with_clustering_2_class.py
new file mode 100644
index 0000000..026362f
--- /dev/null
+++ b/exploration/ml_pipeline_classification_with_clustering_2_class.py
@@ -0,0 +1,181 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from scipy import stats
+
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate, train_test_split
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+
+from sklearn.dummy import DummyClassifier
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
+from lightgbm import LGBMClassifier
+import xgboost as xg 
+
+from sklearn.cluster import KMeans
+
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.labels
+import machine_learning.model
+from machine_learning.classification_models import ClassificationModels
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# # Useful method
+def treat_categorical_features(input_set):
+    categorical_feature_colnames = ["gender", "startlanguage"]
+    additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col]
+    categorical_feature_colnames += additional_categorical_features
+        
+    categorical_features = input_set[categorical_feature_colnames].copy()
+    mode_categorical_features = categorical_features.mode().iloc[0]
+
+    # fillna with mode
+    categorical_features = categorical_features.fillna(mode_categorical_features)
+
+    # one-hot encoding
+    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+    if not categorical_features.empty:
+        categorical_features = pd.get_dummies(categorical_features)
+
+    numerical_features = input_set.drop(categorical_feature_colnames, axis=1)
+    
+    return pd.concat([numerical_features, categorical_features], axis=1)
+
+# %% [markdown]
+# ## Set script's parameters
+n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
+n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
+
+# %% jupyter={"source_hidden": true}
+model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+
+clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
+
+model_input.columns[list(model_input.columns).index('age'):-1]
+
+lime_cols = [col for col in model_input if col.startswith('limesurvey')]
+lime_cols
+lime_col = 'limesurvey_demand_control_ratio'
+clust_col = lime_col
+
+model_input[clust_col].describe()
+
+
+# %% jupyter={"source_hidden": true}
+
+# Filter-out outlier rows by clust_col 
+model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
+
+uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
+plt.bar(uniq['pid'], uniq[clust_col])
+
+# %% jupyter={"source_hidden": true}
+# Get clusters by cluster col & and merge the clusters to main df
+km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
+np.unique(km, return_counts=True)
+uniq['cluster'] = km
+uniq
+
+model_input = model_input.merge(uniq[['pid', 'cluster']])   
+
+# %% jupyter={"source_hidden": true}
+model_input.set_index(index_columns, inplace=True)
+
+# %% jupyter={"source_hidden": true}
+# Create dict with classification ml models
+cm = ClassificationModels()
+cmodels = cm.get_cmodels()
+
+# %% jupyter={"source_hidden": true}
+for k in range(n_clusters):
+    model_input_subset = model_input[model_input["cluster"] == k].copy()
+    
+    # Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
+    model_input_subset['numerical_target'] = model_input_subset['target']
+    bins = [-10, 0, 10] # bins for z-scored targets
+    model_input_subset.loc[:, 'target'] = \
+        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True)
+        
+    p15 = np.percentile(model_input_subset['numerical_target'], 15)
+    p85 = np.percentile(model_input_subset['numerical_target'], 85)
+    
+    # Treat categorical features
+    model_input_subset = treat_categorical_features(model_input_subset)
+    
+    # Split to train, validate, and test subsets
+    train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
+    test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
+
+    train_set['target'].value_counts()
+    test_set['target'].value_counts()
+    
+    train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
+    
+    validate_x, test_x, validate_y, test_y = \
+        train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42)
+    
+    # Impute missing values
+    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+
+    train_x = imputer.fit_transform(train_x)
+    validate_x = imputer.fit_transform(validate_x)
+    test_x = imputer.fit_transform(test_x)
+
+    for model_title, model in cmodels.items():
+        model['model'].fit(train_x, train_y)
+        y_pred = model['model'].predict(validate_x)
+        
+        acc = accuracy_score(validate_y, y_pred)
+        prec = precision_score(validate_y, y_pred)
+        rec = recall_score(validate_y, y_pred)
+        f1 = f1_score(validate_y, y_pred)
+        
+        print("\n-------------------------------------\n")
+        print("Current cluster:", k, end="\n")
+        print("Current model:", model_title, end="\n")
+        print("Acc", acc)
+        print("Precision", prec)
+        print("Recall", rec)
+        print("F1", f1)
+        
+        cmodels[model_title]['metrics'][0] += acc
+        cmodels[model_title]['metrics'][1] += prec
+        cmodels[model_title]['metrics'][2] += rec
+        cmodels[model_title]['metrics'][3] += f1
+
+# %% jupyter={"source_hidden": true}
+# Get overall results
+cm.get_total_models_scores(n_clusters=n_clusters)
diff --git a/machine_learning/classification_models.py b/machine_learning/classification_models.py
index 094e280..82c26b8 100644
--- a/machine_learning/classification_models.py
+++ b/machine_learning/classification_models.py
@@ -18,7 +18,7 @@ class ClassificationModels():
                 'metrics': [0, 0, 0, 0]
             },
             'logistic_regression': {
-                'model': linear_model.LogisticRegression(),
+                'model': linear_model.LogisticRegression(max_iter=1000),
                 'metrics': [0, 0, 0, 0]
             },
             'support_vector_machine': {

From 7504aa34cffdc6d8c84293eb5c8a91e88a256927 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Mon, 28 Nov 2022 13:42:46 +0100
Subject: [PATCH 10/11] Add additional categorical features (uncomment).

---
 exploration/ml_pipeline_classification.py                     | 2 +-
 exploration/ml_pipeline_classification_with_clustering.py     | 2 +-
 .../ml_pipeline_classification_with_clustering_2_class.py     | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py
index ad460d3..233dffc 100644
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@@ -80,7 +80,7 @@ else:
 
 # %% jupyter={"source_hidden": true}
 categorical_feature_colnames = ["gender", "startlanguage"]
-additional_categorical_features = [] #[col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
 
 categorical_features = data_x[categorical_feature_colnames].copy()
diff --git a/exploration/ml_pipeline_classification_with_clustering.py b/exploration/ml_pipeline_classification_with_clustering.py
index 8887b50..0bf4417 100644
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@@ -122,7 +122,7 @@ for k in range(n_clusters):
 
     # Treat categorical features
     categorical_feature_colnames = ["gender", "startlanguage"]
-    additional_categorical_features = [] #[col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+    additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
     categorical_feature_colnames += additional_categorical_features
 
     categorical_features = data_x[categorical_feature_colnames].copy()
diff --git a/exploration/ml_pipeline_classification_with_clustering_2_class.py b/exploration/ml_pipeline_classification_with_clustering_2_class.py
index 026362f..36468fa 100644
--- a/exploration/ml_pipeline_classification_with_clustering_2_class.py
+++ b/exploration/ml_pipeline_classification_with_clustering_2_class.py
@@ -75,8 +75,8 @@ def treat_categorical_features(input_set):
 
 # %% [markdown]
 # ## Set script's parameters
-n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
-n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
+n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
+n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
 
 # %% jupyter={"source_hidden": true}
 model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")

From cf0e4f89be4861e77ae21f64450aae35084f7864 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Tue, 29 Nov 2022 14:06:06 +0100
Subject: [PATCH 11/11] Test nonstandardized data with regular classification
 pipeline.

---
 exploration/ml_pipeline_classification.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py
index 233dffc..3acefcb 100644
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@@ -51,17 +51,19 @@ cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could
 n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
 
 # %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+model_input = pd.read_csv("../data/stressfulness_event_nonstandardized/input_appraisal_stressfulness_event_mean.csv")
 
 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input.set_index(index_columns, inplace=True)
+model_input['target'].value_counts()
 
 # %% jupyter={"source_hidden": true}
-bins = [-10, -1, 1, 10] # bins for z-scored targets
-model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'medium', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
+# bins = [-10, -1, 1, 10] # bins for z-scored targets
+bins = [0, 1, 4] # bins for stressfulness (1-4) target
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
 model_input['target'].value_counts(), edges
-model_input = model_input[model_input['target'] != "medium"]
+# model_input = model_input[model_input['target'] != "medium"]
 model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
 
 model_input['target'].value_counts()