From ac03b36c0f88b9e32ee644d5c6fa2afc6ad5e88a Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Fri, 9 Dec 2022 13:44:20 +0100
Subject: [PATCH 1/8] Add files to .gitignore and add files path for
 stressfulness event.

---
 .gitignore                          | 1 +
 presentation/event_stressfulness.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index d5b862d..d823d18 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,4 @@ __pycache__/
 /data/*input*.csv
 /data/daily*
 /data/intradaily*
+/data/stressfulness_event*
diff --git a/presentation/event_stressfulness.py b/presentation/event_stressfulness.py
index 1f97c81..444baa1 100644
--- a/presentation/event_stressfulness.py
+++ b/presentation/event_stressfulness.py
@@ -47,6 +47,7 @@ import machine_learning.helper
 
 # %% tags=["active-ipynb"]
 # filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
+# filename = Path('C:/Users/Primoz/VSCodeProjects/straw2analysis/data/stressfulness_event/input_appraisal_stressfulness_event_mean.csv')
 
 # %%
 final_scores = machine_learning.helper.run_all_regression_models(filename)

From 6507b053c54ca02f1ff095ea14706f16b2c41c70 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Fri, 9 Dec 2022 13:46:13 +0100
Subject: [PATCH 2/8] Add StrtifiedKFold with shuffling as a default CV method.

---
 exploration/ml_pipeline_classification.py         | 15 +++++++--------
 .../ml_pipeline_classification_with_clustering.py | 10 +++++-----
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py
index 3acefcb..d1f7287 100644
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@@ -26,7 +26,7 @@ import pandas as pd
 import seaborn as sns
 
 from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
-from sklearn.model_selection import LeaveOneGroupOut, cross_validate
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
 from sklearn.dummy import DummyClassifier
 from sklearn.impute import SimpleImputer
 
@@ -47,20 +47,19 @@ import machine_learning.model
 
 # %% [markdown]
 # ## Set script's parameters
-cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
 n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
 
 # %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/stressfulness_event_nonstandardized/input_appraisal_stressfulness_event_mean.csv")
+model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
 
 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input.set_index(index_columns, inplace=True)
-model_input['target'].value_counts()
 
 # %% jupyter={"source_hidden": true}
-# bins = [-10, -1, 1, 10] # bins for z-scored targets
-bins = [0, 1, 4] # bins for stressfulness (1-4) target
+bins = [-10, 0, 10] # bins for z-scored targets
+# bins = [1, 2.5, 4] # bins for stressfulness (1-4) target
 model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
 model_input['target'].value_counts(), edges
 # model_input = model_input[model_input['target'] != "medium"]
@@ -68,7 +67,7 @@ model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x
 
 model_input['target'].value_counts()
 
-if cv_method_str == 'halflogo':
+if cv_method_str == 'half_logo':
     model_input['pid_index'] = model_input.groupby('pid').cumcount()
     model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
 
@@ -101,7 +100,7 @@ train_x = pd.concat([numerical_features, categorical_features], axis=1)
 train_x.dtypes
 
 # %% jupyter={"source_hidden": true}
-cv_method = None # Defaults to 5 k-folds in cross_validate method
+cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
 if cv_method_str == 'logo' or cv_method_str == 'half_logo':
     cv_method = LeaveOneGroupOut()
     cv_method.get_n_splits(
diff --git a/exploration/ml_pipeline_classification_with_clustering.py b/exploration/ml_pipeline_classification_with_clustering.py
index 0bf4417..4ccea22 100644
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@@ -26,7 +26,7 @@ import pandas as pd
 import seaborn as sns
 from scipy import stats
 
-from sklearn.model_selection import LeaveOneGroupOut, cross_validate
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
 from sklearn.impute import SimpleImputer
 
 from sklearn.dummy import DummyClassifier
@@ -52,8 +52,8 @@ from machine_learning.classification_models import ClassificationModels
 
 # %% [markdown]
 # ## Set script's parameters
-n_clusters = 5 # Number of clusters (could be regarded as a hyperparameter)
-cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
+cv_method_str = 'half_logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
 n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
 
 # %% jupyter={"source_hidden": true}
@@ -109,7 +109,7 @@ for k in range(n_clusters):
 
     model_input_subset['target'].value_counts()
     
-    if cv_method_str == 'halflogo':
+    if cv_method_str == 'half_logo':
         model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
         model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
 
@@ -140,7 +140,7 @@ for k in range(n_clusters):
     train_x = pd.concat([numerical_features, categorical_features], axis=1)
 
     # Establish cv method
-    cv_method = None # Defaults to 5 k-folds in cross_validate method
+    cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
     if cv_method_str == 'logo' or cv_method_str == 'half_logo':
         cv_method = LeaveOneGroupOut()
         cv_method.get_n_splits(

From 78b6e7fa076c259d4b649d28a64724726b396cff Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Fri, 9 Dec 2022 13:53:16 +0100
Subject: [PATCH 3/8] Remove unused imports from ML pipeline scripts.

---
 exploration/ml_pipeline_classification.py            |  6 ------
 .../ml_pipeline_classification_with_clustering.py    |  8 --------
 ...ipeline_classification_with_clustering_2_class.py | 12 +-----------
 3 files changed, 1 insertion(+), 25 deletions(-)

diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py
index d1f7287..33d1125 100644
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@@ -15,15 +15,12 @@
 
 # %% jupyter={"source_hidden": true}
 # %matplotlib inline
-import datetime
-import importlib
 import os
 import sys
 
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
-import seaborn as sns
 
 from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
 from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
@@ -39,9 +36,6 @@ nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
     sys.path.append(nb_dir)
 
-import machine_learning.labels
-import machine_learning.model
-
 # %% [markdown]
 # # RAPIDS models
 
diff --git a/exploration/ml_pipeline_classification_with_clustering.py b/exploration/ml_pipeline_classification_with_clustering.py
index 4ccea22..04c35aa 100644
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@@ -15,8 +15,6 @@
 
 # %% jupyter={"source_hidden": true}
 # %matplotlib inline
-import datetime
-import importlib
 import os
 import sys
 
@@ -29,10 +27,6 @@ from scipy import stats
 from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
 from sklearn.impute import SimpleImputer
 
-from sklearn.dummy import DummyClassifier
-from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
-from lightgbm import LGBMClassifier
-import xgboost as xg 
 
 from sklearn.cluster import KMeans
 
@@ -43,8 +37,6 @@ nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
     sys.path.append(nb_dir)
 
-import machine_learning.labels
-import machine_learning.model
 from machine_learning.classification_models import ClassificationModels
 
 # %% [markdown]
diff --git a/exploration/ml_pipeline_classification_with_clustering_2_class.py b/exploration/ml_pipeline_classification_with_clustering_2_class.py
index 36468fa..3442733 100644
--- a/exploration/ml_pipeline_classification_with_clustering_2_class.py
+++ b/exploration/ml_pipeline_classification_with_clustering_2_class.py
@@ -15,26 +15,18 @@
 
 # %% jupyter={"source_hidden": true}
 # %matplotlib inline
-import datetime
-import importlib
 import os
 import sys
 
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
-import seaborn as sns
 from scipy import stats
 
-from sklearn.model_selection import LeaveOneGroupOut, cross_validate, train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn.impute import SimpleImputer
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 
-from sklearn.dummy import DummyClassifier
-from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
-from lightgbm import LGBMClassifier
-import xgboost as xg 
-
 from sklearn.cluster import KMeans
 
 from IPython.core.interactiveshell import InteractiveShell
@@ -44,8 +36,6 @@ nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
     sys.path.append(nb_dir)
 
-import machine_learning.labels
-import machine_learning.model
 from machine_learning.classification_models import ClassificationModels
 
 # %% [markdown]

From 0a45e351646c8a68788bcca5e00a8bc6930f484b Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Fri, 9 Dec 2022 13:56:42 +0100
Subject: [PATCH 4/8] Remove unused imports prt. 2.

---
 exploration/ml_pipeline_classification_with_clustering.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/exploration/ml_pipeline_classification_with_clustering.py b/exploration/ml_pipeline_classification_with_clustering.py
index 04c35aa..c25c17b 100644
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@@ -21,13 +21,11 @@ import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
-import seaborn as sns
 from scipy import stats
 
 from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
 from sklearn.impute import SimpleImputer
 
-
 from sklearn.cluster import KMeans
 
 from IPython.core.interactiveshell import InteractiveShell

From 164d12ed2f1d9f964bbaff95e2501d7ed6e62f31 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Tue, 13 Dec 2022 17:01:46 +0100
Subject: [PATCH 5/8] Add undersampling method (with on/off parameter).

---
 config/environment.yml                    |  1 +
 exploration/ml_pipeline_classification.py | 59 +++++++++++++++++------
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/config/environment.yml b/config/environment.yml
index 62cb210..42d947b 100644
--- a/config/environment.yml
+++ b/config/environment.yml
@@ -7,6 +7,7 @@ dependencies:
   - black
   - isort
   - flake8
+  - imbalanced-learn=0.10.0
   - jupyterlab
   - jupytext
   - mypy
diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py
index 33d1125..f539025 100644
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@@ -43,17 +43,19 @@ if nb_dir not in sys.path:
 # ## Set script's parameters
 cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
 n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
+under_sampling = True # (bool) Will train and test data on balanced dataset (using undersampling method)
 
 # %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv")
 
 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input.set_index(index_columns, inplace=True)
+model_input['target'].value_counts()
 
 # %% jupyter={"source_hidden": true}
-bins = [-10, 0, 10] # bins for z-scored targets
-# bins = [1, 2.5, 4] # bins for stressfulness (1-4) target
+# bins = [-10, 0, 10] # bins for z-scored targets
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
 model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
 model_input['target'].value_counts(), edges
 # model_input = model_input[model_input['target'] != "medium"]
@@ -61,6 +63,20 @@ model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x
 
 model_input['target'].value_counts()
 
+# %% jupyter={"source_hidden": true}
+# UnderSampling
+if under_sampling:
+    model_input.groupby("pid").count()
+    no_stress = model_input[model_input['target'] == 0]
+    stress = model_input[model_input['target'] == 1]
+
+    no_stress = no_stress.sample(n=len(stress))
+    model_input = pd.concat([stress,no_stress], axis=0)
+
+    model_input["target"].value_counts()
+
+
+# %% jupyter={"source_hidden": true}
 if cv_method_str == 'half_logo':
     model_input['pid_index'] = model_input.groupby('pid').cumcount()
     model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
@@ -119,11 +135,12 @@ dummy_classifier = cross_validate(
     cv=cv_method,
     n_jobs=-1,
     error_score='raise',
-    scoring=('accuracy', 'average_precision', 'recall', 'f1')
+    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(dummy_classifier['test_accuracy']))
-print("Precision", np.mean(dummy_classifier['test_average_precision']))
+print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
+print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
+print("Precision", np.mean(dummy_classifier['test_precision']))
 print("Recall", np.mean(dummy_classifier['test_recall']))
 print("F1", np.mean(dummy_classifier['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
@@ -146,7 +163,8 @@ log_reg_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(log_reg_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
+print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
 print("Precision", np.mean(log_reg_scores['test_precision']))
 print("Recall", np.mean(log_reg_scores['test_recall']))
 print("F1", np.mean(log_reg_scores['test_f1']))
@@ -170,7 +188,8 @@ svc_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(svc_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
 print("Precision", np.mean(svc_scores['test_precision']))
 print("Recall", np.mean(svc_scores['test_recall']))
 print("F1", np.mean(svc_scores['test_f1']))
@@ -195,7 +214,8 @@ gaussian_nb_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(gaussian_nb_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
+print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
 print("Precision", np.mean(gaussian_nb_scores['test_precision']))
 print("Recall", np.mean(gaussian_nb_scores['test_recall']))
 print("F1", np.mean(gaussian_nb_scores['test_f1']))
@@ -220,7 +240,8 @@ sgdc_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(sgdc_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
 print("Precision", np.mean(sgdc_scores['test_precision']))
 print("Recall", np.mean(sgdc_scores['test_recall']))
 print("F1", np.mean(sgdc_scores['test_f1']))
@@ -245,7 +266,8 @@ knn_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(knn_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
+print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
 print("Precision", np.mean(knn_scores['test_precision']))
 print("Recall", np.mean(knn_scores['test_recall']))
 print("F1", np.mean(knn_scores['test_f1']))
@@ -270,7 +292,8 @@ dtree_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(dtree_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
+print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
 print("Precision", np.mean(dtree_scores['test_precision']))
 print("Recall", np.mean(dtree_scores['test_recall']))
 print("F1", np.mean(dtree_scores['test_f1']))
@@ -295,7 +318,8 @@ rfc_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(rfc_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
 print("Precision", np.mean(rfc_scores['test_precision']))
 print("Recall", np.mean(rfc_scores['test_recall']))
 print("F1", np.mean(rfc_scores['test_f1']))
@@ -320,7 +344,8 @@ gbc_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(gbc_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
 print("Precision", np.mean(gbc_scores['test_precision']))
 print("Recall", np.mean(gbc_scores['test_recall']))
 print("F1", np.mean(gbc_scores['test_f1']))
@@ -345,7 +370,8 @@ lgbm_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(lgbm_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
+print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
 print("Precision", np.mean(lgbm_scores['test_precision']))
 print("Recall", np.mean(lgbm_scores['test_recall']))
 print("F1", np.mean(lgbm_scores['test_f1']))
@@ -370,7 +396,8 @@ xgb_classifier_scores = cross_validate(
     scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(xgb_classifier_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
+print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
 print("Precision", np.mean(xgb_classifier_scores['test_precision']))
 print("Recall", np.mean(xgb_classifier_scores['test_recall']))
 print("F1", np.mean(xgb_classifier_scores['test_f1']))

From a61ab9ee518b1409c561c9ee37b1b08846a96f4e Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Thu, 15 Dec 2022 16:43:13 +0100
Subject: [PATCH 6/8] Add feature importance check.

---
 exploration/ml_pipeline_classification.py | 34 ++++++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py
index f539025..736e3db 100644
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@@ -42,11 +42,12 @@ if nb_dir not in sys.path:
 # %% [markdown]
 # ## Set script's parameters
 cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
-n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
-under_sampling = True # (bool) Will train and test data on balanced dataset (using undersampling method)
+n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
+undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)
 
 # %% jupyter={"source_hidden": true}
 model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv")
+# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
 
 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
@@ -65,7 +66,7 @@ model_input['target'].value_counts()
 
 # %% jupyter={"source_hidden": true}
 # UnderSampling
-if under_sampling:
+if undersampling:
     model_input.groupby("pid").count()
     no_stress = model_input[model_input['target'] == 0]
     stress = model_input[model_input['target'] == 1]
@@ -315,7 +316,8 @@ rfc_scores = cross_validate(
     cv=cv_method,
     n_jobs=-1,
     error_score='raise',
-    scoring=('accuracy', 'precision', 'recall', 'f1')
+    scoring=('accuracy', 'precision', 'recall', 'f1'), 
+    return_estimator=True
 )
 # %% jupyter={"source_hidden": true}
 print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
@@ -326,6 +328,28 @@ print("F1", np.mean(rfc_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
 
+# %% [markdown]
+# ### Feature importance (RFC)
+
+# %% jupyter={"source_hidden": true}
+rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
+for idx, estimator in enumerate(rfc_scores['estimator']):
+    print("\nFeatures sorted by their score for estimator {}:".format(idx))
+    feature_importances = pd.DataFrame(estimator.feature_importances_,
+                                       index = list(train_x.columns),
+                                        columns=['importance'])
+    print(feature_importances.sort_values('importance', ascending=False).head(10))                                    
+    rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
+
+pd.set_option('display.max_rows', 100)
+print(rfc_es_fimp.sort_values('importance', ascending=False).head(100))
+
+rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
+
+rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
+
+train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
+
 # %% [markdown]
 # ### Gradient Boosting Classifier
 
@@ -403,3 +427,5 @@ print("Recall", np.mean(xgb_classifier_scores['test_recall']))
 print("F1", np.mean(xgb_classifier_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %%

From adcb823d3fefc3db40de3f0eb0d54331bf7e5ba8 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Thu, 15 Dec 2022 16:43:40 +0100
Subject: [PATCH 7/8] Add stress event duration exploration script.

---
 exploration/expl_stress_event.py | 158 +++++++++++++++++++++++++++++++
 1 file changed, 158 insertions(+)
 create mode 100644 exploration/expl_stress_event.py

diff --git a/exploration/expl_stress_event.py b/exploration/expl_stress_event.py
new file mode 100644
index 0000000..b2aaabc
--- /dev/null
+++ b/exploration/expl_stress_event.py
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %%
+import os
+import sys
+import datetime
+import math
+
+import seaborn as sns
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+import participants.query_db
+from features.esm import *
+from features.esm_JCQ import *
+from features.esm_SAM import *
+
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+# %%
+participants_inactive_usernames = participants.query_db.get_usernames(
+    collection_start=datetime.date.fromisoformat("2020-08-01")
+)
+df_esm_inactive = get_esm_data(participants_inactive_usernames)
+
+# %%
+df_esm_preprocessed = preprocess_esm(df_esm_inactive)
+
+
+# %% [markdown]
+# Investigate stressfulness events
+# %%
+extracted_ers = df_esm_preprocessed.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire length
+extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire answering is 15 min
+session_start_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].min().to_frame().rename(columns={'timestamp': 'session_start_timestamp'}) # questionnaire start timestamp
+session_end_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
+se_time = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
+se_duration = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
+
+# Make se_durations to the appropriate lengths
+
+# Extracted 3 targets that will be transfered in the csv file to the cleaning script. 
+df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].columns
+se_stressfulness_event_tg = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'appraisal_stressfulness_event'})
+
+# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
+extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \
+                                .join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
+                                .join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
+                                .join(se_time, on=['device_id', 'esm_session'], how='left') \
+                                .join(se_duration, on=['device_id', 'esm_session'], how='left') \
+
+# Filter-out the sessions that are not useful. Because of the ambiguity this excludes: 
+# (1) straw event times that are marked as "0 - I don't remember"
+# (2) straw event durations that are marked as "0 - I don't remember" 
+extracted_ers = extracted_ers[(~extracted_ers.se_time.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("Removed "))]
+extracted_ers.reset_index(drop=True, inplace=True)
+
+# Add default duration in case if participant answered that no stressful event occured
+
+# Prepare data to fit the data structure in the CSV file ...
+# Add the event time as the end of the questionnaire if no stress event occured
+extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
+# Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds 
+extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
+extracted_ers['shift_direction'] = -1
+
+""">>>>> begin section (could be optimized) <<<<<"""
+
+# Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
+# is taken as end time of the segment. Else the user input duration is taken. 
+extracted_ers['temp_duration'] = extracted_ers['se_duration']
+extracted_ers['se_duration'] = \
+    np.where(
+        extracted_ers['se_duration'].astype(str).str.startswith("1 - "),
+        extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'], 
+        extracted_ers['se_duration']
+    )
+
+# This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds.
+
+extracted_ers['se_duration'] = \
+    extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else abs(pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60)
+
+# Check whether min se_duration is at least the same duration as the ioi. Filter-out the rest.
+
+""">>>>> end section <<<<<"""
+
+# %%
+# Count negative values of duration
+print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
+print("Count stressed:", extracted_ers[(~extracted_ers['se_duration'].isna())][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
+print("Count negative durations (invalid se_time user input):", extracted_ers[extracted_ers['se_duration'] < 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
+print("Count 0 durations:", extracted_ers[extracted_ers['se_duration'] == 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
+extracted_ers[extracted_ers['se_duration'] <= 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]
+extracted_ers[(~extracted_ers['se_duration'].isna()) & (extracted_ers['se_duration'] <= 0)][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']]
+
+ax = extracted_ers[(extracted_ers['se_duration'] < 5000) & (extracted_ers['se_duration'] > -300)].hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
+extracted_ers[(extracted_ers['se_duration'] < 1000) & (extracted_ers['se_duration'] > -1000)]['se_duration'].value_counts()
+hist, bin_edges = np.histogram(extracted_ers['se_duration'].dropna())
+hist
+bin_edges
+
+extracted_ers['se_duration'].describe()
+extracted_ers['se_duration'].median()
+
+# %%
+# bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos'  ..... right=False
+bins = [-100000000, -0.0000001, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
+
+extracted_ers['bins'], edges = pd.cut(extracted_ers.se_duration, bins=bins, labels=['neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
+sns.displot(
+    data=extracted_ers.dropna(),
+    x="bins",
+    binwidth=0.1,
+)
+
+# %%
+# Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika. 
+extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()]
+extracted_ers[['session_end_timestamp', 'event_timestamp']]
+extracted_ers['diff_se_time_session_end'] = (extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'])
+extracted_ers['diff_se_time_session_end'].dropna().value_counts()
+extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)]
+bins2 = [-0.0000001, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
+extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
+sns.displot(
+    data=extracted_ers.dropna(),
+    x="bins2",
+    binwidth=0.1,
+)
+
+extracted_ers.shape
+extracted_ers.dropna().shape
+
+
+# %%
+extracted_ers['appraisal_stressfulness_event_num'] = extracted_ers['appraisal_stressfulness_event'].str[0].astype(int)
+print("duration-target (corr):", extracted_ers['se_duration'].corr(extracted_ers['appraisal_stressfulness_event_num']))
+
+# %%
+# Explore groupby participants?
\ No newline at end of file

From 339142ff31d60ad74fb3bd1ac16cf78f7043dbc0 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Wed, 21 Dec 2022 15:02:25 +0100
Subject: [PATCH 8/8] Add expl stress event script and other changes.

---
 exploration/expl_stress_event.py          | 34 ++++++++++++++---------
 exploration/ml_pipeline_classification.py | 10 +++----
 rapids                                    |  2 +-
 3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/exploration/expl_stress_event.py b/exploration/expl_stress_event.py
index b2aaabc..8fc5bf1 100644
--- a/exploration/expl_stress_event.py
+++ b/exploration/expl_stress_event.py
@@ -75,7 +75,7 @@ extracted_ers.reset_index(drop=True, inplace=True)
 # Add default duration in case if participant answered that no stressful event occured
 
 # Prepare data to fit the data structure in the CSV file ...
-# Add the event time as the end of the questionnaire if no stress event occured
+# Add the event time as the start of the questionnaire if no stress event occured
 extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
 # Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds 
 extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
@@ -102,7 +102,7 @@ extracted_ers['se_duration'] = \
 
 """>>>>> end section <<<<<"""
 
-# %%
+# %% [markdown]
 # Count negative values of duration
 print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
 print("Count stressed:", extracted_ers[(~extracted_ers['se_duration'].isna())][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
@@ -111,14 +111,12 @@ print("Count 0 durations:", extracted_ers[extracted_ers['se_duration'] == 0][['s
 extracted_ers[extracted_ers['se_duration'] <= 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]
 extracted_ers[(~extracted_ers['se_duration'].isna()) & (extracted_ers['se_duration'] <= 0)][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']]
 
-ax = extracted_ers[(extracted_ers['se_duration'] < 5000) & (extracted_ers['se_duration'] > -300)].hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
-extracted_ers[(extracted_ers['se_duration'] < 1000) & (extracted_ers['se_duration'] > -1000)]['se_duration'].value_counts()
+ax = extracted_ers.hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
 hist, bin_edges = np.histogram(extracted_ers['se_duration'].dropna())
 hist
 bin_edges
 
-extracted_ers['se_duration'].describe()
-extracted_ers['se_duration'].median()
+extracted_ers = extracted_ers[extracted_ers['se_duration'] >= 0]
 
 # %%
 # bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos'  ..... right=False
@@ -131,15 +129,23 @@ sns.displot(
     binwidth=0.1,
 )
 
-# %%
+# %% [markdown]
+extracted_ers[extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'] >= 0]
+extracted_ers['se_time'].value_counts()
+pd.set_option('display.max_rows', 100)
 # Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika. 
-extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()]
-extracted_ers[['session_end_timestamp', 'event_timestamp']]
+extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()] # Remove no stress events
 extracted_ers['diff_se_time_session_end'] = (extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'])
-extracted_ers['diff_se_time_session_end'].dropna().value_counts()
-extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)]
-bins2 = [-0.0000001, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
-extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
+
+print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
+print("Count negative durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] < 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']])
+print("Count 0 durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] == 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
+
+extracted_ers[extracted_ers['diff_se_time_session_end'] < 0]['diff_se_time_session_end']
+# extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)]
+bins2 = [-100000, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
+extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['neg_zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
+extracted_ers['bins2']
 sns.displot(
     data=extracted_ers.dropna(),
     x="bins2",
@@ -149,6 +155,8 @@ sns.displot(
 extracted_ers.shape
 extracted_ers.dropna().shape
 
+print()
+
 
 # %%
 extracted_ers['appraisal_stressfulness_event_num'] = extracted_ers['appraisal_stressfulness_event'].str[0].astype(int)
diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py
index 736e3db..0b5dc15 100644
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@@ -41,12 +41,12 @@ if nb_dir not in sys.path:
 
 # %% [markdown]
 # ## Set script's parameters
-cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+cv_method_str = '5kfold' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
 n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
 undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)
 
 # %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv")
+model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
 # model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
 
 # %% jupyter={"source_hidden": true}
@@ -334,15 +334,15 @@ print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'],
 # %% jupyter={"source_hidden": true}
 rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
 for idx, estimator in enumerate(rfc_scores['estimator']):
-    print("\nFeatures sorted by their score for estimator {}:".format(idx))
     feature_importances = pd.DataFrame(estimator.feature_importances_,
                                        index = list(train_x.columns),
                                         columns=['importance'])
-    print(feature_importances.sort_values('importance', ascending=False).head(10))                                    
+    # print("\nFeatures sorted by their score for estimator {}:".format(idx))
+    # print(feature_importances.sort_values('importance', ascending=False).head(10))                                    
     rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
 
 pd.set_option('display.max_rows', 100)
-print(rfc_es_fimp.sort_values('importance', ascending=False).head(100))
+print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
 
 rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
 
diff --git a/rapids b/rapids
index 8a6b52a..7f5a4e6 160000
--- a/rapids
+++ b/rapids
@@ -1 +1 @@
-Subproject commit 8a6b52a97c95dcd8b70b980b4f46421b1a847905
+Subproject commit 7f5a4e6744e502d40dc38502e1e74bd2bf9fe786