Merge branch 'ml_pipeline' of repo.ijs.si:junoslukan/straw2analysis into ml_pipeline

# Conflicts: # exploration/ml_pipeline_daily.py - deleted
2022-12-07 15:36:52 +01:00 · 2022-12-07 15:36:52 +01:00 · 12f2c927fa
parent 71e1fcf8ca cf0e4f89be
commit 12f2c927fa
8 changed files with 861 additions and 819 deletions
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@ -0,0 +1,385 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate
+from sklearn.dummy import DummyClassifier
+from sklearn.impute import SimpleImputer
+
+from lightgbm import LGBMClassifier
+import xgboost as xg
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.labels
+import machine_learning.model
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# ## Set script's parameters
+cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
+
+# %% jupyter={"source_hidden": true}
+model_input = pd.read_csv("../data/stressfulness_event_nonstandardized/input_appraisal_stressfulness_event_mean.csv")
+
+# %% jupyter={"source_hidden": true}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+model_input.set_index(index_columns, inplace=True)
+model_input['target'].value_counts()
+
+# %% jupyter={"source_hidden": true}
+# bins = [-10, -1, 1, 10] # bins for z-scored targets
+bins = [0, 1, 4] # bins for stressfulness (1-4) target
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
+model_input['target'].value_counts(), edges
+# model_input = model_input[model_input['target'] != "medium"]
+model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
+
+model_input['target'].value_counts()
+
+if cv_method_str == 'halflogo':
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
+else:
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+
+
+# %% jupyter={"source_hidden": true}
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+categorical_features = data_x[categorical_feature_colnames].copy()
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+train_x = pd.concat([numerical_features, categorical_features], axis=1)
+train_x.dtypes
+
+# %% jupyter={"source_hidden": true}
+cv_method = None # Defaults to 5 k-folds in cross_validate method
+if cv_method_str == 'logo' or cv_method_str == 'half_logo':
+    cv_method = LeaveOneGroupOut()
+    cv_method.get_n_splits(
+        train_x,
+        data_y,
+        groups=data_groups,
+    )
+
+# %% jupyter={"source_hidden": true}
+imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+
+# %% [markdown]
+# ### Baseline: Dummy Classifier (most frequent)
+dummy_class = DummyClassifier(strategy="most_frequent")
+
+# %% jupyter={"source_hidden": true}
+dummy_classifier = cross_validate(
+    dummy_class,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'average_precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.mean(dummy_classifier['test_accuracy']))
+print("Precision", np.mean(dummy_classifier['test_average_precision']))
+print("Recall", np.mean(dummy_classifier['test_recall']))
+print("F1", np.mean(dummy_classifier['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Logistic Regression
+
+# %% jupyter={"source_hidden": true}
+logistic_regression = linear_model.LogisticRegression()
+
+# %% jupyter={"source_hidden": true}
+log_reg_scores = cross_validate(
+    logistic_regression,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.mean(log_reg_scores['test_accuracy']))
+print("Precision", np.mean(log_reg_scores['test_precision']))
+print("Recall", np.mean(log_reg_scores['test_recall']))
+print("F1", np.mean(log_reg_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Support Vector Machine
+
+# %% jupyter={"source_hidden": true}
+svc = svm.SVC()
+
+# %% jupyter={"source_hidden": true}
+svc_scores = cross_validate(
+    svc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.mean(svc_scores['test_accuracy']))
+print("Precision", np.mean(svc_scores['test_precision']))
+print("Recall", np.mean(svc_scores['test_recall']))
+print("F1", np.mean(svc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Gaussian Naive Bayes
+
+# %% jupyter={"source_hidden": true}
+gaussian_nb = naive_bayes.GaussianNB()
+
+# %% jupyter={"source_hidden": true}
+gaussian_nb_scores = cross_validate(
+    gaussian_nb,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.mean(gaussian_nb_scores['test_accuracy']))
+print("Precision", np.mean(gaussian_nb_scores['test_precision']))
+print("Recall", np.mean(gaussian_nb_scores['test_recall']))
+print("F1", np.mean(gaussian_nb_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Stochastic Gradient Descent Classifier
+
+# %% jupyter={"source_hidden": true}
+sgdc = linear_model.SGDClassifier()
+
+# %% jupyter={"source_hidden": true}
+sgdc_scores = cross_validate(
+    sgdc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.mean(sgdc_scores['test_accuracy']))
+print("Precision", np.mean(sgdc_scores['test_precision']))
+print("Recall", np.mean(sgdc_scores['test_recall']))
+print("F1", np.mean(sgdc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### K-nearest neighbors
+
+# %% jupyter={"source_hidden": true}
+knn = neighbors.KNeighborsClassifier()
+
+# %% jupyter={"source_hidden": true}
+knn_scores = cross_validate(
+    knn,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.mean(knn_scores['test_accuracy']))
+print("Precision", np.mean(knn_scores['test_precision']))
+print("Recall", np.mean(knn_scores['test_recall']))
+print("F1", np.mean(knn_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Decision Tree
+
+# %% jupyter={"source_hidden": true}
+dtree = tree.DecisionTreeClassifier()
+
+# %% jupyter={"source_hidden": true}
+dtree_scores = cross_validate(
+    dtree,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.mean(dtree_scores['test_accuracy']))
+print("Precision", np.mean(dtree_scores['test_precision']))
+print("Recall", np.mean(dtree_scores['test_recall']))
+print("F1", np.mean(dtree_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Random Forest Classifier
+
+# %% jupyter={"source_hidden": true}
+rfc = ensemble.RandomForestClassifier()
+
+# %% jupyter={"source_hidden": true}
+rfc_scores = cross_validate(
+    rfc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.mean(rfc_scores['test_accuracy']))
+print("Precision", np.mean(rfc_scores['test_precision']))
+print("Recall", np.mean(rfc_scores['test_recall']))
+print("F1", np.mean(rfc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### Gradient Boosting Classifier
+
+# %% jupyter={"source_hidden": true}
+gbc = ensemble.GradientBoostingClassifier()
+
+# %% jupyter={"source_hidden": true}
+gbc_scores = cross_validate(
+    gbc,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.mean(gbc_scores['test_accuracy']))
+print("Precision", np.mean(gbc_scores['test_precision']))
+print("Recall", np.mean(gbc_scores['test_recall']))
+print("F1", np.mean(gbc_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### LGBM Classifier
+
+# %% jupyter={"source_hidden": true}
+lgbm = LGBMClassifier()
+
+# %% jupyter={"source_hidden": true}
+lgbm_scores = cross_validate(
+    lgbm,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.mean(lgbm_scores['test_accuracy']))
+print("Precision", np.mean(lgbm_scores['test_precision']))
+print("Recall", np.mean(lgbm_scores['test_recall']))
+print("F1", np.mean(lgbm_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %% [markdown]
+# ### XGBoost Classifier
+
+# %% jupyter={"source_hidden": true}
+xgb_classifier = xg.sklearn.XGBClassifier()
+
+# %% jupyter={"source_hidden": true}
+xgb_classifier_scores = cross_validate(
+    xgb_classifier,
+    X=imputer.fit_transform(train_x),
+    y=data_y,
+    groups=data_groups,
+    cv=cv_method,
+    n_jobs=-1,
+    error_score='raise',
+    scoring=('accuracy', 'precision', 'recall', 'f1')
+)
+# %% jupyter={"source_hidden": true}
+print("Acc", np.mean(xgb_classifier_scores['test_accuracy']))
+print("Precision", np.mean(xgb_classifier_scores['test_precision']))
+print("Recall", np.mean(xgb_classifier_scores['test_recall']))
+print("F1", np.mean(xgb_classifier_scores['test_f1']))
+print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
+print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@ -0,0 +1,184 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from scipy import stats
+
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate
+from sklearn.impute import SimpleImputer
+
+from sklearn.dummy import DummyClassifier
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
+from lightgbm import LGBMClassifier
+import xgboost as xg 
+
+from sklearn.cluster import KMeans
+
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.labels
+import machine_learning.model
+from machine_learning.classification_models import ClassificationModels
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# ## Set script's parameters
+n_clusters = 5 # Number of clusters (could be regarded as a hyperparameter)
+cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
+
+# %% jupyter={"source_hidden": true}
+model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+
+clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
+
+model_input.columns[list(model_input.columns).index('age'):-1]
+
+lime_cols = [col for col in model_input if col.startswith('limesurvey')]
+lime_cols
+lime_col = 'limesurvey_demand_control_ratio'
+clust_col = lime_col
+
+model_input[clust_col].describe()
+
+
+# %% jupyter={"source_hidden": true}
+
+# Filter-out outlier rows by clust_col 
+model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
+
+uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
+plt.bar(uniq['pid'], uniq[clust_col])
+
+# %% jupyter={"source_hidden": true}
+# Get clusters by cluster col & and merge the clusters to main df
+km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
+np.unique(km, return_counts=True)
+uniq['cluster'] = km
+uniq
+
+model_input = model_input.merge(uniq[['pid', 'cluster']])   
+
+# %% jupyter={"source_hidden": true}
+model_input.set_index(index_columns, inplace=True)
+
+# %% jupyter={"source_hidden": true}
+# Create dict with classification ml models
+cm = ClassificationModels()
+cmodels = cm.get_cmodels()
+
+# %% jupyter={"source_hidden": true}
+for k in range(n_clusters):
+    model_input_subset = model_input[model_input["cluster"] == k].copy()
+    bins = [-10, -1, 1, 10] # bins for z-scored targets
+    model_input_subset.loc[:, 'target'] = \
+        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
+    model_input_subset['target'].value_counts()
+    model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
+    model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
+
+    model_input_subset['target'].value_counts()
+    
+    if cv_method_str == 'halflogo':
+        model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
+        model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
+
+        model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
+        model_input_subset["pid_half"] = model_input_subset["pid"] + "_" +  model_input_subset["pid_index"].astype(int).astype(str)
+
+        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
+    else:
+        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]
+
+    # Treat categorical features
+    categorical_feature_colnames = ["gender", "startlanguage"]
+    additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
+    categorical_feature_colnames += additional_categorical_features
+
+    categorical_features = data_x[categorical_feature_colnames].copy()
+    mode_categorical_features = categorical_features.mode().iloc[0]
+
+    # fillna with mode
+    categorical_features = categorical_features.fillna(mode_categorical_features)
+
+    # one-hot encoding
+    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+    if not categorical_features.empty:
+        categorical_features = pd.get_dummies(categorical_features)
+
+    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
+    train_x = pd.concat([numerical_features, categorical_features], axis=1)
+
+    # Establish cv method
+    cv_method = None # Defaults to 5 k-folds in cross_validate method
+    if cv_method_str == 'logo' or cv_method_str == 'half_logo':
+        cv_method = LeaveOneGroupOut()
+        cv_method.get_n_splits(
+            train_x,
+            data_y,
+            groups=data_groups,
+        )
+
+    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+
+    for model_title, model in cmodels.items():
+
+        classifier = cross_validate(
+            model['model'],
+            X=imputer.fit_transform(train_x),
+            y=data_y,
+            groups=data_groups,
+            cv=cv_method,
+            n_jobs=-1,
+            error_score='raise',
+            scoring=('accuracy', 'precision', 'recall', 'f1')
+        )
+        
+        print("\n-------------------------------------\n")
+        print("Current cluster:", k, end="\n")
+        print("Current model:", model_title, end="\n")
+        print("Acc", np.mean(classifier['test_accuracy']))
+        print("Precision", np.mean(classifier['test_precision']))
+        print("Recall", np.mean(classifier['test_recall']))
+        print("F1", np.mean(classifier['test_f1']))
+        print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
+        print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))
+        
+        cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
+        cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
+        cmodels[model_title]['metrics'][2] += np.mean(classifier['test_recall'])
+        cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])
+
+# %% jupyter={"source_hidden": true}
+# Get overall results
+cm.get_total_models_scores(n_clusters=n_clusters)
--- a/exploration/ml_pipeline_classification_with_clustering_2_class.py
+++ b/exploration/ml_pipeline_classification_with_clustering_2_class.py
@ -0,0 +1,181 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": true}
+# %matplotlib inline
+import datetime
+import importlib
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from scipy import stats
+
+from sklearn.model_selection import LeaveOneGroupOut, cross_validate, train_test_split
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+
+from sklearn.dummy import DummyClassifier
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
+from lightgbm import LGBMClassifier
+import xgboost as xg 
+
+from sklearn.cluster import KMeans
+
+from IPython.core.interactiveshell import InteractiveShell
+InteractiveShell.ast_node_interactivity = "all"
+
+nb_dir = os.path.split(os.getcwd())[0]
+if nb_dir not in sys.path:
+    sys.path.append(nb_dir)
+
+import machine_learning.labels
+import machine_learning.model
+from machine_learning.classification_models import ClassificationModels
+
+# %% [markdown]
+# # RAPIDS models
+
+# %% [markdown]
+# # Useful method
+def treat_categorical_features(input_set):
+    categorical_feature_colnames = ["gender", "startlanguage"]
+    additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col]
+    categorical_feature_colnames += additional_categorical_features
+        
+    categorical_features = input_set[categorical_feature_colnames].copy()
+    mode_categorical_features = categorical_features.mode().iloc[0]
+
+    # fillna with mode
+    categorical_features = categorical_features.fillna(mode_categorical_features)
+
+    # one-hot encoding
+    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+    if not categorical_features.empty:
+        categorical_features = pd.get_dummies(categorical_features)
+
+    numerical_features = input_set.drop(categorical_feature_colnames, axis=1)
+    
+    return pd.concat([numerical_features, categorical_features], axis=1)
+
+# %% [markdown]
+# ## Set script's parameters
+n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
+n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
+
+# %% jupyter={"source_hidden": true}
+model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+
+clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
+
+model_input.columns[list(model_input.columns).index('age'):-1]
+
+lime_cols = [col for col in model_input if col.startswith('limesurvey')]
+lime_cols
+lime_col = 'limesurvey_demand_control_ratio'
+clust_col = lime_col
+
+model_input[clust_col].describe()
+
+
+# %% jupyter={"source_hidden": true}
+
+# Filter-out outlier rows by clust_col 
+model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
+
+uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
+plt.bar(uniq['pid'], uniq[clust_col])
+
+# %% jupyter={"source_hidden": true}
+# Get clusters by cluster col & and merge the clusters to main df
+km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
+np.unique(km, return_counts=True)
+uniq['cluster'] = km
+uniq
+
+model_input = model_input.merge(uniq[['pid', 'cluster']])   
+
+# %% jupyter={"source_hidden": true}
+model_input.set_index(index_columns, inplace=True)
+
+# %% jupyter={"source_hidden": true}
+# Create dict with classification ml models
+cm = ClassificationModels()
+cmodels = cm.get_cmodels()
+
+# %% jupyter={"source_hidden": true}
+for k in range(n_clusters):
+    model_input_subset = model_input[model_input["cluster"] == k].copy()
+    
+    # Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
+    model_input_subset['numerical_target'] = model_input_subset['target']
+    bins = [-10, 0, 10] # bins for z-scored targets
+    model_input_subset.loc[:, 'target'] = \
+        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True)
+        
+    p15 = np.percentile(model_input_subset['numerical_target'], 15)
+    p85 = np.percentile(model_input_subset['numerical_target'], 85)
+    
+    # Treat categorical features
+    model_input_subset = treat_categorical_features(model_input_subset)
+    
+    # Split to train, validate, and test subsets
+    train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
+    test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
+
+    train_set['target'].value_counts()
+    test_set['target'].value_counts()
+    
+    train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
+    
+    validate_x, test_x, validate_y, test_y = \
+        train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42)
+    
+    # Impute missing values
+    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+
+    train_x = imputer.fit_transform(train_x)
+    validate_x = imputer.fit_transform(validate_x)
+    test_x = imputer.fit_transform(test_x)
+
+    for model_title, model in cmodels.items():
+        model['model'].fit(train_x, train_y)
+        y_pred = model['model'].predict(validate_x)
+        
+        acc = accuracy_score(validate_y, y_pred)
+        prec = precision_score(validate_y, y_pred)
+        rec = recall_score(validate_y, y_pred)
+        f1 = f1_score(validate_y, y_pred)
+        
+        print("\n-------------------------------------\n")
+        print("Current cluster:", k, end="\n")
+        print("Current model:", model_title, end="\n")
+        print("Acc", acc)
+        print("Precision", prec)
+        print("Recall", rec)
+        print("F1", f1)
+        
+        cmodels[model_title]['metrics'][0] += acc
+        cmodels[model_title]['metrics'][1] += prec
+        cmodels[model_title]['metrics'][2] += rec
+        cmodels[model_title]['metrics'][3] += f1
+
+# %% jupyter={"source_hidden": true}
+# Get overall results
+cm.get_total_models_scores(n_clusters=n_clusters)
--- a/exploration/ml_pipeline_daily.py
+++ b/exploration/ml_pipeline_daily.py
@ -1,472 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.13.0
-#   kernelspec:
-#     display_name: straw2analysis
-#     language: python
-#     name: straw2analysis
-# ---
-
-# %% jupyter={"source_hidden": true}
-# %matplotlib inline
-import datetime
-import importlib
-import os
-import sys
-
-import numpy as np
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-import yaml
-from pyprojroot import here
-from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble
-from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
-from sklearn.metrics import mean_squared_error, r2_score
-from sklearn.impute import SimpleImputer
-from xgboost import XGBRegressor
-
-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
-
-import machine_learning.features_sensor
-import machine_learning.labels
-import machine_learning.model
-
-# %% [markdown]
-# # RAPIDS models
-
-# %% [markdown]
-# ## PANAS negative affect
-
-# %% jupyter={"source_hidden": true}
-# model_input = pd.read_csv("../data/input_PANAS_NA.csv") # Nestandardizirani podatki - pred temeljitim čiščenjem
-model_input = pd.read_csv("../data/z_input_PANAS_NA.csv") # Standardizirani podatki - pred temeljitim čiščenjem
-# %% [markdown]
-# ### NaNs before dropping cols and rows
-
-# %% jupyter={"source_hidden": true}
-sns.set(rc={"figure.figsize":(16, 8)})
-sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
-
-# %% jupyter={"source_hidden": true}
-nan_cols = list(model_input.loc[:, model_input.isna().all()].columns)
-nan_cols
-
-# %% jupyter={"source_hidden": true}
-model_input.dropna(axis=1, how="all", inplace=True)
-model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
-
-# %% [markdown]
-# ### NaNs after dropping NaN cols and rows where target is NaN
-
-# %% jupyter={"source_hidden": true}
-sns.set(rc={"figure.figsize":(16, 8)})
-sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
-
-# %% jupyter={"source_hidden": true}
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-#if "pid" in model_input.columns:
-#    index_columns.append("pid")
-model_input.set_index(index_columns, inplace=True)
-
-data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
-
-# %% jupyter={"source_hidden": true}
-categorical_feature_colnames = ["gender", "startlanguage"]
-
-# %% jupyter={"source_hidden": true}
-categorical_features = data_x[categorical_feature_colnames].copy()
-
-# %% jupyter={"source_hidden": true}
-mode_categorical_features = categorical_features.mode().iloc[0]
-
-# %% jupyter={"source_hidden": true}
-# fillna with mode
-categorical_features = categorical_features.fillna(mode_categorical_features)
-
-# %% jupyter={"source_hidden": true}
-# one-hot encoding
-categorical_features = categorical_features.apply(lambda col: col.astype("category"))
-if not categorical_features.empty:
-    categorical_features = pd.get_dummies(categorical_features)
-
-# %% jupyter={"source_hidden": true}
-numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
-
-# %% jupyter={"source_hidden": true}
-train_x = pd.concat([numerical_features, categorical_features], axis=1)
-
-# %% jupyter={"source_hidden": true}
-train_x.dtypes
-
-# %% jupyter={"source_hidden": true}
-logo = LeaveOneGroupOut()
-logo.get_n_splits(
-    train_x,
-    data_y,
-    groups=data_groups,
-)
-
-# %% jupyter={"source_hidden": true}
-sum(data_y.isna())
-
-# %% [markdown]
-# ### Linear Regression
-
-# %% jupyter={"source_hidden": true}
-lin_reg_rapids = linear_model.LinearRegression()
-
-# %% jupyter={"source_hidden": true}
-imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
-
-# %% jupyter={"source_hidden": true}
-lin_reg_scores = cross_val_score(
-    lin_reg_rapids,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring='r2'
-)
-lin_reg_scores
-np.median(lin_reg_scores)
-
-# %% [markdown]
-# ### Ridge regression
-
-# %% jupyter={"source_hidden": true}
-ridge_reg = linear_model.Ridge(alpha=.5)
-
-# %% tags=[] jupyter={"source_hidden": true}
-ridge_reg_scores = cross_val_score(
-    ridge_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-)
-np.median(ridge_reg_scores)
-
-# %% [markdown]
-# ### Lasso
-
-# %% jupyter={"source_hidden": true}
-lasso_reg = linear_model.Lasso(alpha=0.1)
-
-# %% jupyter={"source_hidden": true}
-lasso_reg_score = cross_val_score(
-    lasso_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-)
-np.median(lasso_reg_score)
-
-# %% [markdown]
-# ### Bayesian Ridge
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg = linear_model.BayesianRidge()
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg_score = cross_val_score(
-    bayesian_ridge_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-)
-np.median(bayesian_ridge_reg_score)
-
-# %% [markdown]
-# ### RANSAC (outlier robust regression)
-
-# %% jupyter={"source_hidden": true}
-ransac_reg = linear_model.RANSACRegressor()
-
-# %% jupyter={"source_hidden": true}
-np.median(
-    cross_val_score(
-    ransac_reg,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-    )
-)
-
-# %% [markdown]
-# ### Support vector regression
-
-# %% jupyter={"source_hidden": true}
-svr = svm.SVR()
-
-# %% jupyter={"source_hidden": true}
-np.median(
-    cross_val_score(
-    svr,
-    X=imputer.fit_transform(train_x),
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring="r2"
-    )
-)
-
-# %% [markdown]
-# ### Kernel Ridge regression
-
-# %% jupyter={"source_hidden": true}
-kridge = kernel_ridge.KernelRidge()
-
-# %% jupyter={"source_hidden": true}
-np.median(
-    cross_val_score(
-        kridge,
-        X=imputer.fit_transform(train_x),
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-)
-# %% [markdown]
-# ### Gaussian Process Regression
-
-# %% jupyter={"source_hidden": true}
-gpr = gaussian_process.GaussianProcessRegressor()
-
-# %% jupyter={"source_hidden": true}
-
-np.median(
-    cross_val_score(
-        gpr,
-        X=imputer.fit_transform(train_x),
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-)
-# %%
-def insert_row(df, row):
-    return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
-
-# %%
-def run_all_models(input_csv):
-    # Prepare data
-    model_input = pd.read_csv(input_csv)
-    model_input.dropna(axis=1, how="all", inplace=True)
-    model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
-
-    index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-    model_input.set_index(index_columns, inplace=True)
-
-    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
-
-    categorical_feature_colnames = ["gender", "startlanguage"]
-    categorical_features = data_x[categorical_feature_colnames].copy()
-    mode_categorical_features = categorical_features.mode().iloc[0]
-    # fillna with mode
-    categorical_features = categorical_features.fillna(mode_categorical_features)
-    # one-hot encoding
-    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
-    if not categorical_features.empty:
-        categorical_features = pd.get_dummies(categorical_features)
-
-    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
-
-    train_x = pd.concat([numerical_features, categorical_features], axis=1)
-    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
-    train_x_imputed = imputer.fit_transform(train_x)
-
-    # Prepare cross validation
-    logo = LeaveOneGroupOut()
-    logo.get_n_splits(
-        train_x,
-        data_y,
-        groups=data_groups,
-    )
-    scores = pd.DataFrame(columns=["method", "median", "max"])
-
-    # Validate models
-    lin_reg_rapids = linear_model.LinearRegression()
-    lin_reg_scores = cross_val_score(
-        lin_reg_rapids,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring='r2'
-    )
-    print("Linear regression:")
-    print(np.median(lin_reg_scores))
-    scores = insert_row(scores, ["Linear regression",np.median(lin_reg_scores),np.max(lin_reg_scores)])
-
-    ridge_reg = linear_model.Ridge(alpha=.5)
-    ridge_reg_scores = cross_val_score(
-        ridge_reg,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Ridge regression")
-    print(np.median(ridge_reg_scores))
-    scores = insert_row(scores, ["Ridge regression",np.median(ridge_reg_scores),np.max(ridge_reg_scores)])
-
-    lasso_reg = linear_model.Lasso(alpha=0.1)
-    lasso_reg_score = cross_val_score(
-        lasso_reg,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Lasso regression")
-    print(np.median(lasso_reg_score))
-    scores = insert_row(scores, ["Lasso regression",np.median(lasso_reg_score),np.max(lasso_reg_score)])
-
-    bayesian_ridge_reg = linear_model.BayesianRidge()
-    bayesian_ridge_reg_score = cross_val_score(
-        bayesian_ridge_reg,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Bayesian Ridge")
-    print(np.median(bayesian_ridge_reg_score))
-    scores = insert_row(scores, ["Bayesian Ridge",np.median(bayesian_ridge_reg_score),np.max(bayesian_ridge_reg_score)])
-
-    ransac_reg = linear_model.RANSACRegressor()
-    ransac_reg_score = cross_val_score(
-        ransac_reg,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("RANSAC (outlier robust regression)")
-    print(np.median(ransac_reg_score))
-    scores = insert_row(scores, ["RANSAC",np.median(ransac_reg_score),np.max(ransac_reg_score)])
-
-    svr = svm.SVR()
-    svr_score = cross_val_score(
-        svr,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Support vector regression")
-    print(np.median(svr_score))
-    scores = insert_row(scores, ["Support vector regression",np.median(svr_score),np.max(svr_score)])
-
-    kridge = kernel_ridge.KernelRidge()
-    kridge_score = cross_val_score(
-        kridge,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Kernel Ridge regression")
-    print(np.median(kridge_score))
-    scores = insert_row(scores, ["Kernel Ridge regression",np.median(kridge_score),np.max(kridge_score)])
-
-    gpr = gaussian_process.GaussianProcessRegressor()
-    gpr_score = cross_val_score(
-        gpr,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Gaussian Process Regression")
-    print(np.median(gpr_score))
-    scores = insert_row(scores, ["Gaussian Process Regression",np.median(gpr_score),np.max(gpr_score)])
-
-    rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
-    rfr_score = cross_val_score(
-        rfr,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("Random Forest Regression")
-    print(np.median(rfr_score))
-    scores = insert_row(scores, ["Random Forest Regression",np.median(rfr_score),np.max(rfr_score)])
-
-    xgb = XGBRegressor()
-    xgb_score = cross_val_score(
-        xgb,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("XGBoost Regressor")
-    print(np.median(xgb_score))
-    scores = insert_row(scores, ["XGBoost Regressor",np.median(xgb_score),np.max(xgb_score)])
-
-    ada = ensemble.AdaBoostRegressor()
-    ada_score = cross_val_score(
-        ada,
-        X=train_x_imputed,
-        y=data_y,
-        groups=data_groups,
-        cv=logo,
-        n_jobs=-1,
-        scoring="r2"
-    )
-    print("ADA Boost Regressor")
-    print(np.median(ada_score))
-    scores = insert_row(scores, ["ADA Boost Regressor",np.median(ada_score),np.max(ada_score)])
-
-    return scores
-
-
-
-
--- a/exploration/ml_pipeline_daily_cleaned_daily.py
+++ b/exploration/ml_pipeline_daily_cleaned_daily.py
@ -1,332 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.13.0
-#   kernelspec:
-#     display_name: straw2analysis
-#     language: python
-#     name: straw2analysis
-# ---
-
-# %% jupyter={"source_hidden": true}
-# %matplotlib inline
-import datetime
-import importlib
-import os
-import sys
-
-import numpy as np
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-import yaml
-from pyprojroot import here
-from sklearn import linear_model, svm, kernel_ridge, gaussian_process
-from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
-from sklearn.metrics import mean_squared_error, r2_score
-from sklearn.impute import SimpleImputer
-from sklearn.dummy import DummyRegressor
-import xgboost as xg
-from IPython.core.interactiveshell import InteractiveShell
-InteractiveShell.ast_node_interactivity = "all"
-
-nb_dir = os.path.split(os.getcwd())[0]
-if nb_dir not in sys.path:
-    sys.path.append(nb_dir)
-
-import machine_learning.features_sensor
-import machine_learning.labels
-import machine_learning.model
-
-# %% [markdown]
-# # RAPIDS models
-
-# %% [markdown]
-# ## PANAS negative affect
-
-# %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/daily_18_hours_all_targets/input_PANAS_negative_affect_mean.csv")
-
-# %% jupyter={"source_hidden": true}
-index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-#if "pid" in model_input.columns:
-#    index_columns.append("pid")
-model_input.set_index(index_columns, inplace=True)
-
-data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
-
-# %% jupyter={"source_hidden": true}
-categorical_feature_colnames = ["gender", "startlanguage"]
-additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
-categorical_feature_colnames += additional_categorical_features
-
-# %% jupyter={"source_hidden": true}
-categorical_features = data_x[categorical_feature_colnames].copy()
-
-# %% jupyter={"source_hidden": true}
-mode_categorical_features = categorical_features.mode().iloc[0]
-
-# %% jupyter={"source_hidden": true}
-# fillna with mode
-categorical_features = categorical_features.fillna(mode_categorical_features)
-
-# %% jupyter={"source_hidden": true}
-# one-hot encoding
-categorical_features = categorical_features.apply(lambda col: col.astype("category"))
-if not categorical_features.empty:
-    categorical_features = pd.get_dummies(categorical_features)
-
-# %% jupyter={"source_hidden": true}
-numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
-
-# %% jupyter={"source_hidden": true}
-train_x = pd.concat([numerical_features, categorical_features], axis=1)
-
-# %% jupyter={"source_hidden": true}
-train_x.dtypes
-
-# %% jupyter={"source_hidden": true}
-logo = LeaveOneGroupOut()
-logo.get_n_splits(
-    train_x,
-    data_y,
-    groups=data_groups,
-)
-
-# %% jupyter={"source_hidden": true}
-sum(data_y.isna())
-
-# %% [markdown]
-# ### Baseline: Dummy Regression (mean)
-dummy_regr = DummyRegressor(strategy="mean")
-
-# %% jupyter={"source_hidden": true}
-lin_reg_scores = cross_validate(
-    dummy_regr,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(lin_reg_scores['test_r2']))
-
-# %% [markdown]
-# ### Linear Regression
-
-# %% jupyter={"source_hidden": true}
-lin_reg_rapids = linear_model.LinearRegression()
-
-# %% jupyter={"source_hidden": true}
-lin_reg_scores = cross_validate(
-    lin_reg_rapids,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(lin_reg_scores['test_r2']))
-
-# %% [markdown]
-# ### XGBRegressor Linear Regression
-# %% jupyter={"source_hidden": true}
-xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
-
-# %% jupyter={"source_hidden": true}
-xgb_reg_scores = cross_validate(
-    xgb_r,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(xgb_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(xgb_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(xgb_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(xgb_reg_scores['test_r2']))
-
-# %% [markdown]
-# ### XGBRegressor Pseudo Huber Error Regression
-# %% jupyter={"source_hidden": true}
-xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
-
-# %% jupyter={"source_hidden": true}
-xgb_psuedo_huber_reg_scores = cross_validate(
-    xgb_psuedo_huber_r,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(xgb_psuedo_huber_reg_scores['test_r2']))
-
-# %% [markdown]
-# ### Ridge regression
-
-# %% jupyter={"source_hidden": true}
-ridge_reg = linear_model.Ridge(alpha=.5)
-
-# %% tags=[] jupyter={"source_hidden": true}
-ridge_reg_scores = cross_validate(
-    ridge_reg,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(ridge_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(ridge_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(ridge_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(ridge_reg_scores['test_r2']))
-
-# %% [markdown]
-# ### Lasso
-
-# %% jupyter={"source_hidden": true}
-lasso_reg = linear_model.Lasso(alpha=0.1)
-
-# %% jupyter={"source_hidden": true}
-lasso_reg_score = cross_validate(
-    lasso_reg,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(lasso_reg_score['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(lasso_reg_score['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(lasso_reg_score['test_neg_root_mean_squared_error']))
-print("R2", np.median(lasso_reg_score['test_r2']))
-
-# %% [markdown]
-# ### Bayesian Ridge
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg = linear_model.BayesianRidge()
-
-# %% jupyter={"source_hidden": true}
-bayesian_ridge_reg_score = cross_validate(
-    bayesian_ridge_reg,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
-print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
-
-# %% [markdown]
-# ### RANSAC (outlier robust regression)
-
-# %% jupyter={"source_hidden": true}
-ransac_reg = linear_model.RANSACRegressor()
-
-# %% jupyter={"source_hidden": true}
-ransac_reg_scores = cross_validate(
-    ransac_reg,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(ransac_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(ransac_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(ransac_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(ransac_reg_scores['test_r2']))
-
-# %% [markdown]
-# ### Support vector regression
-
-# %% jupyter={"source_hidden": true}
-svr = svm.SVR()
-
-# %% jupyter={"source_hidden": true}
-svr_scores = cross_validate(
-    svr,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(svr_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(svr_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(svr_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(svr_scores['test_r2']))
-
-# %% [markdown]
-# ### Kernel Ridge regression
-
-# %% jupyter={"source_hidden": true}
-kridge = kernel_ridge.KernelRidge()
-
-# %% jupyter={"source_hidden": true}
-kridge_scores = cross_validate(
-    kridge,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(kridge_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(kridge_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(kridge_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(kridge_scores['test_r2']))
-
-# %% [markdown]
-# ### Gaussian Process Regression
-
-# %% jupyter={"source_hidden": true}
-gpr = gaussian_process.GaussianProcessRegressor()
-
-# %% jupyter={"source_hidden": true}
-
-gpr_scores = cross_validate(
-    gpr,
-    X=train_x,
-    y=data_y,
-    groups=data_groups,
-    cv=logo,
-    n_jobs=-1,
-    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
-)
-print("Negative Mean Squared Error", np.median(gpr_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(gpr_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(gpr_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(gpr_scores['test_r2']))
-
-# %%
--- a/exploration/ml_pipeline_daily_cleaned_intradaily.py
+++ b/exploration/ml_pipeline_daily_cleaned_intradaily.py
@ -50,7 +50,7 @@ import machine_learning.model
 # ## PANAS negative affect

 # %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_PANAS_negative_affect_mean.csv")
+model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")

 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
@ -58,7 +58,17 @@ index_columns = ["local_segment", "local_segment_label", "local_segment_start_da
 #    index_columns.append("pid")
 model_input.set_index(index_columns, inplace=True)

-data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+cv_method = 'half_logo' # logo, half_logo, 5kfold
+if cv_method == 'logo':
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+else:
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]

 # %% jupyter={"source_hidden": true}
 categorical_feature_colnames = ["gender", "startlanguage"]
@ -98,6 +108,10 @@ logo.get_n_splits(
    groups=data_groups,
 )

+# Defaults to 5 k folds in cross_validate method
+if cv_method != 'logo' and cv_method != 'half_logo':
+    logo = None
+
 # %% jupyter={"source_hidden": true}
 sum(data_y.isna())

@ -109,7 +123,7 @@ dummy_regr = DummyRegressor(strategy="mean")
 imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

 # %% jupyter={"source_hidden": true}
-lin_reg_scores = cross_validate(
+dummy_regressor = cross_validate(
    dummy_regr,
    X=imputer.fit_transform(train_x),
    y=data_y,
@ -118,10 +132,10 @@ lin_reg_scores = cross_validate(
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
-print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
-print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
-print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
-print("R2", np.median(lin_reg_scores['test_r2']))
+print("Negative Mean Squared Error", np.median(dummy_regressor['test_neg_mean_squared_error']))
+print("Negative Mean Absolute Error", np.median(dummy_regressor['test_neg_mean_absolute_error']))
+print("Negative Root Mean Squared Error", np.median(dummy_regressor['test_neg_root_mean_squared_error']))
+print("R2", np.median(dummy_regressor['test_r2']))

 # %% [markdown]
 # ### Linear Regression
--- a/exploration/ml_pipeline_stress_event_cleaned.py
+++ b/exploration/ml_pipeline_stress_event_cleaned.py
@ -53,12 +53,25 @@ import machine_learning.model
 model_input = pd.read_csv("../data/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")

 # %% jupyter={"source_hidden": true}
+
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
-#if "pid" in model_input.columns:
-#    index_columns.append("pid")
+
 model_input.set_index(index_columns, inplace=True)

-data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+cv_method = 'half_logo'
+if cv_method == 'logo':
+    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
+else:
+
+    model_input[(model_input['pid'] == "p037") | (model_input['pid'] == "p064") | (model_input['pid'] == "p092")]
+
+    model_input['pid_index'] = model_input.groupby('pid').cumcount()
+    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
+
+    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
+    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
+
+    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]

 # %% jupyter={"source_hidden": true}
 categorical_feature_colnames = ["gender", "startlanguage"]
@ -97,12 +110,10 @@ logo.get_n_splits(
    data_y,
    groups=data_groups,
 )
-logo.split(
-    train_x,
-    data_y,
-    groups=data_groups,
-)

+# Defaults to 5 k folds in cross_validate method
+if cv_method != 'logo' and cv_method != 'half_logo':
+    logo = None

 # %% jupyter={"source_hidden": true}
 sum(data_y.isna())
--- a/machine_learning/classification_models.py
+++ b/machine_learning/classification_models.py
@ -0,0 +1,71 @@
+from sklearn.dummy import DummyClassifier
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
+from lightgbm import LGBMClassifier
+import xgboost as xg 
+
+class ClassificationModels():
+    
+    def __init__(self):
+        self.cmodels = self.init_classification_models()
+        
+    def get_cmodels(self):
+        return self.cmodels
+
+    def init_classification_models(self):
+        cmodels = {
+            'dummy_classifier': {
+                'model': DummyClassifier(strategy="most_frequent"),
+                'metrics': [0, 0, 0, 0]
+            },
+            'logistic_regression': {
+                'model': linear_model.LogisticRegression(max_iter=1000),
+                'metrics': [0, 0, 0, 0]
+            },
+            'support_vector_machine': {
+                'model': svm.SVC(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'gaussian_naive_bayes': {
+                'model': naive_bayes.GaussianNB(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'stochastic_gradient_descent_classifier': {
+                'model': linear_model.SGDClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'knn': {
+                'model': neighbors.KNeighborsClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'decision_tree': {
+                'model': tree.DecisionTreeClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'random_forest_classifier': {
+                'model': ensemble.RandomForestClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'gradient_boosting_classifier': {
+                'model': ensemble.GradientBoostingClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'lgbm_classifier': {
+                'model': LGBMClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'XGBoost_classifier': {
+                'model': xg.sklearn.XGBClassifier(),
+                'metrics': [0, 0, 0, 0]
+            }
+        }
+        
+        return cmodels
+    
+    def get_total_models_scores(self, n_clusters=1):
+        for model_title, model in self.cmodels.items():
+            print("\n************************************\n")
+            print("Current model:", model_title, end="\n")
+            print("Acc:", model['metrics'][0]/n_clusters)
+            print("Precision:", model['metrics'][1]/n_clusters)
+            print("Recall:", model['metrics'][2]/n_clusters)
+            print("F1:", model['metrics'][3]/n_clusters)