From 0594993133c58f68494639098b17bb397696580b Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Thu, 20 Apr 2023 11:20:26 +0200
Subject: [PATCH] Add GroupKFold to feature selection CV. Start with generic
 metric calculation procedure.

---
 exploration/ml_pipeline.py            | 14 +++---
 machine_learning/cross_validation.py  | 13 ++++--
 machine_learning/feature_selection.py | 62 ++++++++++-----------------
 3 files changed, 40 insertions(+), 49 deletions(-)

diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py
index bec82b2..a794e66 100644
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@@ -34,8 +34,8 @@ index_columns = ["local_segment", "local_segment_label", "local_segment_start_da
 df.set_index(index_columns, inplace=True)
 
 # Create binary target 
-bins = [-1, 0, 4] # bins for stressfulness (0-4) target
-df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
+# bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+# df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
 
 
 nan_cols = df.columns[df.isna().any()].tolist()
@@ -58,10 +58,12 @@ for split in cv.get_splits():
     # Feature selection on train set
     # Morda se implementira GroupKfold namesto stratifiedKFold? >>
     # >> Tako se bo posamezen pid pojavil ali v test ali v train setu
-    fs = FeatureSelection(train_X, train_y) 
-    selected_features = fs.select_features(n_min=20, n_max=50, k=80,
-                                           ml_type="regression_", 
-                                           n_tolerance=20)
+    train_groups, test_groups = cv.get_groups_sets(split)
+
+    fs = FeatureSelection(train_X, train_y, train_groups) 
+    selected_features = fs.select_features(n_min=20, n_max=50, k=60,
+                                           ml_type="classification_multi", 
+                                           metric="f1", n_tolerance=20)
     print(selected_features)
     print(len(selected_features))
     
diff --git a/machine_learning/cross_validation.py b/machine_learning/cross_validation.py
index e030a8f..4f7b9ef 100644
--- a/machine_learning/cross_validation.py
+++ b/machine_learning/cross_validation.py
@@ -49,8 +49,8 @@ class CrossValidation():
 
             data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
            
-        elif self.cv_method == "5kfold":
-            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
+        elif self.cv_method == "Stratified5kfold":
+            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], None
 
         self.X, self.y, self.groups = data_X, data_y, data_groups
 
@@ -71,7 +71,7 @@ class CrossValidation():
         
         if self.cv_method in ["logo", "half_logo"]:
             self.cv = LeaveOneGroupOut()
-        elif self.cv_method == "5kfold":
+        elif self.cv_method == "Stratified5kfold":
             self.cv = StratifiedKFold(n_splits=5, shuffle=True)
 
 
@@ -118,4 +118,11 @@ class CrossValidation():
         """
         return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
     
+    def get_groups_sets(self, split):
+        
+        if self.groups is None:
+            return None, None
+        else:
+            return self.groups.iloc[split[0]], self.groups.iloc[split[1]]
+    
     
\ No newline at end of file
diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py
index 32abd1f..f2cfc95 100644
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@@ -7,7 +7,7 @@ import matplotlib.pyplot as plt
 import pandas as pd
 
 from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
-from sklearn.model_selection import cross_validate, StratifiedKFold
+from sklearn.model_selection import cross_validate, StratifiedKFold, GroupKFold
 from sklearn.naive_bayes import GaussianNB
 from sklearn.linear_model import Lasso 
 
@@ -23,9 +23,10 @@ from sklearn.linear_model import Lasso
 
 class FeatureSelection:
 
-    def __init__(self, X, y):
+    def __init__(self, X, y, groups):
         self.X = X
         self.y = y
+        self.groups = groups
 
     
     def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
@@ -65,55 +66,35 @@ class FeatureSelection:
             
             X  = self.X[pred_features].copy()
             
+            if self.groups is not None:
+                cv = GroupKFold(n_splits=5)
+            else:
+                cv = StratifiedKFold(n_splits=5, shuffle=True)
+                
+            # See link about scoring for multiclassfication
+            # http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
             if ml_type == "classification":
                 nb = GaussianNB()
                 model_cv = cross_validate(
                     nb,
                     X=X,
                     y=self.y,
-                    cv=StratifiedKFold(n_splits=5, shuffle=True),
+                    cv=cv,
+                    groups=self.groups,
                     n_jobs=-1,
-                    scoring=('accuracy', 'precision', 'recall', 'f1')
+                    scoring=(metric)
                 )
                 
                 with warnings.catch_warnings():
                     warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
 
-                    if metric == "accuracy":
-                        acc = np.mean(model_cv['test_accuracy'])
-                        acc_std = np.std(model_cv['test_accuracy'])
-                        
-                        if not best_feature or (acc > best_metric_score):
-                            best_feature = feat
-                            best_metric_score = acc
-                            best_metric_score_std = acc_std
+                    metric_score = np.nanmean(model_cv[f'test_{metric}'])
+                    metric_score_std = np.nanstd(model_cv[f'test_{metric}'])
                     
-                    elif metric == "precision":
-                        prec = np.mean(model_cv['test_precision'])
-                        prec_std = np.std(model_cv['test_precision'])
-                        
-                        if not best_feature or (prec > best_metric_score):
-                            best_feature = feat
-                            best_metric_score = prec
-                            best_metric_score_std = prec_std
-                    
-                    elif metric == "recall":
-                        rec = np.mean(model_cv['test_recall'])
-                        rec_std = np.std(model_cv['test_recall'])
-                        
-                        if not best_feature or (rec > best_metric_score):
-                            best_feature = feat
-                            best_metric_score = rec
-                            best_metric_score_std = rec_std
-                    
-                    else:
-                        f1 = np.mean(model_cv['test_f1'])
-                        f1_std = np.std(model_cv['test_f1'])
-                        
-                        if not best_feature or (f1 > best_metric_score):
-                            best_feature = feat
-                            best_metric_score = f1
-                            best_metric_score_std = f1_std 
+                    if not best_feature or (metric_score > best_metric_score):
+                        best_feature = feat
+                        best_metric_score = metric_score
+                        best_metric_score_std = metric_score_std
                                        
             elif ml_type == "regression":
                 lass = Lasso()
@@ -121,7 +102,8 @@ class FeatureSelection:
                     lass,
                     X=X,
                     y=y,
-                    cv=StratifiedKFold(n_splits=5, shuffle=True),
+                    cv=cv,
+                    groups=self.groups,
                     n_jobs=-1,
                     scoring=('r2')
                 )
@@ -214,7 +196,7 @@ class FeatureSelection:
                     break
                 
                 best_feature, best_metric_score, best_metric_score_std = \
-                    self.select_best_feature(features, method=method, ml_type=ml_type[0], metric="recall")
+                    self.select_best_feature(features, method=method, ml_type=ml_type[0], metric=metric)
                     
                 feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))