From ce13a9e13bb8164240362fa86afac34944cfef39 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Wed, 19 Apr 2023 15:56:34 +0200
Subject: [PATCH 1/6] Implement feature selection method which is used in ML
 pipeline.

---
 exploration/ml_pipeline.py            |  25 +++++-
 machine_learning/feature_selection.py | 122 ++++++++++++++++----------
 2 files changed, 100 insertions(+), 47 deletions(-)

diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py
index eeaa9b3..6d75385 100644
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@@ -26,24 +26,45 @@ if nb_dir not in sys.path:
 
 from machine_learning.cross_validation import CrossValidation
 from machine_learning.preprocessing import Preprocessing
+from machine_learning.feature_selection import FeatureSelection
 
 # %% 
 df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 df.set_index(index_columns, inplace=True)
 
+# Create binary target 
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
+
+
+nan_cols = df.columns[df.isna().any()].tolist()
+df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))
+
 cv = CrossValidation(data=df, cv_method="logo")
 
 categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
 interval_feature_list, other_feature_list = [], []
 
-print(df.columns.tolist())
-
+# %%
 for split in cv.get_splits():
     train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
     pre = Preprocessing(train_X, train_y, test_X, test_y)
     pre.one_hot_encode_train_and_test_sets(categorical_columns)
     train_X, train_y, test_X, test_y = pre.get_train_test_sets()
+    
+    # train_X = train_X[train_X.columns[:30]]
+    
+    # Feature selection on train set
+    # Morda se implementira GroupKfold namesto stratifiedKFold? >>
+    # >> Tako se bo posamezen pid pojavil ali v test ali v train setu
+    fs = FeatureSelection(train_X, train_y) 
+    selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3)
+    print(selected_features)
+    print(len(selected_features))
+    
+    
+    
     break
 
 # %%
diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py
index 0080839..31a5e92 100644
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@@ -1,11 +1,13 @@
 import os
 import sys
+import warnings
 
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 
 from sklearn.feature_selection import SequentialFeatureSelector
+from sklearn.model_selection import cross_validate, StratifiedKFold
 from sklearn.naive_bayes import GaussianNB
 from sklearn.linear_model import Lasso 
 
@@ -21,11 +23,12 @@ from sklearn.linear_model import Lasso
 
 class FeatureSelection:
 
-    def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
-        pass # TODO.... 
+    def __init__(self, X, y):
+        self.X = X
+        self.y = y
 
     
-    def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
+    def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
         """The method selects the best feature by testing the prediction on the feature set with or without the current feature.
         The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat 
         feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
@@ -56,18 +59,18 @@ class FeatureSelection:
 
         for feat in features:
             if method == "remove":
-                pred_features = [col for col in df.columns if feat != col] # All but feat
+                pred_features = [col for col in self.X.columns if feat != col] # All but feat
             elif method == "add":
                 pred_features = [feat] + stored_features # Feat with stored features
             
-            X, y  = df.drop(columns=['target', 'pid'])[pred_features], df['target']
+            X  = self.X[pred_features].copy()
             
             if ml_type == "classification":
                 nb = GaussianNB()
                 model_cv = cross_validate(
                     nb,
                     X=X,
-                    y=y,
+                    y=self.y,
                     cv=StratifiedKFold(n_splits=5, shuffle=True),
                     n_jobs=-1,
                     scoring=('accuracy', 'precision', 'recall', 'f1')
@@ -137,85 +140,114 @@ class FeatureSelection:
         return best_feature, best_metric_score, best_metric_score_std
     
     
-    def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
+    def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10):
+        """This method selects a set of features and returns them as a list. It returns number of features 
+        determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure.
+        The procedure sequentially removes the features that attribute the least to the choosen evaluation metric.
+        If in this sequence the score ML score is improved the next feature is remove otherwise there is a 
+        tolerance criteria (n_not_improve) with which the next n remove features are inspected whether 
+        currently best score is improved. The features are returned in specified interval as a list.     
+
+        Args:
+            n_min (int): Minimal amount of features returned.
+            n_max (int): Maximal amount of features returned.
+            method (str, optional): "remove" or "add" features.  Defaults to "remove".
+            n_not_improve (int): If the best score is not improved in n that is specified by this parameter
+                the method returns index of feature with current best score as a tipping point feature.
+            
+        Returns:
+            list: list of selected features
+        """        
         
-        n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
-        if n_max > n_features:
-            n_max = n_features
+        n_features = self.X.shape[1]
+        if n_max >= n_features:
+            n_max = n_features-1 # The algorithm removes at least one feature
         
         if n_min > n_features:
-            raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
+            raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.")
         
         if n_max < n_min:
             raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
         
-        features = df.columns.tolist()
-        features.remove("pid")
-        features.remove("target")
+        features = self.X.columns.tolist()
         feature_importance = []
         if method == "remove":
+            best_score = 0
+            best_feature_indx = None
+            i_worse = 0
             for i in reversed(range(n_features)):
                 
+                if i+1 == n_min:
+                    break
+                
                 best_feature, best_metric_score, best_metric_score_std = \
-                    self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
-                feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
+                    self.select_best_feature(features, method=method, ml_type="classification", metric="recall")
+                    
+                feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
                 
                 features.remove(best_feature)
                 
+                if i <= n_max:
+                    if best_metric_score >= best_score:
+                        best_score = best_metric_score
+                        best_feature_indx = i+1
+                        i_worse = 0
+                    else:
+                        i_worse += 1
+                    
+                    if i_worse == n_not_improve: 
+                        break  
+                
             feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
+
+            print(feature_importance_df)
+            print("best_feature_indx", best_feature_indx)
+            print("best_score", best_score)
+
+            features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist()
+            selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]    
             
+            return selected_features
+            
+            """
             # Selekcijski kriterij značilk v rangu max-min
             # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
             
             # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
             
             # "Tipping point" značilka mora biti v rangu max-min
-            
             selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
             selection_area.set_index(["i", "name"], inplace=True)
+            print(selection_area)
             diffrences = selection_area.diff()
             diffrences.dropna(how='any', inplace=True)
+            print(diffrences)
             
             # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo 
             cumulative_sumation = diffrences.cumsum()
+            print(cumulative_sumation)
             tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
+            print(tipping_feature_indx_1)
 
-            # Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative 
-            tipping_feature_indx_2 = None
-            for indx, row in diffrences.iterrows():
-                if row["metric"] > 0:
-                    tipping_feature_indx_2 = indx
-                else: 
-                    break
                 
             # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score     
-            tipping_feature_indx_3 = None
-            cum_sum_score = 0
+            tipping_feature_indx_2 = None
+            best_score = 0
             i_worse = 0
-            # TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
             for indx, row in selection_area.iterrows():
-                if row["metric"] > 0:
-                    tipping_feature_indx_3 = indx
-                    cum_sum_score += row["metric"]
+                if row["metric"] > best_score:
+                    tipping_feature_indx_2 = indx
+                    best_score = row["metric"]
                     i_worse = 0
                 else:
                     i_worse += 1
                 
                 if i_worse == n_not_improve:
-                    break
-                    
-                    
-                
-                
+                    break            
             
+            print(tipping_feature_indx_2)
+            selection_area.reset_index(inplace=True)
+            features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
             
-        
-
-    def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
-        pass
-
-    def vizualize_feature_selection_process():
-        pass
-
-    def execute_feature_selection_step():
-        pass
\ No newline at end of file
+            selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
+            """
\ No newline at end of file

From 1cbc743cf76544a1d5b4b2904d886d7374733110 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Thu, 20 Apr 2023 10:12:16 +0200
Subject: [PATCH 2/6] Add kBest method to initially filter out the worst
 performing features. Update comments.

---
 exploration/ml_pipeline.py            |  6 +--
 machine_learning/feature_selection.py | 70 ++++++++++++++++++++-------
 2 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py
index 6d75385..bec82b2 100644
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@@ -59,12 +59,12 @@ for split in cv.get_splits():
     # Morda se implementira GroupKfold namesto stratifiedKFold? >>
     # >> Tako se bo posamezen pid pojavil ali v test ali v train setu
     fs = FeatureSelection(train_X, train_y) 
-    selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3)
+    selected_features = fs.select_features(n_min=20, n_max=50, k=80,
+                                           ml_type="regression_", 
+                                           n_tolerance=20)
     print(selected_features)
     print(len(selected_features))
     
-    
-    
     break
 
 # %%
diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py
index 31a5e92..32abd1f 100644
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@@ -6,7 +6,7 @@ import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 
-from sklearn.feature_selection import SequentialFeatureSelector
+from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
 from sklearn.model_selection import cross_validate, StratifiedKFold
 from sklearn.naive_bayes import GaussianNB
 from sklearn.linear_model import Lasso 
@@ -140,31 +140,62 @@ class FeatureSelection:
         return best_feature, best_metric_score, best_metric_score_std
     
     
-    def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10):
+    def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10):
         """This method selects a set of features and returns them as a list. It returns number of features 
-        determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure.
-        The procedure sequentially removes the features that attribute the least to the choosen evaluation metric.
-        If in this sequence the score ML score is improved the next feature is remove otherwise there is a 
-        tolerance criteria (n_not_improve) with which the next n remove features are inspected whether 
-        currently best score is improved. The features are returned in specified interval as a list.     
+        determined in the interval of [n_min, n_max]. 
+        
+        The method consists of two steps: 
+        (1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter.
+        (2) The sequential features removal procedure is executed. Using the remaing features from (1).
+            The best score is detected using a removal procedure. The procedure sequentially removes the features 
+            that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is 
+            improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance) 
+            with which the next n removed features are inspected whether currently best score is improved.     
 
         Args:
-            n_min (int): Minimal amount of features returned.
-            n_max (int): Maximal amount of features returned.
+            n_min (int, optional): Minimal amount of features returned.
+            n_max (int, optional): Maximal amount of features returned.
+            k (int, optional): Determines the k in the k-best features method.
+            ml_type(str, optional): Type of ML problem. Currently implemented options: 
+                classification_bin, classification_multi, and regression_
             method (str, optional): "remove" or "add" features.  Defaults to "remove".
-            n_not_improve (int): If the best score is not improved in n that is specified by this parameter
+            n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
                 the method returns index of feature with current best score as a tipping point feature.
             
         Returns:
             list: list of selected features
         """        
-        
         n_features = self.X.shape[1]
         if n_max >= n_features:
             n_max = n_features-1 # The algorithm removes at least one feature
+        if k < n_max:
+            raise ValueError("The k parameter needs to be lower than the n_max parameter.")
+        
+        # Select k-best feature dependent on the type of ML task
+        ml_type = ml_type.split("_")
+        if ml_type[0] == "classification":
+            if ml_type[1] == "bin":
+                selector = SelectKBest(mutual_info_classif, k=k)
+            elif ml_type[1] == "multi":
+                selector = SelectKBest(f_classif, k=k)
+            else:
+                raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
+        elif ml_type[0] == "regression":
+            selector = SelectKBest(f_regression, k=k)
+        else:
+            raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
+        
+        selector.fit(self.X, self.y)
+        cols_idxs = selector.get_support(indices=True)
+        self.X = self.X.iloc[:,cols_idxs]
+        
+        print(self.X.columns)
+        
+        # Sequential feature addition / removal
+        n_features = self.X.shape[1]
         
         if n_min > n_features:
-            raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.")
+            raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
         
         if n_max < n_min:
             raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
@@ -177,11 +208,13 @@ class FeatureSelection:
             i_worse = 0
             for i in reversed(range(n_features)):
                 
+                print("Iteration:", i+1)
+                
                 if i+1 == n_min:
                     break
                 
                 best_feature, best_metric_score, best_metric_score_std = \
-                    self.select_best_feature(features, method=method, ml_type="classification", metric="recall")
+                    self.select_best_feature(features, method=method, ml_type=ml_type[0], metric="recall")
                     
                 feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
                 
@@ -195,7 +228,7 @@ class FeatureSelection:
                     else:
                         i_worse += 1
                     
-                    if i_worse == n_not_improve: 
+                    if i_worse == n_tolerance: 
                         break  
                 
             feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
@@ -230,7 +263,7 @@ class FeatureSelection:
             print(tipping_feature_indx_1)
 
                 
-            # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score     
+            # Metoda, ki pusti n_tolerance značilkam, da premagajo dosedajno najboljši score     
             tipping_feature_indx_2 = None
             best_score = 0
             i_worse = 0
@@ -242,7 +275,7 @@ class FeatureSelection:
                 else:
                     i_worse += 1
                 
-                if i_worse == n_not_improve:
+                if i_worse == n_tolerance:
                     break            
             
             print(tipping_feature_indx_2)
@@ -250,4 +283,7 @@ class FeatureSelection:
             features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
             
             selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
-            """
\ No newline at end of file
+            """
+        
+        else:
+            raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
\ No newline at end of file

From 0594993133c58f68494639098b17bb397696580b Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Thu, 20 Apr 2023 11:20:26 +0200
Subject: [PATCH 3/6] Add GroupKFold to feature selection CV. Start with
 generic metric calculation procedure.

---
 exploration/ml_pipeline.py            | 14 +++---
 machine_learning/cross_validation.py  | 13 ++++--
 machine_learning/feature_selection.py | 62 ++++++++++-----------------
 3 files changed, 40 insertions(+), 49 deletions(-)

diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py
index bec82b2..a794e66 100644
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@@ -34,8 +34,8 @@ index_columns = ["local_segment", "local_segment_label", "local_segment_start_da
 df.set_index(index_columns, inplace=True)
 
 # Create binary target 
-bins = [-1, 0, 4] # bins for stressfulness (0-4) target
-df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
+# bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+# df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
 
 
 nan_cols = df.columns[df.isna().any()].tolist()
@@ -58,10 +58,12 @@ for split in cv.get_splits():
     # Feature selection on train set
     # Morda se implementira GroupKfold namesto stratifiedKFold? >>
     # >> Tako se bo posamezen pid pojavil ali v test ali v train setu
-    fs = FeatureSelection(train_X, train_y) 
-    selected_features = fs.select_features(n_min=20, n_max=50, k=80,
-                                           ml_type="regression_", 
-                                           n_tolerance=20)
+    train_groups, test_groups = cv.get_groups_sets(split)
+
+    fs = FeatureSelection(train_X, train_y, train_groups) 
+    selected_features = fs.select_features(n_min=20, n_max=50, k=60,
+                                           ml_type="classification_multi", 
+                                           metric="f1", n_tolerance=20)
     print(selected_features)
     print(len(selected_features))
     
diff --git a/machine_learning/cross_validation.py b/machine_learning/cross_validation.py
index e030a8f..4f7b9ef 100644
--- a/machine_learning/cross_validation.py
+++ b/machine_learning/cross_validation.py
@@ -49,8 +49,8 @@ class CrossValidation():
 
             data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
            
-        elif self.cv_method == "5kfold":
-            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
+        elif self.cv_method == "Stratified5kfold":
+            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], None
 
         self.X, self.y, self.groups = data_X, data_y, data_groups
 
@@ -71,7 +71,7 @@ class CrossValidation():
         
         if self.cv_method in ["logo", "half_logo"]:
             self.cv = LeaveOneGroupOut()
-        elif self.cv_method == "5kfold":
+        elif self.cv_method == "Stratified5kfold":
             self.cv = StratifiedKFold(n_splits=5, shuffle=True)
 
 
@@ -118,4 +118,11 @@ class CrossValidation():
         """
         return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
     
+    def get_groups_sets(self, split):
+        
+        if self.groups is None:
+            return None, None
+        else:
+            return self.groups.iloc[split[0]], self.groups.iloc[split[1]]
+    
     
\ No newline at end of file
diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py
index 32abd1f..f2cfc95 100644
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@@ -7,7 +7,7 @@ import matplotlib.pyplot as plt
 import pandas as pd
 
 from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
-from sklearn.model_selection import cross_validate, StratifiedKFold
+from sklearn.model_selection import cross_validate, StratifiedKFold, GroupKFold
 from sklearn.naive_bayes import GaussianNB
 from sklearn.linear_model import Lasso 
 
@@ -23,9 +23,10 @@ from sklearn.linear_model import Lasso
 
 class FeatureSelection:
 
-    def __init__(self, X, y):
+    def __init__(self, X, y, groups):
         self.X = X
         self.y = y
+        self.groups = groups
 
     
     def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
@@ -65,55 +66,35 @@ class FeatureSelection:
             
             X  = self.X[pred_features].copy()
             
+            if self.groups is not None:
+                cv = GroupKFold(n_splits=5)
+            else:
+                cv = StratifiedKFold(n_splits=5, shuffle=True)
+                
+            # See link about scoring for multiclassfication
+            # http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
             if ml_type == "classification":
                 nb = GaussianNB()
                 model_cv = cross_validate(
                     nb,
                     X=X,
                     y=self.y,
-                    cv=StratifiedKFold(n_splits=5, shuffle=True),
+                    cv=cv,
+                    groups=self.groups,
                     n_jobs=-1,
-                    scoring=('accuracy', 'precision', 'recall', 'f1')
+                    scoring=(metric)
                 )
                 
                 with warnings.catch_warnings():
                     warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
 
-                    if metric == "accuracy":
-                        acc = np.mean(model_cv['test_accuracy'])
-                        acc_std = np.std(model_cv['test_accuracy'])
-                        
-                        if not best_feature or (acc > best_metric_score):
-                            best_feature = feat
-                            best_metric_score = acc
-                            best_metric_score_std = acc_std
+                    metric_score = np.nanmean(model_cv[f'test_{metric}'])
+                    metric_score_std = np.nanstd(model_cv[f'test_{metric}'])
                     
-                    elif metric == "precision":
-                        prec = np.mean(model_cv['test_precision'])
-                        prec_std = np.std(model_cv['test_precision'])
-                        
-                        if not best_feature or (prec > best_metric_score):
-                            best_feature = feat
-                            best_metric_score = prec
-                            best_metric_score_std = prec_std
-                    
-                    elif metric == "recall":
-                        rec = np.mean(model_cv['test_recall'])
-                        rec_std = np.std(model_cv['test_recall'])
-                        
-                        if not best_feature or (rec > best_metric_score):
-                            best_feature = feat
-                            best_metric_score = rec
-                            best_metric_score_std = rec_std
-                    
-                    else:
-                        f1 = np.mean(model_cv['test_f1'])
-                        f1_std = np.std(model_cv['test_f1'])
-                        
-                        if not best_feature or (f1 > best_metric_score):
-                            best_feature = feat
-                            best_metric_score = f1
-                            best_metric_score_std = f1_std 
+                    if not best_feature or (metric_score > best_metric_score):
+                        best_feature = feat
+                        best_metric_score = metric_score
+                        best_metric_score_std = metric_score_std
                                        
             elif ml_type == "regression":
                 lass = Lasso()
@@ -121,7 +102,8 @@ class FeatureSelection:
                     lass,
                     X=X,
                     y=y,
-                    cv=StratifiedKFold(n_splits=5, shuffle=True),
+                    cv=cv,
+                    groups=self.groups,
                     n_jobs=-1,
                     scoring=('r2')
                 )
@@ -214,7 +196,7 @@ class FeatureSelection:
                     break
                 
                 best_feature, best_metric_score, best_metric_score_std = \
-                    self.select_best_feature(features, method=method, ml_type=ml_type[0], metric="recall")
+                    self.select_best_feature(features, method=method, ml_type=ml_type[0], metric=metric)
                     
                 feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
                 

From 259be708aa8031fe3f5ef1cc6f60e5adf8ebef26 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Thu, 20 Apr 2023 13:26:20 +0200
Subject: [PATCH 4/6] Improve the feature selection method with validations
 etc.

---
 machine_learning/feature_selection.py | 164 +++++++++++---------------
 1 file changed, 69 insertions(+), 95 deletions(-)

diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py
index f2cfc95..8d7b950 100644
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@@ -29,9 +29,9 @@ class FeatureSelection:
         self.groups = groups
 
     
-    def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
+    def select_best_feature(self, features, method="remove", ml_category="classification", ml_subcategory="bin", metric="recall", stored_features=[]):
         """The method selects the best feature by testing the prediction on the feature set with or without the current feature.
-        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat 
+        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particular 
         feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
         specified as a parameter.
 
@@ -39,7 +39,11 @@ class FeatureSelection:
             df (DataFrame): Input data on which the predictions will be made.
             features (list): List of features to select the best/worst from
             method (str, optional): remove or add features.  Defaults to "remove".
-            ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and  metric. Defaults to "classification".
+            ml_category (str, optional): Either classification or regression ml problem controls the ML algorithm and  metric. 
+                Defaults to "classification".
+            ml_subcategory (str, optional): In case of classification '_bin' for binary classification 
+                and 'multi' for multiclass classification. For regression an empty string '' is sufficient. 
+                Defaults to "bin".
             metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
             stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
 
@@ -53,9 +57,25 @@ class FeatureSelection:
         
         best_feature = None
         
-        if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
-            raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
-        elif ml_type == "regression" and metric not in ['r2']:
+        # Validacije tipov ML in specificiranimi metrikami
+        if ml_category == "classification":
+            if ml_subcategory == "bin" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
+                raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
+            elif ml_subcategory == "multi":
+                ml_subcategory_error = False
+                if metric != "accuracy" and "_" in metric:          
+                    metric_s, metric_t = metric.split("_")
+                    if metric_s not in ['accuracy', 'precision', 'recall', 'f1'] or metric_t not in ['micro', 'macro', 'weighted']:
+                        ml_subcategory_error = True
+                else:
+                    ml_subcategory_error = True
+                    
+                if ml_subcategory_error:
+                    raise ValueError(""""Classification metric for multi-class classification must be specified precisely.
+                                     Available metric are: 'accuracy', 'precision', 'recall' and 'f1'.
+                                     Only accuracy must be specified as 'accuracy'.
+                                     For others please add appropriate suffixes: '_macro', '_micro', or '_weighted', e.g., 'f1_macro'""")
+        elif ml_category == "regression" and metric not in ['r2']:
             raise ValueError("Regression metric not recognized. Please choose 'r2'")
 
         for feat in features:
@@ -73,7 +93,7 @@ class FeatureSelection:
                 
             # See link about scoring for multiclassfication
             # http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
-            if ml_type == "classification":
+            if ml_category == "classification":
                 nb = GaussianNB()
                 model_cv = cross_validate(
                     nb,
@@ -85,18 +105,8 @@ class FeatureSelection:
                     scoring=(metric)
                 )
                 
-                with warnings.catch_warnings():
-                    warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
-
-                    metric_score = np.nanmean(model_cv[f'test_{metric}'])
-                    metric_score_std = np.nanstd(model_cv[f'test_{metric}'])
-                    
-                    if not best_feature or (metric_score > best_metric_score):
-                        best_feature = feat
-                        best_metric_score = metric_score
-                        best_metric_score_std = metric_score_std
                                        
-            elif ml_type == "regression":
+            elif ml_category == "regression":
                 lass = Lasso()
                 model_cv = cross_validate(
                     lass,
@@ -108,16 +118,20 @@ class FeatureSelection:
                     scoring=('r2')
                 )
 
-                if metric == "r2":
-                    r2 = np.mean(model_cv['test_r2'])
-                    r2_std = np.std(model_cv['test_r2'])
-                    
-                    if not best_feature or (r2 > best_metric_score):
-                        best_feature = feat
-                        best_metric_score = r2
-                        best_metric_score_std = r2_std
             else:
                 raise ValueError("ML type not yet implemented!")
+            
+            # Section of metrics' scores comparison. 
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
+
+                metric_score = np.nanmean(model_cv["test_score"])
+                metric_score_std = np.nanstd(model_cv["test_score"])
+                
+                if not best_feature or (metric_score > best_metric_score):
+                    best_feature = feat
+                    best_metric_score = metric_score
+                    best_metric_score_std = metric_score_std
                     
         return best_feature, best_metric_score, best_metric_score_std
     
@@ -137,9 +151,10 @@ class FeatureSelection:
         Args:
             n_min (int, optional): Minimal amount of features returned.
             n_max (int, optional): Maximal amount of features returned.
-            k (int, optional): Determines the k in the k-best features method.
+            k (int, optional): Determines the k in the k-best features method. 
+                If None, SelectKBest feature selection does not execute.
             ml_type(str, optional): Type of ML problem. Currently implemented options: 
-                classification_bin, classification_multi, and regression_
+                'classification_bin', 'classification_multi', and 'regression_'
             method (str, optional): "remove" or "add" features.  Defaults to "remove".
             n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
                 the method returns index of feature with current best score as a tipping point feature.
@@ -147,35 +162,38 @@ class FeatureSelection:
         Returns:
             list: list of selected features
         """        
-        n_features = self.X.shape[1]
-        if n_max >= n_features:
-            n_max = n_features-1 # The algorithm removes at least one feature
-        if k < n_max:
-            raise ValueError("The k parameter needs to be lower than the n_max parameter.")
+
+        if k is not None and k <= n_max:
+            raise ValueError("The k parameter needs to be greater than the n_max parameter.")
         
         # Select k-best feature dependent on the type of ML task
-        ml_type = ml_type.split("_")
-        if ml_type[0] == "classification":
-            if ml_type[1] == "bin":
-                selector = SelectKBest(mutual_info_classif, k=k)
-            elif ml_type[1] == "multi":
-                selector = SelectKBest(f_classif, k=k)
+        ml_category, ml_subcategory = ml_type.split("_")
+
+        if k is not None:
+            if ml_category == "classification":
+                if ml_subcategory== "bin":
+                    selector = SelectKBest(mutual_info_classif, k=k)
+                elif ml_subcategory== "multi":
+                    selector = SelectKBest(f_classif, k=k)
+                else:
+                    raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
+            elif ml_category == "regression":
+                selector = SelectKBest(f_regression, k=k)
             else:
-                raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
-        elif ml_type[0] == "regression":
-            selector = SelectKBest(f_regression, k=k)
-        else:
-            raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
-        
-        selector.fit(self.X, self.y)
-        cols_idxs = selector.get_support(indices=True)
-        self.X = self.X.iloc[:,cols_idxs]
+                raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
+            
+            selector.fit(self.X, self.y)
+            cols_idxs = selector.get_support(indices=True)
+            self.X = self.X.iloc[:,cols_idxs]
         
+        print("All columns (after SelectKBest method):")
         print(self.X.columns)
         
         # Sequential feature addition / removal
         n_features = self.X.shape[1]
-        
+        if n_max >= n_features:
+            n_max = n_features-1 # The algorithm removes at least one feature
+            
         if n_min > n_features:
             raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
         
@@ -190,17 +208,16 @@ class FeatureSelection:
             i_worse = 0
             for i in reversed(range(n_features)):
                 
-                print("Iteration:", i+1)
-                
                 if i+1 == n_min:
                     break
                 
                 best_feature, best_metric_score, best_metric_score_std = \
-                    self.select_best_feature(features, method=method, ml_type=ml_type[0], metric=metric)
+                    self.select_best_feature(features, method=method, ml_category=ml_category, ml_subcategory=ml_subcategory, metric=metric)
                     
                 feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
                 
                 features.remove(best_feature)
+                print("Features left:", i) 
                 
                 if i <= n_max:
                     if best_metric_score >= best_score:
@@ -223,49 +240,6 @@ class FeatureSelection:
             selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]    
             
             return selected_features
-            
-            """
-            # Selekcijski kriterij značilk v rangu max-min
-            # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
-            
-            # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
-            
-            # "Tipping point" značilka mora biti v rangu max-min
-            selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
-            selection_area.set_index(["i", "name"], inplace=True)
-            print(selection_area)
-            diffrences = selection_area.diff()
-            diffrences.dropna(how='any', inplace=True)
-            print(diffrences)
-            
-            # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo 
-            cumulative_sumation = diffrences.cumsum()
-            print(cumulative_sumation)
-            tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
-            print(tipping_feature_indx_1)
-
-                
-            # Metoda, ki pusti n_tolerance značilkam, da premagajo dosedajno najboljši score     
-            tipping_feature_indx_2 = None
-            best_score = 0
-            i_worse = 0
-            for indx, row in selection_area.iterrows():
-                if row["metric"] > best_score:
-                    tipping_feature_indx_2 = indx
-                    best_score = row["metric"]
-                    i_worse = 0
-                else:
-                    i_worse += 1
-                
-                if i_worse == n_tolerance:
-                    break            
-            
-            print(tipping_feature_indx_2)
-            selection_area.reset_index(inplace=True)
-            features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
-            
-            selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
-            """
         
         else:
             raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
\ No newline at end of file

From 865225994b29b34dbbdeef71416cdd911bc6919f Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Thu, 20 Apr 2023 13:29:14 +0200
Subject: [PATCH 5/6] Added testing section after feature selection.

---
 exploration/ml_pipeline.py | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py
index a794e66..b6b3bb6 100644
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@@ -20,6 +20,9 @@ import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import recall_score, f1_score
+
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
     sys.path.append(nb_dir)
@@ -34,8 +37,8 @@ index_columns = ["local_segment", "local_segment_label", "local_segment_start_da
 df.set_index(index_columns, inplace=True)
 
 # Create binary target 
-# bins = [-1, 0, 4] # bins for stressfulness (0-4) target
-# df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
 
 
 nan_cols = df.columns[df.isna().any()].tolist()
@@ -53,20 +56,38 @@ for split in cv.get_splits():
     pre.one_hot_encode_train_and_test_sets(categorical_columns)
     train_X, train_y, test_X, test_y = pre.get_train_test_sets()
     
-    # train_X = train_X[train_X.columns[:30]]
+    
+    print(train_X.shape, test_X.shape)
+    # Predict before feature selection
+    rfc = RandomForestClassifier(n_estimators=10)
+    rfc.fit(train_X, train_y)
+    predictions = rfc.predict(test_X)
+    
+    print("Recall:", recall_score(test_y, predictions))
+    print("F1:", f1_score(test_y, predictions))
     
     # Feature selection on train set
-    # Morda se implementira GroupKfold namesto stratifiedKFold? >>
-    # >> Tako se bo posamezen pid pojavil ali v test ali v train setu
     train_groups, test_groups = cv.get_groups_sets(split)
 
     fs = FeatureSelection(train_X, train_y, train_groups) 
-    selected_features = fs.select_features(n_min=20, n_max=50, k=60,
-                                           ml_type="classification_multi", 
-                                           metric="f1", n_tolerance=20)
+    selected_features = fs.select_features(n_min=20, n_max=29, k=40,
+                                           ml_type="classification_bin", 
+                                           metric="recall", n_tolerance=20)
+    
+    train_X = train_X[selected_features]
+    test_X = test_X[selected_features]
+    
     print(selected_features)
     print(len(selected_features))
     
+    # Predict after feature selection    
+    rfc = RandomForestClassifier(n_estimators=500)
+    rfc.fit(train_X, train_y)
+    predictions = rfc.predict(test_X)
+    
+    print("Recall:", recall_score(test_y, predictions))
+    print("F1:", f1_score(test_y, predictions))
+    
     break
 
 # %%

From 26804cf8ea2e6582464ab482baf8c50dcefc7114 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Fri, 21 Apr 2023 13:24:31 +0200
Subject: [PATCH 6/6] Repair preprocessing one hot encoding of test set.

---
 machine_learning/preprocessing.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/machine_learning/preprocessing.py b/machine_learning/preprocessing.py
index a11558c..1f55482 100644
--- a/machine_learning/preprocessing.py
+++ b/machine_learning/preprocessing.py
@@ -33,7 +33,7 @@ class Preprocessing:
         Args:
             categorical_features (DataFrame): DataFrame including only categorical columns.
             numerical_features (_type_): DataFrame including only numerical columns.
-            mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
+            mode (int): Mode of the column with which DataFrame is filled.
 
         Returns:
             DataFrame: Hot-One Encoded DataFrame.
@@ -46,7 +46,7 @@ class Preprocessing:
         if not categorical_features.empty:
             categorical_features = pd.get_dummies(categorical_features)
 
-        return pd.concat([numerical_features, categorical_features], axis=1)
+        return pd.concat([numerical_features, categorical_features], axis=1), categorical_features.columns.tolist()
 
 
     def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
@@ -68,20 +68,27 @@ class Preprocessing:
         categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
 
         # For train set
-        
         train_X_categorical_features = self.train_X[categorical_columns].copy()
         train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
         mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
         
-        self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
+        self.train_X, train_cat_col_names = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
+        encoded_categorical_features = [col for col in self.train_X.columns if col.startswith(tuple(categorical_columns))]
         
         # For test set
-        
         test_X_categorical_features = self.test_X[categorical_columns].copy()
         test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
         
-        self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
+        self.test_X, test_cat_col_names = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
 
+        # Create categorical columns that were not found in test set and fill them with 0        
+        missing_cols = [col for col in train_cat_col_names if col not in test_cat_col_names]
+        self.test_X[missing_cols] = 0
+        
+        # Sort column names alphabetically        
+        self.train_X = self.train_X.reindex(sorted(self.train_X.columns), axis=1)
+        self.test_X = self.test_X.reindex(sorted(self.test_X.columns), axis=1)
+        
 
     def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):