From 259be708aa8031fe3f5ef1cc6f60e5adf8ebef26 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Thu, 20 Apr 2023 13:26:20 +0200
Subject: [PATCH] Improve the feature selection method with validations etc.

---
 machine_learning/feature_selection.py | 164 +++++++++++---------------
 1 file changed, 69 insertions(+), 95 deletions(-)

diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py
index f2cfc95..8d7b950 100644
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@@ -29,9 +29,9 @@ class FeatureSelection:
         self.groups = groups
 
     
-    def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
+    def select_best_feature(self, features, method="remove", ml_category="classification", ml_subcategory="bin", metric="recall", stored_features=[]):
         """The method selects the best feature by testing the prediction on the feature set with or without the current feature.
-        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat 
+        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particular 
         feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
         specified as a parameter.
 
@@ -39,7 +39,11 @@ class FeatureSelection:
             df (DataFrame): Input data on which the predictions will be made.
             features (list): List of features to select the best/worst from
             method (str, optional): remove or add features.  Defaults to "remove".
-            ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and  metric. Defaults to "classification".
+            ml_category (str, optional): Either classification or regression ml problem controls the ML algorithm and  metric. 
+                Defaults to "classification".
+            ml_subcategory (str, optional): In case of classification '_bin' for binary classification 
+                and 'multi' for multiclass classification. For regression an empty string '' is sufficient. 
+                Defaults to "bin".
             metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
             stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
 
@@ -53,9 +57,25 @@ class FeatureSelection:
         
         best_feature = None
         
-        if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
-            raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
-        elif ml_type == "regression" and metric not in ['r2']:
+        # Validacije tipov ML in specificiranimi metrikami
+        if ml_category == "classification":
+            if ml_subcategory == "bin" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
+                raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
+            elif ml_subcategory == "multi":
+                ml_subcategory_error = False
+                if metric != "accuracy" and "_" in metric:          
+                    metric_s, metric_t = metric.split("_")
+                    if metric_s not in ['accuracy', 'precision', 'recall', 'f1'] or metric_t not in ['micro', 'macro', 'weighted']:
+                        ml_subcategory_error = True
+                else:
+                    ml_subcategory_error = True
+                    
+                if ml_subcategory_error:
+                    raise ValueError(""""Classification metric for multi-class classification must be specified precisely.
+                                     Available metric are: 'accuracy', 'precision', 'recall' and 'f1'.
+                                     Only accuracy must be specified as 'accuracy'.
+                                     For others please add appropriate suffixes: '_macro', '_micro', or '_weighted', e.g., 'f1_macro'""")
+        elif ml_category == "regression" and metric not in ['r2']:
             raise ValueError("Regression metric not recognized. Please choose 'r2'")
 
         for feat in features:
@@ -73,7 +93,7 @@ class FeatureSelection:
                 
             # See link about scoring for multiclassfication
             # http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
-            if ml_type == "classification":
+            if ml_category == "classification":
                 nb = GaussianNB()
                 model_cv = cross_validate(
                     nb,
@@ -85,18 +105,8 @@ class FeatureSelection:
                     scoring=(metric)
                 )
                 
-                with warnings.catch_warnings():
-                    warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
-
-                    metric_score = np.nanmean(model_cv[f'test_{metric}'])
-                    metric_score_std = np.nanstd(model_cv[f'test_{metric}'])
-                    
-                    if not best_feature or (metric_score > best_metric_score):
-                        best_feature = feat
-                        best_metric_score = metric_score
-                        best_metric_score_std = metric_score_std
                                        
-            elif ml_type == "regression":
+            elif ml_category == "regression":
                 lass = Lasso()
                 model_cv = cross_validate(
                     lass,
@@ -108,16 +118,20 @@ class FeatureSelection:
                     scoring=('r2')
                 )
 
-                if metric == "r2":
-                    r2 = np.mean(model_cv['test_r2'])
-                    r2_std = np.std(model_cv['test_r2'])
-                    
-                    if not best_feature or (r2 > best_metric_score):
-                        best_feature = feat
-                        best_metric_score = r2
-                        best_metric_score_std = r2_std
             else:
                 raise ValueError("ML type not yet implemented!")
+            
+            # Section of metrics' scores comparison. 
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
+
+                metric_score = np.nanmean(model_cv["test_score"])
+                metric_score_std = np.nanstd(model_cv["test_score"])
+                
+                if not best_feature or (metric_score > best_metric_score):
+                    best_feature = feat
+                    best_metric_score = metric_score
+                    best_metric_score_std = metric_score_std
                     
         return best_feature, best_metric_score, best_metric_score_std
     
@@ -137,9 +151,10 @@ class FeatureSelection:
         Args:
             n_min (int, optional): Minimal amount of features returned.
             n_max (int, optional): Maximal amount of features returned.
-            k (int, optional): Determines the k in the k-best features method.
+            k (int, optional): Determines the k in the k-best features method. 
+                If None, SelectKBest feature selection does not execute.
             ml_type(str, optional): Type of ML problem. Currently implemented options: 
-                classification_bin, classification_multi, and regression_
+                'classification_bin', 'classification_multi', and 'regression_'
             method (str, optional): "remove" or "add" features.  Defaults to "remove".
             n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
                 the method returns index of feature with current best score as a tipping point feature.
@@ -147,35 +162,38 @@ class FeatureSelection:
         Returns:
             list: list of selected features
         """        
-        n_features = self.X.shape[1]
-        if n_max >= n_features:
-            n_max = n_features-1 # The algorithm removes at least one feature
-        if k < n_max:
-            raise ValueError("The k parameter needs to be lower than the n_max parameter.")
+
+        if k is not None and k <= n_max:
+            raise ValueError("The k parameter needs to be greater than the n_max parameter.")
         
         # Select k-best feature dependent on the type of ML task
-        ml_type = ml_type.split("_")
-        if ml_type[0] == "classification":
-            if ml_type[1] == "bin":
-                selector = SelectKBest(mutual_info_classif, k=k)
-            elif ml_type[1] == "multi":
-                selector = SelectKBest(f_classif, k=k)
+        ml_category, ml_subcategory = ml_type.split("_")
+
+        if k is not None:
+            if ml_category == "classification":
+                if ml_subcategory== "bin":
+                    selector = SelectKBest(mutual_info_classif, k=k)
+                elif ml_subcategory== "multi":
+                    selector = SelectKBest(f_classif, k=k)
+                else:
+                    raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
+            elif ml_category == "regression":
+                selector = SelectKBest(f_regression, k=k)
             else:
-                raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
-        elif ml_type[0] == "regression":
-            selector = SelectKBest(f_regression, k=k)
-        else:
-            raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
-        
-        selector.fit(self.X, self.y)
-        cols_idxs = selector.get_support(indices=True)
-        self.X = self.X.iloc[:,cols_idxs]
+                raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
+            
+            selector.fit(self.X, self.y)
+            cols_idxs = selector.get_support(indices=True)
+            self.X = self.X.iloc[:,cols_idxs]
         
+        print("All columns (after SelectKBest method):")
         print(self.X.columns)
         
         # Sequential feature addition / removal
         n_features = self.X.shape[1]
-        
+        if n_max >= n_features:
+            n_max = n_features-1 # The algorithm removes at least one feature
+            
         if n_min > n_features:
             raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
         
@@ -190,17 +208,16 @@ class FeatureSelection:
             i_worse = 0
             for i in reversed(range(n_features)):
                 
-                print("Iteration:", i+1)
-                
                 if i+1 == n_min:
                     break
                 
                 best_feature, best_metric_score, best_metric_score_std = \
-                    self.select_best_feature(features, method=method, ml_type=ml_type[0], metric=metric)
+                    self.select_best_feature(features, method=method, ml_category=ml_category, ml_subcategory=ml_subcategory, metric=metric)
                     
                 feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
                 
                 features.remove(best_feature)
+                print("Features left:", i) 
                 
                 if i <= n_max:
                     if best_metric_score >= best_score:
@@ -223,49 +240,6 @@ class FeatureSelection:
             selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]    
             
             return selected_features
-            
-            """
-            # Selekcijski kriterij značilk v rangu max-min
-            # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
-            
-            # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
-            
-            # "Tipping point" značilka mora biti v rangu max-min
-            selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
-            selection_area.set_index(["i", "name"], inplace=True)
-            print(selection_area)
-            diffrences = selection_area.diff()
-            diffrences.dropna(how='any', inplace=True)
-            print(diffrences)
-            
-            # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo 
-            cumulative_sumation = diffrences.cumsum()
-            print(cumulative_sumation)
-            tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
-            print(tipping_feature_indx_1)
-
-                
-            # Metoda, ki pusti n_tolerance značilkam, da premagajo dosedajno najboljši score     
-            tipping_feature_indx_2 = None
-            best_score = 0
-            i_worse = 0
-            for indx, row in selection_area.iterrows():
-                if row["metric"] > best_score:
-                    tipping_feature_indx_2 = indx
-                    best_score = row["metric"]
-                    i_worse = 0
-                else:
-                    i_worse += 1
-                
-                if i_worse == n_tolerance:
-                    break            
-            
-            print(tipping_feature_indx_2)
-            selection_area.reset_index(inplace=True)
-            features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
-            
-            selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
-            """
         
         else:
             raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
\ No newline at end of file