Add kBest method to initially filter out the worst performing features. Update comments.

2023-04-20 10:12:16 +02:00 · 2023-04-20 10:12:16 +02:00 · 1cbc743cf7
parent 2a8f1ee613
commit 1cbc743cf7
2 changed files with 56 additions and 20 deletions
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@ -59,12 +59,12 @@ for split in cv.get_splits():
    # Morda se implementira GroupKfold namesto stratifiedKFold? >>
    # >> Tako se bo posamezen pid pojavil ali v test ali v train setu
    fs = FeatureSelection(train_X, train_y) 
-    selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3)
+    selected_features = fs.select_features(n_min=20, n_max=50, k=80,
+                                           ml_type="regression_", 
+                                           n_tolerance=20)
    print(selected_features)
    print(len(selected_features))
    
-    
-    
    break

 # %%
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@ -6,7 +6,7 @@ import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd

-from sklearn.feature_selection import SequentialFeatureSelector
+from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
 from sklearn.model_selection import cross_validate, StratifiedKFold
 from sklearn.naive_bayes import GaussianNB
 from sklearn.linear_model import Lasso 
@ -140,31 +140,62 @@ class FeatureSelection:
        return best_feature, best_metric_score, best_metric_score_std
    
    
-    def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10):
+    def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10):
        """This method selects a set of features and returns them as a list. It returns number of features 
-        determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure.
-        The procedure sequentially removes the features that attribute the least to the choosen evaluation metric.
-        If in this sequence the score ML score is improved the next feature is remove otherwise there is a 
-        tolerance criteria (n_not_improve) with which the next n remove features are inspected whether 
-        currently best score is improved. The features are returned in specified interval as a list.     
+        determined in the interval of [n_min, n_max]. 
+        
+        The method consists of two steps: 
+        (1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter.
+        (2) The sequential features removal procedure is executed. Using the remaing features from (1).
+            The best score is detected using a removal procedure. The procedure sequentially removes the features 
+            that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is 
+            improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance) 
+            with which the next n removed features are inspected whether currently best score is improved.     

        Args:
-            n_min (int): Minimal amount of features returned.
-            n_max (int): Maximal amount of features returned.
+            n_min (int, optional): Minimal amount of features returned.
+            n_max (int, optional): Maximal amount of features returned.
+            k (int, optional): Determines the k in the k-best features method.
+            ml_type(str, optional): Type of ML problem. Currently implemented options: 
+                classification_bin, classification_multi, and regression_
            method (str, optional): "remove" or "add" features.  Defaults to "remove".
-            n_not_improve (int): If the best score is not improved in n that is specified by this parameter
+            n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
                the method returns index of feature with current best score as a tipping point feature.
            
        Returns:
            list: list of selected features
        """        
-        
        n_features = self.X.shape[1]
        if n_max >= n_features:
            n_max = n_features-1 # The algorithm removes at least one feature
+        if k < n_max:
+            raise ValueError("The k parameter needs to be lower than the n_max parameter.")
+        
+        # Select k-best feature dependent on the type of ML task
+        ml_type = ml_type.split("_")
+        if ml_type[0] == "classification":
+            if ml_type[1] == "bin":
+                selector = SelectKBest(mutual_info_classif, k=k)
+            elif ml_type[1] == "multi":
+                selector = SelectKBest(f_classif, k=k)
+            else:
+                raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
+        elif ml_type[0] == "regression":
+            selector = SelectKBest(f_regression, k=k)
+        else:
+            raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
+        
+        selector.fit(self.X, self.y)
+        cols_idxs = selector.get_support(indices=True)
+        self.X = self.X.iloc[:,cols_idxs]
+        
+        print(self.X.columns)
+        
+        # Sequential feature addition / removal
+        n_features = self.X.shape[1]
        
        if n_min > n_features:
-            raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.")
+            raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
        
        if n_max < n_min:
            raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
@ -177,11 +208,13 @@ class FeatureSelection:
            i_worse = 0
            for i in reversed(range(n_features)):
                
+                print("Iteration:", i+1)
+                
                if i+1 == n_min:
                    break
                
                best_feature, best_metric_score, best_metric_score_std = \
-                    self.select_best_feature(features, method=method, ml_type="classification", metric="recall")
+                    self.select_best_feature(features, method=method, ml_type=ml_type[0], metric="recall")
                    
                feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
                
@ -195,7 +228,7 @@ class FeatureSelection:
                    else:
                        i_worse += 1
                    
-                    if i_worse == n_not_improve: 
+                    if i_worse == n_tolerance: 
                        break  
                
            feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
@ -230,7 +263,7 @@ class FeatureSelection:
            print(tipping_feature_indx_1)

                
-            # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score     
+            # Metoda, ki pusti n_tolerance značilkam, da premagajo dosedajno najboljši score     
            tipping_feature_indx_2 = None
            best_score = 0
            i_worse = 0
@ -242,7 +275,7 @@ class FeatureSelection:
                else:
                    i_worse += 1
                
-                if i_worse == n_not_improve:
+                if i_worse == n_tolerance:
                    break            
            
            print(tipping_feature_indx_2)
@ -250,4 +283,7 @@ class FeatureSelection:
            features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
            
            selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
-            """
+            """
+        
+        else:
+            raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")