Implement feature selection method which is used in ML pipeline.

2023-04-19 15:56:34 +02:00 · 2023-04-19 15:56:34 +02:00 · ce13a9e13b
parent 10ca47583c
commit ce13a9e13b
2 changed files with 100 additions and 47 deletions
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@ -26,24 +26,45 @@ if nb_dir not in sys.path:

 from machine_learning.cross_validation import CrossValidation
 from machine_learning.preprocessing import Preprocessing
+from machine_learning.feature_selection import FeatureSelection

 # %% 
 df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 df.set_index(index_columns, inplace=True)

+# Create binary target 
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
+
+
+nan_cols = df.columns[df.isna().any()].tolist()
+df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))
+
 cv = CrossValidation(data=df, cv_method="logo")

 categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
 interval_feature_list, other_feature_list = [], []

-print(df.columns.tolist())
-
+# %%
 for split in cv.get_splits():
    train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
    pre = Preprocessing(train_X, train_y, test_X, test_y)
    pre.one_hot_encode_train_and_test_sets(categorical_columns)
    train_X, train_y, test_X, test_y = pre.get_train_test_sets()
+    
+    # train_X = train_X[train_X.columns[:30]]
+    
+    # Feature selection on train set
+    # Morda se implementira GroupKfold namesto stratifiedKFold? >>
+    # >> Tako se bo posamezen pid pojavil ali v test ali v train setu
+    fs = FeatureSelection(train_X, train_y) 
+    selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3)
+    print(selected_features)
+    print(len(selected_features))
+    
+    
+    
    break

 # %%
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@ -1,11 +1,13 @@
 import os
 import sys
+import warnings

 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd

 from sklearn.feature_selection import SequentialFeatureSelector
+from sklearn.model_selection import cross_validate, StratifiedKFold
 from sklearn.naive_bayes import GaussianNB
 from sklearn.linear_model import Lasso 

@ -21,11 +23,12 @@ from sklearn.linear_model import Lasso

 class FeatureSelection:

-    def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
-        pass # TODO.... 
+    def __init__(self, X, y):
+        self.X = X
+        self.y = y

    
-    def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
+    def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
        """The method selects the best feature by testing the prediction on the feature set with or without the current feature.
        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat 
        feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
@ -56,18 +59,18 @@ class FeatureSelection:

        for feat in features:
            if method == "remove":
-                pred_features = [col for col in df.columns if feat != col] # All but feat
+                pred_features = [col for col in self.X.columns if feat != col] # All but feat
            elif method == "add":
                pred_features = [feat] + stored_features # Feat with stored features
            
-            X, y  = df.drop(columns=['target', 'pid'])[pred_features], df['target']
+            X  = self.X[pred_features].copy()
            
            if ml_type == "classification":
                nb = GaussianNB()
                model_cv = cross_validate(
                    nb,
                    X=X,
-                    y=y,
+                    y=self.y,
                    cv=StratifiedKFold(n_splits=5, shuffle=True),
                    n_jobs=-1,
                    scoring=('accuracy', 'precision', 'recall', 'f1')
@ -137,66 +140,57 @@ class FeatureSelection:
        return best_feature, best_metric_score, best_metric_score_std
    
    
-    def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
+    def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10):
+        """This method selects a set of features and returns them as a list. It returns number of features 
+        determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure.
+        The procedure sequentially removes the features that attribute the least to the choosen evaluation metric.
+        If in this sequence the score ML score is improved the next feature is remove otherwise there is a 
+        tolerance criteria (n_not_improve) with which the next n remove features are inspected whether 
+        currently best score is improved. The features are returned in specified interval as a list.     

-        n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
-        if n_max > n_features:
-            n_max = n_features
+        Args:
+            n_min (int): Minimal amount of features returned.
+            n_max (int): Maximal amount of features returned.
+            method (str, optional): "remove" or "add" features.  Defaults to "remove".
+            n_not_improve (int): If the best score is not improved in n that is specified by this parameter
+                the method returns index of feature with current best score as a tipping point feature.
+            
+        Returns:
+            list: list of selected features
+        """        
+        
+        n_features = self.X.shape[1]
+        if n_max >= n_features:
+            n_max = n_features-1 # The algorithm removes at least one feature
        
        if n_min > n_features:
-            raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
+            raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.")
        
        if n_max < n_min:
            raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
        
-        features = df.columns.tolist()
-        features.remove("pid")
-        features.remove("target")
+        features = self.X.columns.tolist()
        feature_importance = []
        if method == "remove":
+            best_score = 0
+            best_feature_indx = None
+            i_worse = 0
            for i in reversed(range(n_features)):
                
+                if i+1 == n_min:
+                    break
+                
                best_feature, best_metric_score, best_metric_score_std = \
-                    self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
-                feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
+                    self.select_best_feature(features, method=method, ml_type="classification", metric="recall")
+                    
+                feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
                
                features.remove(best_feature)
                
-            feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
-            
-            # Selekcijski kriterij značilk v rangu max-min
-            # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
-            
-            # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
-            
-            # "Tipping point" značilka mora biti v rangu max-min
-            
-            selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
-            selection_area.set_index(["i", "name"], inplace=True)
-            diffrences = selection_area.diff()
-            diffrences.dropna(how='any', inplace=True)
-            
-            # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo 
-            cumulative_sumation = diffrences.cumsum()
-            tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
-
-            # Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative 
-            tipping_feature_indx_2 = None
-            for indx, row in diffrences.iterrows():
-                if row["metric"] > 0:
-                    tipping_feature_indx_2 = indx
-                else: 
-                    break
-                
-            # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score     
-            tipping_feature_indx_3 = None
-            cum_sum_score = 0
-            i_worse = 0
-            # TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
-            for indx, row in selection_area.iterrows():
-                if row["metric"] > 0:
-                    tipping_feature_indx_3 = indx
-                    cum_sum_score += row["metric"]
+                if i <= n_max:
+                    if best_metric_score >= best_score:
+                        best_score = best_metric_score
+                        best_feature_indx = i+1
                        i_worse = 0
                    else:
                        i_worse += 1
@ -204,18 +198,56 @@ class FeatureSelection:
                    if i_worse == n_not_improve: 
                        break  
                
+            feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
+
+            print(feature_importance_df)
+            print("best_feature_indx", best_feature_indx)
+            print("best_score", best_score)
+
+            features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist()
+            selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]    
+            
+            return selected_features
+            
+            """
+            # Selekcijski kriterij značilk v rangu max-min
+            # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
+            
+            # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
+            
+            # "Tipping point" značilka mora biti v rangu max-min
+            selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
+            selection_area.set_index(["i", "name"], inplace=True)
+            print(selection_area)
+            diffrences = selection_area.diff()
+            diffrences.dropna(how='any', inplace=True)
+            print(diffrences)
+            
+            # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo 
+            cumulative_sumation = diffrences.cumsum()
+            print(cumulative_sumation)
+            tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
+            print(tipping_feature_indx_1)

                
+            # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score     
+            tipping_feature_indx_2 = None
+            best_score = 0
+            i_worse = 0
+            for indx, row in selection_area.iterrows():
+                if row["metric"] > best_score:
+                    tipping_feature_indx_2 = indx
+                    best_score = row["metric"]
+                    i_worse = 0
+                else:
+                    i_worse += 1
                
+                if i_worse == n_not_improve:
+                    break            
            
+            print(tipping_feature_indx_2)
+            selection_area.reset_index(inplace=True)
+            features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
            
-        
-
-    def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
-        pass
-
-    def vizualize_feature_selection_process():
-        pass
-
-    def execute_feature_selection_step():
-        pass
+            selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
+            """