Add kBest method to initially filter out the worst performing features. Update comments.

ml_pipeline
Primoz 2023-04-20 10:12:16 +02:00
parent 2a8f1ee613
commit 1cbc743cf7
2 changed files with 56 additions and 20 deletions

View File

@ -59,12 +59,12 @@ for split in cv.get_splits():
# Morda se implementira GroupKfold namesto stratifiedKFold? >> # Morda se implementira GroupKfold namesto stratifiedKFold? >>
# >> Tako se bo posamezen pid pojavil ali v test ali v train setu # >> Tako se bo posamezen pid pojavil ali v test ali v train setu
fs = FeatureSelection(train_X, train_y) fs = FeatureSelection(train_X, train_y)
selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3) selected_features = fs.select_features(n_min=20, n_max=50, k=80,
ml_type="regression_",
n_tolerance=20)
print(selected_features) print(selected_features)
print(len(selected_features)) print(len(selected_features))
break break
# %% # %%

View File

@ -6,7 +6,7 @@ import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Lasso from sklearn.linear_model import Lasso
@ -140,31 +140,62 @@ class FeatureSelection:
return best_feature, best_metric_score, best_metric_score_std return best_feature, best_metric_score, best_metric_score_std
def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10): def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10):
"""This method selects a set of features and returns them as a list. It returns number of features """This method selects a set of features and returns them as a list. It returns number of features
determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure. determined in the interval of [n_min, n_max].
The procedure sequentially removes the features that attribute the least to the choosen evaluation metric.
If in this sequence the score ML score is improved the next feature is remove otherwise there is a The method consists of two steps:
tolerance criteria (n_not_improve) with which the next n remove features are inspected whether (1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter.
currently best score is improved. The features are returned in specified interval as a list. (2) The sequential features removal procedure is executed. Using the remaing features from (1).
The best score is detected using a removal procedure. The procedure sequentially removes the features
that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is
improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance)
with which the next n removed features are inspected whether currently best score is improved.
Args: Args:
n_min (int): Minimal amount of features returned. n_min (int, optional): Minimal amount of features returned.
n_max (int): Maximal amount of features returned. n_max (int, optional): Maximal amount of features returned.
k (int, optional): Determines the k in the k-best features method.
ml_type(str, optional): Type of ML problem. Currently implemented options:
classification_bin, classification_multi, and regression_
method (str, optional): "remove" or "add" features. Defaults to "remove". method (str, optional): "remove" or "add" features. Defaults to "remove".
n_not_improve (int): If the best score is not improved in n that is specified by this parameter n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
the method returns index of feature with current best score as a tipping point feature. the method returns index of feature with current best score as a tipping point feature.
Returns: Returns:
list: list of selected features list: list of selected features
""" """
n_features = self.X.shape[1] n_features = self.X.shape[1]
if n_max >= n_features: if n_max >= n_features:
n_max = n_features-1 # The algorithm removes at least one feature n_max = n_features-1 # The algorithm removes at least one feature
if k < n_max:
raise ValueError("The k parameter needs to be lower than the n_max parameter.")
# Select k-best feature dependent on the type of ML task
ml_type = ml_type.split("_")
if ml_type[0] == "classification":
if ml_type[1] == "bin":
selector = SelectKBest(mutual_info_classif, k=k)
elif ml_type[1] == "multi":
selector = SelectKBest(f_classif, k=k)
else:
raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
elif ml_type[0] == "regression":
selector = SelectKBest(f_regression, k=k)
else:
raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
selector.fit(self.X, self.y)
cols_idxs = selector.get_support(indices=True)
self.X = self.X.iloc[:,cols_idxs]
print(self.X.columns)
# Sequential feature addition / removal
n_features = self.X.shape[1]
if n_min > n_features: if n_min > n_features:
raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.") raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
if n_max < n_min: if n_max < n_min:
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.") raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
@ -177,11 +208,13 @@ class FeatureSelection:
i_worse = 0 i_worse = 0
for i in reversed(range(n_features)): for i in reversed(range(n_features)):
print("Iteration:", i+1)
if i+1 == n_min: if i+1 == n_min:
break break
best_feature, best_metric_score, best_metric_score_std = \ best_feature, best_metric_score, best_metric_score_std = \
self.select_best_feature(features, method=method, ml_type="classification", metric="recall") self.select_best_feature(features, method=method, ml_type=ml_type[0], metric="recall")
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std)) feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
@ -195,7 +228,7 @@ class FeatureSelection:
else: else:
i_worse += 1 i_worse += 1
if i_worse == n_not_improve: if i_worse == n_tolerance:
break break
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd']) feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
@ -230,7 +263,7 @@ class FeatureSelection:
print(tipping_feature_indx_1) print(tipping_feature_indx_1)
# Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score # Metoda, ki pusti n_tolerance značilkam, da premagajo dosedajno najboljši score
tipping_feature_indx_2 = None tipping_feature_indx_2 = None
best_score = 0 best_score = 0
i_worse = 0 i_worse = 0
@ -242,7 +275,7 @@ class FeatureSelection:
else: else:
i_worse += 1 i_worse += 1
if i_worse == n_not_improve: if i_worse == n_tolerance:
break break
print(tipping_feature_indx_2) print(tipping_feature_indx_2)
@ -250,4 +283,7 @@ class FeatureSelection:
features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist() features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove] selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
""" """
else:
raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")