Add kBest method to initially filter out the worst performing features. Update comments.
parent
2a8f1ee613
commit
1cbc743cf7
|
@ -59,12 +59,12 @@ for split in cv.get_splits():
|
|||
# Morda se implementira GroupKfold namesto stratifiedKFold? >>
|
||||
# >> Tako se bo posamezen pid pojavil ali v test ali v train setu
|
||||
fs = FeatureSelection(train_X, train_y)
|
||||
selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3)
|
||||
selected_features = fs.select_features(n_min=20, n_max=50, k=80,
|
||||
ml_type="regression_",
|
||||
n_tolerance=20)
|
||||
print(selected_features)
|
||||
print(len(selected_features))
|
||||
|
||||
|
||||
|
||||
break
|
||||
|
||||
# %%
|
||||
|
|
|
@ -6,7 +6,7 @@ import numpy as np
|
|||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.feature_selection import SequentialFeatureSelector
|
||||
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
|
||||
from sklearn.model_selection import cross_validate, StratifiedKFold
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.linear_model import Lasso
|
||||
|
@ -140,31 +140,62 @@ class FeatureSelection:
|
|||
return best_feature, best_metric_score, best_metric_score_std
|
||||
|
||||
|
||||
def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10):
|
||||
def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10):
|
||||
"""This method selects a set of features and returns them as a list. It returns number of features
|
||||
determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure.
|
||||
The procedure sequentially removes the features that attribute the least to the choosen evaluation metric.
|
||||
If in this sequence the score ML score is improved the next feature is remove otherwise there is a
|
||||
tolerance criteria (n_not_improve) with which the next n remove features are inspected whether
|
||||
currently best score is improved. The features are returned in specified interval as a list.
|
||||
determined in the interval of [n_min, n_max].
|
||||
|
||||
The method consists of two steps:
|
||||
(1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter.
|
||||
(2) The sequential features removal procedure is executed. Using the remaing features from (1).
|
||||
The best score is detected using a removal procedure. The procedure sequentially removes the features
|
||||
that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is
|
||||
improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance)
|
||||
with which the next n removed features are inspected whether currently best score is improved.
|
||||
|
||||
Args:
|
||||
n_min (int): Minimal amount of features returned.
|
||||
n_max (int): Maximal amount of features returned.
|
||||
n_min (int, optional): Minimal amount of features returned.
|
||||
n_max (int, optional): Maximal amount of features returned.
|
||||
k (int, optional): Determines the k in the k-best features method.
|
||||
ml_type(str, optional): Type of ML problem. Currently implemented options:
|
||||
classification_bin, classification_multi, and regression_
|
||||
method (str, optional): "remove" or "add" features. Defaults to "remove".
|
||||
n_not_improve (int): If the best score is not improved in n that is specified by this parameter
|
||||
n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
|
||||
the method returns index of feature with current best score as a tipping point feature.
|
||||
|
||||
Returns:
|
||||
list: list of selected features
|
||||
"""
|
||||
|
||||
n_features = self.X.shape[1]
|
||||
if n_max >= n_features:
|
||||
n_max = n_features-1 # The algorithm removes at least one feature
|
||||
if k < n_max:
|
||||
raise ValueError("The k parameter needs to be lower than the n_max parameter.")
|
||||
|
||||
# Select k-best feature dependent on the type of ML task
|
||||
ml_type = ml_type.split("_")
|
||||
if ml_type[0] == "classification":
|
||||
if ml_type[1] == "bin":
|
||||
selector = SelectKBest(mutual_info_classif, k=k)
|
||||
elif ml_type[1] == "multi":
|
||||
selector = SelectKBest(f_classif, k=k)
|
||||
else:
|
||||
raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
|
||||
elif ml_type[0] == "regression":
|
||||
selector = SelectKBest(f_regression, k=k)
|
||||
else:
|
||||
raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
|
||||
|
||||
selector.fit(self.X, self.y)
|
||||
cols_idxs = selector.get_support(indices=True)
|
||||
self.X = self.X.iloc[:,cols_idxs]
|
||||
|
||||
print(self.X.columns)
|
||||
|
||||
# Sequential feature addition / removal
|
||||
n_features = self.X.shape[1]
|
||||
|
||||
if n_min > n_features:
|
||||
raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.")
|
||||
raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
|
||||
|
||||
if n_max < n_min:
|
||||
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
|
||||
|
@ -177,11 +208,13 @@ class FeatureSelection:
|
|||
i_worse = 0
|
||||
for i in reversed(range(n_features)):
|
||||
|
||||
print("Iteration:", i+1)
|
||||
|
||||
if i+1 == n_min:
|
||||
break
|
||||
|
||||
best_feature, best_metric_score, best_metric_score_std = \
|
||||
self.select_best_feature(features, method=method, ml_type="classification", metric="recall")
|
||||
self.select_best_feature(features, method=method, ml_type=ml_type[0], metric="recall")
|
||||
|
||||
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
|
||||
|
||||
|
@ -195,7 +228,7 @@ class FeatureSelection:
|
|||
else:
|
||||
i_worse += 1
|
||||
|
||||
if i_worse == n_not_improve:
|
||||
if i_worse == n_tolerance:
|
||||
break
|
||||
|
||||
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
|
||||
|
@ -230,7 +263,7 @@ class FeatureSelection:
|
|||
print(tipping_feature_indx_1)
|
||||
|
||||
|
||||
# Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score
|
||||
# Metoda, ki pusti n_tolerance značilkam, da premagajo dosedajno najboljši score
|
||||
tipping_feature_indx_2 = None
|
||||
best_score = 0
|
||||
i_worse = 0
|
||||
|
@ -242,7 +275,7 @@ class FeatureSelection:
|
|||
else:
|
||||
i_worse += 1
|
||||
|
||||
if i_worse == n_not_improve:
|
||||
if i_worse == n_tolerance:
|
||||
break
|
||||
|
||||
print(tipping_feature_indx_2)
|
||||
|
@ -250,4 +283,7 @@ class FeatureSelection:
|
|||
features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
|
||||
|
||||
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
|
||||
"""
|
||||
"""
|
||||
|
||||
else:
|
||||
raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
|
Loading…
Reference in New Issue