Add kBest method to initially filter out the worst performing features. Update comments.
parent
2a8f1ee613
commit
1cbc743cf7
|
@ -59,12 +59,12 @@ for split in cv.get_splits():
|
||||||
# Morda se implementira GroupKfold namesto stratifiedKFold? >>
|
# Morda se implementira GroupKfold namesto stratifiedKFold? >>
|
||||||
# >> Tako se bo posamezen pid pojavil ali v test ali v train setu
|
# >> Tako se bo posamezen pid pojavil ali v test ali v train setu
|
||||||
fs = FeatureSelection(train_X, train_y)
|
fs = FeatureSelection(train_X, train_y)
|
||||||
selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3)
|
selected_features = fs.select_features(n_min=20, n_max=50, k=80,
|
||||||
|
ml_type="regression_",
|
||||||
|
n_tolerance=20)
|
||||||
print(selected_features)
|
print(selected_features)
|
||||||
print(len(selected_features))
|
print(len(selected_features))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
|
|
@ -6,7 +6,7 @@ import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from sklearn.feature_selection import SequentialFeatureSelector
|
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
|
||||||
from sklearn.model_selection import cross_validate, StratifiedKFold
|
from sklearn.model_selection import cross_validate, StratifiedKFold
|
||||||
from sklearn.naive_bayes import GaussianNB
|
from sklearn.naive_bayes import GaussianNB
|
||||||
from sklearn.linear_model import Lasso
|
from sklearn.linear_model import Lasso
|
||||||
|
@ -140,31 +140,62 @@ class FeatureSelection:
|
||||||
return best_feature, best_metric_score, best_metric_score_std
|
return best_feature, best_metric_score, best_metric_score_std
|
||||||
|
|
||||||
|
|
||||||
def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10):
|
def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10):
|
||||||
"""This method selects a set of features and returns them as a list. It returns number of features
|
"""This method selects a set of features and returns them as a list. It returns number of features
|
||||||
determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure.
|
determined in the interval of [n_min, n_max].
|
||||||
The procedure sequentially removes the features that attribute the least to the choosen evaluation metric.
|
|
||||||
If in this sequence the score ML score is improved the next feature is remove otherwise there is a
|
The method consists of two steps:
|
||||||
tolerance criteria (n_not_improve) with which the next n remove features are inspected whether
|
(1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter.
|
||||||
currently best score is improved. The features are returned in specified interval as a list.
|
(2) The sequential features removal procedure is executed. Using the remaing features from (1).
|
||||||
|
The best score is detected using a removal procedure. The procedure sequentially removes the features
|
||||||
|
that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is
|
||||||
|
improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance)
|
||||||
|
with which the next n removed features are inspected whether currently best score is improved.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
n_min (int): Minimal amount of features returned.
|
n_min (int, optional): Minimal amount of features returned.
|
||||||
n_max (int): Maximal amount of features returned.
|
n_max (int, optional): Maximal amount of features returned.
|
||||||
|
k (int, optional): Determines the k in the k-best features method.
|
||||||
|
ml_type(str, optional): Type of ML problem. Currently implemented options:
|
||||||
|
classification_bin, classification_multi, and regression_
|
||||||
method (str, optional): "remove" or "add" features. Defaults to "remove".
|
method (str, optional): "remove" or "add" features. Defaults to "remove".
|
||||||
n_not_improve (int): If the best score is not improved in n that is specified by this parameter
|
n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
|
||||||
the method returns index of feature with current best score as a tipping point feature.
|
the method returns index of feature with current best score as a tipping point feature.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: list of selected features
|
list: list of selected features
|
||||||
"""
|
"""
|
||||||
|
|
||||||
n_features = self.X.shape[1]
|
n_features = self.X.shape[1]
|
||||||
if n_max >= n_features:
|
if n_max >= n_features:
|
||||||
n_max = n_features-1 # The algorithm removes at least one feature
|
n_max = n_features-1 # The algorithm removes at least one feature
|
||||||
|
if k < n_max:
|
||||||
|
raise ValueError("The k parameter needs to be lower than the n_max parameter.")
|
||||||
|
|
||||||
|
# Select k-best feature dependent on the type of ML task
|
||||||
|
ml_type = ml_type.split("_")
|
||||||
|
if ml_type[0] == "classification":
|
||||||
|
if ml_type[1] == "bin":
|
||||||
|
selector = SelectKBest(mutual_info_classif, k=k)
|
||||||
|
elif ml_type[1] == "multi":
|
||||||
|
selector = SelectKBest(f_classif, k=k)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
|
||||||
|
elif ml_type[0] == "regression":
|
||||||
|
selector = SelectKBest(f_regression, k=k)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
|
||||||
|
|
||||||
|
selector.fit(self.X, self.y)
|
||||||
|
cols_idxs = selector.get_support(indices=True)
|
||||||
|
self.X = self.X.iloc[:,cols_idxs]
|
||||||
|
|
||||||
|
print(self.X.columns)
|
||||||
|
|
||||||
|
# Sequential feature addition / removal
|
||||||
|
n_features = self.X.shape[1]
|
||||||
|
|
||||||
if n_min > n_features:
|
if n_min > n_features:
|
||||||
raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.")
|
raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
|
||||||
|
|
||||||
if n_max < n_min:
|
if n_max < n_min:
|
||||||
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
|
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
|
||||||
|
@ -177,11 +208,13 @@ class FeatureSelection:
|
||||||
i_worse = 0
|
i_worse = 0
|
||||||
for i in reversed(range(n_features)):
|
for i in reversed(range(n_features)):
|
||||||
|
|
||||||
|
print("Iteration:", i+1)
|
||||||
|
|
||||||
if i+1 == n_min:
|
if i+1 == n_min:
|
||||||
break
|
break
|
||||||
|
|
||||||
best_feature, best_metric_score, best_metric_score_std = \
|
best_feature, best_metric_score, best_metric_score_std = \
|
||||||
self.select_best_feature(features, method=method, ml_type="classification", metric="recall")
|
self.select_best_feature(features, method=method, ml_type=ml_type[0], metric="recall")
|
||||||
|
|
||||||
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
|
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
|
||||||
|
|
||||||
|
@ -195,7 +228,7 @@ class FeatureSelection:
|
||||||
else:
|
else:
|
||||||
i_worse += 1
|
i_worse += 1
|
||||||
|
|
||||||
if i_worse == n_not_improve:
|
if i_worse == n_tolerance:
|
||||||
break
|
break
|
||||||
|
|
||||||
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
|
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
|
||||||
|
@ -230,7 +263,7 @@ class FeatureSelection:
|
||||||
print(tipping_feature_indx_1)
|
print(tipping_feature_indx_1)
|
||||||
|
|
||||||
|
|
||||||
# Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score
|
# Metoda, ki pusti n_tolerance značilkam, da premagajo dosedajno najboljši score
|
||||||
tipping_feature_indx_2 = None
|
tipping_feature_indx_2 = None
|
||||||
best_score = 0
|
best_score = 0
|
||||||
i_worse = 0
|
i_worse = 0
|
||||||
|
@ -242,7 +275,7 @@ class FeatureSelection:
|
||||||
else:
|
else:
|
||||||
i_worse += 1
|
i_worse += 1
|
||||||
|
|
||||||
if i_worse == n_not_improve:
|
if i_worse == n_tolerance:
|
||||||
break
|
break
|
||||||
|
|
||||||
print(tipping_feature_indx_2)
|
print(tipping_feature_indx_2)
|
||||||
|
@ -250,4 +283,7 @@ class FeatureSelection:
|
||||||
features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
|
features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
|
||||||
|
|
||||||
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
|
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
|
Loading…
Reference in New Issue