Improve the feature selection method with validations etc.
parent
0594993133
commit
259be708aa
|
@ -29,9 +29,9 @@ class FeatureSelection:
|
||||||
self.groups = groups
|
self.groups = groups
|
||||||
|
|
||||||
|
|
||||||
def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
|
def select_best_feature(self, features, method="remove", ml_category="classification", ml_subcategory="bin", metric="recall", stored_features=[]):
|
||||||
"""The method selects the best feature by testing the prediction on the feature set with or without the current feature.
|
"""The method selects the best feature by testing the prediction on the feature set with or without the current feature.
|
||||||
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat
|
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particular
|
||||||
feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
|
feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
|
||||||
specified as a parameter.
|
specified as a parameter.
|
||||||
|
|
||||||
|
@ -39,7 +39,11 @@ class FeatureSelection:
|
||||||
df (DataFrame): Input data on which the predictions will be made.
|
df (DataFrame): Input data on which the predictions will be made.
|
||||||
features (list): List of features to select the best/worst from
|
features (list): List of features to select the best/worst from
|
||||||
method (str, optional): remove or add features. Defaults to "remove".
|
method (str, optional): remove or add features. Defaults to "remove".
|
||||||
ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and metric. Defaults to "classification".
|
ml_category (str, optional): Either classification or regression ml problem controls the ML algorithm and metric.
|
||||||
|
Defaults to "classification".
|
||||||
|
ml_subcategory (str, optional): In case of classification '_bin' for binary classification
|
||||||
|
and 'multi' for multiclass classification. For regression an empty string '' is sufficient.
|
||||||
|
Defaults to "bin".
|
||||||
metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
|
metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
|
||||||
stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
|
stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
|
||||||
|
|
||||||
|
@ -53,9 +57,25 @@ class FeatureSelection:
|
||||||
|
|
||||||
best_feature = None
|
best_feature = None
|
||||||
|
|
||||||
if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
|
# Validacije tipov ML in specificiranimi metrikami
|
||||||
raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
|
if ml_category == "classification":
|
||||||
elif ml_type == "regression" and metric not in ['r2']:
|
if ml_subcategory == "bin" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
|
||||||
|
raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
|
||||||
|
elif ml_subcategory == "multi":
|
||||||
|
ml_subcategory_error = False
|
||||||
|
if metric != "accuracy" and "_" in metric:
|
||||||
|
metric_s, metric_t = metric.split("_")
|
||||||
|
if metric_s not in ['accuracy', 'precision', 'recall', 'f1'] or metric_t not in ['micro', 'macro', 'weighted']:
|
||||||
|
ml_subcategory_error = True
|
||||||
|
else:
|
||||||
|
ml_subcategory_error = True
|
||||||
|
|
||||||
|
if ml_subcategory_error:
|
||||||
|
raise ValueError(""""Classification metric for multi-class classification must be specified precisely.
|
||||||
|
Available metric are: 'accuracy', 'precision', 'recall' and 'f1'.
|
||||||
|
Only accuracy must be specified as 'accuracy'.
|
||||||
|
For others please add appropriate suffixes: '_macro', '_micro', or '_weighted', e.g., 'f1_macro'""")
|
||||||
|
elif ml_category == "regression" and metric not in ['r2']:
|
||||||
raise ValueError("Regression metric not recognized. Please choose 'r2'")
|
raise ValueError("Regression metric not recognized. Please choose 'r2'")
|
||||||
|
|
||||||
for feat in features:
|
for feat in features:
|
||||||
|
@ -73,7 +93,7 @@ class FeatureSelection:
|
||||||
|
|
||||||
# See link about scoring for multiclassfication
|
# See link about scoring for multiclassfication
|
||||||
# http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
|
# http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
|
||||||
if ml_type == "classification":
|
if ml_category == "classification":
|
||||||
nb = GaussianNB()
|
nb = GaussianNB()
|
||||||
model_cv = cross_validate(
|
model_cv = cross_validate(
|
||||||
nb,
|
nb,
|
||||||
|
@ -85,18 +105,8 @@ class FeatureSelection:
|
||||||
scoring=(metric)
|
scoring=(metric)
|
||||||
)
|
)
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
|
||||||
|
|
||||||
metric_score = np.nanmean(model_cv[f'test_{metric}'])
|
|
||||||
metric_score_std = np.nanstd(model_cv[f'test_{metric}'])
|
|
||||||
|
|
||||||
if not best_feature or (metric_score > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = metric_score
|
|
||||||
best_metric_score_std = metric_score_std
|
|
||||||
|
|
||||||
elif ml_type == "regression":
|
elif ml_category == "regression":
|
||||||
lass = Lasso()
|
lass = Lasso()
|
||||||
model_cv = cross_validate(
|
model_cv = cross_validate(
|
||||||
lass,
|
lass,
|
||||||
|
@ -108,16 +118,20 @@ class FeatureSelection:
|
||||||
scoring=('r2')
|
scoring=('r2')
|
||||||
)
|
)
|
||||||
|
|
||||||
if metric == "r2":
|
|
||||||
r2 = np.mean(model_cv['test_r2'])
|
|
||||||
r2_std = np.std(model_cv['test_r2'])
|
|
||||||
|
|
||||||
if not best_feature or (r2 > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = r2
|
|
||||||
best_metric_score_std = r2_std
|
|
||||||
else:
|
else:
|
||||||
raise ValueError("ML type not yet implemented!")
|
raise ValueError("ML type not yet implemented!")
|
||||||
|
|
||||||
|
# Section of metrics' scores comparison.
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
||||||
|
|
||||||
|
metric_score = np.nanmean(model_cv["test_score"])
|
||||||
|
metric_score_std = np.nanstd(model_cv["test_score"])
|
||||||
|
|
||||||
|
if not best_feature or (metric_score > best_metric_score):
|
||||||
|
best_feature = feat
|
||||||
|
best_metric_score = metric_score
|
||||||
|
best_metric_score_std = metric_score_std
|
||||||
|
|
||||||
return best_feature, best_metric_score, best_metric_score_std
|
return best_feature, best_metric_score, best_metric_score_std
|
||||||
|
|
||||||
|
@ -137,9 +151,10 @@ class FeatureSelection:
|
||||||
Args:
|
Args:
|
||||||
n_min (int, optional): Minimal amount of features returned.
|
n_min (int, optional): Minimal amount of features returned.
|
||||||
n_max (int, optional): Maximal amount of features returned.
|
n_max (int, optional): Maximal amount of features returned.
|
||||||
k (int, optional): Determines the k in the k-best features method.
|
k (int, optional): Determines the k in the k-best features method.
|
||||||
|
If None, SelectKBest feature selection does not execute.
|
||||||
ml_type(str, optional): Type of ML problem. Currently implemented options:
|
ml_type(str, optional): Type of ML problem. Currently implemented options:
|
||||||
classification_bin, classification_multi, and regression_
|
'classification_bin', 'classification_multi', and 'regression_'
|
||||||
method (str, optional): "remove" or "add" features. Defaults to "remove".
|
method (str, optional): "remove" or "add" features. Defaults to "remove".
|
||||||
n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
|
n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
|
||||||
the method returns index of feature with current best score as a tipping point feature.
|
the method returns index of feature with current best score as a tipping point feature.
|
||||||
|
@ -147,35 +162,38 @@ class FeatureSelection:
|
||||||
Returns:
|
Returns:
|
||||||
list: list of selected features
|
list: list of selected features
|
||||||
"""
|
"""
|
||||||
n_features = self.X.shape[1]
|
|
||||||
if n_max >= n_features:
|
if k is not None and k <= n_max:
|
||||||
n_max = n_features-1 # The algorithm removes at least one feature
|
raise ValueError("The k parameter needs to be greater than the n_max parameter.")
|
||||||
if k < n_max:
|
|
||||||
raise ValueError("The k parameter needs to be lower than the n_max parameter.")
|
|
||||||
|
|
||||||
# Select k-best feature dependent on the type of ML task
|
# Select k-best feature dependent on the type of ML task
|
||||||
ml_type = ml_type.split("_")
|
ml_category, ml_subcategory = ml_type.split("_")
|
||||||
if ml_type[0] == "classification":
|
|
||||||
if ml_type[1] == "bin":
|
if k is not None:
|
||||||
selector = SelectKBest(mutual_info_classif, k=k)
|
if ml_category == "classification":
|
||||||
elif ml_type[1] == "multi":
|
if ml_subcategory== "bin":
|
||||||
selector = SelectKBest(f_classif, k=k)
|
selector = SelectKBest(mutual_info_classif, k=k)
|
||||||
|
elif ml_subcategory== "multi":
|
||||||
|
selector = SelectKBest(f_classif, k=k)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
|
||||||
|
elif ml_category == "regression":
|
||||||
|
selector = SelectKBest(f_regression, k=k)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
|
raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
|
||||||
elif ml_type[0] == "regression":
|
|
||||||
selector = SelectKBest(f_regression, k=k)
|
selector.fit(self.X, self.y)
|
||||||
else:
|
cols_idxs = selector.get_support(indices=True)
|
||||||
raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
|
self.X = self.X.iloc[:,cols_idxs]
|
||||||
|
|
||||||
selector.fit(self.X, self.y)
|
|
||||||
cols_idxs = selector.get_support(indices=True)
|
|
||||||
self.X = self.X.iloc[:,cols_idxs]
|
|
||||||
|
|
||||||
|
print("All columns (after SelectKBest method):")
|
||||||
print(self.X.columns)
|
print(self.X.columns)
|
||||||
|
|
||||||
# Sequential feature addition / removal
|
# Sequential feature addition / removal
|
||||||
n_features = self.X.shape[1]
|
n_features = self.X.shape[1]
|
||||||
|
if n_max >= n_features:
|
||||||
|
n_max = n_features-1 # The algorithm removes at least one feature
|
||||||
|
|
||||||
if n_min > n_features:
|
if n_min > n_features:
|
||||||
raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
|
raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
|
||||||
|
|
||||||
|
@ -190,17 +208,16 @@ class FeatureSelection:
|
||||||
i_worse = 0
|
i_worse = 0
|
||||||
for i in reversed(range(n_features)):
|
for i in reversed(range(n_features)):
|
||||||
|
|
||||||
print("Iteration:", i+1)
|
|
||||||
|
|
||||||
if i+1 == n_min:
|
if i+1 == n_min:
|
||||||
break
|
break
|
||||||
|
|
||||||
best_feature, best_metric_score, best_metric_score_std = \
|
best_feature, best_metric_score, best_metric_score_std = \
|
||||||
self.select_best_feature(features, method=method, ml_type=ml_type[0], metric=metric)
|
self.select_best_feature(features, method=method, ml_category=ml_category, ml_subcategory=ml_subcategory, metric=metric)
|
||||||
|
|
||||||
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
|
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
|
||||||
|
|
||||||
features.remove(best_feature)
|
features.remove(best_feature)
|
||||||
|
print("Features left:", i)
|
||||||
|
|
||||||
if i <= n_max:
|
if i <= n_max:
|
||||||
if best_metric_score >= best_score:
|
if best_metric_score >= best_score:
|
||||||
|
@ -223,49 +240,6 @@ class FeatureSelection:
|
||||||
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
|
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
|
||||||
|
|
||||||
return selected_features
|
return selected_features
|
||||||
|
|
||||||
"""
|
|
||||||
# Selekcijski kriterij značilk v rangu max-min
|
|
||||||
# Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
|
|
||||||
|
|
||||||
# Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
|
|
||||||
|
|
||||||
# "Tipping point" značilka mora biti v rangu max-min
|
|
||||||
selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
|
|
||||||
selection_area.set_index(["i", "name"], inplace=True)
|
|
||||||
print(selection_area)
|
|
||||||
diffrences = selection_area.diff()
|
|
||||||
diffrences.dropna(how='any', inplace=True)
|
|
||||||
print(diffrences)
|
|
||||||
|
|
||||||
# Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo
|
|
||||||
cumulative_sumation = diffrences.cumsum()
|
|
||||||
print(cumulative_sumation)
|
|
||||||
tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
|
|
||||||
print(tipping_feature_indx_1)
|
|
||||||
|
|
||||||
|
|
||||||
# Metoda, ki pusti n_tolerance značilkam, da premagajo dosedajno najboljši score
|
|
||||||
tipping_feature_indx_2 = None
|
|
||||||
best_score = 0
|
|
||||||
i_worse = 0
|
|
||||||
for indx, row in selection_area.iterrows():
|
|
||||||
if row["metric"] > best_score:
|
|
||||||
tipping_feature_indx_2 = indx
|
|
||||||
best_score = row["metric"]
|
|
||||||
i_worse = 0
|
|
||||||
else:
|
|
||||||
i_worse += 1
|
|
||||||
|
|
||||||
if i_worse == n_tolerance:
|
|
||||||
break
|
|
||||||
|
|
||||||
print(tipping_feature_indx_2)
|
|
||||||
selection_area.reset_index(inplace=True)
|
|
||||||
features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
|
|
||||||
|
|
||||||
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
|
|
||||||
"""
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
|
raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
|
Loading…
Reference in New Issue