Improve the feature selection method with validations etc.

ml_pipeline
Primoz 2023-04-20 13:26:20 +02:00
parent 0594993133
commit 259be708aa
1 changed files with 69 additions and 95 deletions

View File

@ -29,9 +29,9 @@ class FeatureSelection:
self.groups = groups self.groups = groups
def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]): def select_best_feature(self, features, method="remove", ml_category="classification", ml_subcategory="bin", metric="recall", stored_features=[]):
"""The method selects the best feature by testing the prediction on the feature set with or without the current feature. """The method selects the best feature by testing the prediction on the feature set with or without the current feature.
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particular
feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
specified as a parameter. specified as a parameter.
@ -39,7 +39,11 @@ class FeatureSelection:
df (DataFrame): Input data on which the predictions will be made. df (DataFrame): Input data on which the predictions will be made.
features (list): List of features to select the best/worst from features (list): List of features to select the best/worst from
method (str, optional): remove or add features. Defaults to "remove". method (str, optional): remove or add features. Defaults to "remove".
ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and metric. Defaults to "classification". ml_category (str, optional): Either classification or regression ml problem controls the ML algorithm and metric.
Defaults to "classification".
ml_subcategory (str, optional): In case of classification '_bin' for binary classification
and 'multi' for multiclass classification. For regression an empty string '' is sufficient.
Defaults to "bin".
metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall". metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to []. stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
@ -53,9 +57,25 @@ class FeatureSelection:
best_feature = None best_feature = None
if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']: # Validacije tipov ML in specificiranimi metrikami
raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'") if ml_category == "classification":
elif ml_type == "regression" and metric not in ['r2']: if ml_subcategory == "bin" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
elif ml_subcategory == "multi":
ml_subcategory_error = False
if metric != "accuracy" and "_" in metric:
metric_s, metric_t = metric.split("_")
if metric_s not in ['accuracy', 'precision', 'recall', 'f1'] or metric_t not in ['micro', 'macro', 'weighted']:
ml_subcategory_error = True
else:
ml_subcategory_error = True
if ml_subcategory_error:
raise ValueError(""""Classification metric for multi-class classification must be specified precisely.
Available metric are: 'accuracy', 'precision', 'recall' and 'f1'.
Only accuracy must be specified as 'accuracy'.
For others please add appropriate suffixes: '_macro', '_micro', or '_weighted', e.g., 'f1_macro'""")
elif ml_category == "regression" and metric not in ['r2']:
raise ValueError("Regression metric not recognized. Please choose 'r2'") raise ValueError("Regression metric not recognized. Please choose 'r2'")
for feat in features: for feat in features:
@ -73,7 +93,7 @@ class FeatureSelection:
# See link about scoring for multiclassfication # See link about scoring for multiclassfication
# http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/ # http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
if ml_type == "classification": if ml_category == "classification":
nb = GaussianNB() nb = GaussianNB()
model_cv = cross_validate( model_cv = cross_validate(
nb, nb,
@ -85,18 +105,8 @@ class FeatureSelection:
scoring=(metric) scoring=(metric)
) )
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
metric_score = np.nanmean(model_cv[f'test_{metric}']) elif ml_category == "regression":
metric_score_std = np.nanstd(model_cv[f'test_{metric}'])
if not best_feature or (metric_score > best_metric_score):
best_feature = feat
best_metric_score = metric_score
best_metric_score_std = metric_score_std
elif ml_type == "regression":
lass = Lasso() lass = Lasso()
model_cv = cross_validate( model_cv = cross_validate(
lass, lass,
@ -108,17 +118,21 @@ class FeatureSelection:
scoring=('r2') scoring=('r2')
) )
if metric == "r2":
r2 = np.mean(model_cv['test_r2'])
r2_std = np.std(model_cv['test_r2'])
if not best_feature or (r2 > best_metric_score):
best_feature = feat
best_metric_score = r2
best_metric_score_std = r2_std
else: else:
raise ValueError("ML type not yet implemented!") raise ValueError("ML type not yet implemented!")
# Section of metrics' scores comparison.
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
metric_score = np.nanmean(model_cv["test_score"])
metric_score_std = np.nanstd(model_cv["test_score"])
if not best_feature or (metric_score > best_metric_score):
best_feature = feat
best_metric_score = metric_score
best_metric_score_std = metric_score_std
return best_feature, best_metric_score, best_metric_score_std return best_feature, best_metric_score, best_metric_score_std
@ -138,8 +152,9 @@ class FeatureSelection:
n_min (int, optional): Minimal amount of features returned. n_min (int, optional): Minimal amount of features returned.
n_max (int, optional): Maximal amount of features returned. n_max (int, optional): Maximal amount of features returned.
k (int, optional): Determines the k in the k-best features method. k (int, optional): Determines the k in the k-best features method.
If None, SelectKBest feature selection does not execute.
ml_type(str, optional): Type of ML problem. Currently implemented options: ml_type(str, optional): Type of ML problem. Currently implemented options:
classification_bin, classification_multi, and regression_ 'classification_bin', 'classification_multi', and 'regression_'
method (str, optional): "remove" or "add" features. Defaults to "remove". method (str, optional): "remove" or "add" features. Defaults to "remove".
n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
the method returns index of feature with current best score as a tipping point feature. the method returns index of feature with current best score as a tipping point feature.
@ -147,34 +162,37 @@ class FeatureSelection:
Returns: Returns:
list: list of selected features list: list of selected features
""" """
n_features = self.X.shape[1]
if n_max >= n_features: if k is not None and k <= n_max:
n_max = n_features-1 # The algorithm removes at least one feature raise ValueError("The k parameter needs to be greater than the n_max parameter.")
if k < n_max:
raise ValueError("The k parameter needs to be lower than the n_max parameter.")
# Select k-best feature dependent on the type of ML task # Select k-best feature dependent on the type of ML task
ml_type = ml_type.split("_") ml_category, ml_subcategory = ml_type.split("_")
if ml_type[0] == "classification":
if ml_type[1] == "bin": if k is not None:
selector = SelectKBest(mutual_info_classif, k=k) if ml_category == "classification":
elif ml_type[1] == "multi": if ml_subcategory== "bin":
selector = SelectKBest(f_classif, k=k) selector = SelectKBest(mutual_info_classif, k=k)
elif ml_subcategory== "multi":
selector = SelectKBest(f_classif, k=k)
else:
raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
elif ml_category == "regression":
selector = SelectKBest(f_regression, k=k)
else: else:
raise ValueError("Unknown ML type: cannot recognize ML classification subtype.") raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
elif ml_type[0] == "regression":
selector = SelectKBest(f_regression, k=k)
else:
raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
selector.fit(self.X, self.y) selector.fit(self.X, self.y)
cols_idxs = selector.get_support(indices=True) cols_idxs = selector.get_support(indices=True)
self.X = self.X.iloc[:,cols_idxs] self.X = self.X.iloc[:,cols_idxs]
print("All columns (after SelectKBest method):")
print(self.X.columns) print(self.X.columns)
# Sequential feature addition / removal # Sequential feature addition / removal
n_features = self.X.shape[1] n_features = self.X.shape[1]
if n_max >= n_features:
n_max = n_features-1 # The algorithm removes at least one feature
if n_min > n_features: if n_min > n_features:
raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.") raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
@ -190,17 +208,16 @@ class FeatureSelection:
i_worse = 0 i_worse = 0
for i in reversed(range(n_features)): for i in reversed(range(n_features)):
print("Iteration:", i+1)
if i+1 == n_min: if i+1 == n_min:
break break
best_feature, best_metric_score, best_metric_score_std = \ best_feature, best_metric_score, best_metric_score_std = \
self.select_best_feature(features, method=method, ml_type=ml_type[0], metric=metric) self.select_best_feature(features, method=method, ml_category=ml_category, ml_subcategory=ml_subcategory, metric=metric)
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std)) feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
features.remove(best_feature) features.remove(best_feature)
print("Features left:", i)
if i <= n_max: if i <= n_max:
if best_metric_score >= best_score: if best_metric_score >= best_score:
@ -224,48 +241,5 @@ class FeatureSelection:
return selected_features return selected_features
"""
# Selekcijski kriterij značilk v rangu max-min
# Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
# Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
# "Tipping point" značilka mora biti v rangu max-min
selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
selection_area.set_index(["i", "name"], inplace=True)
print(selection_area)
diffrences = selection_area.diff()
diffrences.dropna(how='any', inplace=True)
print(diffrences)
# Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo
cumulative_sumation = diffrences.cumsum()
print(cumulative_sumation)
tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
print(tipping_feature_indx_1)
# Metoda, ki pusti n_tolerance značilkam, da premagajo dosedajno najboljši score
tipping_feature_indx_2 = None
best_score = 0
i_worse = 0
for indx, row in selection_area.iterrows():
if row["metric"] > best_score:
tipping_feature_indx_2 = indx
best_score = row["metric"]
i_worse = 0
else:
i_worse += 1
if i_worse == n_tolerance:
break
print(tipping_feature_indx_2)
selection_area.reset_index(inplace=True)
features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
"""
else: else:
raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.") raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")