From 259be708aa8031fe3f5ef1cc6f60e5adf8ebef26 Mon Sep 17 00:00:00 2001 From: Primoz Date: Thu, 20 Apr 2023 13:26:20 +0200 Subject: [PATCH] Improve the feature selection method with validations etc. --- machine_learning/feature_selection.py | 164 +++++++++++--------------- 1 file changed, 69 insertions(+), 95 deletions(-) diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py index f2cfc95..8d7b950 100644 --- a/machine_learning/feature_selection.py +++ b/machine_learning/feature_selection.py @@ -29,9 +29,9 @@ class FeatureSelection: self.groups = groups - def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]): + def select_best_feature(self, features, method="remove", ml_category="classification", ml_subcategory="bin", metric="recall", stored_features=[]): """The method selects the best feature by testing the prediction on the feature set with or without the current feature. - The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat + The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particular feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric specified as a parameter. @@ -39,7 +39,11 @@ class FeatureSelection: df (DataFrame): Input data on which the predictions will be made. features (list): List of features to select the best/worst from method (str, optional): remove or add features. Defaults to "remove". - ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and metric. Defaults to "classification". + ml_category (str, optional): Either classification or regression ml problem controls the ML algorithm and metric. + Defaults to "classification". + ml_subcategory (str, optional): In case of classification '_bin' for binary classification + and 'multi' for multiclass classification. For regression an empty string '' is sufficient. + Defaults to "bin". metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall". stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to []. @@ -53,9 +57,25 @@ class FeatureSelection: best_feature = None - if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']: - raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'") - elif ml_type == "regression" and metric not in ['r2']: + # Validacije tipov ML in specificiranimi metrikami + if ml_category == "classification": + if ml_subcategory == "bin" and metric not in ['accuracy', 'precision', 'recall', 'f1']: + raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'") + elif ml_subcategory == "multi": + ml_subcategory_error = False + if metric != "accuracy" and "_" in metric: + metric_s, metric_t = metric.split("_") + if metric_s not in ['accuracy', 'precision', 'recall', 'f1'] or metric_t not in ['micro', 'macro', 'weighted']: + ml_subcategory_error = True + else: + ml_subcategory_error = True + + if ml_subcategory_error: + raise ValueError(""""Classification metric for multi-class classification must be specified precisely. + Available metric are: 'accuracy', 'precision', 'recall' and 'f1'. + Only accuracy must be specified as 'accuracy'. + For others please add appropriate suffixes: '_macro', '_micro', or '_weighted', e.g., 'f1_macro'""") + elif ml_category == "regression" and metric not in ['r2']: raise ValueError("Regression metric not recognized. Please choose 'r2'") for feat in features: @@ -73,7 +93,7 @@ class FeatureSelection: # See link about scoring for multiclassfication # http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/ - if ml_type == "classification": + if ml_category == "classification": nb = GaussianNB() model_cv = cross_validate( nb, @@ -85,18 +105,8 @@ class FeatureSelection: scoring=(metric) ) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.") - - metric_score = np.nanmean(model_cv[f'test_{metric}']) - metric_score_std = np.nanstd(model_cv[f'test_{metric}']) - - if not best_feature or (metric_score > best_metric_score): - best_feature = feat - best_metric_score = metric_score - best_metric_score_std = metric_score_std - elif ml_type == "regression": + elif ml_category == "regression": lass = Lasso() model_cv = cross_validate( lass, @@ -108,16 +118,20 @@ class FeatureSelection: scoring=('r2') ) - if metric == "r2": - r2 = np.mean(model_cv['test_r2']) - r2_std = np.std(model_cv['test_r2']) - - if not best_feature or (r2 > best_metric_score): - best_feature = feat - best_metric_score = r2 - best_metric_score_std = r2_std else: raise ValueError("ML type not yet implemented!") + + # Section of metrics' scores comparison. + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.") + + metric_score = np.nanmean(model_cv["test_score"]) + metric_score_std = np.nanstd(model_cv["test_score"]) + + if not best_feature or (metric_score > best_metric_score): + best_feature = feat + best_metric_score = metric_score + best_metric_score_std = metric_score_std return best_feature, best_metric_score, best_metric_score_std @@ -137,9 +151,10 @@ class FeatureSelection: Args: n_min (int, optional): Minimal amount of features returned. n_max (int, optional): Maximal amount of features returned. - k (int, optional): Determines the k in the k-best features method. + k (int, optional): Determines the k in the k-best features method. + If None, SelectKBest feature selection does not execute. ml_type(str, optional): Type of ML problem. Currently implemented options: - classification_bin, classification_multi, and regression_ + 'classification_bin', 'classification_multi', and 'regression_' method (str, optional): "remove" or "add" features. Defaults to "remove". n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter the method returns index of feature with current best score as a tipping point feature. @@ -147,35 +162,38 @@ class FeatureSelection: Returns: list: list of selected features """ - n_features = self.X.shape[1] - if n_max >= n_features: - n_max = n_features-1 # The algorithm removes at least one feature - if k < n_max: - raise ValueError("The k parameter needs to be lower than the n_max parameter.") + + if k is not None and k <= n_max: + raise ValueError("The k parameter needs to be greater than the n_max parameter.") # Select k-best feature dependent on the type of ML task - ml_type = ml_type.split("_") - if ml_type[0] == "classification": - if ml_type[1] == "bin": - selector = SelectKBest(mutual_info_classif, k=k) - elif ml_type[1] == "multi": - selector = SelectKBest(f_classif, k=k) + ml_category, ml_subcategory = ml_type.split("_") + + if k is not None: + if ml_category == "classification": + if ml_subcategory== "bin": + selector = SelectKBest(mutual_info_classif, k=k) + elif ml_subcategory== "multi": + selector = SelectKBest(f_classif, k=k) + else: + raise ValueError("Unknown ML type: cannot recognize ML classification subtype.") + elif ml_category == "regression": + selector = SelectKBest(f_regression, k=k) else: - raise ValueError("Unknown ML type: cannot recognize ML classification subtype.") - elif ml_type[0] == "regression": - selector = SelectKBest(f_regression, k=k) - else: - raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.") - - selector.fit(self.X, self.y) - cols_idxs = selector.get_support(indices=True) - self.X = self.X.iloc[:,cols_idxs] + raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.") + + selector.fit(self.X, self.y) + cols_idxs = selector.get_support(indices=True) + self.X = self.X.iloc[:,cols_idxs] + print("All columns (after SelectKBest method):") print(self.X.columns) # Sequential feature addition / removal n_features = self.X.shape[1] - + if n_max >= n_features: + n_max = n_features-1 # The algorithm removes at least one feature + if n_min > n_features: raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.") @@ -190,17 +208,16 @@ class FeatureSelection: i_worse = 0 for i in reversed(range(n_features)): - print("Iteration:", i+1) - if i+1 == n_min: break best_feature, best_metric_score, best_metric_score_std = \ - self.select_best_feature(features, method=method, ml_type=ml_type[0], metric=metric) + self.select_best_feature(features, method=method, ml_category=ml_category, ml_subcategory=ml_subcategory, metric=metric) feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std)) features.remove(best_feature) + print("Features left:", i) if i <= n_max: if best_metric_score >= best_score: @@ -223,49 +240,6 @@ class FeatureSelection: selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove] return selected_features - - """ - # Selekcijski kriterij značilk v rangu max-min - # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk. - - # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke - - # "Tipping point" značilka mora biti v rangu max-min - selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)] - selection_area.set_index(["i", "name"], inplace=True) - print(selection_area) - diffrences = selection_area.diff() - diffrences.dropna(how='any', inplace=True) - print(diffrences) - - # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo - cumulative_sumation = diffrences.cumsum() - print(cumulative_sumation) - tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"] - print(tipping_feature_indx_1) - - - # Metoda, ki pusti n_tolerance značilkam, da premagajo dosedajno najboljši score - tipping_feature_indx_2 = None - best_score = 0 - i_worse = 0 - for indx, row in selection_area.iterrows(): - if row["metric"] > best_score: - tipping_feature_indx_2 = indx - best_score = row["metric"] - i_worse = 0 - else: - i_worse += 1 - - if i_worse == n_tolerance: - break - - print(tipping_feature_indx_2) - selection_area.reset_index(inplace=True) - features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist() - - selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove] - """ else: raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.") \ No newline at end of file