diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py index 6d75385..bec82b2 100644 --- a/exploration/ml_pipeline.py +++ b/exploration/ml_pipeline.py @@ -59,12 +59,12 @@ for split in cv.get_splits(): # Morda se implementira GroupKfold namesto stratifiedKFold? >> # >> Tako se bo posamezen pid pojavil ali v test ali v train setu fs = FeatureSelection(train_X, train_y) - selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3) + selected_features = fs.select_features(n_min=20, n_max=50, k=80, + ml_type="regression_", + n_tolerance=20) print(selected_features) print(len(selected_features)) - - break # %% diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py index 31a5e92..32abd1f 100644 --- a/machine_learning/feature_selection.py +++ b/machine_learning/feature_selection.py @@ -6,7 +6,7 @@ import numpy as np import matplotlib.pyplot as plt import pandas as pd -from sklearn.feature_selection import SequentialFeatureSelector +from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Lasso @@ -140,31 +140,62 @@ class FeatureSelection: return best_feature, best_metric_score, best_metric_score_std - def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10): + def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10): """This method selects a set of features and returns them as a list. It returns number of features - determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure. - The procedure sequentially removes the features that attribute the least to the choosen evaluation metric. - If in this sequence the score ML score is improved the next feature is remove otherwise there is a - tolerance criteria (n_not_improve) with which the next n remove features are inspected whether - currently best score is improved. The features are returned in specified interval as a list. + determined in the interval of [n_min, n_max]. + + The method consists of two steps: + (1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter. + (2) The sequential features removal procedure is executed. Using the remaing features from (1). + The best score is detected using a removal procedure. The procedure sequentially removes the features + that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is + improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance) + with which the next n removed features are inspected whether currently best score is improved. Args: - n_min (int): Minimal amount of features returned. - n_max (int): Maximal amount of features returned. + n_min (int, optional): Minimal amount of features returned. + n_max (int, optional): Maximal amount of features returned. + k (int, optional): Determines the k in the k-best features method. + ml_type(str, optional): Type of ML problem. Currently implemented options: + classification_bin, classification_multi, and regression_ method (str, optional): "remove" or "add" features. Defaults to "remove". - n_not_improve (int): If the best score is not improved in n that is specified by this parameter + n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter the method returns index of feature with current best score as a tipping point feature. Returns: list: list of selected features """ - n_features = self.X.shape[1] if n_max >= n_features: n_max = n_features-1 # The algorithm removes at least one feature + if k < n_max: + raise ValueError("The k parameter needs to be lower than the n_max parameter.") + + # Select k-best feature dependent on the type of ML task + ml_type = ml_type.split("_") + if ml_type[0] == "classification": + if ml_type[1] == "bin": + selector = SelectKBest(mutual_info_classif, k=k) + elif ml_type[1] == "multi": + selector = SelectKBest(f_classif, k=k) + else: + raise ValueError("Unknown ML type: cannot recognize ML classification subtype.") + elif ml_type[0] == "regression": + selector = SelectKBest(f_regression, k=k) + else: + raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.") + + selector.fit(self.X, self.y) + cols_idxs = selector.get_support(indices=True) + self.X = self.X.iloc[:,cols_idxs] + + print(self.X.columns) + + # Sequential feature addition / removal + n_features = self.X.shape[1] if n_min > n_features: - raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.") + raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.") if n_max < n_min: raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.") @@ -177,11 +208,13 @@ class FeatureSelection: i_worse = 0 for i in reversed(range(n_features)): + print("Iteration:", i+1) + if i+1 == n_min: break best_feature, best_metric_score, best_metric_score_std = \ - self.select_best_feature(features, method=method, ml_type="classification", metric="recall") + self.select_best_feature(features, method=method, ml_type=ml_type[0], metric="recall") feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std)) @@ -195,7 +228,7 @@ class FeatureSelection: else: i_worse += 1 - if i_worse == n_not_improve: + if i_worse == n_tolerance: break feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd']) @@ -230,7 +263,7 @@ class FeatureSelection: print(tipping_feature_indx_1) - # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score + # Metoda, ki pusti n_tolerance značilkam, da premagajo dosedajno najboljši score tipping_feature_indx_2 = None best_score = 0 i_worse = 0 @@ -242,7 +275,7 @@ class FeatureSelection: else: i_worse += 1 - if i_worse == n_not_improve: + if i_worse == n_tolerance: break print(tipping_feature_indx_2) @@ -250,4 +283,7 @@ class FeatureSelection: features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist() selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove] - """ \ No newline at end of file + """ + + else: + raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.") \ No newline at end of file