From 0594993133c58f68494639098b17bb397696580b Mon Sep 17 00:00:00 2001 From: Primoz Date: Thu, 20 Apr 2023 11:20:26 +0200 Subject: [PATCH] Add GroupKFold to feature selection CV. Start with generic metric calculation procedure. --- exploration/ml_pipeline.py | 14 +++--- machine_learning/cross_validation.py | 13 ++++-- machine_learning/feature_selection.py | 62 ++++++++++----------------- 3 files changed, 40 insertions(+), 49 deletions(-) diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py index bec82b2..a794e66 100644 --- a/exploration/ml_pipeline.py +++ b/exploration/ml_pipeline.py @@ -34,8 +34,8 @@ index_columns = ["local_segment", "local_segment_label", "local_segment_start_da df.set_index(index_columns, inplace=True) # Create binary target -bins = [-1, 0, 4] # bins for stressfulness (0-4) target -df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high'] +# bins = [-1, 0, 4] # bins for stressfulness (0-4) target +# df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high'] nan_cols = df.columns[df.isna().any()].tolist() @@ -58,10 +58,12 @@ for split in cv.get_splits(): # Feature selection on train set # Morda se implementira GroupKfold namesto stratifiedKFold? >> # >> Tako se bo posamezen pid pojavil ali v test ali v train setu - fs = FeatureSelection(train_X, train_y) - selected_features = fs.select_features(n_min=20, n_max=50, k=80, - ml_type="regression_", - n_tolerance=20) + train_groups, test_groups = cv.get_groups_sets(split) + + fs = FeatureSelection(train_X, train_y, train_groups) + selected_features = fs.select_features(n_min=20, n_max=50, k=60, + ml_type="classification_multi", + metric="f1", n_tolerance=20) print(selected_features) print(len(selected_features)) diff --git a/machine_learning/cross_validation.py b/machine_learning/cross_validation.py index e030a8f..4f7b9ef 100644 --- a/machine_learning/cross_validation.py +++ b/machine_learning/cross_validation.py @@ -49,8 +49,8 @@ class CrossValidation(): data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"] - elif self.cv_method == "5kfold": - data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"] + elif self.cv_method == "Stratified5kfold": + data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], None self.X, self.y, self.groups = data_X, data_y, data_groups @@ -71,7 +71,7 @@ class CrossValidation(): if self.cv_method in ["logo", "half_logo"]: self.cv = LeaveOneGroupOut() - elif self.cv_method == "5kfold": + elif self.cv_method == "Stratified5kfold": self.cv = StratifiedKFold(n_splits=5, shuffle=True) @@ -118,4 +118,11 @@ class CrossValidation(): """ return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]] + def get_groups_sets(self, split): + + if self.groups is None: + return None, None + else: + return self.groups.iloc[split[0]], self.groups.iloc[split[1]] + \ No newline at end of file diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py index 32abd1f..f2cfc95 100644 --- a/machine_learning/feature_selection.py +++ b/machine_learning/feature_selection.py @@ -7,7 +7,7 @@ import matplotlib.pyplot as plt import pandas as pd from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression -from sklearn.model_selection import cross_validate, StratifiedKFold +from sklearn.model_selection import cross_validate, StratifiedKFold, GroupKFold from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Lasso @@ -23,9 +23,10 @@ from sklearn.linear_model import Lasso class FeatureSelection: - def __init__(self, X, y): + def __init__(self, X, y, groups): self.X = X self.y = y + self.groups = groups def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]): @@ -65,55 +66,35 @@ class FeatureSelection: X = self.X[pred_features].copy() + if self.groups is not None: + cv = GroupKFold(n_splits=5) + else: + cv = StratifiedKFold(n_splits=5, shuffle=True) + + # See link about scoring for multiclassfication + # http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/ if ml_type == "classification": nb = GaussianNB() model_cv = cross_validate( nb, X=X, y=self.y, - cv=StratifiedKFold(n_splits=5, shuffle=True), + cv=cv, + groups=self.groups, n_jobs=-1, - scoring=('accuracy', 'precision', 'recall', 'f1') + scoring=(metric) ) with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.") - if metric == "accuracy": - acc = np.mean(model_cv['test_accuracy']) - acc_std = np.std(model_cv['test_accuracy']) - - if not best_feature or (acc > best_metric_score): - best_feature = feat - best_metric_score = acc - best_metric_score_std = acc_std + metric_score = np.nanmean(model_cv[f'test_{metric}']) + metric_score_std = np.nanstd(model_cv[f'test_{metric}']) - elif metric == "precision": - prec = np.mean(model_cv['test_precision']) - prec_std = np.std(model_cv['test_precision']) - - if not best_feature or (prec > best_metric_score): - best_feature = feat - best_metric_score = prec - best_metric_score_std = prec_std - - elif metric == "recall": - rec = np.mean(model_cv['test_recall']) - rec_std = np.std(model_cv['test_recall']) - - if not best_feature or (rec > best_metric_score): - best_feature = feat - best_metric_score = rec - best_metric_score_std = rec_std - - else: - f1 = np.mean(model_cv['test_f1']) - f1_std = np.std(model_cv['test_f1']) - - if not best_feature or (f1 > best_metric_score): - best_feature = feat - best_metric_score = f1 - best_metric_score_std = f1_std + if not best_feature or (metric_score > best_metric_score): + best_feature = feat + best_metric_score = metric_score + best_metric_score_std = metric_score_std elif ml_type == "regression": lass = Lasso() @@ -121,7 +102,8 @@ class FeatureSelection: lass, X=X, y=y, - cv=StratifiedKFold(n_splits=5, shuffle=True), + cv=cv, + groups=self.groups, n_jobs=-1, scoring=('r2') ) @@ -214,7 +196,7 @@ class FeatureSelection: break best_feature, best_metric_score, best_metric_score_std = \ - self.select_best_feature(features, method=method, ml_type=ml_type[0], metric="recall") + self.select_best_feature(features, method=method, ml_type=ml_type[0], metric=metric) feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))