From ce13a9e13bb8164240362fa86afac34944cfef39 Mon Sep 17 00:00:00 2001 From: Primoz Date: Wed, 19 Apr 2023 15:56:34 +0200 Subject: [PATCH 1/6] Implement feature selection method which is used in ML pipeline. --- exploration/ml_pipeline.py | 25 +++++- machine_learning/feature_selection.py | 122 ++++++++++++++++---------- 2 files changed, 100 insertions(+), 47 deletions(-) diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py index eeaa9b3..6d75385 100644 --- a/exploration/ml_pipeline.py +++ b/exploration/ml_pipeline.py @@ -26,24 +26,45 @@ if nb_dir not in sys.path: from machine_learning.cross_validation import CrossValidation from machine_learning.preprocessing import Preprocessing +from machine_learning.feature_selection import FeatureSelection # %% df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv") index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] df.set_index(index_columns, inplace=True) +# Create binary target +bins = [-1, 0, 4] # bins for stressfulness (0-4) target +df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high'] + + +nan_cols = df.columns[df.isna().any()].tolist() +df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0)) + cv = CrossValidation(data=df, cv_method="logo") categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"] interval_feature_list, other_feature_list = [], [] -print(df.columns.tolist()) - +# %% for split in cv.get_splits(): train_X, train_y, test_X, test_y = cv.get_train_test_sets(split) pre = Preprocessing(train_X, train_y, test_X, test_y) pre.one_hot_encode_train_and_test_sets(categorical_columns) train_X, train_y, test_X, test_y = pre.get_train_test_sets() + + # train_X = train_X[train_X.columns[:30]] + + # Feature selection on train set + # Morda se implementira GroupKfold namesto stratifiedKFold? >> + # >> Tako se bo posamezen pid pojavil ali v test ali v train setu + fs = FeatureSelection(train_X, train_y) + selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3) + print(selected_features) + print(len(selected_features)) + + + break # %% diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py index 0080839..31a5e92 100644 --- a/machine_learning/feature_selection.py +++ b/machine_learning/feature_selection.py @@ -1,11 +1,13 @@ import os import sys +import warnings import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.feature_selection import SequentialFeatureSelector +from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Lasso @@ -21,11 +23,12 @@ from sklearn.linear_model import Lasso class FeatureSelection: - def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV? - pass # TODO.... + def __init__(self, X, y): + self.X = X + self.y = y - def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]): + def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]): """The method selects the best feature by testing the prediction on the feature set with or without the current feature. The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric @@ -56,18 +59,18 @@ class FeatureSelection: for feat in features: if method == "remove": - pred_features = [col for col in df.columns if feat != col] # All but feat + pred_features = [col for col in self.X.columns if feat != col] # All but feat elif method == "add": pred_features = [feat] + stored_features # Feat with stored features - X, y = df.drop(columns=['target', 'pid'])[pred_features], df['target'] + X = self.X[pred_features].copy() if ml_type == "classification": nb = GaussianNB() model_cv = cross_validate( nb, X=X, - y=y, + y=self.y, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1, scoring=('accuracy', 'precision', 'recall', 'f1') @@ -137,85 +140,114 @@ class FeatureSelection: return best_feature, best_metric_score, best_metric_score_std - def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10): + def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10): + """This method selects a set of features and returns them as a list. It returns number of features + determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure. + The procedure sequentially removes the features that attribute the least to the choosen evaluation metric. + If in this sequence the score ML score is improved the next feature is remove otherwise there is a + tolerance criteria (n_not_improve) with which the next n remove features are inspected whether + currently best score is improved. The features are returned in specified interval as a list. + + Args: + n_min (int): Minimal amount of features returned. + n_max (int): Maximal amount of features returned. + method (str, optional): "remove" or "add" features. Defaults to "remove". + n_not_improve (int): If the best score is not improved in n that is specified by this parameter + the method returns index of feature with current best score as a tipping point feature. + + Returns: + list: list of selected features + """ - n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered - if n_max > n_features: - n_max = n_features + n_features = self.X.shape[1] + if n_max >= n_features: + n_max = n_features-1 # The algorithm removes at least one feature if n_min > n_features: - raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.") + raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.") if n_max < n_min: raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.") - features = df.columns.tolist() - features.remove("pid") - features.remove("target") + features = self.X.columns.tolist() feature_importance = [] if method == "remove": + best_score = 0 + best_feature_indx = None + i_worse = 0 for i in reversed(range(n_features)): + if i+1 == n_min: + break + best_feature, best_metric_score, best_metric_score_std = \ - self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall") - feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std)) + self.select_best_feature(features, method=method, ml_type="classification", metric="recall") + + feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std)) features.remove(best_feature) + if i <= n_max: + if best_metric_score >= best_score: + best_score = best_metric_score + best_feature_indx = i+1 + i_worse = 0 + else: + i_worse += 1 + + if i_worse == n_not_improve: + break + feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd']) + + print(feature_importance_df) + print("best_feature_indx", best_feature_indx) + print("best_score", best_score) + + features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist() + selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove] + return selected_features + + """ # Selekcijski kriterij značilk v rangu max-min # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk. # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke # "Tipping point" značilka mora biti v rangu max-min - selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)] selection_area.set_index(["i", "name"], inplace=True) + print(selection_area) diffrences = selection_area.diff() diffrences.dropna(how='any', inplace=True) + print(diffrences) # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo cumulative_sumation = diffrences.cumsum() + print(cumulative_sumation) tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"] + print(tipping_feature_indx_1) - # Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative - tipping_feature_indx_2 = None - for indx, row in diffrences.iterrows(): - if row["metric"] > 0: - tipping_feature_indx_2 = indx - else: - break # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score - tipping_feature_indx_3 = None - cum_sum_score = 0 + tipping_feature_indx_2 = None + best_score = 0 i_worse = 0 - # TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ... for indx, row in selection_area.iterrows(): - if row["metric"] > 0: - tipping_feature_indx_3 = indx - cum_sum_score += row["metric"] + if row["metric"] > best_score: + tipping_feature_indx_2 = indx + best_score = row["metric"] i_worse = 0 else: i_worse += 1 if i_worse == n_not_improve: - break - - - - + break + print(tipping_feature_indx_2) + selection_area.reset_index(inplace=True) + features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist() - - - def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False): - pass - - def vizualize_feature_selection_process(): - pass - - def execute_feature_selection_step(): - pass \ No newline at end of file + selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove] + """ \ No newline at end of file From 1cbc743cf76544a1d5b4b2904d886d7374733110 Mon Sep 17 00:00:00 2001 From: Primoz Date: Thu, 20 Apr 2023 10:12:16 +0200 Subject: [PATCH 2/6] Add kBest method to initially filter out the worst performing features. Update comments. --- exploration/ml_pipeline.py | 6 +-- machine_learning/feature_selection.py | 70 ++++++++++++++++++++------- 2 files changed, 56 insertions(+), 20 deletions(-) diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py index 6d75385..bec82b2 100644 --- a/exploration/ml_pipeline.py +++ b/exploration/ml_pipeline.py @@ -59,12 +59,12 @@ for split in cv.get_splits(): # Morda se implementira GroupKfold namesto stratifiedKFold? >> # >> Tako se bo posamezen pid pojavil ali v test ali v train setu fs = FeatureSelection(train_X, train_y) - selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3) + selected_features = fs.select_features(n_min=20, n_max=50, k=80, + ml_type="regression_", + n_tolerance=20) print(selected_features) print(len(selected_features)) - - break # %% diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py index 31a5e92..32abd1f 100644 --- a/machine_learning/feature_selection.py +++ b/machine_learning/feature_selection.py @@ -6,7 +6,7 @@ import numpy as np import matplotlib.pyplot as plt import pandas as pd -from sklearn.feature_selection import SequentialFeatureSelector +from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Lasso @@ -140,31 +140,62 @@ class FeatureSelection: return best_feature, best_metric_score, best_metric_score_std - def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10): + def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10): """This method selects a set of features and returns them as a list. It returns number of features - determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure. - The procedure sequentially removes the features that attribute the least to the choosen evaluation metric. - If in this sequence the score ML score is improved the next feature is remove otherwise there is a - tolerance criteria (n_not_improve) with which the next n remove features are inspected whether - currently best score is improved. The features are returned in specified interval as a list. + determined in the interval of [n_min, n_max]. + + The method consists of two steps: + (1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter. + (2) The sequential features removal procedure is executed. Using the remaing features from (1). + The best score is detected using a removal procedure. The procedure sequentially removes the features + that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is + improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance) + with which the next n removed features are inspected whether currently best score is improved. Args: - n_min (int): Minimal amount of features returned. - n_max (int): Maximal amount of features returned. + n_min (int, optional): Minimal amount of features returned. + n_max (int, optional): Maximal amount of features returned. + k (int, optional): Determines the k in the k-best features method. + ml_type(str, optional): Type of ML problem. Currently implemented options: + classification_bin, classification_multi, and regression_ method (str, optional): "remove" or "add" features. Defaults to "remove". - n_not_improve (int): If the best score is not improved in n that is specified by this parameter + n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter the method returns index of feature with current best score as a tipping point feature. Returns: list: list of selected features """ - n_features = self.X.shape[1] if n_max >= n_features: n_max = n_features-1 # The algorithm removes at least one feature + if k < n_max: + raise ValueError("The k parameter needs to be lower than the n_max parameter.") + + # Select k-best feature dependent on the type of ML task + ml_type = ml_type.split("_") + if ml_type[0] == "classification": + if ml_type[1] == "bin": + selector = SelectKBest(mutual_info_classif, k=k) + elif ml_type[1] == "multi": + selector = SelectKBest(f_classif, k=k) + else: + raise ValueError("Unknown ML type: cannot recognize ML classification subtype.") + elif ml_type[0] == "regression": + selector = SelectKBest(f_regression, k=k) + else: + raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.") + + selector.fit(self.X, self.y) + cols_idxs = selector.get_support(indices=True) + self.X = self.X.iloc[:,cols_idxs] + + print(self.X.columns) + + # Sequential feature addition / removal + n_features = self.X.shape[1] if n_min > n_features: - raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.") + raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.") if n_max < n_min: raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.") @@ -177,11 +208,13 @@ class FeatureSelection: i_worse = 0 for i in reversed(range(n_features)): + print("Iteration:", i+1) + if i+1 == n_min: break best_feature, best_metric_score, best_metric_score_std = \ - self.select_best_feature(features, method=method, ml_type="classification", metric="recall") + self.select_best_feature(features, method=method, ml_type=ml_type[0], metric="recall") feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std)) @@ -195,7 +228,7 @@ class FeatureSelection: else: i_worse += 1 - if i_worse == n_not_improve: + if i_worse == n_tolerance: break feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd']) @@ -230,7 +263,7 @@ class FeatureSelection: print(tipping_feature_indx_1) - # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score + # Metoda, ki pusti n_tolerance značilkam, da premagajo dosedajno najboljši score tipping_feature_indx_2 = None best_score = 0 i_worse = 0 @@ -242,7 +275,7 @@ class FeatureSelection: else: i_worse += 1 - if i_worse == n_not_improve: + if i_worse == n_tolerance: break print(tipping_feature_indx_2) @@ -250,4 +283,7 @@ class FeatureSelection: features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist() selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove] - """ \ No newline at end of file + """ + + else: + raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.") \ No newline at end of file From 0594993133c58f68494639098b17bb397696580b Mon Sep 17 00:00:00 2001 From: Primoz Date: Thu, 20 Apr 2023 11:20:26 +0200 Subject: [PATCH 3/6] Add GroupKFold to feature selection CV. Start with generic metric calculation procedure. --- exploration/ml_pipeline.py | 14 +++--- machine_learning/cross_validation.py | 13 ++++-- machine_learning/feature_selection.py | 62 ++++++++++----------------- 3 files changed, 40 insertions(+), 49 deletions(-) diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py index bec82b2..a794e66 100644 --- a/exploration/ml_pipeline.py +++ b/exploration/ml_pipeline.py @@ -34,8 +34,8 @@ index_columns = ["local_segment", "local_segment_label", "local_segment_start_da df.set_index(index_columns, inplace=True) # Create binary target -bins = [-1, 0, 4] # bins for stressfulness (0-4) target -df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high'] +# bins = [-1, 0, 4] # bins for stressfulness (0-4) target +# df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high'] nan_cols = df.columns[df.isna().any()].tolist() @@ -58,10 +58,12 @@ for split in cv.get_splits(): # Feature selection on train set # Morda se implementira GroupKfold namesto stratifiedKFold? >> # >> Tako se bo posamezen pid pojavil ali v test ali v train setu - fs = FeatureSelection(train_X, train_y) - selected_features = fs.select_features(n_min=20, n_max=50, k=80, - ml_type="regression_", - n_tolerance=20) + train_groups, test_groups = cv.get_groups_sets(split) + + fs = FeatureSelection(train_X, train_y, train_groups) + selected_features = fs.select_features(n_min=20, n_max=50, k=60, + ml_type="classification_multi", + metric="f1", n_tolerance=20) print(selected_features) print(len(selected_features)) diff --git a/machine_learning/cross_validation.py b/machine_learning/cross_validation.py index e030a8f..4f7b9ef 100644 --- a/machine_learning/cross_validation.py +++ b/machine_learning/cross_validation.py @@ -49,8 +49,8 @@ class CrossValidation(): data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"] - elif self.cv_method == "5kfold": - data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"] + elif self.cv_method == "Stratified5kfold": + data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], None self.X, self.y, self.groups = data_X, data_y, data_groups @@ -71,7 +71,7 @@ class CrossValidation(): if self.cv_method in ["logo", "half_logo"]: self.cv = LeaveOneGroupOut() - elif self.cv_method == "5kfold": + elif self.cv_method == "Stratified5kfold": self.cv = StratifiedKFold(n_splits=5, shuffle=True) @@ -118,4 +118,11 @@ class CrossValidation(): """ return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]] + def get_groups_sets(self, split): + + if self.groups is None: + return None, None + else: + return self.groups.iloc[split[0]], self.groups.iloc[split[1]] + \ No newline at end of file diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py index 32abd1f..f2cfc95 100644 --- a/machine_learning/feature_selection.py +++ b/machine_learning/feature_selection.py @@ -7,7 +7,7 @@ import matplotlib.pyplot as plt import pandas as pd from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression -from sklearn.model_selection import cross_validate, StratifiedKFold +from sklearn.model_selection import cross_validate, StratifiedKFold, GroupKFold from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Lasso @@ -23,9 +23,10 @@ from sklearn.linear_model import Lasso class FeatureSelection: - def __init__(self, X, y): + def __init__(self, X, y, groups): self.X = X self.y = y + self.groups = groups def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]): @@ -65,55 +66,35 @@ class FeatureSelection: X = self.X[pred_features].copy() + if self.groups is not None: + cv = GroupKFold(n_splits=5) + else: + cv = StratifiedKFold(n_splits=5, shuffle=True) + + # See link about scoring for multiclassfication + # http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/ if ml_type == "classification": nb = GaussianNB() model_cv = cross_validate( nb, X=X, y=self.y, - cv=StratifiedKFold(n_splits=5, shuffle=True), + cv=cv, + groups=self.groups, n_jobs=-1, - scoring=('accuracy', 'precision', 'recall', 'f1') + scoring=(metric) ) with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.") - if metric == "accuracy": - acc = np.mean(model_cv['test_accuracy']) - acc_std = np.std(model_cv['test_accuracy']) - - if not best_feature or (acc > best_metric_score): - best_feature = feat - best_metric_score = acc - best_metric_score_std = acc_std + metric_score = np.nanmean(model_cv[f'test_{metric}']) + metric_score_std = np.nanstd(model_cv[f'test_{metric}']) - elif metric == "precision": - prec = np.mean(model_cv['test_precision']) - prec_std = np.std(model_cv['test_precision']) - - if not best_feature or (prec > best_metric_score): - best_feature = feat - best_metric_score = prec - best_metric_score_std = prec_std - - elif metric == "recall": - rec = np.mean(model_cv['test_recall']) - rec_std = np.std(model_cv['test_recall']) - - if not best_feature or (rec > best_metric_score): - best_feature = feat - best_metric_score = rec - best_metric_score_std = rec_std - - else: - f1 = np.mean(model_cv['test_f1']) - f1_std = np.std(model_cv['test_f1']) - - if not best_feature or (f1 > best_metric_score): - best_feature = feat - best_metric_score = f1 - best_metric_score_std = f1_std + if not best_feature or (metric_score > best_metric_score): + best_feature = feat + best_metric_score = metric_score + best_metric_score_std = metric_score_std elif ml_type == "regression": lass = Lasso() @@ -121,7 +102,8 @@ class FeatureSelection: lass, X=X, y=y, - cv=StratifiedKFold(n_splits=5, shuffle=True), + cv=cv, + groups=self.groups, n_jobs=-1, scoring=('r2') ) @@ -214,7 +196,7 @@ class FeatureSelection: break best_feature, best_metric_score, best_metric_score_std = \ - self.select_best_feature(features, method=method, ml_type=ml_type[0], metric="recall") + self.select_best_feature(features, method=method, ml_type=ml_type[0], metric=metric) feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std)) From 259be708aa8031fe3f5ef1cc6f60e5adf8ebef26 Mon Sep 17 00:00:00 2001 From: Primoz Date: Thu, 20 Apr 2023 13:26:20 +0200 Subject: [PATCH 4/6] Improve the feature selection method with validations etc. --- machine_learning/feature_selection.py | 164 +++++++++++--------------- 1 file changed, 69 insertions(+), 95 deletions(-) diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py index f2cfc95..8d7b950 100644 --- a/machine_learning/feature_selection.py +++ b/machine_learning/feature_selection.py @@ -29,9 +29,9 @@ class FeatureSelection: self.groups = groups - def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]): + def select_best_feature(self, features, method="remove", ml_category="classification", ml_subcategory="bin", metric="recall", stored_features=[]): """The method selects the best feature by testing the prediction on the feature set with or without the current feature. - The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat + The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particular feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric specified as a parameter. @@ -39,7 +39,11 @@ class FeatureSelection: df (DataFrame): Input data on which the predictions will be made. features (list): List of features to select the best/worst from method (str, optional): remove or add features. Defaults to "remove". - ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and metric. Defaults to "classification". + ml_category (str, optional): Either classification or regression ml problem controls the ML algorithm and metric. + Defaults to "classification". + ml_subcategory (str, optional): In case of classification '_bin' for binary classification + and 'multi' for multiclass classification. For regression an empty string '' is sufficient. + Defaults to "bin". metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall". stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to []. @@ -53,9 +57,25 @@ class FeatureSelection: best_feature = None - if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']: - raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'") - elif ml_type == "regression" and metric not in ['r2']: + # Validacije tipov ML in specificiranimi metrikami + if ml_category == "classification": + if ml_subcategory == "bin" and metric not in ['accuracy', 'precision', 'recall', 'f1']: + raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'") + elif ml_subcategory == "multi": + ml_subcategory_error = False + if metric != "accuracy" and "_" in metric: + metric_s, metric_t = metric.split("_") + if metric_s not in ['accuracy', 'precision', 'recall', 'f1'] or metric_t not in ['micro', 'macro', 'weighted']: + ml_subcategory_error = True + else: + ml_subcategory_error = True + + if ml_subcategory_error: + raise ValueError(""""Classification metric for multi-class classification must be specified precisely. + Available metric are: 'accuracy', 'precision', 'recall' and 'f1'. + Only accuracy must be specified as 'accuracy'. + For others please add appropriate suffixes: '_macro', '_micro', or '_weighted', e.g., 'f1_macro'""") + elif ml_category == "regression" and metric not in ['r2']: raise ValueError("Regression metric not recognized. Please choose 'r2'") for feat in features: @@ -73,7 +93,7 @@ class FeatureSelection: # See link about scoring for multiclassfication # http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/ - if ml_type == "classification": + if ml_category == "classification": nb = GaussianNB() model_cv = cross_validate( nb, @@ -85,18 +105,8 @@ class FeatureSelection: scoring=(metric) ) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.") - - metric_score = np.nanmean(model_cv[f'test_{metric}']) - metric_score_std = np.nanstd(model_cv[f'test_{metric}']) - - if not best_feature or (metric_score > best_metric_score): - best_feature = feat - best_metric_score = metric_score - best_metric_score_std = metric_score_std - elif ml_type == "regression": + elif ml_category == "regression": lass = Lasso() model_cv = cross_validate( lass, @@ -108,16 +118,20 @@ class FeatureSelection: scoring=('r2') ) - if metric == "r2": - r2 = np.mean(model_cv['test_r2']) - r2_std = np.std(model_cv['test_r2']) - - if not best_feature or (r2 > best_metric_score): - best_feature = feat - best_metric_score = r2 - best_metric_score_std = r2_std else: raise ValueError("ML type not yet implemented!") + + # Section of metrics' scores comparison. + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.") + + metric_score = np.nanmean(model_cv["test_score"]) + metric_score_std = np.nanstd(model_cv["test_score"]) + + if not best_feature or (metric_score > best_metric_score): + best_feature = feat + best_metric_score = metric_score + best_metric_score_std = metric_score_std return best_feature, best_metric_score, best_metric_score_std @@ -137,9 +151,10 @@ class FeatureSelection: Args: n_min (int, optional): Minimal amount of features returned. n_max (int, optional): Maximal amount of features returned. - k (int, optional): Determines the k in the k-best features method. + k (int, optional): Determines the k in the k-best features method. + If None, SelectKBest feature selection does not execute. ml_type(str, optional): Type of ML problem. Currently implemented options: - classification_bin, classification_multi, and regression_ + 'classification_bin', 'classification_multi', and 'regression_' method (str, optional): "remove" or "add" features. Defaults to "remove". n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter the method returns index of feature with current best score as a tipping point feature. @@ -147,35 +162,38 @@ class FeatureSelection: Returns: list: list of selected features """ - n_features = self.X.shape[1] - if n_max >= n_features: - n_max = n_features-1 # The algorithm removes at least one feature - if k < n_max: - raise ValueError("The k parameter needs to be lower than the n_max parameter.") + + if k is not None and k <= n_max: + raise ValueError("The k parameter needs to be greater than the n_max parameter.") # Select k-best feature dependent on the type of ML task - ml_type = ml_type.split("_") - if ml_type[0] == "classification": - if ml_type[1] == "bin": - selector = SelectKBest(mutual_info_classif, k=k) - elif ml_type[1] == "multi": - selector = SelectKBest(f_classif, k=k) + ml_category, ml_subcategory = ml_type.split("_") + + if k is not None: + if ml_category == "classification": + if ml_subcategory== "bin": + selector = SelectKBest(mutual_info_classif, k=k) + elif ml_subcategory== "multi": + selector = SelectKBest(f_classif, k=k) + else: + raise ValueError("Unknown ML type: cannot recognize ML classification subtype.") + elif ml_category == "regression": + selector = SelectKBest(f_regression, k=k) else: - raise ValueError("Unknown ML type: cannot recognize ML classification subtype.") - elif ml_type[0] == "regression": - selector = SelectKBest(f_regression, k=k) - else: - raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.") - - selector.fit(self.X, self.y) - cols_idxs = selector.get_support(indices=True) - self.X = self.X.iloc[:,cols_idxs] + raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.") + + selector.fit(self.X, self.y) + cols_idxs = selector.get_support(indices=True) + self.X = self.X.iloc[:,cols_idxs] + print("All columns (after SelectKBest method):") print(self.X.columns) # Sequential feature addition / removal n_features = self.X.shape[1] - + if n_max >= n_features: + n_max = n_features-1 # The algorithm removes at least one feature + if n_min > n_features: raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.") @@ -190,17 +208,16 @@ class FeatureSelection: i_worse = 0 for i in reversed(range(n_features)): - print("Iteration:", i+1) - if i+1 == n_min: break best_feature, best_metric_score, best_metric_score_std = \ - self.select_best_feature(features, method=method, ml_type=ml_type[0], metric=metric) + self.select_best_feature(features, method=method, ml_category=ml_category, ml_subcategory=ml_subcategory, metric=metric) feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std)) features.remove(best_feature) + print("Features left:", i) if i <= n_max: if best_metric_score >= best_score: @@ -223,49 +240,6 @@ class FeatureSelection: selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove] return selected_features - - """ - # Selekcijski kriterij značilk v rangu max-min - # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk. - - # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke - - # "Tipping point" značilka mora biti v rangu max-min - selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)] - selection_area.set_index(["i", "name"], inplace=True) - print(selection_area) - diffrences = selection_area.diff() - diffrences.dropna(how='any', inplace=True) - print(diffrences) - - # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo - cumulative_sumation = diffrences.cumsum() - print(cumulative_sumation) - tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"] - print(tipping_feature_indx_1) - - - # Metoda, ki pusti n_tolerance značilkam, da premagajo dosedajno najboljši score - tipping_feature_indx_2 = None - best_score = 0 - i_worse = 0 - for indx, row in selection_area.iterrows(): - if row["metric"] > best_score: - tipping_feature_indx_2 = indx - best_score = row["metric"] - i_worse = 0 - else: - i_worse += 1 - - if i_worse == n_tolerance: - break - - print(tipping_feature_indx_2) - selection_area.reset_index(inplace=True) - features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist() - - selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove] - """ else: raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.") \ No newline at end of file From 865225994b29b34dbbdeef71416cdd911bc6919f Mon Sep 17 00:00:00 2001 From: Primoz Date: Thu, 20 Apr 2023 13:29:14 +0200 Subject: [PATCH 5/6] Added testing section after feature selection. --- exploration/ml_pipeline.py | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py index a794e66..b6b3bb6 100644 --- a/exploration/ml_pipeline.py +++ b/exploration/ml_pipeline.py @@ -20,6 +20,9 @@ import numpy as np import matplotlib.pyplot as plt import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import recall_score, f1_score + nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) @@ -34,8 +37,8 @@ index_columns = ["local_segment", "local_segment_label", "local_segment_start_da df.set_index(index_columns, inplace=True) # Create binary target -# bins = [-1, 0, 4] # bins for stressfulness (0-4) target -# df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high'] +bins = [-1, 0, 4] # bins for stressfulness (0-4) target +df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high'] nan_cols = df.columns[df.isna().any()].tolist() @@ -53,20 +56,38 @@ for split in cv.get_splits(): pre.one_hot_encode_train_and_test_sets(categorical_columns) train_X, train_y, test_X, test_y = pre.get_train_test_sets() - # train_X = train_X[train_X.columns[:30]] + + print(train_X.shape, test_X.shape) + # Predict before feature selection + rfc = RandomForestClassifier(n_estimators=10) + rfc.fit(train_X, train_y) + predictions = rfc.predict(test_X) + + print("Recall:", recall_score(test_y, predictions)) + print("F1:", f1_score(test_y, predictions)) # Feature selection on train set - # Morda se implementira GroupKfold namesto stratifiedKFold? >> - # >> Tako se bo posamezen pid pojavil ali v test ali v train setu train_groups, test_groups = cv.get_groups_sets(split) fs = FeatureSelection(train_X, train_y, train_groups) - selected_features = fs.select_features(n_min=20, n_max=50, k=60, - ml_type="classification_multi", - metric="f1", n_tolerance=20) + selected_features = fs.select_features(n_min=20, n_max=29, k=40, + ml_type="classification_bin", + metric="recall", n_tolerance=20) + + train_X = train_X[selected_features] + test_X = test_X[selected_features] + print(selected_features) print(len(selected_features)) + # Predict after feature selection + rfc = RandomForestClassifier(n_estimators=500) + rfc.fit(train_X, train_y) + predictions = rfc.predict(test_X) + + print("Recall:", recall_score(test_y, predictions)) + print("F1:", f1_score(test_y, predictions)) + break # %% From 26804cf8ea2e6582464ab482baf8c50dcefc7114 Mon Sep 17 00:00:00 2001 From: Primoz Date: Fri, 21 Apr 2023 13:24:31 +0200 Subject: [PATCH 6/6] Repair preprocessing one hot encoding of test set. --- machine_learning/preprocessing.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/machine_learning/preprocessing.py b/machine_learning/preprocessing.py index a11558c..1f55482 100644 --- a/machine_learning/preprocessing.py +++ b/machine_learning/preprocessing.py @@ -33,7 +33,7 @@ class Preprocessing: Args: categorical_features (DataFrame): DataFrame including only categorical columns. numerical_features (_type_): DataFrame including only numerical columns. - mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results + mode (int): Mode of the column with which DataFrame is filled. Returns: DataFrame: Hot-One Encoded DataFrame. @@ -46,7 +46,7 @@ class Preprocessing: if not categorical_features.empty: categorical_features = pd.get_dummies(categorical_features) - return pd.concat([numerical_features, categorical_features], axis=1) + return pd.concat([numerical_features, categorical_features], axis=1), categorical_features.columns.tolist() def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]): @@ -68,20 +68,27 @@ class Preprocessing: categorical_columns = [col for col in self.train_X.columns if col in categorical_columns] # For train set - train_X_categorical_features = self.train_X[categorical_columns].copy() train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1) mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0] - self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features) + self.train_X, train_cat_col_names = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features) + encoded_categorical_features = [col for col in self.train_X.columns if col.startswith(tuple(categorical_columns))] # For test set - test_X_categorical_features = self.test_X[categorical_columns].copy() test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1) - self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features) + self.test_X, test_cat_col_names = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features) + # Create categorical columns that were not found in test set and fill them with 0 + missing_cols = [col for col in train_cat_col_names if col not in test_cat_col_names] + self.test_X[missing_cols] = 0 + + # Sort column names alphabetically + self.train_X = self.train_X.reindex(sorted(self.train_X.columns), axis=1) + self.test_X = self.test_X.reindex(sorted(self.test_X.columns), axis=1) + def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):