From ce13a9e13bb8164240362fa86afac34944cfef39 Mon Sep 17 00:00:00 2001 From: Primoz Date: Wed, 19 Apr 2023 15:56:34 +0200 Subject: [PATCH] Implement feature selection method which is used in ML pipeline. --- exploration/ml_pipeline.py | 25 +++++- machine_learning/feature_selection.py | 122 ++++++++++++++++---------- 2 files changed, 100 insertions(+), 47 deletions(-) diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py index eeaa9b3..6d75385 100644 --- a/exploration/ml_pipeline.py +++ b/exploration/ml_pipeline.py @@ -26,24 +26,45 @@ if nb_dir not in sys.path: from machine_learning.cross_validation import CrossValidation from machine_learning.preprocessing import Preprocessing +from machine_learning.feature_selection import FeatureSelection # %% df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv") index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] df.set_index(index_columns, inplace=True) +# Create binary target +bins = [-1, 0, 4] # bins for stressfulness (0-4) target +df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high'] + + +nan_cols = df.columns[df.isna().any()].tolist() +df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0)) + cv = CrossValidation(data=df, cv_method="logo") categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"] interval_feature_list, other_feature_list = [], [] -print(df.columns.tolist()) - +# %% for split in cv.get_splits(): train_X, train_y, test_X, test_y = cv.get_train_test_sets(split) pre = Preprocessing(train_X, train_y, test_X, test_y) pre.one_hot_encode_train_and_test_sets(categorical_columns) train_X, train_y, test_X, test_y = pre.get_train_test_sets() + + # train_X = train_X[train_X.columns[:30]] + + # Feature selection on train set + # Morda se implementira GroupKfold namesto stratifiedKFold? >> + # >> Tako se bo posamezen pid pojavil ali v test ali v train setu + fs = FeatureSelection(train_X, train_y) + selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3) + print(selected_features) + print(len(selected_features)) + + + break # %% diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py index 0080839..31a5e92 100644 --- a/machine_learning/feature_selection.py +++ b/machine_learning/feature_selection.py @@ -1,11 +1,13 @@ import os import sys +import warnings import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.feature_selection import SequentialFeatureSelector +from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Lasso @@ -21,11 +23,12 @@ from sklearn.linear_model import Lasso class FeatureSelection: - def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV? - pass # TODO.... + def __init__(self, X, y): + self.X = X + self.y = y - def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]): + def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]): """The method selects the best feature by testing the prediction on the feature set with or without the current feature. The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric @@ -56,18 +59,18 @@ class FeatureSelection: for feat in features: if method == "remove": - pred_features = [col for col in df.columns if feat != col] # All but feat + pred_features = [col for col in self.X.columns if feat != col] # All but feat elif method == "add": pred_features = [feat] + stored_features # Feat with stored features - X, y = df.drop(columns=['target', 'pid'])[pred_features], df['target'] + X = self.X[pred_features].copy() if ml_type == "classification": nb = GaussianNB() model_cv = cross_validate( nb, X=X, - y=y, + y=self.y, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1, scoring=('accuracy', 'precision', 'recall', 'f1') @@ -137,85 +140,114 @@ class FeatureSelection: return best_feature, best_metric_score, best_metric_score_std - def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10): + def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10): + """This method selects a set of features and returns them as a list. It returns number of features + determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure. + The procedure sequentially removes the features that attribute the least to the choosen evaluation metric. + If in this sequence the score ML score is improved the next feature is remove otherwise there is a + tolerance criteria (n_not_improve) with which the next n remove features are inspected whether + currently best score is improved. The features are returned in specified interval as a list. + + Args: + n_min (int): Minimal amount of features returned. + n_max (int): Maximal amount of features returned. + method (str, optional): "remove" or "add" features. Defaults to "remove". + n_not_improve (int): If the best score is not improved in n that is specified by this parameter + the method returns index of feature with current best score as a tipping point feature. + + Returns: + list: list of selected features + """ - n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered - if n_max > n_features: - n_max = n_features + n_features = self.X.shape[1] + if n_max >= n_features: + n_max = n_features-1 # The algorithm removes at least one feature if n_min > n_features: - raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.") + raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.") if n_max < n_min: raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.") - features = df.columns.tolist() - features.remove("pid") - features.remove("target") + features = self.X.columns.tolist() feature_importance = [] if method == "remove": + best_score = 0 + best_feature_indx = None + i_worse = 0 for i in reversed(range(n_features)): + if i+1 == n_min: + break + best_feature, best_metric_score, best_metric_score_std = \ - self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall") - feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std)) + self.select_best_feature(features, method=method, ml_type="classification", metric="recall") + + feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std)) features.remove(best_feature) + if i <= n_max: + if best_metric_score >= best_score: + best_score = best_metric_score + best_feature_indx = i+1 + i_worse = 0 + else: + i_worse += 1 + + if i_worse == n_not_improve: + break + feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd']) + + print(feature_importance_df) + print("best_feature_indx", best_feature_indx) + print("best_score", best_score) + + features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist() + selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove] + return selected_features + + """ # Selekcijski kriterij značilk v rangu max-min # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk. # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke # "Tipping point" značilka mora biti v rangu max-min - selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)] selection_area.set_index(["i", "name"], inplace=True) + print(selection_area) diffrences = selection_area.diff() diffrences.dropna(how='any', inplace=True) + print(diffrences) # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo cumulative_sumation = diffrences.cumsum() + print(cumulative_sumation) tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"] + print(tipping_feature_indx_1) - # Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative - tipping_feature_indx_2 = None - for indx, row in diffrences.iterrows(): - if row["metric"] > 0: - tipping_feature_indx_2 = indx - else: - break # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score - tipping_feature_indx_3 = None - cum_sum_score = 0 + tipping_feature_indx_2 = None + best_score = 0 i_worse = 0 - # TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ... for indx, row in selection_area.iterrows(): - if row["metric"] > 0: - tipping_feature_indx_3 = indx - cum_sum_score += row["metric"] + if row["metric"] > best_score: + tipping_feature_indx_2 = indx + best_score = row["metric"] i_worse = 0 else: i_worse += 1 if i_worse == n_not_improve: - break - - - - + break + print(tipping_feature_indx_2) + selection_area.reset_index(inplace=True) + features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist() - - - def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False): - pass - - def vizualize_feature_selection_process(): - pass - - def execute_feature_selection_step(): - pass \ No newline at end of file + selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove] + """ \ No newline at end of file