Implement feature selection method which is used in ML pipeline.

ml_pipeline
Primoz 2023-04-19 15:56:34 +02:00
parent 10ca47583c
commit ce13a9e13b
2 changed files with 100 additions and 47 deletions

View File

@ -26,24 +26,45 @@ if nb_dir not in sys.path:
from machine_learning.cross_validation import CrossValidation
from machine_learning.preprocessing import Preprocessing
from machine_learning.feature_selection import FeatureSelection
# %%
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
df.set_index(index_columns, inplace=True)
# Create binary target
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
nan_cols = df.columns[df.isna().any()].tolist()
df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))
cv = CrossValidation(data=df, cv_method="logo")
categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
interval_feature_list, other_feature_list = [], []
print(df.columns.tolist())
# %%
for split in cv.get_splits():
train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
pre = Preprocessing(train_X, train_y, test_X, test_y)
pre.one_hot_encode_train_and_test_sets(categorical_columns)
train_X, train_y, test_X, test_y = pre.get_train_test_sets()
# train_X = train_X[train_X.columns[:30]]
# Feature selection on train set
# Morda se implementira GroupKfold namesto stratifiedKFold? >>
# >> Tako se bo posamezen pid pojavil ali v test ali v train setu
fs = FeatureSelection(train_X, train_y)
selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3)
print(selected_features)
print(len(selected_features))
break
# %%

View File

@ -1,11 +1,13 @@
import os
import sys
import warnings
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Lasso
@ -21,11 +23,12 @@ from sklearn.linear_model import Lasso
class FeatureSelection:
def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
pass # TODO....
def __init__(self, X, y):
self.X = X
self.y = y
def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
"""The method selects the best feature by testing the prediction on the feature set with or without the current feature.
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat
feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
@ -56,18 +59,18 @@ class FeatureSelection:
for feat in features:
if method == "remove":
pred_features = [col for col in df.columns if feat != col] # All but feat
pred_features = [col for col in self.X.columns if feat != col] # All but feat
elif method == "add":
pred_features = [feat] + stored_features # Feat with stored features
X, y = df.drop(columns=['target', 'pid'])[pred_features], df['target']
X = self.X[pred_features].copy()
if ml_type == "classification":
nb = GaussianNB()
model_cv = cross_validate(
nb,
X=X,
y=y,
y=self.y,
cv=StratifiedKFold(n_splits=5, shuffle=True),
n_jobs=-1,
scoring=('accuracy', 'precision', 'recall', 'f1')
@ -137,66 +140,104 @@ class FeatureSelection:
return best_feature, best_metric_score, best_metric_score_std
def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10):
"""This method selects a set of features and returns them as a list. It returns number of features
determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure.
The procedure sequentially removes the features that attribute the least to the choosen evaluation metric.
If in this sequence the score ML score is improved the next feature is remove otherwise there is a
tolerance criteria (n_not_improve) with which the next n remove features are inspected whether
currently best score is improved. The features are returned in specified interval as a list.
n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
if n_max > n_features:
n_max = n_features
Args:
n_min (int): Minimal amount of features returned.
n_max (int): Maximal amount of features returned.
method (str, optional): "remove" or "add" features. Defaults to "remove".
n_not_improve (int): If the best score is not improved in n that is specified by this parameter
the method returns index of feature with current best score as a tipping point feature.
Returns:
list: list of selected features
"""
n_features = self.X.shape[1]
if n_max >= n_features:
n_max = n_features-1 # The algorithm removes at least one feature
if n_min > n_features:
raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.")
if n_max < n_min:
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
features = df.columns.tolist()
features.remove("pid")
features.remove("target")
features = self.X.columns.tolist()
feature_importance = []
if method == "remove":
best_score = 0
best_feature_indx = None
i_worse = 0
for i in reversed(range(n_features)):
if i+1 == n_min:
break
best_feature, best_metric_score, best_metric_score_std = \
self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
self.select_best_feature(features, method=method, ml_type="classification", metric="recall")
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
features.remove(best_feature)
if i <= n_max:
if best_metric_score >= best_score:
best_score = best_metric_score
best_feature_indx = i+1
i_worse = 0
else:
i_worse += 1
if i_worse == n_not_improve:
break
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
print(feature_importance_df)
print("best_feature_indx", best_feature_indx)
print("best_score", best_score)
features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist()
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
return selected_features
"""
# Selekcijski kriterij značilk v rangu max-min
# Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
# Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
# "Tipping point" značilka mora biti v rangu max-min
selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
selection_area.set_index(["i", "name"], inplace=True)
print(selection_area)
diffrences = selection_area.diff()
diffrences.dropna(how='any', inplace=True)
print(diffrences)
# Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo
cumulative_sumation = diffrences.cumsum()
print(cumulative_sumation)
tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
print(tipping_feature_indx_1)
# Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative
tipping_feature_indx_2 = None
for indx, row in diffrences.iterrows():
if row["metric"] > 0:
tipping_feature_indx_2 = indx
else:
break
# Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score
tipping_feature_indx_3 = None
cum_sum_score = 0
tipping_feature_indx_2 = None
best_score = 0
i_worse = 0
# TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
for indx, row in selection_area.iterrows():
if row["metric"] > 0:
tipping_feature_indx_3 = indx
cum_sum_score += row["metric"]
if row["metric"] > best_score:
tipping_feature_indx_2 = indx
best_score = row["metric"]
i_worse = 0
else:
i_worse += 1
@ -204,18 +245,9 @@ class FeatureSelection:
if i_worse == n_not_improve:
break
print(tipping_feature_indx_2)
selection_area.reset_index(inplace=True)
features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
pass
def vizualize_feature_selection_process():
pass
def execute_feature_selection_step():
pass
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
"""