Implement feature selection method which is used in ML pipeline.
parent
10ca47583c
commit
ce13a9e13b
|
@ -26,24 +26,45 @@ if nb_dir not in sys.path:
|
|||
|
||||
from machine_learning.cross_validation import CrossValidation
|
||||
from machine_learning.preprocessing import Preprocessing
|
||||
from machine_learning.feature_selection import FeatureSelection
|
||||
|
||||
# %%
|
||||
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
df.set_index(index_columns, inplace=True)
|
||||
|
||||
# Create binary target
|
||||
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||
df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
|
||||
|
||||
|
||||
nan_cols = df.columns[df.isna().any()].tolist()
|
||||
df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))
|
||||
|
||||
cv = CrossValidation(data=df, cv_method="logo")
|
||||
|
||||
categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
|
||||
interval_feature_list, other_feature_list = [], []
|
||||
|
||||
print(df.columns.tolist())
|
||||
|
||||
# %%
|
||||
for split in cv.get_splits():
|
||||
train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
|
||||
pre = Preprocessing(train_X, train_y, test_X, test_y)
|
||||
pre.one_hot_encode_train_and_test_sets(categorical_columns)
|
||||
train_X, train_y, test_X, test_y = pre.get_train_test_sets()
|
||||
|
||||
# train_X = train_X[train_X.columns[:30]]
|
||||
|
||||
# Feature selection on train set
|
||||
# Morda se implementira GroupKfold namesto stratifiedKFold? >>
|
||||
# >> Tako se bo posamezen pid pojavil ali v test ali v train setu
|
||||
fs = FeatureSelection(train_X, train_y)
|
||||
selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3)
|
||||
print(selected_features)
|
||||
print(len(selected_features))
|
||||
|
||||
|
||||
|
||||
break
|
||||
|
||||
# %%
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
import os
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.feature_selection import SequentialFeatureSelector
|
||||
from sklearn.model_selection import cross_validate, StratifiedKFold
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.linear_model import Lasso
|
||||
|
||||
|
@ -21,11 +23,12 @@ from sklearn.linear_model import Lasso
|
|||
|
||||
class FeatureSelection:
|
||||
|
||||
def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
|
||||
pass # TODO....
|
||||
def __init__(self, X, y):
|
||||
self.X = X
|
||||
self.y = y
|
||||
|
||||
|
||||
def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
|
||||
def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
|
||||
"""The method selects the best feature by testing the prediction on the feature set with or without the current feature.
|
||||
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat
|
||||
feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
|
||||
|
@ -56,18 +59,18 @@ class FeatureSelection:
|
|||
|
||||
for feat in features:
|
||||
if method == "remove":
|
||||
pred_features = [col for col in df.columns if feat != col] # All but feat
|
||||
pred_features = [col for col in self.X.columns if feat != col] # All but feat
|
||||
elif method == "add":
|
||||
pred_features = [feat] + stored_features # Feat with stored features
|
||||
|
||||
X, y = df.drop(columns=['target', 'pid'])[pred_features], df['target']
|
||||
X = self.X[pred_features].copy()
|
||||
|
||||
if ml_type == "classification":
|
||||
nb = GaussianNB()
|
||||
model_cv = cross_validate(
|
||||
nb,
|
||||
X=X,
|
||||
y=y,
|
||||
y=self.y,
|
||||
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
||||
n_jobs=-1,
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
|
@ -137,66 +140,57 @@ class FeatureSelection:
|
|||
return best_feature, best_metric_score, best_metric_score_std
|
||||
|
||||
|
||||
def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
|
||||
def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10):
|
||||
"""This method selects a set of features and returns them as a list. It returns number of features
|
||||
determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure.
|
||||
The procedure sequentially removes the features that attribute the least to the choosen evaluation metric.
|
||||
If in this sequence the score ML score is improved the next feature is remove otherwise there is a
|
||||
tolerance criteria (n_not_improve) with which the next n remove features are inspected whether
|
||||
currently best score is improved. The features are returned in specified interval as a list.
|
||||
|
||||
n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
|
||||
if n_max > n_features:
|
||||
n_max = n_features
|
||||
Args:
|
||||
n_min (int): Minimal amount of features returned.
|
||||
n_max (int): Maximal amount of features returned.
|
||||
method (str, optional): "remove" or "add" features. Defaults to "remove".
|
||||
n_not_improve (int): If the best score is not improved in n that is specified by this parameter
|
||||
the method returns index of feature with current best score as a tipping point feature.
|
||||
|
||||
Returns:
|
||||
list: list of selected features
|
||||
"""
|
||||
|
||||
n_features = self.X.shape[1]
|
||||
if n_max >= n_features:
|
||||
n_max = n_features-1 # The algorithm removes at least one feature
|
||||
|
||||
if n_min > n_features:
|
||||
raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
|
||||
raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.")
|
||||
|
||||
if n_max < n_min:
|
||||
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
|
||||
|
||||
features = df.columns.tolist()
|
||||
features.remove("pid")
|
||||
features.remove("target")
|
||||
features = self.X.columns.tolist()
|
||||
feature_importance = []
|
||||
if method == "remove":
|
||||
best_score = 0
|
||||
best_feature_indx = None
|
||||
i_worse = 0
|
||||
for i in reversed(range(n_features)):
|
||||
|
||||
if i+1 == n_min:
|
||||
break
|
||||
|
||||
best_feature, best_metric_score, best_metric_score_std = \
|
||||
self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
|
||||
feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
|
||||
self.select_best_feature(features, method=method, ml_type="classification", metric="recall")
|
||||
|
||||
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
|
||||
|
||||
features.remove(best_feature)
|
||||
|
||||
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
|
||||
|
||||
# Selekcijski kriterij značilk v rangu max-min
|
||||
# Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
|
||||
|
||||
# Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
|
||||
|
||||
# "Tipping point" značilka mora biti v rangu max-min
|
||||
|
||||
selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
|
||||
selection_area.set_index(["i", "name"], inplace=True)
|
||||
diffrences = selection_area.diff()
|
||||
diffrences.dropna(how='any', inplace=True)
|
||||
|
||||
# Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo
|
||||
cumulative_sumation = diffrences.cumsum()
|
||||
tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
|
||||
|
||||
# Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative
|
||||
tipping_feature_indx_2 = None
|
||||
for indx, row in diffrences.iterrows():
|
||||
if row["metric"] > 0:
|
||||
tipping_feature_indx_2 = indx
|
||||
else:
|
||||
break
|
||||
|
||||
# Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score
|
||||
tipping_feature_indx_3 = None
|
||||
cum_sum_score = 0
|
||||
i_worse = 0
|
||||
# TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
|
||||
for indx, row in selection_area.iterrows():
|
||||
if row["metric"] > 0:
|
||||
tipping_feature_indx_3 = indx
|
||||
cum_sum_score += row["metric"]
|
||||
if i <= n_max:
|
||||
if best_metric_score >= best_score:
|
||||
best_score = best_metric_score
|
||||
best_feature_indx = i+1
|
||||
i_worse = 0
|
||||
else:
|
||||
i_worse += 1
|
||||
|
@ -204,18 +198,56 @@ class FeatureSelection:
|
|||
if i_worse == n_not_improve:
|
||||
break
|
||||
|
||||
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
|
||||
|
||||
print(feature_importance_df)
|
||||
print("best_feature_indx", best_feature_indx)
|
||||
print("best_score", best_score)
|
||||
|
||||
features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist()
|
||||
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
|
||||
|
||||
return selected_features
|
||||
|
||||
"""
|
||||
# Selekcijski kriterij značilk v rangu max-min
|
||||
# Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
|
||||
|
||||
# Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
|
||||
|
||||
# "Tipping point" značilka mora biti v rangu max-min
|
||||
selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
|
||||
selection_area.set_index(["i", "name"], inplace=True)
|
||||
print(selection_area)
|
||||
diffrences = selection_area.diff()
|
||||
diffrences.dropna(how='any', inplace=True)
|
||||
print(diffrences)
|
||||
|
||||
# Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo
|
||||
cumulative_sumation = diffrences.cumsum()
|
||||
print(cumulative_sumation)
|
||||
tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
|
||||
print(tipping_feature_indx_1)
|
||||
|
||||
|
||||
# Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score
|
||||
tipping_feature_indx_2 = None
|
||||
best_score = 0
|
||||
i_worse = 0
|
||||
for indx, row in selection_area.iterrows():
|
||||
if row["metric"] > best_score:
|
||||
tipping_feature_indx_2 = indx
|
||||
best_score = row["metric"]
|
||||
i_worse = 0
|
||||
else:
|
||||
i_worse += 1
|
||||
|
||||
if i_worse == n_not_improve:
|
||||
break
|
||||
|
||||
print(tipping_feature_indx_2)
|
||||
selection_area.reset_index(inplace=True)
|
||||
features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
|
||||
|
||||
|
||||
|
||||
def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
|
||||
pass
|
||||
|
||||
def vizualize_feature_selection_process():
|
||||
pass
|
||||
|
||||
def execute_feature_selection_step():
|
||||
pass
|
||||
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
|
||||
"""
|
Loading…
Reference in New Issue