Implement feature selection method which is used in ML pipeline.
parent
10ca47583c
commit
ce13a9e13b
|
@ -26,24 +26,45 @@ if nb_dir not in sys.path:
|
||||||
|
|
||||||
from machine_learning.cross_validation import CrossValidation
|
from machine_learning.cross_validation import CrossValidation
|
||||||
from machine_learning.preprocessing import Preprocessing
|
from machine_learning.preprocessing import Preprocessing
|
||||||
|
from machine_learning.feature_selection import FeatureSelection
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
|
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
|
||||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
df.set_index(index_columns, inplace=True)
|
df.set_index(index_columns, inplace=True)
|
||||||
|
|
||||||
|
# Create binary target
|
||||||
|
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||||
|
df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
|
||||||
|
|
||||||
|
|
||||||
|
nan_cols = df.columns[df.isna().any()].tolist()
|
||||||
|
df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))
|
||||||
|
|
||||||
cv = CrossValidation(data=df, cv_method="logo")
|
cv = CrossValidation(data=df, cv_method="logo")
|
||||||
|
|
||||||
categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
|
categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
|
||||||
interval_feature_list, other_feature_list = [], []
|
interval_feature_list, other_feature_list = [], []
|
||||||
|
|
||||||
print(df.columns.tolist())
|
# %%
|
||||||
|
|
||||||
for split in cv.get_splits():
|
for split in cv.get_splits():
|
||||||
train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
|
train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
|
||||||
pre = Preprocessing(train_X, train_y, test_X, test_y)
|
pre = Preprocessing(train_X, train_y, test_X, test_y)
|
||||||
pre.one_hot_encode_train_and_test_sets(categorical_columns)
|
pre.one_hot_encode_train_and_test_sets(categorical_columns)
|
||||||
train_X, train_y, test_X, test_y = pre.get_train_test_sets()
|
train_X, train_y, test_X, test_y = pre.get_train_test_sets()
|
||||||
|
|
||||||
|
# train_X = train_X[train_X.columns[:30]]
|
||||||
|
|
||||||
|
# Feature selection on train set
|
||||||
|
# Morda se implementira GroupKfold namesto stratifiedKFold? >>
|
||||||
|
# >> Tako se bo posamezen pid pojavil ali v test ali v train setu
|
||||||
|
fs = FeatureSelection(train_X, train_y)
|
||||||
|
selected_features = fs.select_features(n_min=20, n_max=60, n_not_improve=3)
|
||||||
|
print(selected_features)
|
||||||
|
print(len(selected_features))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import warnings
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from sklearn.feature_selection import SequentialFeatureSelector
|
from sklearn.feature_selection import SequentialFeatureSelector
|
||||||
|
from sklearn.model_selection import cross_validate, StratifiedKFold
|
||||||
from sklearn.naive_bayes import GaussianNB
|
from sklearn.naive_bayes import GaussianNB
|
||||||
from sklearn.linear_model import Lasso
|
from sklearn.linear_model import Lasso
|
||||||
|
|
||||||
|
@ -21,11 +23,12 @@ from sklearn.linear_model import Lasso
|
||||||
|
|
||||||
class FeatureSelection:
|
class FeatureSelection:
|
||||||
|
|
||||||
def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
|
def __init__(self, X, y):
|
||||||
pass # TODO....
|
self.X = X
|
||||||
|
self.y = y
|
||||||
|
|
||||||
|
|
||||||
def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
|
def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
|
||||||
"""The method selects the best feature by testing the prediction on the feature set with or without the current feature.
|
"""The method selects the best feature by testing the prediction on the feature set with or without the current feature.
|
||||||
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat
|
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat
|
||||||
feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
|
feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
|
||||||
|
@ -56,18 +59,18 @@ class FeatureSelection:
|
||||||
|
|
||||||
for feat in features:
|
for feat in features:
|
||||||
if method == "remove":
|
if method == "remove":
|
||||||
pred_features = [col for col in df.columns if feat != col] # All but feat
|
pred_features = [col for col in self.X.columns if feat != col] # All but feat
|
||||||
elif method == "add":
|
elif method == "add":
|
||||||
pred_features = [feat] + stored_features # Feat with stored features
|
pred_features = [feat] + stored_features # Feat with stored features
|
||||||
|
|
||||||
X, y = df.drop(columns=['target', 'pid'])[pred_features], df['target']
|
X = self.X[pred_features].copy()
|
||||||
|
|
||||||
if ml_type == "classification":
|
if ml_type == "classification":
|
||||||
nb = GaussianNB()
|
nb = GaussianNB()
|
||||||
model_cv = cross_validate(
|
model_cv = cross_validate(
|
||||||
nb,
|
nb,
|
||||||
X=X,
|
X=X,
|
||||||
y=y,
|
y=self.y,
|
||||||
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
|
@ -137,66 +140,57 @@ class FeatureSelection:
|
||||||
return best_feature, best_metric_score, best_metric_score_std
|
return best_feature, best_metric_score, best_metric_score_std
|
||||||
|
|
||||||
|
|
||||||
def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
|
def select_features(self, n_min=20, n_max=50, method="remove", n_not_improve=10):
|
||||||
|
"""This method selects a set of features and returns them as a list. It returns number of features
|
||||||
|
determined in the interval of [n_min, n_max]. The best score is detected using a removal procedure.
|
||||||
|
The procedure sequentially removes the features that attribute the least to the choosen evaluation metric.
|
||||||
|
If in this sequence the score ML score is improved the next feature is remove otherwise there is a
|
||||||
|
tolerance criteria (n_not_improve) with which the next n remove features are inspected whether
|
||||||
|
currently best score is improved. The features are returned in specified interval as a list.
|
||||||
|
|
||||||
n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
|
Args:
|
||||||
if n_max > n_features:
|
n_min (int): Minimal amount of features returned.
|
||||||
n_max = n_features
|
n_max (int): Maximal amount of features returned.
|
||||||
|
method (str, optional): "remove" or "add" features. Defaults to "remove".
|
||||||
|
n_not_improve (int): If the best score is not improved in n that is specified by this parameter
|
||||||
|
the method returns index of feature with current best score as a tipping point feature.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: list of selected features
|
||||||
|
"""
|
||||||
|
|
||||||
|
n_features = self.X.shape[1]
|
||||||
|
if n_max >= n_features:
|
||||||
|
n_max = n_features-1 # The algorithm removes at least one feature
|
||||||
|
|
||||||
if n_min > n_features:
|
if n_min > n_features:
|
||||||
raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
|
raise ValueError("The number of features in the dataframe must be at least as n_min+1 parameter.")
|
||||||
|
|
||||||
if n_max < n_min:
|
if n_max < n_min:
|
||||||
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
|
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
|
||||||
|
|
||||||
features = df.columns.tolist()
|
features = self.X.columns.tolist()
|
||||||
features.remove("pid")
|
|
||||||
features.remove("target")
|
|
||||||
feature_importance = []
|
feature_importance = []
|
||||||
if method == "remove":
|
if method == "remove":
|
||||||
|
best_score = 0
|
||||||
|
best_feature_indx = None
|
||||||
|
i_worse = 0
|
||||||
for i in reversed(range(n_features)):
|
for i in reversed(range(n_features)):
|
||||||
|
|
||||||
|
if i+1 == n_min:
|
||||||
|
break
|
||||||
|
|
||||||
best_feature, best_metric_score, best_metric_score_std = \
|
best_feature, best_metric_score, best_metric_score_std = \
|
||||||
self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
|
self.select_best_feature(features, method=method, ml_type="classification", metric="recall")
|
||||||
feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
|
|
||||||
|
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
|
||||||
|
|
||||||
features.remove(best_feature)
|
features.remove(best_feature)
|
||||||
|
|
||||||
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
|
if i <= n_max:
|
||||||
|
if best_metric_score >= best_score:
|
||||||
# Selekcijski kriterij značilk v rangu max-min
|
best_score = best_metric_score
|
||||||
# Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
|
best_feature_indx = i+1
|
||||||
|
|
||||||
# Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
|
|
||||||
|
|
||||||
# "Tipping point" značilka mora biti v rangu max-min
|
|
||||||
|
|
||||||
selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
|
|
||||||
selection_area.set_index(["i", "name"], inplace=True)
|
|
||||||
diffrences = selection_area.diff()
|
|
||||||
diffrences.dropna(how='any', inplace=True)
|
|
||||||
|
|
||||||
# Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo
|
|
||||||
cumulative_sumation = diffrences.cumsum()
|
|
||||||
tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
|
|
||||||
|
|
||||||
# Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative
|
|
||||||
tipping_feature_indx_2 = None
|
|
||||||
for indx, row in diffrences.iterrows():
|
|
||||||
if row["metric"] > 0:
|
|
||||||
tipping_feature_indx_2 = indx
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score
|
|
||||||
tipping_feature_indx_3 = None
|
|
||||||
cum_sum_score = 0
|
|
||||||
i_worse = 0
|
|
||||||
# TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
|
|
||||||
for indx, row in selection_area.iterrows():
|
|
||||||
if row["metric"] > 0:
|
|
||||||
tipping_feature_indx_3 = indx
|
|
||||||
cum_sum_score += row["metric"]
|
|
||||||
i_worse = 0
|
i_worse = 0
|
||||||
else:
|
else:
|
||||||
i_worse += 1
|
i_worse += 1
|
||||||
|
@ -204,18 +198,56 @@ class FeatureSelection:
|
||||||
if i_worse == n_not_improve:
|
if i_worse == n_not_improve:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
|
||||||
|
|
||||||
|
print(feature_importance_df)
|
||||||
|
print("best_feature_indx", best_feature_indx)
|
||||||
|
print("best_score", best_score)
|
||||||
|
|
||||||
|
features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist()
|
||||||
|
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
|
||||||
|
|
||||||
|
return selected_features
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Selekcijski kriterij značilk v rangu max-min
|
||||||
|
# Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
|
||||||
|
|
||||||
|
# Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
|
||||||
|
|
||||||
|
# "Tipping point" značilka mora biti v rangu max-min
|
||||||
|
selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
|
||||||
|
selection_area.set_index(["i", "name"], inplace=True)
|
||||||
|
print(selection_area)
|
||||||
|
diffrences = selection_area.diff()
|
||||||
|
diffrences.dropna(how='any', inplace=True)
|
||||||
|
print(diffrences)
|
||||||
|
|
||||||
|
# Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo
|
||||||
|
cumulative_sumation = diffrences.cumsum()
|
||||||
|
print(cumulative_sumation)
|
||||||
|
tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
|
||||||
|
print(tipping_feature_indx_1)
|
||||||
|
|
||||||
|
|
||||||
|
# Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score
|
||||||
|
tipping_feature_indx_2 = None
|
||||||
|
best_score = 0
|
||||||
|
i_worse = 0
|
||||||
|
for indx, row in selection_area.iterrows():
|
||||||
|
if row["metric"] > best_score:
|
||||||
|
tipping_feature_indx_2 = indx
|
||||||
|
best_score = row["metric"]
|
||||||
|
i_worse = 0
|
||||||
|
else:
|
||||||
|
i_worse += 1
|
||||||
|
|
||||||
|
if i_worse == n_not_improve:
|
||||||
|
break
|
||||||
|
|
||||||
|
print(tipping_feature_indx_2)
|
||||||
|
selection_area.reset_index(inplace=True)
|
||||||
|
features_to_remove = feature_importance_df[feature_importance_df["i"] >= tipping_feature_indx_2[0]]["name"].values.tolist()
|
||||||
|
|
||||||
|
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
|
||||||
|
"""
|
||||||
def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def vizualize_feature_selection_process():
|
|
||||||
pass
|
|
||||||
|
|
||||||
def execute_feature_selection_step():
|
|
||||||
pass
|
|
Loading…
Reference in New Issue