Add GroupKFold to feature selection CV. Start with generic metric calculation procedure.
parent
1cbc743cf7
commit
0594993133
|
@ -34,8 +34,8 @@ index_columns = ["local_segment", "local_segment_label", "local_segment_start_da
|
||||||
df.set_index(index_columns, inplace=True)
|
df.set_index(index_columns, inplace=True)
|
||||||
|
|
||||||
# Create binary target
|
# Create binary target
|
||||||
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
# bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||||
df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
|
# df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
|
||||||
|
|
||||||
|
|
||||||
nan_cols = df.columns[df.isna().any()].tolist()
|
nan_cols = df.columns[df.isna().any()].tolist()
|
||||||
|
@ -58,10 +58,12 @@ for split in cv.get_splits():
|
||||||
# Feature selection on train set
|
# Feature selection on train set
|
||||||
# Morda se implementira GroupKfold namesto stratifiedKFold? >>
|
# Morda se implementira GroupKfold namesto stratifiedKFold? >>
|
||||||
# >> Tako se bo posamezen pid pojavil ali v test ali v train setu
|
# >> Tako se bo posamezen pid pojavil ali v test ali v train setu
|
||||||
fs = FeatureSelection(train_X, train_y)
|
train_groups, test_groups = cv.get_groups_sets(split)
|
||||||
selected_features = fs.select_features(n_min=20, n_max=50, k=80,
|
|
||||||
ml_type="regression_",
|
fs = FeatureSelection(train_X, train_y, train_groups)
|
||||||
n_tolerance=20)
|
selected_features = fs.select_features(n_min=20, n_max=50, k=60,
|
||||||
|
ml_type="classification_multi",
|
||||||
|
metric="f1", n_tolerance=20)
|
||||||
print(selected_features)
|
print(selected_features)
|
||||||
print(len(selected_features))
|
print(len(selected_features))
|
||||||
|
|
||||||
|
|
|
@ -49,8 +49,8 @@ class CrossValidation():
|
||||||
|
|
||||||
data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
|
data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
|
||||||
|
|
||||||
elif self.cv_method == "5kfold":
|
elif self.cv_method == "Stratified5kfold":
|
||||||
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
|
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], None
|
||||||
|
|
||||||
self.X, self.y, self.groups = data_X, data_y, data_groups
|
self.X, self.y, self.groups = data_X, data_y, data_groups
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ class CrossValidation():
|
||||||
|
|
||||||
if self.cv_method in ["logo", "half_logo"]:
|
if self.cv_method in ["logo", "half_logo"]:
|
||||||
self.cv = LeaveOneGroupOut()
|
self.cv = LeaveOneGroupOut()
|
||||||
elif self.cv_method == "5kfold":
|
elif self.cv_method == "Stratified5kfold":
|
||||||
self.cv = StratifiedKFold(n_splits=5, shuffle=True)
|
self.cv = StratifiedKFold(n_splits=5, shuffle=True)
|
||||||
|
|
||||||
|
|
||||||
|
@ -118,4 +118,11 @@ class CrossValidation():
|
||||||
"""
|
"""
|
||||||
return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
|
return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
|
||||||
|
|
||||||
|
def get_groups_sets(self, split):
|
||||||
|
|
||||||
|
if self.groups is None:
|
||||||
|
return None, None
|
||||||
|
else:
|
||||||
|
return self.groups.iloc[split[0]], self.groups.iloc[split[1]]
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
|
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
|
||||||
from sklearn.model_selection import cross_validate, StratifiedKFold
|
from sklearn.model_selection import cross_validate, StratifiedKFold, GroupKFold
|
||||||
from sklearn.naive_bayes import GaussianNB
|
from sklearn.naive_bayes import GaussianNB
|
||||||
from sklearn.linear_model import Lasso
|
from sklearn.linear_model import Lasso
|
||||||
|
|
||||||
|
@ -23,9 +23,10 @@ from sklearn.linear_model import Lasso
|
||||||
|
|
||||||
class FeatureSelection:
|
class FeatureSelection:
|
||||||
|
|
||||||
def __init__(self, X, y):
|
def __init__(self, X, y, groups):
|
||||||
self.X = X
|
self.X = X
|
||||||
self.y = y
|
self.y = y
|
||||||
|
self.groups = groups
|
||||||
|
|
||||||
|
|
||||||
def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
|
def select_best_feature(self, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
|
||||||
|
@ -65,55 +66,35 @@ class FeatureSelection:
|
||||||
|
|
||||||
X = self.X[pred_features].copy()
|
X = self.X[pred_features].copy()
|
||||||
|
|
||||||
|
if self.groups is not None:
|
||||||
|
cv = GroupKFold(n_splits=5)
|
||||||
|
else:
|
||||||
|
cv = StratifiedKFold(n_splits=5, shuffle=True)
|
||||||
|
|
||||||
|
# See link about scoring for multiclassfication
|
||||||
|
# http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
|
||||||
if ml_type == "classification":
|
if ml_type == "classification":
|
||||||
nb = GaussianNB()
|
nb = GaussianNB()
|
||||||
model_cv = cross_validate(
|
model_cv = cross_validate(
|
||||||
nb,
|
nb,
|
||||||
X=X,
|
X=X,
|
||||||
y=self.y,
|
y=self.y,
|
||||||
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
cv=cv,
|
||||||
|
groups=self.groups,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=(metric)
|
||||||
)
|
)
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
||||||
|
|
||||||
if metric == "accuracy":
|
metric_score = np.nanmean(model_cv[f'test_{metric}'])
|
||||||
acc = np.mean(model_cv['test_accuracy'])
|
metric_score_std = np.nanstd(model_cv[f'test_{metric}'])
|
||||||
acc_std = np.std(model_cv['test_accuracy'])
|
|
||||||
|
|
||||||
if not best_feature or (acc > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = acc
|
|
||||||
best_metric_score_std = acc_std
|
|
||||||
|
|
||||||
elif metric == "precision":
|
if not best_feature or (metric_score > best_metric_score):
|
||||||
prec = np.mean(model_cv['test_precision'])
|
best_feature = feat
|
||||||
prec_std = np.std(model_cv['test_precision'])
|
best_metric_score = metric_score
|
||||||
|
best_metric_score_std = metric_score_std
|
||||||
if not best_feature or (prec > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = prec
|
|
||||||
best_metric_score_std = prec_std
|
|
||||||
|
|
||||||
elif metric == "recall":
|
|
||||||
rec = np.mean(model_cv['test_recall'])
|
|
||||||
rec_std = np.std(model_cv['test_recall'])
|
|
||||||
|
|
||||||
if not best_feature or (rec > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = rec
|
|
||||||
best_metric_score_std = rec_std
|
|
||||||
|
|
||||||
else:
|
|
||||||
f1 = np.mean(model_cv['test_f1'])
|
|
||||||
f1_std = np.std(model_cv['test_f1'])
|
|
||||||
|
|
||||||
if not best_feature or (f1 > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = f1
|
|
||||||
best_metric_score_std = f1_std
|
|
||||||
|
|
||||||
elif ml_type == "regression":
|
elif ml_type == "regression":
|
||||||
lass = Lasso()
|
lass = Lasso()
|
||||||
|
@ -121,7 +102,8 @@ class FeatureSelection:
|
||||||
lass,
|
lass,
|
||||||
X=X,
|
X=X,
|
||||||
y=y,
|
y=y,
|
||||||
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
cv=cv,
|
||||||
|
groups=self.groups,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=('r2')
|
scoring=('r2')
|
||||||
)
|
)
|
||||||
|
@ -214,7 +196,7 @@ class FeatureSelection:
|
||||||
break
|
break
|
||||||
|
|
||||||
best_feature, best_metric_score, best_metric_score_std = \
|
best_feature, best_metric_score, best_metric_score_std = \
|
||||||
self.select_best_feature(features, method=method, ml_type=ml_type[0], metric="recall")
|
self.select_best_feature(features, method=method, ml_type=ml_type[0], metric=metric)
|
||||||
|
|
||||||
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
|
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue