Merge branch 'ml_pipeline'
commit
3e38b64b45
|
@ -20,30 +20,74 @@ import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.metrics import recall_score, f1_score
|
||||||
|
|
||||||
nb_dir = os.path.split(os.getcwd())[0]
|
nb_dir = os.path.split(os.getcwd())[0]
|
||||||
if nb_dir not in sys.path:
|
if nb_dir not in sys.path:
|
||||||
sys.path.append(nb_dir)
|
sys.path.append(nb_dir)
|
||||||
|
|
||||||
from machine_learning.cross_validation import CrossValidation
|
from machine_learning.cross_validation import CrossValidation
|
||||||
from machine_learning.preprocessing import Preprocessing
|
from machine_learning.preprocessing import Preprocessing
|
||||||
|
from machine_learning.feature_selection import FeatureSelection
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
|
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
|
||||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
df.set_index(index_columns, inplace=True)
|
df.set_index(index_columns, inplace=True)
|
||||||
|
|
||||||
|
# Create binary target
|
||||||
|
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||||
|
df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
|
||||||
|
|
||||||
|
|
||||||
|
nan_cols = df.columns[df.isna().any()].tolist()
|
||||||
|
df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))
|
||||||
|
|
||||||
cv = CrossValidation(data=df, cv_method="logo")
|
cv = CrossValidation(data=df, cv_method="logo")
|
||||||
|
|
||||||
categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
|
categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
|
||||||
interval_feature_list, other_feature_list = [], []
|
interval_feature_list, other_feature_list = [], []
|
||||||
|
|
||||||
print(df.columns.tolist())
|
# %%
|
||||||
|
|
||||||
for split in cv.get_splits():
|
for split in cv.get_splits():
|
||||||
train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
|
train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
|
||||||
pre = Preprocessing(train_X, train_y, test_X, test_y)
|
pre = Preprocessing(train_X, train_y, test_X, test_y)
|
||||||
pre.one_hot_encode_train_and_test_sets(categorical_columns)
|
pre.one_hot_encode_train_and_test_sets(categorical_columns)
|
||||||
train_X, train_y, test_X, test_y = pre.get_train_test_sets()
|
train_X, train_y, test_X, test_y = pre.get_train_test_sets()
|
||||||
|
|
||||||
|
|
||||||
|
print(train_X.shape, test_X.shape)
|
||||||
|
# Predict before feature selection
|
||||||
|
rfc = RandomForestClassifier(n_estimators=10)
|
||||||
|
rfc.fit(train_X, train_y)
|
||||||
|
predictions = rfc.predict(test_X)
|
||||||
|
|
||||||
|
print("Recall:", recall_score(test_y, predictions))
|
||||||
|
print("F1:", f1_score(test_y, predictions))
|
||||||
|
|
||||||
|
# Feature selection on train set
|
||||||
|
train_groups, test_groups = cv.get_groups_sets(split)
|
||||||
|
|
||||||
|
fs = FeatureSelection(train_X, train_y, train_groups)
|
||||||
|
selected_features = fs.select_features(n_min=20, n_max=29, k=40,
|
||||||
|
ml_type="classification_bin",
|
||||||
|
metric="recall", n_tolerance=20)
|
||||||
|
|
||||||
|
train_X = train_X[selected_features]
|
||||||
|
test_X = test_X[selected_features]
|
||||||
|
|
||||||
|
print(selected_features)
|
||||||
|
print(len(selected_features))
|
||||||
|
|
||||||
|
# Predict after feature selection
|
||||||
|
rfc = RandomForestClassifier(n_estimators=500)
|
||||||
|
rfc.fit(train_X, train_y)
|
||||||
|
predictions = rfc.predict(test_X)
|
||||||
|
|
||||||
|
print("Recall:", recall_score(test_y, predictions))
|
||||||
|
print("F1:", f1_score(test_y, predictions))
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
|
|
@ -49,8 +49,8 @@ class CrossValidation():
|
||||||
|
|
||||||
data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
|
data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
|
||||||
|
|
||||||
elif self.cv_method == "5kfold":
|
elif self.cv_method == "Stratified5kfold":
|
||||||
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
|
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], None
|
||||||
|
|
||||||
self.X, self.y, self.groups = data_X, data_y, data_groups
|
self.X, self.y, self.groups = data_X, data_y, data_groups
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ class CrossValidation():
|
||||||
|
|
||||||
if self.cv_method in ["logo", "half_logo"]:
|
if self.cv_method in ["logo", "half_logo"]:
|
||||||
self.cv = LeaveOneGroupOut()
|
self.cv = LeaveOneGroupOut()
|
||||||
elif self.cv_method == "5kfold":
|
elif self.cv_method == "Stratified5kfold":
|
||||||
self.cv = StratifiedKFold(n_splits=5, shuffle=True)
|
self.cv = StratifiedKFold(n_splits=5, shuffle=True)
|
||||||
|
|
||||||
|
|
||||||
|
@ -118,4 +118,11 @@ class CrossValidation():
|
||||||
"""
|
"""
|
||||||
return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
|
return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
|
||||||
|
|
||||||
|
def get_groups_sets(self, split):
|
||||||
|
|
||||||
|
if self.groups is None:
|
||||||
|
return None, None
|
||||||
|
else:
|
||||||
|
return self.groups.iloc[split[0]], self.groups.iloc[split[1]]
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import warnings
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from sklearn.feature_selection import SequentialFeatureSelector
|
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
|
||||||
|
from sklearn.model_selection import cross_validate, StratifiedKFold, GroupKFold
|
||||||
from sklearn.naive_bayes import GaussianNB
|
from sklearn.naive_bayes import GaussianNB
|
||||||
from sklearn.linear_model import Lasso
|
from sklearn.linear_model import Lasso
|
||||||
|
|
||||||
|
@ -21,13 +23,15 @@ from sklearn.linear_model import Lasso
|
||||||
|
|
||||||
class FeatureSelection:
|
class FeatureSelection:
|
||||||
|
|
||||||
def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
|
def __init__(self, X, y, groups):
|
||||||
pass # TODO....
|
self.X = X
|
||||||
|
self.y = y
|
||||||
|
self.groups = groups
|
||||||
|
|
||||||
|
|
||||||
def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
|
def select_best_feature(self, features, method="remove", ml_category="classification", ml_subcategory="bin", metric="recall", stored_features=[]):
|
||||||
"""The method selects the best feature by testing the prediction on the feature set with or without the current feature.
|
"""The method selects the best feature by testing the prediction on the feature set with or without the current feature.
|
||||||
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat
|
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particular
|
||||||
feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
|
feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
|
||||||
specified as a parameter.
|
specified as a parameter.
|
||||||
|
|
||||||
|
@ -35,7 +39,11 @@ class FeatureSelection:
|
||||||
df (DataFrame): Input data on which the predictions will be made.
|
df (DataFrame): Input data on which the predictions will be made.
|
||||||
features (list): List of features to select the best/worst from
|
features (list): List of features to select the best/worst from
|
||||||
method (str, optional): remove or add features. Defaults to "remove".
|
method (str, optional): remove or add features. Defaults to "remove".
|
||||||
ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and metric. Defaults to "classification".
|
ml_category (str, optional): Either classification or regression ml problem controls the ML algorithm and metric.
|
||||||
|
Defaults to "classification".
|
||||||
|
ml_subcategory (str, optional): In case of classification '_bin' for binary classification
|
||||||
|
and 'multi' for multiclass classification. For regression an empty string '' is sufficient.
|
||||||
|
Defaults to "bin".
|
||||||
metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
|
metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
|
||||||
stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
|
stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
|
||||||
|
|
||||||
|
@ -49,173 +57,189 @@ class FeatureSelection:
|
||||||
|
|
||||||
best_feature = None
|
best_feature = None
|
||||||
|
|
||||||
if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
|
# Validacije tipov ML in specificiranimi metrikami
|
||||||
raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
|
if ml_category == "classification":
|
||||||
elif ml_type == "regression" and metric not in ['r2']:
|
if ml_subcategory == "bin" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
|
||||||
|
raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
|
||||||
|
elif ml_subcategory == "multi":
|
||||||
|
ml_subcategory_error = False
|
||||||
|
if metric != "accuracy" and "_" in metric:
|
||||||
|
metric_s, metric_t = metric.split("_")
|
||||||
|
if metric_s not in ['accuracy', 'precision', 'recall', 'f1'] or metric_t not in ['micro', 'macro', 'weighted']:
|
||||||
|
ml_subcategory_error = True
|
||||||
|
else:
|
||||||
|
ml_subcategory_error = True
|
||||||
|
|
||||||
|
if ml_subcategory_error:
|
||||||
|
raise ValueError(""""Classification metric for multi-class classification must be specified precisely.
|
||||||
|
Available metric are: 'accuracy', 'precision', 'recall' and 'f1'.
|
||||||
|
Only accuracy must be specified as 'accuracy'.
|
||||||
|
For others please add appropriate suffixes: '_macro', '_micro', or '_weighted', e.g., 'f1_macro'""")
|
||||||
|
elif ml_category == "regression" and metric not in ['r2']:
|
||||||
raise ValueError("Regression metric not recognized. Please choose 'r2'")
|
raise ValueError("Regression metric not recognized. Please choose 'r2'")
|
||||||
|
|
||||||
for feat in features:
|
for feat in features:
|
||||||
if method == "remove":
|
if method == "remove":
|
||||||
pred_features = [col for col in df.columns if feat != col] # All but feat
|
pred_features = [col for col in self.X.columns if feat != col] # All but feat
|
||||||
elif method == "add":
|
elif method == "add":
|
||||||
pred_features = [feat] + stored_features # Feat with stored features
|
pred_features = [feat] + stored_features # Feat with stored features
|
||||||
|
|
||||||
X, y = df.drop(columns=['target', 'pid'])[pred_features], df['target']
|
X = self.X[pred_features].copy()
|
||||||
|
|
||||||
if ml_type == "classification":
|
if self.groups is not None:
|
||||||
|
cv = GroupKFold(n_splits=5)
|
||||||
|
else:
|
||||||
|
cv = StratifiedKFold(n_splits=5, shuffle=True)
|
||||||
|
|
||||||
|
# See link about scoring for multiclassfication
|
||||||
|
# http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
|
||||||
|
if ml_category == "classification":
|
||||||
nb = GaussianNB()
|
nb = GaussianNB()
|
||||||
model_cv = cross_validate(
|
model_cv = cross_validate(
|
||||||
nb,
|
nb,
|
||||||
X=X,
|
X=X,
|
||||||
y=y,
|
y=self.y,
|
||||||
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
cv=cv,
|
||||||
|
groups=self.groups,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=(metric)
|
||||||
)
|
)
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
|
||||||
|
|
||||||
if metric == "accuracy":
|
elif ml_category == "regression":
|
||||||
acc = np.mean(model_cv['test_accuracy'])
|
|
||||||
acc_std = np.std(model_cv['test_accuracy'])
|
|
||||||
|
|
||||||
if not best_feature or (acc > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = acc
|
|
||||||
best_metric_score_std = acc_std
|
|
||||||
|
|
||||||
elif metric == "precision":
|
|
||||||
prec = np.mean(model_cv['test_precision'])
|
|
||||||
prec_std = np.std(model_cv['test_precision'])
|
|
||||||
|
|
||||||
if not best_feature or (prec > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = prec
|
|
||||||
best_metric_score_std = prec_std
|
|
||||||
|
|
||||||
elif metric == "recall":
|
|
||||||
rec = np.mean(model_cv['test_recall'])
|
|
||||||
rec_std = np.std(model_cv['test_recall'])
|
|
||||||
|
|
||||||
if not best_feature or (rec > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = rec
|
|
||||||
best_metric_score_std = rec_std
|
|
||||||
|
|
||||||
else:
|
|
||||||
f1 = np.mean(model_cv['test_f1'])
|
|
||||||
f1_std = np.std(model_cv['test_f1'])
|
|
||||||
|
|
||||||
if not best_feature or (f1 > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = f1
|
|
||||||
best_metric_score_std = f1_std
|
|
||||||
|
|
||||||
elif ml_type == "regression":
|
|
||||||
lass = Lasso()
|
lass = Lasso()
|
||||||
model_cv = cross_validate(
|
model_cv = cross_validate(
|
||||||
lass,
|
lass,
|
||||||
X=X,
|
X=X,
|
||||||
y=y,
|
y=y,
|
||||||
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
cv=cv,
|
||||||
|
groups=self.groups,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=('r2')
|
scoring=('r2')
|
||||||
)
|
)
|
||||||
|
|
||||||
if metric == "r2":
|
|
||||||
r2 = np.mean(model_cv['test_r2'])
|
|
||||||
r2_std = np.std(model_cv['test_r2'])
|
|
||||||
|
|
||||||
if not best_feature or (r2 > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = r2
|
|
||||||
best_metric_score_std = r2_std
|
|
||||||
else:
|
else:
|
||||||
raise ValueError("ML type not yet implemented!")
|
raise ValueError("ML type not yet implemented!")
|
||||||
|
|
||||||
|
# Section of metrics' scores comparison.
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
||||||
|
|
||||||
|
metric_score = np.nanmean(model_cv["test_score"])
|
||||||
|
metric_score_std = np.nanstd(model_cv["test_score"])
|
||||||
|
|
||||||
|
if not best_feature or (metric_score > best_metric_score):
|
||||||
|
best_feature = feat
|
||||||
|
best_metric_score = metric_score
|
||||||
|
best_metric_score_std = metric_score_std
|
||||||
|
|
||||||
return best_feature, best_metric_score, best_metric_score_std
|
return best_feature, best_metric_score, best_metric_score_std
|
||||||
|
|
||||||
|
|
||||||
def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
|
def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10):
|
||||||
|
"""This method selects a set of features and returns them as a list. It returns number of features
|
||||||
|
determined in the interval of [n_min, n_max].
|
||||||
|
|
||||||
n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
|
The method consists of two steps:
|
||||||
if n_max > n_features:
|
(1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter.
|
||||||
n_max = n_features
|
(2) The sequential features removal procedure is executed. Using the remaing features from (1).
|
||||||
|
The best score is detected using a removal procedure. The procedure sequentially removes the features
|
||||||
|
that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is
|
||||||
|
improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance)
|
||||||
|
with which the next n removed features are inspected whether currently best score is improved.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
n_min (int, optional): Minimal amount of features returned.
|
||||||
|
n_max (int, optional): Maximal amount of features returned.
|
||||||
|
k (int, optional): Determines the k in the k-best features method.
|
||||||
|
If None, SelectKBest feature selection does not execute.
|
||||||
|
ml_type(str, optional): Type of ML problem. Currently implemented options:
|
||||||
|
'classification_bin', 'classification_multi', and 'regression_'
|
||||||
|
method (str, optional): "remove" or "add" features. Defaults to "remove".
|
||||||
|
n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
|
||||||
|
the method returns index of feature with current best score as a tipping point feature.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: list of selected features
|
||||||
|
"""
|
||||||
|
|
||||||
|
if k is not None and k <= n_max:
|
||||||
|
raise ValueError("The k parameter needs to be greater than the n_max parameter.")
|
||||||
|
|
||||||
|
# Select k-best feature dependent on the type of ML task
|
||||||
|
ml_category, ml_subcategory = ml_type.split("_")
|
||||||
|
|
||||||
|
if k is not None:
|
||||||
|
if ml_category == "classification":
|
||||||
|
if ml_subcategory== "bin":
|
||||||
|
selector = SelectKBest(mutual_info_classif, k=k)
|
||||||
|
elif ml_subcategory== "multi":
|
||||||
|
selector = SelectKBest(f_classif, k=k)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
|
||||||
|
elif ml_category == "regression":
|
||||||
|
selector = SelectKBest(f_regression, k=k)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
|
||||||
|
|
||||||
|
selector.fit(self.X, self.y)
|
||||||
|
cols_idxs = selector.get_support(indices=True)
|
||||||
|
self.X = self.X.iloc[:,cols_idxs]
|
||||||
|
|
||||||
|
print("All columns (after SelectKBest method):")
|
||||||
|
print(self.X.columns)
|
||||||
|
|
||||||
|
# Sequential feature addition / removal
|
||||||
|
n_features = self.X.shape[1]
|
||||||
|
if n_max >= n_features:
|
||||||
|
n_max = n_features-1 # The algorithm removes at least one feature
|
||||||
|
|
||||||
if n_min > n_features:
|
if n_min > n_features:
|
||||||
raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
|
raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
|
||||||
|
|
||||||
if n_max < n_min:
|
if n_max < n_min:
|
||||||
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
|
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
|
||||||
|
|
||||||
features = df.columns.tolist()
|
features = self.X.columns.tolist()
|
||||||
features.remove("pid")
|
|
||||||
features.remove("target")
|
|
||||||
feature_importance = []
|
feature_importance = []
|
||||||
if method == "remove":
|
if method == "remove":
|
||||||
|
best_score = 0
|
||||||
|
best_feature_indx = None
|
||||||
|
i_worse = 0
|
||||||
for i in reversed(range(n_features)):
|
for i in reversed(range(n_features)):
|
||||||
|
|
||||||
|
if i+1 == n_min:
|
||||||
|
break
|
||||||
|
|
||||||
best_feature, best_metric_score, best_metric_score_std = \
|
best_feature, best_metric_score, best_metric_score_std = \
|
||||||
self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
|
self.select_best_feature(features, method=method, ml_category=ml_category, ml_subcategory=ml_subcategory, metric=metric)
|
||||||
feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
|
|
||||||
|
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
|
||||||
|
|
||||||
features.remove(best_feature)
|
features.remove(best_feature)
|
||||||
|
print("Features left:", i)
|
||||||
|
|
||||||
|
if i <= n_max:
|
||||||
|
if best_metric_score >= best_score:
|
||||||
|
best_score = best_metric_score
|
||||||
|
best_feature_indx = i+1
|
||||||
|
i_worse = 0
|
||||||
|
else:
|
||||||
|
i_worse += 1
|
||||||
|
|
||||||
|
if i_worse == n_tolerance:
|
||||||
|
break
|
||||||
|
|
||||||
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
|
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
|
||||||
|
|
||||||
# Selekcijski kriterij značilk v rangu max-min
|
print(feature_importance_df)
|
||||||
# Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
|
print("best_feature_indx", best_feature_indx)
|
||||||
|
print("best_score", best_score)
|
||||||
|
|
||||||
# Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
|
features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist()
|
||||||
|
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
|
||||||
|
|
||||||
# "Tipping point" značilka mora biti v rangu max-min
|
return selected_features
|
||||||
|
|
||||||
selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
|
else:
|
||||||
selection_area.set_index(["i", "name"], inplace=True)
|
raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
|
||||||
diffrences = selection_area.diff()
|
|
||||||
diffrences.dropna(how='any', inplace=True)
|
|
||||||
|
|
||||||
# Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo
|
|
||||||
cumulative_sumation = diffrences.cumsum()
|
|
||||||
tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
|
|
||||||
|
|
||||||
# Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative
|
|
||||||
tipping_feature_indx_2 = None
|
|
||||||
for indx, row in diffrences.iterrows():
|
|
||||||
if row["metric"] > 0:
|
|
||||||
tipping_feature_indx_2 = indx
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score
|
|
||||||
tipping_feature_indx_3 = None
|
|
||||||
cum_sum_score = 0
|
|
||||||
i_worse = 0
|
|
||||||
# TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
|
|
||||||
for indx, row in selection_area.iterrows():
|
|
||||||
if row["metric"] > 0:
|
|
||||||
tipping_feature_indx_3 = indx
|
|
||||||
cum_sum_score += row["metric"]
|
|
||||||
i_worse = 0
|
|
||||||
else:
|
|
||||||
i_worse += 1
|
|
||||||
|
|
||||||
if i_worse == n_not_improve:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def vizualize_feature_selection_process():
|
|
||||||
pass
|
|
||||||
|
|
||||||
def execute_feature_selection_step():
|
|
||||||
pass
|
|
|
@ -33,7 +33,7 @@ class Preprocessing:
|
||||||
Args:
|
Args:
|
||||||
categorical_features (DataFrame): DataFrame including only categorical columns.
|
categorical_features (DataFrame): DataFrame including only categorical columns.
|
||||||
numerical_features (_type_): DataFrame including only numerical columns.
|
numerical_features (_type_): DataFrame including only numerical columns.
|
||||||
mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
|
mode (int): Mode of the column with which DataFrame is filled.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
DataFrame: Hot-One Encoded DataFrame.
|
DataFrame: Hot-One Encoded DataFrame.
|
||||||
|
@ -46,7 +46,7 @@ class Preprocessing:
|
||||||
if not categorical_features.empty:
|
if not categorical_features.empty:
|
||||||
categorical_features = pd.get_dummies(categorical_features)
|
categorical_features = pd.get_dummies(categorical_features)
|
||||||
|
|
||||||
return pd.concat([numerical_features, categorical_features], axis=1)
|
return pd.concat([numerical_features, categorical_features], axis=1), categorical_features.columns.tolist()
|
||||||
|
|
||||||
|
|
||||||
def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
|
def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
|
||||||
|
@ -68,19 +68,26 @@ class Preprocessing:
|
||||||
categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
|
categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
|
||||||
|
|
||||||
# For train set
|
# For train set
|
||||||
|
|
||||||
train_X_categorical_features = self.train_X[categorical_columns].copy()
|
train_X_categorical_features = self.train_X[categorical_columns].copy()
|
||||||
train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
|
train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
|
||||||
mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
|
mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
|
||||||
|
|
||||||
self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
|
self.train_X, train_cat_col_names = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
|
||||||
|
encoded_categorical_features = [col for col in self.train_X.columns if col.startswith(tuple(categorical_columns))]
|
||||||
|
|
||||||
# For test set
|
# For test set
|
||||||
|
|
||||||
test_X_categorical_features = self.test_X[categorical_columns].copy()
|
test_X_categorical_features = self.test_X[categorical_columns].copy()
|
||||||
test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
|
test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
|
||||||
|
|
||||||
self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
|
self.test_X, test_cat_col_names = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
|
||||||
|
|
||||||
|
# Create categorical columns that were not found in test set and fill them with 0
|
||||||
|
missing_cols = [col for col in train_cat_col_names if col not in test_cat_col_names]
|
||||||
|
self.test_X[missing_cols] = 0
|
||||||
|
|
||||||
|
# Sort column names alphabetically
|
||||||
|
self.train_X = self.train_X.reindex(sorted(self.train_X.columns), axis=1)
|
||||||
|
self.test_X = self.test_X.reindex(sorted(self.test_X.columns), axis=1)
|
||||||
|
|
||||||
|
|
||||||
def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
|
def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
|
||||||
|
|
Loading…
Reference in New Issue