Define classification models method.

ml_pipeline
junos 2022-12-08 10:00:14 +01:00
parent 509707855e
commit 852e17afbe
1 changed files with 141 additions and 3 deletions

View File

@ -1,10 +1,12 @@
from pathlib import Path from pathlib import Path
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble, naive_bayes, neighbors, tree
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate
from sklearn.metrics import mean_squared_error, r2_score from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor from sklearn.dummy import DummyRegressor, DummyClassifier
from xgboost import XGBRegressor
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xg
import pandas as pd import pandas as pd
import numpy as np import numpy as np
@ -319,3 +321,139 @@ def run_all_regression_models(input_csv):
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
return scores return scores
def run_all_classification_models(data_x, data_y, data_groups, cv_method):
metrics = ['accuracy', 'average_precision', 'recall', 'f1']
test_metrics = ["test_" + metric for metric in metrics]
scores = pd.DataFrame(columns=["method", "max", "mean"])
dummy_class = DummyClassifier(strategy="most_frequent")
dummy_score = cross_validate(
dummy_class,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=metrics
)
print("Dummy")
scores_df = pd.DataFrame(dummy_score)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "Dummy"
scores = pd.concat([scores, scores_df])
logistic_regression = linear_model.LogisticRegression()
log_reg_scores = cross_validate(
logistic_regression,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Logistic regression")
scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "logistic_reg"
scores = pd.concat([scores, scores_df])
svc = svm.SVC()
svc_scores = cross_validate(
svc,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Support Vector Machine")
scores_df = pd.DataFrame(svc_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "svc"
scores = pd.concat([scores, scores_df])
gaussian_nb = naive_bayes.GaussianNB()
gaussian_nb_scores = cross_validate(
gaussian_nb,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Gaussian Naive Bayes")
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "gaussian_naive_bayes"
scores = pd.concat([scores, scores_df])
sgdc = linear_model.SGDClassifier()
sgdc_scores = cross_validate(
sgdc,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Stochastic Gradient Descent")
scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "stochastic_gradient_descent"
scores = pd.concat([scores, scores_df])
rfc = ensemble.RandomForestClassifier()
rfc_scores = cross_validate(
rfc,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Random Forest")
scores_df = pd.DataFrame(rfc_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "random_forest"
scores = pd.concat([scores, scores_df])
xgb_classifier = XGBClassifier()
xgb_scores = cross_validate(
xgb_classifier,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("XGBoost")
scores_df = pd.DataFrame(xgb_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "xgboost"
scores = pd.concat([scores, scores_df])
return scores