From 852e17afbead64a5e91f6ff4065fe07d9aafd448 Mon Sep 17 00:00:00 2001 From: junos Date: Thu, 8 Dec 2022 10:00:14 +0100 Subject: [PATCH] Define classification models method. --- machine_learning/helper.py | 144 ++++++++++++++++++++++++++++++++++++- 1 file changed, 141 insertions(+), 3 deletions(-) diff --git a/machine_learning/helper.py b/machine_learning/helper.py index 5d999fe..aae5af9 100644 --- a/machine_learning/helper.py +++ b/machine_learning/helper.py @@ -1,10 +1,12 @@ from pathlib import Path -from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble +from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble, naive_bayes, neighbors, tree from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate from sklearn.metrics import mean_squared_error, r2_score from sklearn.impute import SimpleImputer -from sklearn.dummy import DummyRegressor -from xgboost import XGBRegressor +from sklearn.dummy import DummyRegressor, DummyClassifier + +from xgboost import XGBRegressor, XGBClassifier +import xgboost as xg import pandas as pd import numpy as np @@ -319,3 +321,139 @@ def run_all_regression_models(input_csv): scores = pd.concat([scores, scores_df]) return scores + + +def run_all_classification_models(data_x, data_y, data_groups, cv_method): + metrics = ['accuracy', 'average_precision', 'recall', 'f1'] + test_metrics = ["test_" + metric for metric in metrics] + + scores = pd.DataFrame(columns=["method", "max", "mean"]) + + dummy_class = DummyClassifier(strategy="most_frequent") + + dummy_score = cross_validate( + dummy_class, + X=data_x, + y=data_y, + groups=data_groups, + cv=cv_method, + n_jobs=-1, + error_score='raise', + scoring=metrics + ) + print("Dummy") + + scores_df = pd.DataFrame(dummy_score)[test_metrics] + scores_df = scores_df.agg(['max', 'mean']).transpose() + scores_df["method"] = "Dummy" + scores = pd.concat([scores, scores_df]) + + logistic_regression = linear_model.LogisticRegression() + + log_reg_scores = cross_validate( + logistic_regression, + X=data_x, + y=data_y, + groups=data_groups, + cv=cv_method, + n_jobs=-1, + scoring=metrics + ) + print("Logistic regression") + + scores_df = pd.DataFrame(log_reg_scores)[test_metrics] + scores_df = scores_df.agg(['max', 'mean']).transpose() + scores_df["method"] = "logistic_reg" + scores = pd.concat([scores, scores_df]) + + svc = svm.SVC() + + svc_scores = cross_validate( + svc, + X=data_x, + y=data_y, + groups=data_groups, + cv=cv_method, + n_jobs=-1, + scoring=metrics + ) + print("Support Vector Machine") + + scores_df = pd.DataFrame(svc_scores)[test_metrics] + scores_df = scores_df.agg(['max', 'mean']).transpose() + scores_df["method"] = "svc" + scores = pd.concat([scores, scores_df]) + + gaussian_nb = naive_bayes.GaussianNB() + + gaussian_nb_scores = cross_validate( + gaussian_nb, + X=data_x, + y=data_y, + groups=data_groups, + cv=cv_method, + n_jobs=-1, + scoring=metrics + ) + print("Gaussian Naive Bayes") + + scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics] + scores_df = scores_df.agg(['max', 'mean']).transpose() + scores_df["method"] = "gaussian_naive_bayes" + scores = pd.concat([scores, scores_df]) + + sgdc = linear_model.SGDClassifier() + + sgdc_scores = cross_validate( + sgdc, + X=data_x, + y=data_y, + groups=data_groups, + cv=cv_method, + n_jobs=-1, + scoring=metrics + ) + print("Stochastic Gradient Descent") + + scores_df = pd.DataFrame(sgdc_scores)[test_metrics] + scores_df = scores_df.agg(['max', 'mean']).transpose() + scores_df["method"] = "stochastic_gradient_descent" + scores = pd.concat([scores, scores_df]) + + rfc = ensemble.RandomForestClassifier() + + rfc_scores = cross_validate( + rfc, + X=data_x, + y=data_y, + groups=data_groups, + cv=cv_method, + n_jobs=-1, + scoring=metrics + ) + print("Random Forest") + + scores_df = pd.DataFrame(rfc_scores)[test_metrics] + scores_df = scores_df.agg(['max', 'mean']).transpose() + scores_df["method"] = "random_forest" + scores = pd.concat([scores, scores_df]) + + xgb_classifier = XGBClassifier() + + xgb_scores = cross_validate( + xgb_classifier, + X=data_x, + y=data_y, + groups=data_groups, + cv=cv_method, + n_jobs=-1, + scoring=metrics + ) + print("XGBoost") + + scores_df = pd.DataFrame(xgb_scores)[test_metrics] + scores_df = scores_df.agg(['max', 'mean']).transpose() + scores_df["method"] = "xgboost" + scores = pd.concat([scores, scores_df]) + + return scores