From 91e73524807ba41223c8b52205d3c8d74db32d4d Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 10 May 2023 22:50:00 +0200 Subject: [PATCH] Thoroughly refactor classification runner. --- machine_learning/helper.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/machine_learning/helper.py b/machine_learning/helper.py index c1776da..2fd0f25 100644 --- a/machine_learning/helper.py +++ b/machine_learning/helper.py @@ -407,7 +407,12 @@ def run_all_regression_models( return scores -def run_all_classification_models(data_x, data_y, data_groups, cv_method): +def run_all_classification_models( + data_x: pd.DataFrame, + data_y: pd.DataFrame, + data_groups: pd.DataFrame, + cross_validator: BaseCrossValidator, +): metrics = ["accuracy", "average_precision", "recall", "f1"] test_metrics = ["test_" + metric for metric in metrics] @@ -420,7 +425,7 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): X=data_x, y=data_y, groups=data_groups, - cv=cv_method, + cv=cross_validator, n_jobs=-1, error_score="raise", scoring=metrics, @@ -431,6 +436,8 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df["method"] = "Dummy" scores = pd.concat([scores, scores_df]) + del dummy_class + del dummy_score logistic_regression = linear_model.LogisticRegression() @@ -439,7 +446,7 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): X=data_x, y=data_y, groups=data_groups, - cv=cv_method, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -449,6 +456,8 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df["method"] = "logistic_reg" scores = pd.concat([scores, scores_df]) + del logistic_regression + del log_reg_scores svc = svm.SVC() @@ -457,7 +466,7 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): X=data_x, y=data_y, groups=data_groups, - cv=cv_method, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -467,6 +476,8 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df["method"] = "svc" scores = pd.concat([scores, scores_df]) + del svc + del svc_scores gaussian_nb = naive_bayes.GaussianNB() @@ -475,7 +486,7 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): X=data_x, y=data_y, groups=data_groups, - cv=cv_method, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -485,6 +496,8 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df["method"] = "gaussian_naive_bayes" scores = pd.concat([scores, scores_df]) + del gaussian_nb + del gaussian_nb_scores sgdc = linear_model.SGDClassifier() @@ -493,7 +506,7 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): X=data_x, y=data_y, groups=data_groups, - cv=cv_method, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -503,6 +516,8 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df["method"] = "stochastic_gradient_descent" scores = pd.concat([scores, scores_df]) + del sgdc + del sgdc_scores rfc = ensemble.RandomForestClassifier() @@ -511,7 +526,7 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): X=data_x, y=data_y, groups=data_groups, - cv=cv_method, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -521,6 +536,8 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df["method"] = "random_forest" scores = pd.concat([scores, scores_df]) + del rfc + del rfc_scores xgb_classifier = XGBClassifier() @@ -529,7 +546,7 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): X=data_x, y=data_y, groups=data_groups, - cv=cv_method, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -539,5 +556,7 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method): scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df["method"] = "xgboost" scores = pd.concat([scores, scores_df]) + del xgb_classifier + del xgb_scores return scores