From 2336edffb654dce18a82f2585250930128732dc7 Mon Sep 17 00:00:00 2001 From: junos Date: Thu, 18 May 2023 18:40:06 +0200 Subject: [PATCH] Retain metric names in final scores. --- machine_learning/helper.py | 53 +++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/machine_learning/helper.py b/machine_learning/helper.py index 2fd0f25..0b7574c 100644 --- a/machine_learning/helper.py +++ b/machine_learning/helper.py @@ -163,6 +163,17 @@ def prepare_cross_validator( return cv +def aggregate_and_transpose(df: pd.DataFrame, statistics=None) -> pd.DataFrame: + if statistics is None: + statistics = ["max", "mean"] + return ( + df.agg(statistics) + .transpose() + .reset_index() + .rename(columns={"index": "test_metric"}) + ) + + def run_all_regression_models( data_x: pd.DataFrame, data_y: pd.DataFrame, @@ -171,7 +182,7 @@ def run_all_regression_models( ) -> pd.DataFrame: metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"] test_metrics = ["test_" + metric for metric in metrics] - scores = pd.DataFrame(columns=["method", "max", "nanmedian"]) + scores = pd.DataFrame(columns=["method", "metric", "max", "nanmedian"]) # Validate models dummy_regr = DummyRegressor(strategy="mean") @@ -188,7 +199,7 @@ def run_all_regression_models( print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"])) scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics] - scores_df = scores_df.agg(["max", np.nanmedian]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian]) scores_df["method"] = "dummy" scores = pd.concat([scores, scores_df]) del dummy_regr @@ -208,7 +219,7 @@ def run_all_regression_models( print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"])) scores_df = pd.DataFrame(lin_reg_scores)[test_metrics] - scores_df = scores_df.agg(["max", np.nanmedian]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian]) scores_df["method"] = "linear_reg" scores = pd.concat([scores, scores_df]) del lin_reg @@ -227,7 +238,7 @@ def run_all_regression_models( print("Ridge regression") scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics] - scores_df = scores_df.agg(["max", np.nanmedian]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian]) scores_df["method"] = "ridge_reg" scores = pd.concat([scores, scores_df]) del ridge_reg @@ -246,7 +257,7 @@ def run_all_regression_models( print("Lasso regression") scores_df = pd.DataFrame(lasso_reg_score)[test_metrics] - scores_df = scores_df.agg(["max", np.nanmedian]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian]) scores_df["method"] = "lasso_reg" scores = pd.concat([scores, scores_df]) del lasso_reg @@ -265,7 +276,7 @@ def run_all_regression_models( print("Bayesian Ridge") scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics] - scores_df = scores_df.agg(["max", np.nanmedian]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian]) scores_df["method"] = "bayesian_ridge" scores = pd.concat([scores, scores_df]) del bayesian_ridge_reg @@ -284,7 +295,7 @@ def run_all_regression_models( print("RANSAC (outlier robust regression)") scores_df = pd.DataFrame(ransac_reg_score)[test_metrics] - scores_df = scores_df.agg(["max", np.nanmedian]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian]) scores_df["method"] = "RANSAC" scores = pd.concat([scores, scores_df]) del ransac_reg @@ -303,7 +314,7 @@ def run_all_regression_models( print("Support vector regression") scores_df = pd.DataFrame(svr_score)[test_metrics] - scores_df = scores_df.agg(["max", np.nanmedian]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian]) scores_df["method"] = "SVR" scores = pd.concat([scores, scores_df]) del svr @@ -322,7 +333,7 @@ def run_all_regression_models( print("Kernel Ridge regression") scores_df = pd.DataFrame(kridge_score)[test_metrics] - scores_df = scores_df.agg(["max", np.nanmedian]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian]) scores_df["method"] = "kernel_ridge" scores = pd.concat([scores, scores_df]) del kridge @@ -341,7 +352,7 @@ def run_all_regression_models( print("Gaussian Process Regression") scores_df = pd.DataFrame(gpr_score)[test_metrics] - scores_df = scores_df.agg(["max", np.nanmedian]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian]) scores_df["method"] = "gaussian_proc" scores = pd.concat([scores, scores_df]) del gpr @@ -360,7 +371,7 @@ def run_all_regression_models( print("Random Forest Regression") scores_df = pd.DataFrame(rfr_score)[test_metrics] - scores_df = scores_df.agg(["max", np.nanmedian]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian]) scores_df["method"] = "random_forest" scores = pd.concat([scores, scores_df]) del rfr @@ -379,7 +390,7 @@ def run_all_regression_models( print("XGBoost Regressor") scores_df = pd.DataFrame(xgb_score)[test_metrics] - scores_df = scores_df.agg(["max", np.nanmedian]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian]) scores_df["method"] = "XGBoost" scores = pd.concat([scores, scores_df]) del xgb @@ -398,7 +409,7 @@ def run_all_regression_models( print("ADA Boost Regressor") scores_df = pd.DataFrame(ada_score)[test_metrics] - scores_df = scores_df.agg(["max", np.nanmedian]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian]) scores_df["method"] = "ADA_boost" scores = pd.concat([scores, scores_df]) del ada @@ -416,7 +427,7 @@ def run_all_classification_models( metrics = ["accuracy", "average_precision", "recall", "f1"] test_metrics = ["test_" + metric for metric in metrics] - scores = pd.DataFrame(columns=["method", "max", "mean"]) + scores = pd.DataFrame(columns=["method", "metric", "max", "mean"]) dummy_class = DummyClassifier(strategy="most_frequent") @@ -433,7 +444,7 @@ def run_all_classification_models( print("Dummy") scores_df = pd.DataFrame(dummy_score)[test_metrics] - scores_df = scores_df.agg(["max", "mean"]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"]) scores_df["method"] = "Dummy" scores = pd.concat([scores, scores_df]) del dummy_class @@ -453,7 +464,7 @@ def run_all_classification_models( print("Logistic regression") scores_df = pd.DataFrame(log_reg_scores)[test_metrics] - scores_df = scores_df.agg(["max", "mean"]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"]) scores_df["method"] = "logistic_reg" scores = pd.concat([scores, scores_df]) del logistic_regression @@ -473,7 +484,7 @@ def run_all_classification_models( print("Support Vector Machine") scores_df = pd.DataFrame(svc_scores)[test_metrics] - scores_df = scores_df.agg(["max", "mean"]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"]) scores_df["method"] = "svc" scores = pd.concat([scores, scores_df]) del svc @@ -493,7 +504,7 @@ def run_all_classification_models( print("Gaussian Naive Bayes") scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics] - scores_df = scores_df.agg(["max", "mean"]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"]) scores_df["method"] = "gaussian_naive_bayes" scores = pd.concat([scores, scores_df]) del gaussian_nb @@ -513,7 +524,7 @@ def run_all_classification_models( print("Stochastic Gradient Descent") scores_df = pd.DataFrame(sgdc_scores)[test_metrics] - scores_df = scores_df.agg(["max", "mean"]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"]) scores_df["method"] = "stochastic_gradient_descent" scores = pd.concat([scores, scores_df]) del sgdc @@ -533,7 +544,7 @@ def run_all_classification_models( print("Random Forest") scores_df = pd.DataFrame(rfc_scores)[test_metrics] - scores_df = scores_df.agg(["max", "mean"]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"]) scores_df["method"] = "random_forest" scores = pd.concat([scores, scores_df]) del rfc @@ -553,7 +564,7 @@ def run_all_classification_models( print("XGBoost") scores_df = pd.DataFrame(xgb_scores)[test_metrics] - scores_df = scores_df.agg(["max", "mean"]).transpose() + scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"]) scores_df["method"] = "xgboost" scores = pd.concat([scores, scores_df]) del xgb_classifier