From 2336edffb654dce18a82f2585250930128732dc7 Mon Sep 17 00:00:00 2001
From: junos <junos.lukan@ijs.si>
Date: Thu, 18 May 2023 18:40:06 +0200
Subject: [PATCH] Retain metric names in final scores.

---
 machine_learning/helper.py | 53 +++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/machine_learning/helper.py b/machine_learning/helper.py
index 2fd0f25..0b7574c 100644
--- a/machine_learning/helper.py
+++ b/machine_learning/helper.py
@@ -163,6 +163,17 @@ def prepare_cross_validator(
     return cv
 
 
+def aggregate_and_transpose(df: pd.DataFrame, statistics=None) -> pd.DataFrame:
+    if statistics is None:
+        statistics = ["max", "mean"]
+    return (
+        df.agg(statistics)
+        .transpose()
+        .reset_index()
+        .rename(columns={"index": "test_metric"})
+    )
+
+
 def run_all_regression_models(
     data_x: pd.DataFrame,
     data_y: pd.DataFrame,
@@ -171,7 +182,7 @@ def run_all_regression_models(
 ) -> pd.DataFrame:
     metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
     test_metrics = ["test_" + metric for metric in metrics]
-    scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
+    scores = pd.DataFrame(columns=["method", "metric", "max", "nanmedian"])
 
     # Validate models
     dummy_regr = DummyRegressor(strategy="mean")
@@ -188,7 +199,7 @@ def run_all_regression_models(
     print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))
 
     scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
     scores_df["method"] = "dummy"
     scores = pd.concat([scores, scores_df])
     del dummy_regr
@@ -208,7 +219,7 @@ def run_all_regression_models(
     print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))
 
     scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
     scores_df["method"] = "linear_reg"
     scores = pd.concat([scores, scores_df])
     del lin_reg
@@ -227,7 +238,7 @@ def run_all_regression_models(
     print("Ridge regression")
 
     scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
     scores_df["method"] = "ridge_reg"
     scores = pd.concat([scores, scores_df])
     del ridge_reg
@@ -246,7 +257,7 @@ def run_all_regression_models(
     print("Lasso regression")
 
     scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
     scores_df["method"] = "lasso_reg"
     scores = pd.concat([scores, scores_df])
     del lasso_reg
@@ -265,7 +276,7 @@ def run_all_regression_models(
     print("Bayesian Ridge")
 
     scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
     scores_df["method"] = "bayesian_ridge"
     scores = pd.concat([scores, scores_df])
     del bayesian_ridge_reg
@@ -284,7 +295,7 @@ def run_all_regression_models(
     print("RANSAC (outlier robust regression)")
 
     scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
     scores_df["method"] = "RANSAC"
     scores = pd.concat([scores, scores_df])
     del ransac_reg
@@ -303,7 +314,7 @@ def run_all_regression_models(
     print("Support vector regression")
 
     scores_df = pd.DataFrame(svr_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
     scores_df["method"] = "SVR"
     scores = pd.concat([scores, scores_df])
     del svr
@@ -322,7 +333,7 @@ def run_all_regression_models(
     print("Kernel Ridge regression")
 
     scores_df = pd.DataFrame(kridge_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
     scores_df["method"] = "kernel_ridge"
     scores = pd.concat([scores, scores_df])
     del kridge
@@ -341,7 +352,7 @@ def run_all_regression_models(
     print("Gaussian Process Regression")
 
     scores_df = pd.DataFrame(gpr_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
     scores_df["method"] = "gaussian_proc"
     scores = pd.concat([scores, scores_df])
     del gpr
@@ -360,7 +371,7 @@ def run_all_regression_models(
     print("Random Forest Regression")
 
     scores_df = pd.DataFrame(rfr_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
     scores_df["method"] = "random_forest"
     scores = pd.concat([scores, scores_df])
     del rfr
@@ -379,7 +390,7 @@ def run_all_regression_models(
     print("XGBoost Regressor")
 
     scores_df = pd.DataFrame(xgb_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
     scores_df["method"] = "XGBoost"
     scores = pd.concat([scores, scores_df])
     del xgb
@@ -398,7 +409,7 @@ def run_all_regression_models(
     print("ADA Boost Regressor")
 
     scores_df = pd.DataFrame(ada_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
     scores_df["method"] = "ADA_boost"
     scores = pd.concat([scores, scores_df])
     del ada
@@ -416,7 +427,7 @@ def run_all_classification_models(
     metrics = ["accuracy", "average_precision", "recall", "f1"]
     test_metrics = ["test_" + metric for metric in metrics]
 
-    scores = pd.DataFrame(columns=["method", "max", "mean"])
+    scores = pd.DataFrame(columns=["method", "metric", "max", "mean"])
 
     dummy_class = DummyClassifier(strategy="most_frequent")
 
@@ -433,7 +444,7 @@ def run_all_classification_models(
     print("Dummy")
 
     scores_df = pd.DataFrame(dummy_score)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
     scores_df["method"] = "Dummy"
     scores = pd.concat([scores, scores_df])
     del dummy_class
@@ -453,7 +464,7 @@ def run_all_classification_models(
     print("Logistic regression")
 
     scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
     scores_df["method"] = "logistic_reg"
     scores = pd.concat([scores, scores_df])
     del logistic_regression
@@ -473,7 +484,7 @@ def run_all_classification_models(
     print("Support Vector Machine")
 
     scores_df = pd.DataFrame(svc_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
     scores_df["method"] = "svc"
     scores = pd.concat([scores, scores_df])
     del svc
@@ -493,7 +504,7 @@ def run_all_classification_models(
     print("Gaussian Naive Bayes")
 
     scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
     scores_df["method"] = "gaussian_naive_bayes"
     scores = pd.concat([scores, scores_df])
     del gaussian_nb
@@ -513,7 +524,7 @@ def run_all_classification_models(
     print("Stochastic Gradient Descent")
 
     scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
     scores_df["method"] = "stochastic_gradient_descent"
     scores = pd.concat([scores, scores_df])
     del sgdc
@@ -533,7 +544,7 @@ def run_all_classification_models(
     print("Random Forest")
 
     scores_df = pd.DataFrame(rfc_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
     scores_df["method"] = "random_forest"
     scores = pd.concat([scores, scores_df])
     del rfc
@@ -553,7 +564,7 @@ def run_all_classification_models(
     print("XGBoost")
 
     scores_df = pd.DataFrame(xgb_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
     scores_df["method"] = "xgboost"
     scores = pd.concat([scores, scores_df])
     del xgb_classifier