Retain metric names in final scores.

2023-05-18 18:40:06 +02:00 · 2023-05-18 18:40:06 +02:00 · 2336edffb6
parent b756ed5feb
commit 2336edffb6
1 changed files with 32 additions and 21 deletions
--- a/machine_learning/helper.py
+++ b/machine_learning/helper.py
@ -163,6 +163,17 @@ def prepare_cross_validator(
    return cv


+def aggregate_and_transpose(df: pd.DataFrame, statistics=None) -> pd.DataFrame:
+    if statistics is None:
+        statistics = ["max", "mean"]
+    return (
+        df.agg(statistics)
+        .transpose()
+        .reset_index()
+        .rename(columns={"index": "test_metric"})
+    )
+
+
 def run_all_regression_models(
    data_x: pd.DataFrame,
    data_y: pd.DataFrame,
@ -171,7 +182,7 @@ def run_all_regression_models(
 ) -> pd.DataFrame:
    metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
    test_metrics = ["test_" + metric for metric in metrics]
-    scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
+    scores = pd.DataFrame(columns=["method", "metric", "max", "nanmedian"])

    # Validate models
    dummy_regr = DummyRegressor(strategy="mean")
@ -188,7 +199,7 @@ def run_all_regression_models(
    print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))

    scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "dummy"
    scores = pd.concat([scores, scores_df])
    del dummy_regr
@ -208,7 +219,7 @@ def run_all_regression_models(
    print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))

    scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "linear_reg"
    scores = pd.concat([scores, scores_df])
    del lin_reg
@ -227,7 +238,7 @@ def run_all_regression_models(
    print("Ridge regression")

    scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "ridge_reg"
    scores = pd.concat([scores, scores_df])
    del ridge_reg
@ -246,7 +257,7 @@ def run_all_regression_models(
    print("Lasso regression")

    scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "lasso_reg"
    scores = pd.concat([scores, scores_df])
    del lasso_reg
@ -265,7 +276,7 @@ def run_all_regression_models(
    print("Bayesian Ridge")

    scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "bayesian_ridge"
    scores = pd.concat([scores, scores_df])
    del bayesian_ridge_reg
@ -284,7 +295,7 @@ def run_all_regression_models(
    print("RANSAC (outlier robust regression)")

    scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "RANSAC"
    scores = pd.concat([scores, scores_df])
    del ransac_reg
@ -303,7 +314,7 @@ def run_all_regression_models(
    print("Support vector regression")

    scores_df = pd.DataFrame(svr_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "SVR"
    scores = pd.concat([scores, scores_df])
    del svr
@ -322,7 +333,7 @@ def run_all_regression_models(
    print("Kernel Ridge regression")

    scores_df = pd.DataFrame(kridge_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "kernel_ridge"
    scores = pd.concat([scores, scores_df])
    del kridge
@ -341,7 +352,7 @@ def run_all_regression_models(
    print("Gaussian Process Regression")

    scores_df = pd.DataFrame(gpr_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "gaussian_proc"
    scores = pd.concat([scores, scores_df])
    del gpr
@ -360,7 +371,7 @@ def run_all_regression_models(
    print("Random Forest Regression")

    scores_df = pd.DataFrame(rfr_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "random_forest"
    scores = pd.concat([scores, scores_df])
    del rfr
@ -379,7 +390,7 @@ def run_all_regression_models(
    print("XGBoost Regressor")

    scores_df = pd.DataFrame(xgb_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "XGBoost"
    scores = pd.concat([scores, scores_df])
    del xgb
@ -398,7 +409,7 @@ def run_all_regression_models(
    print("ADA Boost Regressor")

    scores_df = pd.DataFrame(ada_score)[test_metrics]
-    scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
    scores_df["method"] = "ADA_boost"
    scores = pd.concat([scores, scores_df])
    del ada
@ -416,7 +427,7 @@ def run_all_classification_models(
    metrics = ["accuracy", "average_precision", "recall", "f1"]
    test_metrics = ["test_" + metric for metric in metrics]

-    scores = pd.DataFrame(columns=["method", "max", "mean"])
+    scores = pd.DataFrame(columns=["method", "metric", "max", "mean"])

    dummy_class = DummyClassifier(strategy="most_frequent")

@ -433,7 +444,7 @@ def run_all_classification_models(
    print("Dummy")

    scores_df = pd.DataFrame(dummy_score)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
    scores_df["method"] = "Dummy"
    scores = pd.concat([scores, scores_df])
    del dummy_class
@ -453,7 +464,7 @@ def run_all_classification_models(
    print("Logistic regression")

    scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
    scores_df["method"] = "logistic_reg"
    scores = pd.concat([scores, scores_df])
    del logistic_regression
@ -473,7 +484,7 @@ def run_all_classification_models(
    print("Support Vector Machine")

    scores_df = pd.DataFrame(svc_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
    scores_df["method"] = "svc"
    scores = pd.concat([scores, scores_df])
    del svc
@ -493,7 +504,7 @@ def run_all_classification_models(
    print("Gaussian Naive Bayes")

    scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
    scores_df["method"] = "gaussian_naive_bayes"
    scores = pd.concat([scores, scores_df])
    del gaussian_nb
@ -513,7 +524,7 @@ def run_all_classification_models(
    print("Stochastic Gradient Descent")

    scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
    scores_df["method"] = "stochastic_gradient_descent"
    scores = pd.concat([scores, scores_df])
    del sgdc
@ -533,7 +544,7 @@ def run_all_classification_models(
    print("Random Forest")

    scores_df = pd.DataFrame(rfc_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
    scores_df["method"] = "random_forest"
    scores = pd.concat([scores, scores_df])
    del rfc
@ -553,7 +564,7 @@ def run_all_classification_models(
    print("XGBoost")

    scores_df = pd.DataFrame(xgb_scores)[test_metrics]
-    scores_df = scores_df.agg(["max", "mean"]).transpose()
+    scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
    scores_df["method"] = "xgboost"
    scores = pd.concat([scores, scores_df])
    del xgb_classifier