Retain metric names in final scores.

master
junos 2023-05-18 18:40:06 +02:00
parent b756ed5feb
commit 2336edffb6
1 changed files with 32 additions and 21 deletions

View File

@ -163,6 +163,17 @@ def prepare_cross_validator(
return cv return cv
def aggregate_and_transpose(df: pd.DataFrame, statistics=None) -> pd.DataFrame:
if statistics is None:
statistics = ["max", "mean"]
return (
df.agg(statistics)
.transpose()
.reset_index()
.rename(columns={"index": "test_metric"})
)
def run_all_regression_models( def run_all_regression_models(
data_x: pd.DataFrame, data_x: pd.DataFrame,
data_y: pd.DataFrame, data_y: pd.DataFrame,
@ -171,7 +182,7 @@ def run_all_regression_models(
) -> pd.DataFrame: ) -> pd.DataFrame:
metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"] metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
test_metrics = ["test_" + metric for metric in metrics] test_metrics = ["test_" + metric for metric in metrics]
scores = pd.DataFrame(columns=["method", "max", "nanmedian"]) scores = pd.DataFrame(columns=["method", "metric", "max", "nanmedian"])
# Validate models # Validate models
dummy_regr = DummyRegressor(strategy="mean") dummy_regr = DummyRegressor(strategy="mean")
@ -188,7 +199,7 @@ def run_all_regression_models(
print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"])) print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))
scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics] scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
scores_df["method"] = "dummy" scores_df["method"] = "dummy"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del dummy_regr del dummy_regr
@ -208,7 +219,7 @@ def run_all_regression_models(
print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"])) print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))
scores_df = pd.DataFrame(lin_reg_scores)[test_metrics] scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
scores_df["method"] = "linear_reg" scores_df["method"] = "linear_reg"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del lin_reg del lin_reg
@ -227,7 +238,7 @@ def run_all_regression_models(
print("Ridge regression") print("Ridge regression")
scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics] scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
scores_df["method"] = "ridge_reg" scores_df["method"] = "ridge_reg"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del ridge_reg del ridge_reg
@ -246,7 +257,7 @@ def run_all_regression_models(
print("Lasso regression") print("Lasso regression")
scores_df = pd.DataFrame(lasso_reg_score)[test_metrics] scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
scores_df["method"] = "lasso_reg" scores_df["method"] = "lasso_reg"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del lasso_reg del lasso_reg
@ -265,7 +276,7 @@ def run_all_regression_models(
print("Bayesian Ridge") print("Bayesian Ridge")
scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics] scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
scores_df["method"] = "bayesian_ridge" scores_df["method"] = "bayesian_ridge"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del bayesian_ridge_reg del bayesian_ridge_reg
@ -284,7 +295,7 @@ def run_all_regression_models(
print("RANSAC (outlier robust regression)") print("RANSAC (outlier robust regression)")
scores_df = pd.DataFrame(ransac_reg_score)[test_metrics] scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
scores_df["method"] = "RANSAC" scores_df["method"] = "RANSAC"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del ransac_reg del ransac_reg
@ -303,7 +314,7 @@ def run_all_regression_models(
print("Support vector regression") print("Support vector regression")
scores_df = pd.DataFrame(svr_score)[test_metrics] scores_df = pd.DataFrame(svr_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
scores_df["method"] = "SVR" scores_df["method"] = "SVR"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del svr del svr
@ -322,7 +333,7 @@ def run_all_regression_models(
print("Kernel Ridge regression") print("Kernel Ridge regression")
scores_df = pd.DataFrame(kridge_score)[test_metrics] scores_df = pd.DataFrame(kridge_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
scores_df["method"] = "kernel_ridge" scores_df["method"] = "kernel_ridge"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del kridge del kridge
@ -341,7 +352,7 @@ def run_all_regression_models(
print("Gaussian Process Regression") print("Gaussian Process Regression")
scores_df = pd.DataFrame(gpr_score)[test_metrics] scores_df = pd.DataFrame(gpr_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
scores_df["method"] = "gaussian_proc" scores_df["method"] = "gaussian_proc"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del gpr del gpr
@ -360,7 +371,7 @@ def run_all_regression_models(
print("Random Forest Regression") print("Random Forest Regression")
scores_df = pd.DataFrame(rfr_score)[test_metrics] scores_df = pd.DataFrame(rfr_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
scores_df["method"] = "random_forest" scores_df["method"] = "random_forest"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del rfr del rfr
@ -379,7 +390,7 @@ def run_all_regression_models(
print("XGBoost Regressor") print("XGBoost Regressor")
scores_df = pd.DataFrame(xgb_score)[test_metrics] scores_df = pd.DataFrame(xgb_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
scores_df["method"] = "XGBoost" scores_df["method"] = "XGBoost"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del xgb del xgb
@ -398,7 +409,7 @@ def run_all_regression_models(
print("ADA Boost Regressor") print("ADA Boost Regressor")
scores_df = pd.DataFrame(ada_score)[test_metrics] scores_df = pd.DataFrame(ada_score)[test_metrics]
scores_df = scores_df.agg(["max", np.nanmedian]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
scores_df["method"] = "ADA_boost" scores_df["method"] = "ADA_boost"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del ada del ada
@ -416,7 +427,7 @@ def run_all_classification_models(
metrics = ["accuracy", "average_precision", "recall", "f1"] metrics = ["accuracy", "average_precision", "recall", "f1"]
test_metrics = ["test_" + metric for metric in metrics] test_metrics = ["test_" + metric for metric in metrics]
scores = pd.DataFrame(columns=["method", "max", "mean"]) scores = pd.DataFrame(columns=["method", "metric", "max", "mean"])
dummy_class = DummyClassifier(strategy="most_frequent") dummy_class = DummyClassifier(strategy="most_frequent")
@ -433,7 +444,7 @@ def run_all_classification_models(
print("Dummy") print("Dummy")
scores_df = pd.DataFrame(dummy_score)[test_metrics] scores_df = pd.DataFrame(dummy_score)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
scores_df["method"] = "Dummy" scores_df["method"] = "Dummy"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del dummy_class del dummy_class
@ -453,7 +464,7 @@ def run_all_classification_models(
print("Logistic regression") print("Logistic regression")
scores_df = pd.DataFrame(log_reg_scores)[test_metrics] scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
scores_df["method"] = "logistic_reg" scores_df["method"] = "logistic_reg"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del logistic_regression del logistic_regression
@ -473,7 +484,7 @@ def run_all_classification_models(
print("Support Vector Machine") print("Support Vector Machine")
scores_df = pd.DataFrame(svc_scores)[test_metrics] scores_df = pd.DataFrame(svc_scores)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
scores_df["method"] = "svc" scores_df["method"] = "svc"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del svc del svc
@ -493,7 +504,7 @@ def run_all_classification_models(
print("Gaussian Naive Bayes") print("Gaussian Naive Bayes")
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics] scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
scores_df["method"] = "gaussian_naive_bayes" scores_df["method"] = "gaussian_naive_bayes"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del gaussian_nb del gaussian_nb
@ -513,7 +524,7 @@ def run_all_classification_models(
print("Stochastic Gradient Descent") print("Stochastic Gradient Descent")
scores_df = pd.DataFrame(sgdc_scores)[test_metrics] scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
scores_df["method"] = "stochastic_gradient_descent" scores_df["method"] = "stochastic_gradient_descent"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del sgdc del sgdc
@ -533,7 +544,7 @@ def run_all_classification_models(
print("Random Forest") print("Random Forest")
scores_df = pd.DataFrame(rfc_scores)[test_metrics] scores_df = pd.DataFrame(rfc_scores)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
scores_df["method"] = "random_forest" scores_df["method"] = "random_forest"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del rfc del rfc
@ -553,7 +564,7 @@ def run_all_classification_models(
print("XGBoost") print("XGBoost")
scores_df = pd.DataFrame(xgb_scores)[test_metrics] scores_df = pd.DataFrame(xgb_scores)[test_metrics]
scores_df = scores_df.agg(["max", "mean"]).transpose() scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
scores_df["method"] = "xgboost" scores_df["method"] = "xgboost"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
del xgb_classifier del xgb_classifier