diff --git a/machine_learning/helper.py b/machine_learning/helper.py index f008716..ddb6a72 100644 --- a/machine_learning/helper.py +++ b/machine_learning/helper.py @@ -491,6 +491,7 @@ def run_all_classification_models( scores = pd.concat([scores, scores_df]) del dummy_class del dummy_score + del dummy_confusion_matrix logistic_regression = linear_model.LogisticRegression() @@ -503,14 +504,33 @@ def run_all_classification_models( n_jobs=-1, scoring=metrics, ) + log_reg_confusion_matrix = cross_validate( + logistic_regression, + X=data_x, + y=data_y, + groups=data_groups, + cv=cross_validator, + n_jobs=-1, + scoring=confusion_matrix_scorer, + ) print("Logistic regression") scores_df = pd.DataFrame(log_reg_scores)[test_metrics] scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"]) + scores_df = pd.concat( + [ + scores_df, + aggregate_confusion_matrix(log_reg_confusion_matrix).rename( + columns={"sum": "mean"} + # Note: the column is misleadingly renamed to get concise output. + ), + ] + ) scores_df["method"] = "logistic_regression" scores = pd.concat([scores, scores_df]) del logistic_regression del log_reg_scores + del log_reg_confusion_matrix svc = svm.SVC() @@ -523,14 +543,33 @@ def run_all_classification_models( n_jobs=-1, scoring=metrics, ) + svc_confusion_matrix = cross_validate( + svc, + X=data_x, + y=data_y, + groups=data_groups, + cv=cross_validator, + n_jobs=-1, + scoring=confusion_matrix_scorer, + ) print("Support Vector Machine") scores_df = pd.DataFrame(svc_scores)[test_metrics] scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"]) + scores_df = pd.concat( + [ + scores_df, + aggregate_confusion_matrix(svc_confusion_matrix).rename( + columns={"sum": "mean"} + # Note: the column is misleadingly renamed to get concise output. + ), + ] + ) scores_df["method"] = "SVC" scores = pd.concat([scores, scores_df]) del svc del svc_scores + del svc_confusion_matrix gaussian_nb = naive_bayes.GaussianNB() @@ -543,14 +582,33 @@ def run_all_classification_models( n_jobs=-1, scoring=metrics, ) + gaussian_nb_confusion_matrix = cross_validate( + gaussian_nb, + X=data_x, + y=data_y, + groups=data_groups, + cv=cross_validator, + n_jobs=-1, + scoring=confusion_matrix_scorer, + ) print("Gaussian Naive Bayes") scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics] scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"]) + scores_df = pd.concat( + [ + scores_df, + aggregate_confusion_matrix(gaussian_nb_confusion_matrix).rename( + columns={"sum": "mean"} + # Note: the column is misleadingly renamed to get concise output. + ), + ] + ) scores_df["method"] = "gaussian_naive_bayes" scores = pd.concat([scores, scores_df]) del gaussian_nb del gaussian_nb_scores + del gaussian_nb_confusion_matrix sgdc = linear_model.SGDClassifier() @@ -563,14 +621,33 @@ def run_all_classification_models( n_jobs=-1, scoring=metrics, ) + sgdc_confusion_matrix = cross_validate( + sgdc, + X=data_x, + y=data_y, + groups=data_groups, + cv=cross_validator, + n_jobs=-1, + scoring=confusion_matrix_scorer, + ) print("Stochastic Gradient Descent") scores_df = pd.DataFrame(sgdc_scores)[test_metrics] scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"]) + scores_df = pd.concat( + [ + scores_df, + aggregate_confusion_matrix(sgdc_confusion_matrix).rename( + columns={"sum": "mean"} + # Note: the column is misleadingly renamed to get concise output. + ), + ] + ) scores_df["method"] = "stochastic_gradient_descent_classifier" scores = pd.concat([scores, scores_df]) del sgdc del sgdc_scores + del sgdc_confusion_matrix rfc = ensemble.RandomForestClassifier() @@ -583,14 +660,33 @@ def run_all_classification_models( n_jobs=-1, scoring=metrics, ) + rfc_confusion_matrix = cross_validate( + rfc, + X=data_x, + y=data_y, + groups=data_groups, + cv=cross_validator, + n_jobs=-1, + scoring=confusion_matrix_scorer, + ) print("Random Forest") scores_df = pd.DataFrame(rfc_scores)[test_metrics] scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"]) + scores_df = pd.concat( + [ + scores_df, + aggregate_confusion_matrix(rfc_confusion_matrix).rename( + columns={"sum": "mean"} + # Note: the column is misleadingly renamed to get concise output. + ), + ] + ) scores_df["method"] = "random_forest_classifier" scores = pd.concat([scores, scores_df]) del rfc del rfc_scores + del rfc_confusion_matrix xgb_classifier = XGBClassifier() @@ -603,13 +699,32 @@ def run_all_classification_models( n_jobs=-1, scoring=metrics, ) + xgb_confusion_matrix = cross_validate( + xgb_classifier, + X=data_x, + y=data_y, + groups=data_groups, + cv=cross_validator, + n_jobs=-1, + scoring=confusion_matrix_scorer, + ) print("XGBoost") scores_df = pd.DataFrame(xgb_scores)[test_metrics] scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"]) + scores_df = pd.concat( + [ + scores_df, + aggregate_confusion_matrix(xgb_confusion_matrix).rename( + columns={"sum": "mean"} + # Note: the column is misleadingly renamed to get concise output. + ), + ] + ) scores_df["method"] = "XGBoost_classifier" scores = pd.concat([scores, scores_df]) del xgb_classifier del xgb_scores + del xgb_confusion_matrix return scores