From 123b78d438a305fb550101cf5e7eba27b106f4f2 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Sun, 16 Aug 2020 16:08:51 -0400 Subject: [PATCH] Fix minor bugs of modeling.py: f1-macro and proba --- src/models/modeling.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/models/modeling.py b/src/models/modeling.py index 0639b744..57a50eb7 100644 --- a/src/models/modeling.py +++ b/src/models/modeling.py @@ -117,10 +117,10 @@ for train_index, test_index in outer_cv.split(data_x): # Inner cross validation if min(targets_value_counts) >= 6: # SMOTE requires n_neighbors <= n_samples, the default value of n_neighbors is 6 - clf = GridSearchCV(estimator=createPipeline(model, "SMOTE"), param_grid=model_hyperparams, cv=inner_cv, scoring="f1_micro") + clf = GridSearchCV(estimator=createPipeline(model, "SMOTE"), param_grid=model_hyperparams, cv=inner_cv, scoring="f1_macro") else: # RandomOverSampler: over-sample the minority class(es) by picking samples at random with replacement. - clf = GridSearchCV(estimator=createPipeline(model, "RandomOverSampler"), param_grid=model_hyperparams, cv=inner_cv, scoring="f1_micro") + clf = GridSearchCV(estimator=createPipeline(model, "RandomOverSampler"), param_grid=model_hyperparams, cv=inner_cv, scoring="f1_macro") clf.fit(train_x, train_y.values.ravel()) # Collect results and parameters @@ -129,10 +129,7 @@ for train_index, test_index in outer_cv.split(data_x): pred_y = pred_y + cur_fold_pred proba_of_two_categories = clf.predict_proba(test_x).tolist() - if cur_fold_pred[0]: - pred_y_prob = pred_y_prob + [row[proba_of_two_categories[0].index(max(proba_of_two_categories[0]))] for row in proba_of_two_categories] - else: - pred_y_prob = pred_y_prob + [row[proba_of_two_categories[0].index(min(proba_of_two_categories[0]))] for row in proba_of_two_categories] + pred_y_prob = pred_y_prob + [probabilities[clf.classes_.tolist().index(1)] for probabilities in proba_of_two_categories] true_y = true_y + test_y.values.ravel().tolist() pid = pid + test_y.index.tolist() # each test partition (fold) in the outer cv is a participant (LeaveOneOut cv)