Add undersampling method (with on/off parameter).

2022-12-13 17:01:46 +01:00 · 2022-12-13 17:01:46 +01:00 · 164d12ed2f
parent 0a45e35164
commit 164d12ed2f
2 changed files with 44 additions and 16 deletions
--- a/config/environment.yml
+++ b/config/environment.yml
@ -7,6 +7,7 @@ dependencies:
  - black
  - isort
  - flake8
+  - imbalanced-learn=0.10.0
  - jupyterlab
  - jupytext
  - mypy
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@ -43,17 +43,19 @@ if nb_dir not in sys.path:
 # ## Set script's parameters
 cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
 n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
+under_sampling = True # (bool) Will train and test data on balanced dataset (using undersampling method)

 # %% jupyter={"source_hidden": true}
-model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
+model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv")

 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input.set_index(index_columns, inplace=True)
+model_input['target'].value_counts()

 # %% jupyter={"source_hidden": true}
-bins = [-10, 0, 10] # bins for z-scored targets
-# bins = [1, 2.5, 4] # bins for stressfulness (1-4) target
+# bins = [-10, 0, 10] # bins for z-scored targets
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
 model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
 model_input['target'].value_counts(), edges
 # model_input = model_input[model_input['target'] != "medium"]
@ -61,6 +63,20 @@ model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x

 model_input['target'].value_counts()

+# %% jupyter={"source_hidden": true}
+# UnderSampling
+if under_sampling:
+    model_input.groupby("pid").count()
+    no_stress = model_input[model_input['target'] == 0]
+    stress = model_input[model_input['target'] == 1]
+
+    no_stress = no_stress.sample(n=len(stress))
+    model_input = pd.concat([stress,no_stress], axis=0)
+
+    model_input["target"].value_counts()
+
+
+# %% jupyter={"source_hidden": true}
 if cv_method_str == 'half_logo':
    model_input['pid_index'] = model_input.groupby('pid').cumcount()
    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
@ -119,11 +135,12 @@ dummy_classifier = cross_validate(
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
-    scoring=('accuracy', 'average_precision', 'recall', 'f1')
+    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(dummy_classifier['test_accuracy']))
-print("Precision", np.mean(dummy_classifier['test_average_precision']))
+print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
+print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
+print("Precision", np.mean(dummy_classifier['test_precision']))
 print("Recall", np.mean(dummy_classifier['test_recall']))
 print("F1", np.mean(dummy_classifier['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
@ -146,7 +163,8 @@ log_reg_scores = cross_validate(
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(log_reg_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
+print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
 print("Precision", np.mean(log_reg_scores['test_precision']))
 print("Recall", np.mean(log_reg_scores['test_recall']))
 print("F1", np.mean(log_reg_scores['test_f1']))
@ -170,7 +188,8 @@ svc_scores = cross_validate(
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(svc_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
 print("Precision", np.mean(svc_scores['test_precision']))
 print("Recall", np.mean(svc_scores['test_recall']))
 print("F1", np.mean(svc_scores['test_f1']))
@ -195,7 +214,8 @@ gaussian_nb_scores = cross_validate(
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(gaussian_nb_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
+print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
 print("Precision", np.mean(gaussian_nb_scores['test_precision']))
 print("Recall", np.mean(gaussian_nb_scores['test_recall']))
 print("F1", np.mean(gaussian_nb_scores['test_f1']))
@ -220,7 +240,8 @@ sgdc_scores = cross_validate(
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(sgdc_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
 print("Precision", np.mean(sgdc_scores['test_precision']))
 print("Recall", np.mean(sgdc_scores['test_recall']))
 print("F1", np.mean(sgdc_scores['test_f1']))
@ -245,7 +266,8 @@ knn_scores = cross_validate(
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(knn_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
+print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
 print("Precision", np.mean(knn_scores['test_precision']))
 print("Recall", np.mean(knn_scores['test_recall']))
 print("F1", np.mean(knn_scores['test_f1']))
@ -270,7 +292,8 @@ dtree_scores = cross_validate(
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(dtree_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
+print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
 print("Precision", np.mean(dtree_scores['test_precision']))
 print("Recall", np.mean(dtree_scores['test_recall']))
 print("F1", np.mean(dtree_scores['test_f1']))
@ -295,7 +318,8 @@ rfc_scores = cross_validate(
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(rfc_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
 print("Precision", np.mean(rfc_scores['test_precision']))
 print("Recall", np.mean(rfc_scores['test_recall']))
 print("F1", np.mean(rfc_scores['test_f1']))
@ -320,7 +344,8 @@ gbc_scores = cross_validate(
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(gbc_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
+print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
 print("Precision", np.mean(gbc_scores['test_precision']))
 print("Recall", np.mean(gbc_scores['test_recall']))
 print("F1", np.mean(gbc_scores['test_f1']))
@ -345,7 +370,8 @@ lgbm_scores = cross_validate(
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(lgbm_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
+print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
 print("Precision", np.mean(lgbm_scores['test_precision']))
 print("Recall", np.mean(lgbm_scores['test_recall']))
 print("F1", np.mean(lgbm_scores['test_f1']))
@ -370,7 +396,8 @@ xgb_classifier_scores = cross_validate(
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": true}
-print("Acc", np.mean(xgb_classifier_scores['test_accuracy']))
+print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
+print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
 print("Precision", np.mean(xgb_classifier_scores['test_precision']))
 print("Recall", np.mean(xgb_classifier_scores['test_recall']))
 print("F1", np.mean(xgb_classifier_scores['test_f1']))