Add undersampling method (with on/off parameter).

ml_pipeline
Primoz 2022-12-13 17:01:46 +01:00
parent 0a45e35164
commit 164d12ed2f
2 changed files with 44 additions and 16 deletions

View File

@ -7,6 +7,7 @@ dependencies:
- black - black
- isort - isort
- flake8 - flake8
- imbalanced-learn=0.10.0
- jupyterlab - jupyterlab
- jupytext - jupytext
- mypy - mypy

View File

@ -43,17 +43,19 @@ if nb_dir not in sys.path:
# ## Set script's parameters # ## Set script's parameters
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
under_sampling = True # (bool) Will train and test data on balanced dataset (using undersampling method)
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv") model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv")
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input.set_index(index_columns, inplace=True) model_input.set_index(index_columns, inplace=True)
model_input['target'].value_counts()
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
bins = [-10, 0, 10] # bins for z-scored targets # bins = [-10, 0, 10] # bins for z-scored targets
# bins = [1, 2.5, 4] # bins for stressfulness (1-4) target bins = [-1, 0, 4] # bins for stressfulness (0-4) target
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high'] model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
model_input['target'].value_counts(), edges model_input['target'].value_counts(), edges
# model_input = model_input[model_input['target'] != "medium"] # model_input = model_input[model_input['target'] != "medium"]
@ -61,6 +63,20 @@ model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x
model_input['target'].value_counts() model_input['target'].value_counts()
# %% jupyter={"source_hidden": true}
# UnderSampling
if under_sampling:
model_input.groupby("pid").count()
no_stress = model_input[model_input['target'] == 0]
stress = model_input[model_input['target'] == 1]
no_stress = no_stress.sample(n=len(stress))
model_input = pd.concat([stress,no_stress], axis=0)
model_input["target"].value_counts()
# %% jupyter={"source_hidden": true}
if cv_method_str == 'half_logo': if cv_method_str == 'half_logo':
model_input['pid_index'] = model_input.groupby('pid').cumcount() model_input['pid_index'] = model_input.groupby('pid').cumcount()
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count') model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
@ -119,11 +135,12 @@ dummy_classifier = cross_validate(
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
error_score='raise', error_score='raise',
scoring=('accuracy', 'average_precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(dummy_classifier['test_accuracy'])) print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
print("Precision", np.mean(dummy_classifier['test_average_precision'])) print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
print("Precision", np.mean(dummy_classifier['test_precision']))
print("Recall", np.mean(dummy_classifier['test_recall'])) print("Recall", np.mean(dummy_classifier['test_recall']))
print("F1", np.mean(dummy_classifier['test_f1'])) print("F1", np.mean(dummy_classifier['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1]) print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
@ -146,7 +163,8 @@ log_reg_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(log_reg_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
print("Precision", np.mean(log_reg_scores['test_precision'])) print("Precision", np.mean(log_reg_scores['test_precision']))
print("Recall", np.mean(log_reg_scores['test_recall'])) print("Recall", np.mean(log_reg_scores['test_recall']))
print("F1", np.mean(log_reg_scores['test_f1'])) print("F1", np.mean(log_reg_scores['test_f1']))
@ -170,7 +188,8 @@ svc_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(svc_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
print("Precision", np.mean(svc_scores['test_precision'])) print("Precision", np.mean(svc_scores['test_precision']))
print("Recall", np.mean(svc_scores['test_recall'])) print("Recall", np.mean(svc_scores['test_recall']))
print("F1", np.mean(svc_scores['test_f1'])) print("F1", np.mean(svc_scores['test_f1']))
@ -195,7 +214,8 @@ gaussian_nb_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(gaussian_nb_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
print("Precision", np.mean(gaussian_nb_scores['test_precision'])) print("Precision", np.mean(gaussian_nb_scores['test_precision']))
print("Recall", np.mean(gaussian_nb_scores['test_recall'])) print("Recall", np.mean(gaussian_nb_scores['test_recall']))
print("F1", np.mean(gaussian_nb_scores['test_f1'])) print("F1", np.mean(gaussian_nb_scores['test_f1']))
@ -220,7 +240,8 @@ sgdc_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(sgdc_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
print("Precision", np.mean(sgdc_scores['test_precision'])) print("Precision", np.mean(sgdc_scores['test_precision']))
print("Recall", np.mean(sgdc_scores['test_recall'])) print("Recall", np.mean(sgdc_scores['test_recall']))
print("F1", np.mean(sgdc_scores['test_f1'])) print("F1", np.mean(sgdc_scores['test_f1']))
@ -245,7 +266,8 @@ knn_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(knn_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
print("Precision", np.mean(knn_scores['test_precision'])) print("Precision", np.mean(knn_scores['test_precision']))
print("Recall", np.mean(knn_scores['test_recall'])) print("Recall", np.mean(knn_scores['test_recall']))
print("F1", np.mean(knn_scores['test_f1'])) print("F1", np.mean(knn_scores['test_f1']))
@ -270,7 +292,8 @@ dtree_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(dtree_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
print("Precision", np.mean(dtree_scores['test_precision'])) print("Precision", np.mean(dtree_scores['test_precision']))
print("Recall", np.mean(dtree_scores['test_recall'])) print("Recall", np.mean(dtree_scores['test_recall']))
print("F1", np.mean(dtree_scores['test_f1'])) print("F1", np.mean(dtree_scores['test_f1']))
@ -295,7 +318,8 @@ rfc_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(rfc_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
print("Precision", np.mean(rfc_scores['test_precision'])) print("Precision", np.mean(rfc_scores['test_precision']))
print("Recall", np.mean(rfc_scores['test_recall'])) print("Recall", np.mean(rfc_scores['test_recall']))
print("F1", np.mean(rfc_scores['test_f1'])) print("F1", np.mean(rfc_scores['test_f1']))
@ -320,7 +344,8 @@ gbc_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(gbc_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
print("Precision", np.mean(gbc_scores['test_precision'])) print("Precision", np.mean(gbc_scores['test_precision']))
print("Recall", np.mean(gbc_scores['test_recall'])) print("Recall", np.mean(gbc_scores['test_recall']))
print("F1", np.mean(gbc_scores['test_f1'])) print("F1", np.mean(gbc_scores['test_f1']))
@ -345,7 +370,8 @@ lgbm_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(lgbm_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
print("Precision", np.mean(lgbm_scores['test_precision'])) print("Precision", np.mean(lgbm_scores['test_precision']))
print("Recall", np.mean(lgbm_scores['test_recall'])) print("Recall", np.mean(lgbm_scores['test_recall']))
print("F1", np.mean(lgbm_scores['test_f1'])) print("F1", np.mean(lgbm_scores['test_f1']))
@ -370,7 +396,8 @@ xgb_classifier_scores = cross_validate(
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1')
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc", np.mean(xgb_classifier_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
print("Precision", np.mean(xgb_classifier_scores['test_precision'])) print("Precision", np.mean(xgb_classifier_scores['test_precision']))
print("Recall", np.mean(xgb_classifier_scores['test_recall'])) print("Recall", np.mean(xgb_classifier_scores['test_recall']))
print("F1", np.mean(xgb_classifier_scores['test_f1'])) print("F1", np.mean(xgb_classifier_scores['test_f1']))