Add undersampling method (with on/off parameter).
parent
0a45e35164
commit
164d12ed2f
|
@ -7,6 +7,7 @@ dependencies:
|
||||||
- black
|
- black
|
||||||
- isort
|
- isort
|
||||||
- flake8
|
- flake8
|
||||||
|
- imbalanced-learn=0.10.0
|
||||||
- jupyterlab
|
- jupyterlab
|
||||||
- jupytext
|
- jupytext
|
||||||
- mypy
|
- mypy
|
||||||
|
|
|
@ -43,17 +43,19 @@ if nb_dir not in sys.path:
|
||||||
# ## Set script's parameters
|
# ## Set script's parameters
|
||||||
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||||
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||||
|
under_sampling = True # (bool) Will train and test data on balanced dataset (using undersampling method)
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
|
model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv")
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
model_input.set_index(index_columns, inplace=True)
|
model_input.set_index(index_columns, inplace=True)
|
||||||
|
model_input['target'].value_counts()
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
bins = [-10, 0, 10] # bins for z-scored targets
|
# bins = [-10, 0, 10] # bins for z-scored targets
|
||||||
# bins = [1, 2.5, 4] # bins for stressfulness (1-4) target
|
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||||
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
|
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
|
||||||
model_input['target'].value_counts(), edges
|
model_input['target'].value_counts(), edges
|
||||||
# model_input = model_input[model_input['target'] != "medium"]
|
# model_input = model_input[model_input['target'] != "medium"]
|
||||||
|
@ -61,6 +63,20 @@ model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x
|
||||||
|
|
||||||
model_input['target'].value_counts()
|
model_input['target'].value_counts()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
# UnderSampling
|
||||||
|
if under_sampling:
|
||||||
|
model_input.groupby("pid").count()
|
||||||
|
no_stress = model_input[model_input['target'] == 0]
|
||||||
|
stress = model_input[model_input['target'] == 1]
|
||||||
|
|
||||||
|
no_stress = no_stress.sample(n=len(stress))
|
||||||
|
model_input = pd.concat([stress,no_stress], axis=0)
|
||||||
|
|
||||||
|
model_input["target"].value_counts()
|
||||||
|
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
if cv_method_str == 'half_logo':
|
if cv_method_str == 'half_logo':
|
||||||
model_input['pid_index'] = model_input.groupby('pid').cumcount()
|
model_input['pid_index'] = model_input.groupby('pid').cumcount()
|
||||||
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
|
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
|
||||||
|
@ -119,11 +135,12 @@ dummy_classifier = cross_validate(
|
||||||
cv=cv_method,
|
cv=cv_method,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
error_score='raise',
|
error_score='raise',
|
||||||
scoring=('accuracy', 'average_precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(dummy_classifier['test_accuracy']))
|
print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
|
||||||
print("Precision", np.mean(dummy_classifier['test_average_precision']))
|
print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
|
||||||
|
print("Precision", np.mean(dummy_classifier['test_precision']))
|
||||||
print("Recall", np.mean(dummy_classifier['test_recall']))
|
print("Recall", np.mean(dummy_classifier['test_recall']))
|
||||||
print("F1", np.mean(dummy_classifier['test_f1']))
|
print("F1", np.mean(dummy_classifier['test_f1']))
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
|
@ -146,7 +163,8 @@ log_reg_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(log_reg_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(log_reg_scores['test_precision']))
|
print("Precision", np.mean(log_reg_scores['test_precision']))
|
||||||
print("Recall", np.mean(log_reg_scores['test_recall']))
|
print("Recall", np.mean(log_reg_scores['test_recall']))
|
||||||
print("F1", np.mean(log_reg_scores['test_f1']))
|
print("F1", np.mean(log_reg_scores['test_f1']))
|
||||||
|
@ -170,7 +188,8 @@ svc_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(svc_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(svc_scores['test_precision']))
|
print("Precision", np.mean(svc_scores['test_precision']))
|
||||||
print("Recall", np.mean(svc_scores['test_recall']))
|
print("Recall", np.mean(svc_scores['test_recall']))
|
||||||
print("F1", np.mean(svc_scores['test_f1']))
|
print("F1", np.mean(svc_scores['test_f1']))
|
||||||
|
@ -195,7 +214,8 @@ gaussian_nb_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(gaussian_nb_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(gaussian_nb_scores['test_precision']))
|
print("Precision", np.mean(gaussian_nb_scores['test_precision']))
|
||||||
print("Recall", np.mean(gaussian_nb_scores['test_recall']))
|
print("Recall", np.mean(gaussian_nb_scores['test_recall']))
|
||||||
print("F1", np.mean(gaussian_nb_scores['test_f1']))
|
print("F1", np.mean(gaussian_nb_scores['test_f1']))
|
||||||
|
@ -220,7 +240,8 @@ sgdc_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(sgdc_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(sgdc_scores['test_precision']))
|
print("Precision", np.mean(sgdc_scores['test_precision']))
|
||||||
print("Recall", np.mean(sgdc_scores['test_recall']))
|
print("Recall", np.mean(sgdc_scores['test_recall']))
|
||||||
print("F1", np.mean(sgdc_scores['test_f1']))
|
print("F1", np.mean(sgdc_scores['test_f1']))
|
||||||
|
@ -245,7 +266,8 @@ knn_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(knn_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(knn_scores['test_precision']))
|
print("Precision", np.mean(knn_scores['test_precision']))
|
||||||
print("Recall", np.mean(knn_scores['test_recall']))
|
print("Recall", np.mean(knn_scores['test_recall']))
|
||||||
print("F1", np.mean(knn_scores['test_f1']))
|
print("F1", np.mean(knn_scores['test_f1']))
|
||||||
|
@ -270,7 +292,8 @@ dtree_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(dtree_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(dtree_scores['test_precision']))
|
print("Precision", np.mean(dtree_scores['test_precision']))
|
||||||
print("Recall", np.mean(dtree_scores['test_recall']))
|
print("Recall", np.mean(dtree_scores['test_recall']))
|
||||||
print("F1", np.mean(dtree_scores['test_f1']))
|
print("F1", np.mean(dtree_scores['test_f1']))
|
||||||
|
@ -295,7 +318,8 @@ rfc_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(rfc_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(rfc_scores['test_precision']))
|
print("Precision", np.mean(rfc_scores['test_precision']))
|
||||||
print("Recall", np.mean(rfc_scores['test_recall']))
|
print("Recall", np.mean(rfc_scores['test_recall']))
|
||||||
print("F1", np.mean(rfc_scores['test_f1']))
|
print("F1", np.mean(rfc_scores['test_f1']))
|
||||||
|
@ -320,7 +344,8 @@ gbc_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(gbc_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(gbc_scores['test_precision']))
|
print("Precision", np.mean(gbc_scores['test_precision']))
|
||||||
print("Recall", np.mean(gbc_scores['test_recall']))
|
print("Recall", np.mean(gbc_scores['test_recall']))
|
||||||
print("F1", np.mean(gbc_scores['test_f1']))
|
print("F1", np.mean(gbc_scores['test_f1']))
|
||||||
|
@ -345,7 +370,8 @@ lgbm_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(lgbm_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(lgbm_scores['test_precision']))
|
print("Precision", np.mean(lgbm_scores['test_precision']))
|
||||||
print("Recall", np.mean(lgbm_scores['test_recall']))
|
print("Recall", np.mean(lgbm_scores['test_recall']))
|
||||||
print("F1", np.mean(lgbm_scores['test_f1']))
|
print("F1", np.mean(lgbm_scores['test_f1']))
|
||||||
|
@ -370,7 +396,8 @@ xgb_classifier_scores = cross_validate(
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc", np.mean(xgb_classifier_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
|
||||||
print("Precision", np.mean(xgb_classifier_scores['test_precision']))
|
print("Precision", np.mean(xgb_classifier_scores['test_precision']))
|
||||||
print("Recall", np.mean(xgb_classifier_scores['test_recall']))
|
print("Recall", np.mean(xgb_classifier_scores['test_recall']))
|
||||||
print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
||||||
|
|
Loading…
Reference in New Issue