diff --git a/config/environment.yml b/config/environment.yml index 62cb210..42d947b 100644 --- a/config/environment.yml +++ b/config/environment.yml @@ -7,6 +7,7 @@ dependencies: - black - isort - flake8 + - imbalanced-learn=0.10.0 - jupyterlab - jupytext - mypy diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py index 33d1125..f539025 100644 --- a/exploration/ml_pipeline_classification.py +++ b/exploration/ml_pipeline_classification.py @@ -43,17 +43,19 @@ if nb_dir not in sys.path: # ## Set script's parameters cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs +under_sampling = True # (bool) Will train and test data on balanced dataset (using undersampling method) # %% jupyter={"source_hidden": true} -model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv") +model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv") # %% jupyter={"source_hidden": true} index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] model_input.set_index(index_columns, inplace=True) +model_input['target'].value_counts() # %% jupyter={"source_hidden": true} -bins = [-10, 0, 10] # bins for z-scored targets -# bins = [1, 2.5, 4] # bins for stressfulness (1-4) target +# bins = [-10, 0, 10] # bins for z-scored targets +bins = [-1, 0, 4] # bins for stressfulness (0-4) target model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high'] model_input['target'].value_counts(), edges # model_input = model_input[model_input['target'] != "medium"] @@ -61,6 +63,20 @@ model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x model_input['target'].value_counts() +# %% jupyter={"source_hidden": true} +# UnderSampling +if under_sampling: + model_input.groupby("pid").count() + no_stress = model_input[model_input['target'] == 0] + stress = model_input[model_input['target'] == 1] + + no_stress = no_stress.sample(n=len(stress)) + model_input = pd.concat([stress,no_stress], axis=0) + + model_input["target"].value_counts() + + +# %% jupyter={"source_hidden": true} if cv_method_str == 'half_logo': model_input['pid_index'] = model_input.groupby('pid').cumcount() model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count') @@ -119,11 +135,12 @@ dummy_classifier = cross_validate( cv=cv_method, n_jobs=-1, error_score='raise', - scoring=('accuracy', 'average_precision', 'recall', 'f1') + scoring=('accuracy', 'precision', 'recall', 'f1') ) # %% jupyter={"source_hidden": true} -print("Acc", np.mean(dummy_classifier['test_accuracy'])) -print("Precision", np.mean(dummy_classifier['test_average_precision'])) +print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy'])) +print("Acc (mean)", np.mean(dummy_classifier['test_accuracy'])) +print("Precision", np.mean(dummy_classifier['test_precision'])) print("Recall", np.mean(dummy_classifier['test_recall'])) print("F1", np.mean(dummy_classifier['test_f1'])) print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1]) @@ -146,7 +163,8 @@ log_reg_scores = cross_validate( scoring=('accuracy', 'precision', 'recall', 'f1') ) # %% jupyter={"source_hidden": true} -print("Acc", np.mean(log_reg_scores['test_accuracy'])) +print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy'])) +print("Acc (mean)", np.mean(log_reg_scores['test_accuracy'])) print("Precision", np.mean(log_reg_scores['test_precision'])) print("Recall", np.mean(log_reg_scores['test_recall'])) print("F1", np.mean(log_reg_scores['test_f1'])) @@ -170,7 +188,8 @@ svc_scores = cross_validate( scoring=('accuracy', 'precision', 'recall', 'f1') ) # %% jupyter={"source_hidden": true} -print("Acc", np.mean(svc_scores['test_accuracy'])) +print("Acc (median)", np.nanmedian(svc_scores['test_accuracy'])) +print("Acc (mean)", np.mean(svc_scores['test_accuracy'])) print("Precision", np.mean(svc_scores['test_precision'])) print("Recall", np.mean(svc_scores['test_recall'])) print("F1", np.mean(svc_scores['test_f1'])) @@ -195,7 +214,8 @@ gaussian_nb_scores = cross_validate( scoring=('accuracy', 'precision', 'recall', 'f1') ) # %% jupyter={"source_hidden": true} -print("Acc", np.mean(gaussian_nb_scores['test_accuracy'])) +print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy'])) +print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy'])) print("Precision", np.mean(gaussian_nb_scores['test_precision'])) print("Recall", np.mean(gaussian_nb_scores['test_recall'])) print("F1", np.mean(gaussian_nb_scores['test_f1'])) @@ -220,7 +240,8 @@ sgdc_scores = cross_validate( scoring=('accuracy', 'precision', 'recall', 'f1') ) # %% jupyter={"source_hidden": true} -print("Acc", np.mean(sgdc_scores['test_accuracy'])) +print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy'])) +print("Acc (mean)", np.mean(sgdc_scores['test_accuracy'])) print("Precision", np.mean(sgdc_scores['test_precision'])) print("Recall", np.mean(sgdc_scores['test_recall'])) print("F1", np.mean(sgdc_scores['test_f1'])) @@ -245,7 +266,8 @@ knn_scores = cross_validate( scoring=('accuracy', 'precision', 'recall', 'f1') ) # %% jupyter={"source_hidden": true} -print("Acc", np.mean(knn_scores['test_accuracy'])) +print("Acc (median)", np.nanmedian(knn_scores['test_accuracy'])) +print("Acc (mean)", np.mean(knn_scores['test_accuracy'])) print("Precision", np.mean(knn_scores['test_precision'])) print("Recall", np.mean(knn_scores['test_recall'])) print("F1", np.mean(knn_scores['test_f1'])) @@ -270,7 +292,8 @@ dtree_scores = cross_validate( scoring=('accuracy', 'precision', 'recall', 'f1') ) # %% jupyter={"source_hidden": true} -print("Acc", np.mean(dtree_scores['test_accuracy'])) +print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy'])) +print("Acc (mean)", np.mean(dtree_scores['test_accuracy'])) print("Precision", np.mean(dtree_scores['test_precision'])) print("Recall", np.mean(dtree_scores['test_recall'])) print("F1", np.mean(dtree_scores['test_f1'])) @@ -295,7 +318,8 @@ rfc_scores = cross_validate( scoring=('accuracy', 'precision', 'recall', 'f1') ) # %% jupyter={"source_hidden": true} -print("Acc", np.mean(rfc_scores['test_accuracy'])) +print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy'])) +print("Acc (mean)", np.mean(rfc_scores['test_accuracy'])) print("Precision", np.mean(rfc_scores['test_precision'])) print("Recall", np.mean(rfc_scores['test_recall'])) print("F1", np.mean(rfc_scores['test_f1'])) @@ -320,7 +344,8 @@ gbc_scores = cross_validate( scoring=('accuracy', 'precision', 'recall', 'f1') ) # %% jupyter={"source_hidden": true} -print("Acc", np.mean(gbc_scores['test_accuracy'])) +print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy'])) +print("Acc (mean)", np.mean(gbc_scores['test_accuracy'])) print("Precision", np.mean(gbc_scores['test_precision'])) print("Recall", np.mean(gbc_scores['test_recall'])) print("F1", np.mean(gbc_scores['test_f1'])) @@ -345,7 +370,8 @@ lgbm_scores = cross_validate( scoring=('accuracy', 'precision', 'recall', 'f1') ) # %% jupyter={"source_hidden": true} -print("Acc", np.mean(lgbm_scores['test_accuracy'])) +print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy'])) +print("Acc (mean)", np.mean(lgbm_scores['test_accuracy'])) print("Precision", np.mean(lgbm_scores['test_precision'])) print("Recall", np.mean(lgbm_scores['test_recall'])) print("F1", np.mean(lgbm_scores['test_f1'])) @@ -370,7 +396,8 @@ xgb_classifier_scores = cross_validate( scoring=('accuracy', 'precision', 'recall', 'f1') ) # %% jupyter={"source_hidden": true} -print("Acc", np.mean(xgb_classifier_scores['test_accuracy'])) +print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy'])) +print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy'])) print("Precision", np.mean(xgb_classifier_scores['test_precision'])) print("Recall", np.mean(xgb_classifier_scores['test_recall'])) print("F1", np.mean(xgb_classifier_scores['test_f1']))