diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py index f539025..736e3db 100644 --- a/exploration/ml_pipeline_classification.py +++ b/exploration/ml_pipeline_classification.py @@ -42,11 +42,12 @@ if nb_dir not in sys.path: # %% [markdown] # ## Set script's parameters cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) -n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs -under_sampling = True # (bool) Will train and test data on balanced dataset (using undersampling method) +n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs +undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method) # %% jupyter={"source_hidden": true} model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv") +# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))] # %% jupyter={"source_hidden": true} index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] @@ -65,7 +66,7 @@ model_input['target'].value_counts() # %% jupyter={"source_hidden": true} # UnderSampling -if under_sampling: +if undersampling: model_input.groupby("pid").count() no_stress = model_input[model_input['target'] == 0] stress = model_input[model_input['target'] == 1] @@ -315,7 +316,8 @@ rfc_scores = cross_validate( cv=cv_method, n_jobs=-1, error_score='raise', - scoring=('accuracy', 'precision', 'recall', 'f1') + scoring=('accuracy', 'precision', 'recall', 'f1'), + return_estimator=True ) # %% jupyter={"source_hidden": true} print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy'])) @@ -326,6 +328,28 @@ print("F1", np.mean(rfc_scores['test_f1'])) print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1]) print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl])) +# %% [markdown] +# ### Feature importance (RFC) + +# %% jupyter={"source_hidden": true} +rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns)) +for idx, estimator in enumerate(rfc_scores['estimator']): + print("\nFeatures sorted by their score for estimator {}:".format(idx)) + feature_importances = pd.DataFrame(estimator.feature_importances_, + index = list(train_x.columns), + columns=['importance']) + print(feature_importances.sort_values('importance', ascending=False).head(10)) + rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean() + +pd.set_option('display.max_rows', 100) +print(rfc_es_fimp.sort_values('importance', ascending=False).head(100)) + +rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar() + +rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar() + +train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts() + # %% [markdown] # ### Gradient Boosting Classifier @@ -403,3 +427,5 @@ print("Recall", np.mean(xgb_classifier_scores['test_recall'])) print("F1", np.mean(xgb_classifier_scores['test_f1'])) print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1]) print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])) + +# %%