Add feature importance check.

2022-12-15 16:43:13 +01:00 · 2022-12-15 16:43:13 +01:00 · a61ab9ee51
parent 164d12ed2f
commit a61ab9ee51
1 changed files with 30 additions and 4 deletions
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@ -42,11 +42,12 @@ if nb_dir not in sys.path:
 # %% [markdown]
 # ## Set script's parameters
 cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
-n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
-under_sampling = True # (bool) Will train and test data on balanced dataset (using undersampling method)
+n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
+undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)

 # %% jupyter={"source_hidden": true}
 model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv")
+# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]

 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
@ -65,7 +66,7 @@ model_input['target'].value_counts()

 # %% jupyter={"source_hidden": true}
 # UnderSampling
-if under_sampling:
+if undersampling:
    model_input.groupby("pid").count()
    no_stress = model_input[model_input['target'] == 0]
    stress = model_input[model_input['target'] == 1]
@ -315,7 +316,8 @@ rfc_scores = cross_validate(
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
-    scoring=('accuracy', 'precision', 'recall', 'f1')
+    scoring=('accuracy', 'precision', 'recall', 'f1'), 
+    return_estimator=True
 )
 # %% jupyter={"source_hidden": true}
 print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
@ -326,6 +328,28 @@ print("F1", np.mean(rfc_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))

+# %% [markdown]
+# ### Feature importance (RFC)
+
+# %% jupyter={"source_hidden": true}
+rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
+for idx, estimator in enumerate(rfc_scores['estimator']):
+    print("\nFeatures sorted by their score for estimator {}:".format(idx))
+    feature_importances = pd.DataFrame(estimator.feature_importances_,
+                                       index = list(train_x.columns),
+                                        columns=['importance'])
+    print(feature_importances.sort_values('importance', ascending=False).head(10))                                    
+    rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
+
+pd.set_option('display.max_rows', 100)
+print(rfc_es_fimp.sort_values('importance', ascending=False).head(100))
+
+rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
+
+rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
+
+train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
+
 # %% [markdown]
 # ### Gradient Boosting Classifier

@ -403,3 +427,5 @@ print("Recall", np.mean(xgb_classifier_scores['test_recall']))
 print("F1", np.mean(xgb_classifier_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
+
+# %%