Add feature importance check.

ml_pipeline
Primoz 2022-12-15 16:43:13 +01:00
parent 164d12ed2f
commit a61ab9ee51
1 changed files with 30 additions and 4 deletions

View File

@ -42,11 +42,12 @@ if nb_dir not in sys.path:
# %% [markdown] # %% [markdown]
# ## Set script's parameters # ## Set script's parameters
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
under_sampling = True # (bool) Will train and test data on balanced dataset (using undersampling method) undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv") model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv")
# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
@ -65,7 +66,7 @@ model_input['target'].value_counts()
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# UnderSampling # UnderSampling
if under_sampling: if undersampling:
model_input.groupby("pid").count() model_input.groupby("pid").count()
no_stress = model_input[model_input['target'] == 0] no_stress = model_input[model_input['target'] == 0]
stress = model_input[model_input['target'] == 1] stress = model_input[model_input['target'] == 1]
@ -315,7 +316,8 @@ rfc_scores = cross_validate(
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
error_score='raise', error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=('accuracy', 'precision', 'recall', 'f1'),
return_estimator=True
) )
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy'])) print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
@ -326,6 +328,28 @@ print("F1", np.mean(rfc_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1]) print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl])) print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown]
# ### Feature importance (RFC)
# %% jupyter={"source_hidden": true}
rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
for idx, estimator in enumerate(rfc_scores['estimator']):
print("\nFeatures sorted by their score for estimator {}:".format(idx))
feature_importances = pd.DataFrame(estimator.feature_importances_,
index = list(train_x.columns),
columns=['importance'])
print(feature_importances.sort_values('importance', ascending=False).head(10))
rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
pd.set_option('display.max_rows', 100)
print(rfc_es_fimp.sort_values('importance', ascending=False).head(100))
rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
# %% [markdown] # %% [markdown]
# ### Gradient Boosting Classifier # ### Gradient Boosting Classifier
@ -403,3 +427,5 @@ print("Recall", np.mean(xgb_classifier_scores['test_recall']))
print("F1", np.mean(xgb_classifier_scores['test_f1'])) print("F1", np.mean(xgb_classifier_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1]) print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])) print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
# %%