Add feature importance check.
parent
164d12ed2f
commit
a61ab9ee51
|
@ -42,11 +42,12 @@ if nb_dir not in sys.path:
|
|||
# %% [markdown]
|
||||
# ## Set script's parameters
|
||||
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
under_sampling = True # (bool) Will train and test data on balanced dataset (using undersampling method)
|
||||
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv")
|
||||
# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
|
@ -65,7 +66,7 @@ model_input['target'].value_counts()
|
|||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# UnderSampling
|
||||
if under_sampling:
|
||||
if undersampling:
|
||||
model_input.groupby("pid").count()
|
||||
no_stress = model_input[model_input['target'] == 0]
|
||||
stress = model_input[model_input['target'] == 1]
|
||||
|
@ -315,7 +316,8 @@ rfc_scores = cross_validate(
|
|||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1'),
|
||||
return_estimator=True
|
||||
)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
|
||||
|
@ -326,6 +328,28 @@ print("F1", np.mean(rfc_scores['test_f1']))
|
|||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Feature importance (RFC)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
|
||||
for idx, estimator in enumerate(rfc_scores['estimator']):
|
||||
print("\nFeatures sorted by their score for estimator {}:".format(idx))
|
||||
feature_importances = pd.DataFrame(estimator.feature_importances_,
|
||||
index = list(train_x.columns),
|
||||
columns=['importance'])
|
||||
print(feature_importances.sort_values('importance', ascending=False).head(10))
|
||||
rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
|
||||
|
||||
pd.set_option('display.max_rows', 100)
|
||||
print(rfc_es_fimp.sort_values('importance', ascending=False).head(100))
|
||||
|
||||
rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
|
||||
|
||||
rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
|
||||
|
||||
train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Gradient Boosting Classifier
|
||||
|
||||
|
@ -403,3 +427,5 @@ print("Recall", np.mean(xgb_classifier_scores['test_recall']))
|
|||
print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %%
|
||||
|
|
Loading…
Reference in New Issue