Add feature importance check.
parent
164d12ed2f
commit
a61ab9ee51
|
@ -42,11 +42,12 @@ if nb_dir not in sys.path:
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ## Set script's parameters
|
# ## Set script's parameters
|
||||||
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||||
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||||
under_sampling = True # (bool) Will train and test data on balanced dataset (using undersampling method)
|
undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv")
|
model_input = pd.read_csv("../data/stressfulness_event_with_target_0/input_appraisal_stressfulness_event_mean.csv")
|
||||||
|
# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
|
@ -65,7 +66,7 @@ model_input['target'].value_counts()
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# UnderSampling
|
# UnderSampling
|
||||||
if under_sampling:
|
if undersampling:
|
||||||
model_input.groupby("pid").count()
|
model_input.groupby("pid").count()
|
||||||
no_stress = model_input[model_input['target'] == 0]
|
no_stress = model_input[model_input['target'] == 0]
|
||||||
stress = model_input[model_input['target'] == 1]
|
stress = model_input[model_input['target'] == 1]
|
||||||
|
@ -315,7 +316,8 @@ rfc_scores = cross_validate(
|
||||||
cv=cv_method,
|
cv=cv_method,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
error_score='raise',
|
error_score='raise',
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=('accuracy', 'precision', 'recall', 'f1'),
|
||||||
|
return_estimator=True
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
|
print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
|
||||||
|
@ -326,6 +328,28 @@ print("F1", np.mean(rfc_scores['test_f1']))
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Feature importance (RFC)
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
|
||||||
|
for idx, estimator in enumerate(rfc_scores['estimator']):
|
||||||
|
print("\nFeatures sorted by their score for estimator {}:".format(idx))
|
||||||
|
feature_importances = pd.DataFrame(estimator.feature_importances_,
|
||||||
|
index = list(train_x.columns),
|
||||||
|
columns=['importance'])
|
||||||
|
print(feature_importances.sort_values('importance', ascending=False).head(10))
|
||||||
|
rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
|
||||||
|
|
||||||
|
pd.set_option('display.max_rows', 100)
|
||||||
|
print(rfc_es_fimp.sort_values('importance', ascending=False).head(100))
|
||||||
|
|
||||||
|
rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
|
||||||
|
|
||||||
|
rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
|
||||||
|
|
||||||
|
train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ### Gradient Boosting Classifier
|
# ### Gradient Boosting Classifier
|
||||||
|
|
||||||
|
@ -403,3 +427,5 @@ print("Recall", np.mean(xgb_classifier_scores['test_recall']))
|
||||||
print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
Loading…
Reference in New Issue