Change ML model and ddd CV to sequential feat_select. Add std lines to plots.
parent
e3aef2dae7
commit
ea3f805ba7
|
@ -23,8 +23,8 @@ import matplotlib.pyplot as plt
|
|||
import pandas as pd
|
||||
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
|
||||
from sklearn import metrics
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
|
@ -82,7 +82,7 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
|
|||
while excluding the 'pid' and 'target' columns. It then splits the data into training
|
||||
and test sets, using a test size of 0.25 for the first split and 0.2 for the second split.
|
||||
A SimpleImputer is used to fill in missing values with median values.
|
||||
A RandomForestClassifier is then used to fit the training set and make predictions
|
||||
A LogisticRegression is then used to fit the training set and make predictions
|
||||
on the test set. Finally, accuracy, precision, recall and F1 scores are printed
|
||||
for each substring group depending on whether or not include_group
|
||||
is set to True or False.
|
||||
|
@ -90,7 +90,7 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
|
|||
"""
|
||||
|
||||
best_sensor = None
|
||||
best_recall_score, best_f1_sore = None, None
|
||||
best_recall_score, best_f1_score = None, None
|
||||
|
||||
for fgroup_substr in groups_substrings:
|
||||
if fgroup_substr is None:
|
||||
|
@ -105,16 +105,21 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
|
|||
|
||||
|
||||
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
|
||||
X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
|
||||
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
X = imputer.fit_transform(X)
|
||||
|
||||
X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
|
||||
nb = GaussianNB()
|
||||
model_cv = cross_validate(
|
||||
nb,
|
||||
X=imputer.fit_transform(X),
|
||||
y=y,
|
||||
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
||||
n_jobs=-1,
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
|
||||
|
||||
rfc = RandomForestClassifier(random_state=0)
|
||||
rfc.fit(X_train, y_train)
|
||||
y_pred = rfc.predict(X_test)
|
||||
|
||||
if print_flag:
|
||||
if include_group:
|
||||
|
@ -125,26 +130,34 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
|
|||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
||||
|
||||
acc = metrics.accuracy_score(y_test, y_pred)
|
||||
prec = metrics.precision_score(y_test, y_pred)
|
||||
rec = metrics.recall_score(y_test, y_pred)
|
||||
f1 = metrics.f1_score(y_test, y_pred)
|
||||
acc = np.mean(model_cv['test_accuracy'])
|
||||
acc_std = np.std(model_cv['test_accuracy'])
|
||||
|
||||
prec = np.mean(model_cv['test_precision'])
|
||||
prec_std = np.std(model_cv['test_precision'])
|
||||
|
||||
rec = np.mean(model_cv['test_recall'])
|
||||
rec_std = np.std(model_cv['test_recall'])
|
||||
|
||||
f1 = np.mean(model_cv['test_f1'])
|
||||
f1_std = np.std(model_cv['test_f1'])
|
||||
|
||||
if print_flag:
|
||||
print("************************************************")
|
||||
print("Accuracy", acc)
|
||||
print("Precision", prec)
|
||||
print("Recall", rec)
|
||||
print("F1", f1, "\n")
|
||||
print(f"Accuracy: {acc} (sd={acc_std})")
|
||||
print(f"Precison: {prec} (sd={prec_std})")
|
||||
print(f"Recall: {rec} (sd={rec_std})")
|
||||
print(f"F1: {f1} (sd={f1_std})\n")
|
||||
|
||||
if (not best_recall_score and not best_f1_sore) or (rec > best_recall_score):
|
||||
if (not best_recall_score and not best_f1_score) or (rec > best_recall_score):
|
||||
best_sensor = fgroup_substr
|
||||
best_recall_score, best_f1_sore = rec, f1
|
||||
best_recall_score, best_f1_score = rec, f1
|
||||
best_recall_score_std, best_f1_score_std = rec_std, f1_std
|
||||
|
||||
return best_sensor, best_recall_score, best_f1_sore
|
||||
return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std
|
||||
|
||||
# %% [markdown]
|
||||
# ### sensor big feature groups (phone, empatica, demografical)
|
||||
# ### sensor big feature groups (phone, empatica, demographical)
|
||||
big_groups_substr = ["phone_", "empatica_", "demo_"]
|
||||
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)
|
||||
|
||||
|
@ -189,11 +202,11 @@ def find_sensor_group_features_importance(model_input, sensor_groups_strings):
|
|||
with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
|
||||
|
||||
|
||||
best_sensor, best_recall_score, best_f1_sore = \
|
||||
best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std = \
|
||||
make_predictions_with_sensor_groups(model_input,
|
||||
groups_substrings=sensor_groups_strings, include_group=True,
|
||||
with_cols=with_cols)
|
||||
sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore))
|
||||
sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std ))
|
||||
print(f"\nAdded sensor: {best_sensor}\n")
|
||||
sensor_groups_strings.remove(best_sensor)
|
||||
|
||||
|
@ -210,14 +223,17 @@ def sort_tuples_to_lists(list_of_tuples):
|
|||
to 4 decimal places. The third list, y_fscore, contains the third element of each tuple
|
||||
rounded to 4 decimal places. The method returns all three lists.
|
||||
"""
|
||||
xs, y_recall, y_fscore = [], [], []
|
||||
xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], []
|
||||
for a_tuple in list_of_tuples:
|
||||
xs.append(a_tuple[0])
|
||||
y_recall.append(round(a_tuple[1], 4))
|
||||
y_fscore.append(round(a_tuple[2], 4))
|
||||
return xs, y_recall, y_fscore
|
||||
recall_std.append(round(a_tuple[3], 4))
|
||||
fscore_std.append(round(a_tuple[4], 4))
|
||||
return xs, y_recall, y_fscore, recall_std, fscore_std
|
||||
|
||||
def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, title="Sequential addition of features and its F1, and recall scores"):
|
||||
def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
|
||||
title="Sequential addition of features and its F1, and recall scores"):
|
||||
"""
|
||||
This function plots the sequential progress of feature addition scores using two subplots.
|
||||
The first subplot is for recall scores and the second subplot is for F1-scores.
|
||||
|
@ -230,13 +246,20 @@ def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore,
|
|||
"""
|
||||
|
||||
fig, ax = plt.subplots(nrows=2, sharex=True)
|
||||
ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD
|
||||
ax[0].plot(xs, y_recall, color='red')
|
||||
ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD
|
||||
mrec_indx = np.argmax(y_recall)
|
||||
ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
|
||||
ax[0].legend(["Upper std", "Mean Recall", "Lower std"])
|
||||
|
||||
ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD
|
||||
ax[1].plot(xs, y_fscore)
|
||||
ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD
|
||||
mfscore_indx = np.argmax(y_fscore)
|
||||
ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
|
||||
ax[1].legend(["Upper std", "Mean F1-score", "Lower std"])
|
||||
|
||||
fig.set_size_inches(18.5, 10.5)
|
||||
|
||||
ax[0].title.set_text('Recall scores')
|
||||
|
@ -245,15 +268,23 @@ def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore,
|
|||
plt.xticks(rotation=90)
|
||||
plt.show()
|
||||
|
||||
# %%
|
||||
sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
|
||||
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_data_yield_", "phone_light_",
|
||||
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
|
||||
|
||||
# sensors_features_groups = ["phone_", "empatica_", "demo_"]
|
||||
|
||||
# %%
|
||||
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
|
||||
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
|
||||
xs, y_recall, y_fscore = sort_tuples_to_lists(sensor_groups_importance_scores)
|
||||
xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Visualize sensors groups F1 and recall scores
|
||||
print(sensor_groups_importance_scores)
|
||||
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore)
|
||||
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
|
||||
title="Sequential addition of sensors and its F1, and recall scores")
|
||||
|
||||
# %%
|
||||
# Take the most important feature group and investigate it feature-by-feature
|
||||
|
|
Loading…
Reference in New Issue