Change ML model and ddd CV to sequential feat_select. Add std lines to plots.
parent
e3aef2dae7
commit
ea3f805ba7
|
@ -23,8 +23,8 @@ import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from sklearn.impute import SimpleImputer
|
from sklearn.impute import SimpleImputer
|
||||||
from sklearn.ensemble import RandomForestClassifier
|
from sklearn.naive_bayes import GaussianNB
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
|
||||||
from sklearn import metrics
|
from sklearn import metrics
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||||
|
@ -82,7 +82,7 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
|
||||||
while excluding the 'pid' and 'target' columns. It then splits the data into training
|
while excluding the 'pid' and 'target' columns. It then splits the data into training
|
||||||
and test sets, using a test size of 0.25 for the first split and 0.2 for the second split.
|
and test sets, using a test size of 0.25 for the first split and 0.2 for the second split.
|
||||||
A SimpleImputer is used to fill in missing values with median values.
|
A SimpleImputer is used to fill in missing values with median values.
|
||||||
A RandomForestClassifier is then used to fit the training set and make predictions
|
A LogisticRegression is then used to fit the training set and make predictions
|
||||||
on the test set. Finally, accuracy, precision, recall and F1 scores are printed
|
on the test set. Finally, accuracy, precision, recall and F1 scores are printed
|
||||||
for each substring group depending on whether or not include_group
|
for each substring group depending on whether or not include_group
|
||||||
is set to True or False.
|
is set to True or False.
|
||||||
|
@ -90,7 +90,7 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
|
||||||
"""
|
"""
|
||||||
|
|
||||||
best_sensor = None
|
best_sensor = None
|
||||||
best_recall_score, best_f1_sore = None, None
|
best_recall_score, best_f1_score = None, None
|
||||||
|
|
||||||
for fgroup_substr in groups_substrings:
|
for fgroup_substr in groups_substrings:
|
||||||
if fgroup_substr is None:
|
if fgroup_substr is None:
|
||||||
|
@ -105,16 +105,21 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
|
||||||
|
|
||||||
|
|
||||||
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
|
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
|
||||||
|
|
||||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
|
||||||
X = imputer.fit_transform(X)
|
|
||||||
|
|
||||||
X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
|
X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
|
||||||
|
|
||||||
|
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||||
|
|
||||||
|
nb = GaussianNB()
|
||||||
|
model_cv = cross_validate(
|
||||||
|
nb,
|
||||||
|
X=imputer.fit_transform(X),
|
||||||
|
y=y,
|
||||||
|
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
|
)
|
||||||
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
|
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
|
||||||
|
|
||||||
rfc = RandomForestClassifier(random_state=0)
|
|
||||||
rfc.fit(X_train, y_train)
|
|
||||||
y_pred = rfc.predict(X_test)
|
|
||||||
|
|
||||||
if print_flag:
|
if print_flag:
|
||||||
if include_group:
|
if include_group:
|
||||||
|
@ -125,26 +130,34 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
||||||
|
|
||||||
acc = metrics.accuracy_score(y_test, y_pred)
|
acc = np.mean(model_cv['test_accuracy'])
|
||||||
prec = metrics.precision_score(y_test, y_pred)
|
acc_std = np.std(model_cv['test_accuracy'])
|
||||||
rec = metrics.recall_score(y_test, y_pred)
|
|
||||||
f1 = metrics.f1_score(y_test, y_pred)
|
prec = np.mean(model_cv['test_precision'])
|
||||||
|
prec_std = np.std(model_cv['test_precision'])
|
||||||
|
|
||||||
|
rec = np.mean(model_cv['test_recall'])
|
||||||
|
rec_std = np.std(model_cv['test_recall'])
|
||||||
|
|
||||||
|
f1 = np.mean(model_cv['test_f1'])
|
||||||
|
f1_std = np.std(model_cv['test_f1'])
|
||||||
|
|
||||||
if print_flag:
|
if print_flag:
|
||||||
print("************************************************")
|
print("************************************************")
|
||||||
print("Accuracy", acc)
|
print(f"Accuracy: {acc} (sd={acc_std})")
|
||||||
print("Precision", prec)
|
print(f"Precison: {prec} (sd={prec_std})")
|
||||||
print("Recall", rec)
|
print(f"Recall: {rec} (sd={rec_std})")
|
||||||
print("F1", f1, "\n")
|
print(f"F1: {f1} (sd={f1_std})\n")
|
||||||
|
|
||||||
if (not best_recall_score and not best_f1_sore) or (rec > best_recall_score):
|
if (not best_recall_score and not best_f1_score) or (rec > best_recall_score):
|
||||||
best_sensor = fgroup_substr
|
best_sensor = fgroup_substr
|
||||||
best_recall_score, best_f1_sore = rec, f1
|
best_recall_score, best_f1_score = rec, f1
|
||||||
|
best_recall_score_std, best_f1_score_std = rec_std, f1_std
|
||||||
|
|
||||||
return best_sensor, best_recall_score, best_f1_sore
|
return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ### sensor big feature groups (phone, empatica, demografical)
|
# ### sensor big feature groups (phone, empatica, demographical)
|
||||||
big_groups_substr = ["phone_", "empatica_", "demo_"]
|
big_groups_substr = ["phone_", "empatica_", "demo_"]
|
||||||
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)
|
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)
|
||||||
|
|
||||||
|
@ -189,11 +202,11 @@ def find_sensor_group_features_importance(model_input, sensor_groups_strings):
|
||||||
with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
|
with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
|
||||||
|
|
||||||
|
|
||||||
best_sensor, best_recall_score, best_f1_sore = \
|
best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std = \
|
||||||
make_predictions_with_sensor_groups(model_input,
|
make_predictions_with_sensor_groups(model_input,
|
||||||
groups_substrings=sensor_groups_strings, include_group=True,
|
groups_substrings=sensor_groups_strings, include_group=True,
|
||||||
with_cols=with_cols)
|
with_cols=with_cols)
|
||||||
sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore))
|
sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std ))
|
||||||
print(f"\nAdded sensor: {best_sensor}\n")
|
print(f"\nAdded sensor: {best_sensor}\n")
|
||||||
sensor_groups_strings.remove(best_sensor)
|
sensor_groups_strings.remove(best_sensor)
|
||||||
|
|
||||||
|
@ -210,14 +223,17 @@ def sort_tuples_to_lists(list_of_tuples):
|
||||||
to 4 decimal places. The third list, y_fscore, contains the third element of each tuple
|
to 4 decimal places. The third list, y_fscore, contains the third element of each tuple
|
||||||
rounded to 4 decimal places. The method returns all three lists.
|
rounded to 4 decimal places. The method returns all three lists.
|
||||||
"""
|
"""
|
||||||
xs, y_recall, y_fscore = [], [], []
|
xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], []
|
||||||
for a_tuple in list_of_tuples:
|
for a_tuple in list_of_tuples:
|
||||||
xs.append(a_tuple[0])
|
xs.append(a_tuple[0])
|
||||||
y_recall.append(round(a_tuple[1], 4))
|
y_recall.append(round(a_tuple[1], 4))
|
||||||
y_fscore.append(round(a_tuple[2], 4))
|
y_fscore.append(round(a_tuple[2], 4))
|
||||||
return xs, y_recall, y_fscore
|
recall_std.append(round(a_tuple[3], 4))
|
||||||
|
fscore_std.append(round(a_tuple[4], 4))
|
||||||
|
return xs, y_recall, y_fscore, recall_std, fscore_std
|
||||||
|
|
||||||
def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, title="Sequential addition of features and its F1, and recall scores"):
|
def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
|
||||||
|
title="Sequential addition of features and its F1, and recall scores"):
|
||||||
"""
|
"""
|
||||||
This function plots the sequential progress of feature addition scores using two subplots.
|
This function plots the sequential progress of feature addition scores using two subplots.
|
||||||
The first subplot is for recall scores and the second subplot is for F1-scores.
|
The first subplot is for recall scores and the second subplot is for F1-scores.
|
||||||
|
@ -230,13 +246,20 @@ def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore,
|
||||||
"""
|
"""
|
||||||
|
|
||||||
fig, ax = plt.subplots(nrows=2, sharex=True)
|
fig, ax = plt.subplots(nrows=2, sharex=True)
|
||||||
|
ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD
|
||||||
ax[0].plot(xs, y_recall, color='red')
|
ax[0].plot(xs, y_recall, color='red')
|
||||||
|
ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD
|
||||||
mrec_indx = np.argmax(y_recall)
|
mrec_indx = np.argmax(y_recall)
|
||||||
ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
|
ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
|
||||||
|
ax[0].legend(["Upper std", "Mean Recall", "Lower std"])
|
||||||
|
|
||||||
|
ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD
|
||||||
ax[1].plot(xs, y_fscore)
|
ax[1].plot(xs, y_fscore)
|
||||||
|
ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD
|
||||||
mfscore_indx = np.argmax(y_fscore)
|
mfscore_indx = np.argmax(y_fscore)
|
||||||
ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
|
ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
|
||||||
|
ax[1].legend(["Upper std", "Mean F1-score", "Lower std"])
|
||||||
|
|
||||||
fig.set_size_inches(18.5, 10.5)
|
fig.set_size_inches(18.5, 10.5)
|
||||||
|
|
||||||
ax[0].title.set_text('Recall scores')
|
ax[0].title.set_text('Recall scores')
|
||||||
|
@ -245,15 +268,23 @@ def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore,
|
||||||
plt.xticks(rotation=90)
|
plt.xticks(rotation=90)
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
|
||||||
|
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_data_yield_", "phone_light_",
|
||||||
|
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
|
||||||
|
|
||||||
|
# sensors_features_groups = ["phone_", "empatica_", "demo_"]
|
||||||
|
|
||||||
|
# %%
|
||||||
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
|
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
|
||||||
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
|
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
|
||||||
xs, y_recall, y_fscore = sort_tuples_to_lists(sensor_groups_importance_scores)
|
xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores)
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ### Visualize sensors groups F1 and recall scores
|
# ### Visualize sensors groups F1 and recall scores
|
||||||
print(sensor_groups_importance_scores)
|
print(sensor_groups_importance_scores)
|
||||||
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore)
|
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
|
||||||
|
title="Sequential addition of sensors and its F1, and recall scores")
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# Take the most important feature group and investigate it feature-by-feature
|
# Take the most important feature group and investigate it feature-by-feature
|
||||||
|
|
Loading…
Reference in New Issue