Change ML model and ddd CV to sequential feat_select. Add std lines to plots.

ml_pipeline
Primoz 2023-02-06 11:09:15 +01:00
parent e3aef2dae7
commit ea3f805ba7
1 changed files with 61 additions and 30 deletions

View File

@ -23,8 +23,8 @@ import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn import metrics from sklearn import metrics
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
@ -82,7 +82,7 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
while excluding the 'pid' and 'target' columns. It then splits the data into training while excluding the 'pid' and 'target' columns. It then splits the data into training
and test sets, using a test size of 0.25 for the first split and 0.2 for the second split. and test sets, using a test size of 0.25 for the first split and 0.2 for the second split.
A SimpleImputer is used to fill in missing values with median values. A SimpleImputer is used to fill in missing values with median values.
A RandomForestClassifier is then used to fit the training set and make predictions A LogisticRegression is then used to fit the training set and make predictions
on the test set. Finally, accuracy, precision, recall and F1 scores are printed on the test set. Finally, accuracy, precision, recall and F1 scores are printed
for each substring group depending on whether or not include_group for each substring group depending on whether or not include_group
is set to True or False. is set to True or False.
@ -90,7 +90,7 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
""" """
best_sensor = None best_sensor = None
best_recall_score, best_f1_sore = None, None best_recall_score, best_f1_score = None, None
for fgroup_substr in groups_substrings: for fgroup_substr in groups_substrings:
if fgroup_substr is None: if fgroup_substr is None:
@ -105,16 +105,21 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target'] X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
imputer = SimpleImputer(missing_values=np.nan, strategy='median') imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)
X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2) nb = GaussianNB()
model_cv = cross_validate(
nb,
X=imputer.fit_transform(X),
y=y,
cv=StratifiedKFold(n_splits=5, shuffle=True),
n_jobs=-1,
scoring=('accuracy', 'precision', 'recall', 'f1')
)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
if print_flag: if print_flag:
if include_group: if include_group:
@ -125,26 +130,34 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.") warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
acc = metrics.accuracy_score(y_test, y_pred) acc = np.mean(model_cv['test_accuracy'])
prec = metrics.precision_score(y_test, y_pred) acc_std = np.std(model_cv['test_accuracy'])
rec = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred) prec = np.mean(model_cv['test_precision'])
prec_std = np.std(model_cv['test_precision'])
rec = np.mean(model_cv['test_recall'])
rec_std = np.std(model_cv['test_recall'])
f1 = np.mean(model_cv['test_f1'])
f1_std = np.std(model_cv['test_f1'])
if print_flag: if print_flag:
print("************************************************") print("************************************************")
print("Accuracy", acc) print(f"Accuracy: {acc} (sd={acc_std})")
print("Precision", prec) print(f"Precison: {prec} (sd={prec_std})")
print("Recall", rec) print(f"Recall: {rec} (sd={rec_std})")
print("F1", f1, "\n") print(f"F1: {f1} (sd={f1_std})\n")
if (not best_recall_score and not best_f1_sore) or (rec > best_recall_score): if (not best_recall_score and not best_f1_score) or (rec > best_recall_score):
best_sensor = fgroup_substr best_sensor = fgroup_substr
best_recall_score, best_f1_sore = rec, f1 best_recall_score, best_f1_score = rec, f1
best_recall_score_std, best_f1_score_std = rec_std, f1_std
return best_sensor, best_recall_score, best_f1_sore return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std
# %% [markdown] # %% [markdown]
# ### sensor big feature groups (phone, empatica, demografical) # ### sensor big feature groups (phone, empatica, demographical)
big_groups_substr = ["phone_", "empatica_", "demo_"] big_groups_substr = ["phone_", "empatica_", "demo_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False) make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)
@ -189,11 +202,11 @@ def find_sensor_group_features_importance(model_input, sensor_groups_strings):
with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)] with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
best_sensor, best_recall_score, best_f1_sore = \ best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std = \
make_predictions_with_sensor_groups(model_input, make_predictions_with_sensor_groups(model_input,
groups_substrings=sensor_groups_strings, include_group=True, groups_substrings=sensor_groups_strings, include_group=True,
with_cols=with_cols) with_cols=with_cols)
sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore)) sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std ))
print(f"\nAdded sensor: {best_sensor}\n") print(f"\nAdded sensor: {best_sensor}\n")
sensor_groups_strings.remove(best_sensor) sensor_groups_strings.remove(best_sensor)
@ -210,14 +223,17 @@ def sort_tuples_to_lists(list_of_tuples):
to 4 decimal places. The third list, y_fscore, contains the third element of each tuple to 4 decimal places. The third list, y_fscore, contains the third element of each tuple
rounded to 4 decimal places. The method returns all three lists. rounded to 4 decimal places. The method returns all three lists.
""" """
xs, y_recall, y_fscore = [], [], [] xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], []
for a_tuple in list_of_tuples: for a_tuple in list_of_tuples:
xs.append(a_tuple[0]) xs.append(a_tuple[0])
y_recall.append(round(a_tuple[1], 4)) y_recall.append(round(a_tuple[1], 4))
y_fscore.append(round(a_tuple[2], 4)) y_fscore.append(round(a_tuple[2], 4))
return xs, y_recall, y_fscore recall_std.append(round(a_tuple[3], 4))
fscore_std.append(round(a_tuple[4], 4))
return xs, y_recall, y_fscore, recall_std, fscore_std
def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, title="Sequential addition of features and its F1, and recall scores"): def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
title="Sequential addition of features and its F1, and recall scores"):
""" """
This function plots the sequential progress of feature addition scores using two subplots. This function plots the sequential progress of feature addition scores using two subplots.
The first subplot is for recall scores and the second subplot is for F1-scores. The first subplot is for recall scores and the second subplot is for F1-scores.
@ -230,13 +246,20 @@ def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore,
""" """
fig, ax = plt.subplots(nrows=2, sharex=True) fig, ax = plt.subplots(nrows=2, sharex=True)
ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD
ax[0].plot(xs, y_recall, color='red') ax[0].plot(xs, y_recall, color='red')
ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD
mrec_indx = np.argmax(y_recall) mrec_indx = np.argmax(y_recall)
ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black') ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
ax[0].legend(["Upper std", "Mean Recall", "Lower std"])
ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD
ax[1].plot(xs, y_fscore) ax[1].plot(xs, y_fscore)
ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD
mfscore_indx = np.argmax(y_fscore) mfscore_indx = np.argmax(y_fscore)
ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black') ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
ax[1].legend(["Upper std", "Mean F1-score", "Lower std"])
fig.set_size_inches(18.5, 10.5) fig.set_size_inches(18.5, 10.5)
ax[0].title.set_text('Recall scores') ax[0].title.set_text('Recall scores')
@ -245,15 +268,23 @@ def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore,
plt.xticks(rotation=90) plt.xticks(rotation=90)
plt.show() plt.show()
# %%
sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_data_yield_", "phone_light_",
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
# sensors_features_groups = ["phone_", "empatica_", "demo_"]
# %%
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr) # sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups) sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
xs, y_recall, y_fscore = sort_tuples_to_lists(sensor_groups_importance_scores) xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores)
# %% [markdown] # %% [markdown]
# ### Visualize sensors groups F1 and recall scores # ### Visualize sensors groups F1 and recall scores
print(sensor_groups_importance_scores) print(sensor_groups_importance_scores)
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore) plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
title="Sequential addition of sensors and its F1, and recall scores")
# %% # %%
# Take the most important feature group and investigate it feature-by-feature # Take the most important feature group and investigate it feature-by-feature