Add vizualization sections for sequential addition of sensors' features.

ml_pipeline
Primoz 2023-02-01 13:51:56 +01:00
parent 07ef72dec5
commit b286753696
1 changed files with 106 additions and 21 deletions

View File

@ -71,8 +71,9 @@ demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control'
# %%
# Get phone and non-phone columns
import warnings
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[]):
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
"""
This function makes predictions with sensor groups.
It takes in a dataframe (df), a list of group substrings (groups_substrings)
@ -105,34 +106,36 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
print(fgroup_substr, X.shape)
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)
X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2)
X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
if include_group:
print("\nPrediction with", fgroup_substr)
else:
print("\nPrediction without", fgroup_substr)
if print_flag:
if include_group:
print("\nPrediction with", fgroup_substr)
else:
print("\nPrediction without", fgroup_substr)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
acc = metrics.accuracy_score(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred)
rec = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
acc = metrics.accuracy_score(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred)
rec = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
print("************************************************")
print("Accuracy", acc)
print("Precision", prec)
print("Recall", rec)
print("F1", f1, "\n")
if print_flag:
print("************************************************")
print("Accuracy", acc)
print("Precision", prec)
print("Recall", rec)
print("F1", f1, "\n")
if (not best_recall_score and not best_f1_sore) or (rec > best_recall_score):
best_sensor = fgroup_substr
@ -166,6 +169,17 @@ sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "e
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
# %%
def find_sensor_group_features_importance(model_input, sensor_groups_strings):
"""
This function finds the importance of sensor groups for a given model input. It takes two parameters:
model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores,
which will be populated with tuples containing the best sensor, its recall score, and its F1 score.
It then makes a copy of the model input and the sensor groups strings. It then loops through each group
in the list of strings, creating a list of important columns from the sensor importance scores list.
It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score,
and its F1 score. These values are added to the sensor importance scores list as a tuple. The function
then removes that best sensor from the list of strings before looping again until all groups have been evaluated.
Finally, it returns the populated list of tuples containing all sensors' scores.
"""
sensor_importance_scores = []
model_input = model_input.copy()
sensor_groups_strings = sensor_groups_strings.copy()
@ -185,15 +199,86 @@ def find_sensor_group_features_importance(model_input, sensor_groups_strings):
return sensor_importance_scores
# %%
# Method for sorting list of tuples into 3 lists
def sort_tuples_to_lists(list_of_tuples):
"""
sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument
and sorts them into three separate lists. The first list, xs, contains the first element
of each tuple. The second list, yrecall, contains the second element of each tuple rounded
to 4 decimal places. The third list, y_fscore, contains the third element of each tuple
rounded to 4 decimal places. The method returns all three lists.
"""
xs, y_recall, y_fscore = [], [], []
for a_tuple in list_of_tuples:
xs.append(a_tuple[0])
y_recall.append(round(a_tuple[1], 4))
y_fscore.append(round(a_tuple[2], 4))
return xs, y_recall, y_fscore
def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, title="Sequential addition of features and its F1, and recall scores"):
"""
This function plots the sequential progress of feature addition scores using two subplots.
The first subplot is for recall scores and the second subplot is for F1-scores.
The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes.
The title of the plot can be specified by the user using the parameter title.
The maximum recall index and maximum F1-score index are also plotted using a black dot.
The figure size is set to 18.5 inches in width and 10.5 inches in height,
and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed
using plt.show().
"""
fig, ax = plt.subplots(nrows=2, sharex=True)
ax[0].plot(xs, y_recall, color='red')
mrec_indx = np.argmax(y_recall)
ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
ax[1].plot(xs, y_fscore)
mfscore_indx = np.argmax(y_fscore)
ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
fig.set_size_inches(18.5, 10.5)
ax[0].title.set_text('Recall scores')
ax[1].title.set_text('F1-scores')
plt.suptitle(title, fontsize=14)
plt.xticks(rotation=90)
plt.show()
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
xs, y_recall, y_fscore = sort_tuples_to_lists(sensor_groups_importance_scores)
# %% [markdown]
# ### Visualize sensors groups F1 and recall scores
print(sensor_groups_importance_scores)
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore)
# %%
best_sensor_group = sensor_groups_importance_scores[0][0]
# Take the most important feature group and investigate it feature-by-feature
best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
best_sensor_group_importance_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
xs, y_recall, y_fscore = sort_tuples_to_lists(best_sensor_features_scores)
# %% [markdown]
# ### Visualize best sensor's F1 and recall scores
print(best_sensor_features_scores)
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore)
# %%
# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
# TODO: visualization of sensor_group and best_sensor_group importance (Recall/f1-score chart)
for sensor_group in sensor_groups_importance_scores:
current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
xs, y_recall, y_fscore = sort_tuples_to_lists(current_sensor_features_scores)
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore,
title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")
# %%