Add vizualization sections for sequential addition of sensors' features.
parent
07ef72dec5
commit
b286753696
|
@ -71,8 +71,9 @@ demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control'
|
|||
|
||||
# %%
|
||||
# Get phone and non-phone columns
|
||||
import warnings
|
||||
|
||||
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[]):
|
||||
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
|
||||
"""
|
||||
This function makes predictions with sensor groups.
|
||||
It takes in a dataframe (df), a list of group substrings (groups_substrings)
|
||||
|
@ -105,34 +106,36 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
|
|||
|
||||
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
|
||||
|
||||
print(fgroup_substr, X.shape)
|
||||
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
X = imputer.fit_transform(X)
|
||||
|
||||
X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2)
|
||||
X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
|
||||
|
||||
rfc = RandomForestClassifier(random_state=0)
|
||||
rfc.fit(X_train, y_train)
|
||||
y_pred = rfc.predict(X_test)
|
||||
|
||||
if include_group:
|
||||
print("\nPrediction with", fgroup_substr)
|
||||
else:
|
||||
print("\nPrediction without", fgroup_substr)
|
||||
if print_flag:
|
||||
if include_group:
|
||||
print("\nPrediction with", fgroup_substr)
|
||||
else:
|
||||
print("\nPrediction without", fgroup_substr)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
||||
|
||||
acc = metrics.accuracy_score(y_test, y_pred)
|
||||
prec = metrics.precision_score(y_test, y_pred)
|
||||
rec = metrics.recall_score(y_test, y_pred)
|
||||
f1 = metrics.f1_score(y_test, y_pred)
|
||||
acc = metrics.accuracy_score(y_test, y_pred)
|
||||
prec = metrics.precision_score(y_test, y_pred)
|
||||
rec = metrics.recall_score(y_test, y_pred)
|
||||
f1 = metrics.f1_score(y_test, y_pred)
|
||||
|
||||
print("************************************************")
|
||||
print("Accuracy", acc)
|
||||
print("Precision", prec)
|
||||
print("Recall", rec)
|
||||
print("F1", f1, "\n")
|
||||
if print_flag:
|
||||
print("************************************************")
|
||||
print("Accuracy", acc)
|
||||
print("Precision", prec)
|
||||
print("Recall", rec)
|
||||
print("F1", f1, "\n")
|
||||
|
||||
if (not best_recall_score and not best_f1_sore) or (rec > best_recall_score):
|
||||
best_sensor = fgroup_substr
|
||||
|
@ -166,6 +169,17 @@ sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "e
|
|||
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
|
||||
# %%
|
||||
def find_sensor_group_features_importance(model_input, sensor_groups_strings):
|
||||
"""
|
||||
This function finds the importance of sensor groups for a given model input. It takes two parameters:
|
||||
model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores,
|
||||
which will be populated with tuples containing the best sensor, its recall score, and its F1 score.
|
||||
It then makes a copy of the model input and the sensor groups strings. It then loops through each group
|
||||
in the list of strings, creating a list of important columns from the sensor importance scores list.
|
||||
It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score,
|
||||
and its F1 score. These values are added to the sensor importance scores list as a tuple. The function
|
||||
then removes that best sensor from the list of strings before looping again until all groups have been evaluated.
|
||||
Finally, it returns the populated list of tuples containing all sensors' scores.
|
||||
"""
|
||||
sensor_importance_scores = []
|
||||
model_input = model_input.copy()
|
||||
sensor_groups_strings = sensor_groups_strings.copy()
|
||||
|
@ -185,15 +199,86 @@ def find_sensor_group_features_importance(model_input, sensor_groups_strings):
|
|||
|
||||
return sensor_importance_scores
|
||||
|
||||
|
||||
# %%
|
||||
# Method for sorting list of tuples into 3 lists
|
||||
def sort_tuples_to_lists(list_of_tuples):
|
||||
"""
|
||||
sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument
|
||||
and sorts them into three separate lists. The first list, xs, contains the first element
|
||||
of each tuple. The second list, yrecall, contains the second element of each tuple rounded
|
||||
to 4 decimal places. The third list, y_fscore, contains the third element of each tuple
|
||||
rounded to 4 decimal places. The method returns all three lists.
|
||||
"""
|
||||
xs, y_recall, y_fscore = [], [], []
|
||||
for a_tuple in list_of_tuples:
|
||||
xs.append(a_tuple[0])
|
||||
y_recall.append(round(a_tuple[1], 4))
|
||||
y_fscore.append(round(a_tuple[2], 4))
|
||||
return xs, y_recall, y_fscore
|
||||
|
||||
def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, title="Sequential addition of features and its F1, and recall scores"):
|
||||
"""
|
||||
This function plots the sequential progress of feature addition scores using two subplots.
|
||||
The first subplot is for recall scores and the second subplot is for F1-scores.
|
||||
The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes.
|
||||
The title of the plot can be specified by the user using the parameter title.
|
||||
The maximum recall index and maximum F1-score index are also plotted using a black dot.
|
||||
The figure size is set to 18.5 inches in width and 10.5 inches in height,
|
||||
and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed
|
||||
using plt.show().
|
||||
"""
|
||||
|
||||
fig, ax = plt.subplots(nrows=2, sharex=True)
|
||||
ax[0].plot(xs, y_recall, color='red')
|
||||
mrec_indx = np.argmax(y_recall)
|
||||
ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
|
||||
|
||||
ax[1].plot(xs, y_fscore)
|
||||
mfscore_indx = np.argmax(y_fscore)
|
||||
ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
|
||||
fig.set_size_inches(18.5, 10.5)
|
||||
|
||||
ax[0].title.set_text('Recall scores')
|
||||
ax[1].title.set_text('F1-scores')
|
||||
plt.suptitle(title, fontsize=14)
|
||||
plt.xticks(rotation=90)
|
||||
plt.show()
|
||||
|
||||
|
||||
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
|
||||
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
|
||||
xs, y_recall, y_fscore = sort_tuples_to_lists(sensor_groups_importance_scores)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Visualize sensors groups F1 and recall scores
|
||||
print(sensor_groups_importance_scores)
|
||||
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore)
|
||||
|
||||
# %%
|
||||
best_sensor_group = sensor_groups_importance_scores[0][0]
|
||||
# Take the most important feature group and investigate it feature-by-feature
|
||||
best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
|
||||
best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
|
||||
|
||||
best_sensor_group_importance_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
|
||||
best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
|
||||
|
||||
xs, y_recall, y_fscore = sort_tuples_to_lists(best_sensor_features_scores)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Visualize best sensor's F1 and recall scores
|
||||
print(best_sensor_features_scores)
|
||||
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore)
|
||||
|
||||
# %%
|
||||
# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
|
||||
|
||||
# TODO: visualization of sensor_group and best_sensor_group importance (Recall/f1-score chart)
|
||||
for sensor_group in sensor_groups_importance_scores:
|
||||
current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
|
||||
current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
|
||||
xs, y_recall, y_fscore = sort_tuples_to_lists(current_sensor_features_scores)
|
||||
|
||||
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore,
|
||||
title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")
|
||||
|
||||
|
||||
# %%
|
||||
|
|
Loading…
Reference in New Issue