Add vizualization sections for sequential addition of sensors' features.
parent
07ef72dec5
commit
b286753696
|
@ -71,8 +71,9 @@ demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control'
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# Get phone and non-phone columns
|
# Get phone and non-phone columns
|
||||||
|
import warnings
|
||||||
|
|
||||||
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[]):
|
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
|
||||||
"""
|
"""
|
||||||
This function makes predictions with sensor groups.
|
This function makes predictions with sensor groups.
|
||||||
It takes in a dataframe (df), a list of group substrings (groups_substrings)
|
It takes in a dataframe (df), a list of group substrings (groups_substrings)
|
||||||
|
@ -105,34 +106,36 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
|
||||||
|
|
||||||
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
|
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
|
||||||
|
|
||||||
print(fgroup_substr, X.shape)
|
|
||||||
|
|
||||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||||
X = imputer.fit_transform(X)
|
X = imputer.fit_transform(X)
|
||||||
|
|
||||||
X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25)
|
X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
|
||||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2)
|
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
|
||||||
|
|
||||||
rfc = RandomForestClassifier(random_state=0)
|
rfc = RandomForestClassifier(random_state=0)
|
||||||
rfc.fit(X_train, y_train)
|
rfc.fit(X_train, y_train)
|
||||||
y_pred = rfc.predict(X_test)
|
y_pred = rfc.predict(X_test)
|
||||||
|
|
||||||
if include_group:
|
if print_flag:
|
||||||
print("\nPrediction with", fgroup_substr)
|
if include_group:
|
||||||
else:
|
print("\nPrediction with", fgroup_substr)
|
||||||
print("\nPrediction without", fgroup_substr)
|
else:
|
||||||
|
print("\nPrediction without", fgroup_substr)
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
||||||
|
|
||||||
acc = metrics.accuracy_score(y_test, y_pred)
|
acc = metrics.accuracy_score(y_test, y_pred)
|
||||||
prec = metrics.precision_score(y_test, y_pred)
|
prec = metrics.precision_score(y_test, y_pred)
|
||||||
rec = metrics.recall_score(y_test, y_pred)
|
rec = metrics.recall_score(y_test, y_pred)
|
||||||
f1 = metrics.f1_score(y_test, y_pred)
|
f1 = metrics.f1_score(y_test, y_pred)
|
||||||
|
|
||||||
print("************************************************")
|
if print_flag:
|
||||||
print("Accuracy", acc)
|
print("************************************************")
|
||||||
print("Precision", prec)
|
print("Accuracy", acc)
|
||||||
print("Recall", rec)
|
print("Precision", prec)
|
||||||
print("F1", f1, "\n")
|
print("Recall", rec)
|
||||||
|
print("F1", f1, "\n")
|
||||||
|
|
||||||
if (not best_recall_score and not best_f1_sore) or (rec > best_recall_score):
|
if (not best_recall_score and not best_f1_sore) or (rec > best_recall_score):
|
||||||
best_sensor = fgroup_substr
|
best_sensor = fgroup_substr
|
||||||
|
@ -166,6 +169,17 @@ sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "e
|
||||||
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
|
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
|
||||||
# %%
|
# %%
|
||||||
def find_sensor_group_features_importance(model_input, sensor_groups_strings):
|
def find_sensor_group_features_importance(model_input, sensor_groups_strings):
|
||||||
|
"""
|
||||||
|
This function finds the importance of sensor groups for a given model input. It takes two parameters:
|
||||||
|
model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores,
|
||||||
|
which will be populated with tuples containing the best sensor, its recall score, and its F1 score.
|
||||||
|
It then makes a copy of the model input and the sensor groups strings. It then loops through each group
|
||||||
|
in the list of strings, creating a list of important columns from the sensor importance scores list.
|
||||||
|
It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score,
|
||||||
|
and its F1 score. These values are added to the sensor importance scores list as a tuple. The function
|
||||||
|
then removes that best sensor from the list of strings before looping again until all groups have been evaluated.
|
||||||
|
Finally, it returns the populated list of tuples containing all sensors' scores.
|
||||||
|
"""
|
||||||
sensor_importance_scores = []
|
sensor_importance_scores = []
|
||||||
model_input = model_input.copy()
|
model_input = model_input.copy()
|
||||||
sensor_groups_strings = sensor_groups_strings.copy()
|
sensor_groups_strings = sensor_groups_strings.copy()
|
||||||
|
@ -185,15 +199,86 @@ def find_sensor_group_features_importance(model_input, sensor_groups_strings):
|
||||||
|
|
||||||
return sensor_importance_scores
|
return sensor_importance_scores
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Method for sorting list of tuples into 3 lists
|
||||||
|
def sort_tuples_to_lists(list_of_tuples):
|
||||||
|
"""
|
||||||
|
sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument
|
||||||
|
and sorts them into three separate lists. The first list, xs, contains the first element
|
||||||
|
of each tuple. The second list, yrecall, contains the second element of each tuple rounded
|
||||||
|
to 4 decimal places. The third list, y_fscore, contains the third element of each tuple
|
||||||
|
rounded to 4 decimal places. The method returns all three lists.
|
||||||
|
"""
|
||||||
|
xs, y_recall, y_fscore = [], [], []
|
||||||
|
for a_tuple in list_of_tuples:
|
||||||
|
xs.append(a_tuple[0])
|
||||||
|
y_recall.append(round(a_tuple[1], 4))
|
||||||
|
y_fscore.append(round(a_tuple[2], 4))
|
||||||
|
return xs, y_recall, y_fscore
|
||||||
|
|
||||||
|
def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, title="Sequential addition of features and its F1, and recall scores"):
|
||||||
|
"""
|
||||||
|
This function plots the sequential progress of feature addition scores using two subplots.
|
||||||
|
The first subplot is for recall scores and the second subplot is for F1-scores.
|
||||||
|
The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes.
|
||||||
|
The title of the plot can be specified by the user using the parameter title.
|
||||||
|
The maximum recall index and maximum F1-score index are also plotted using a black dot.
|
||||||
|
The figure size is set to 18.5 inches in width and 10.5 inches in height,
|
||||||
|
and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed
|
||||||
|
using plt.show().
|
||||||
|
"""
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(nrows=2, sharex=True)
|
||||||
|
ax[0].plot(xs, y_recall, color='red')
|
||||||
|
mrec_indx = np.argmax(y_recall)
|
||||||
|
ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
|
||||||
|
|
||||||
|
ax[1].plot(xs, y_fscore)
|
||||||
|
mfscore_indx = np.argmax(y_fscore)
|
||||||
|
ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
|
||||||
|
fig.set_size_inches(18.5, 10.5)
|
||||||
|
|
||||||
|
ax[0].title.set_text('Recall scores')
|
||||||
|
ax[1].title.set_text('F1-scores')
|
||||||
|
plt.suptitle(title, fontsize=14)
|
||||||
|
plt.xticks(rotation=90)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
|
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
|
||||||
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
|
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
|
||||||
|
xs, y_recall, y_fscore = sort_tuples_to_lists(sensor_groups_importance_scores)
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Visualize sensors groups F1 and recall scores
|
||||||
|
print(sensor_groups_importance_scores)
|
||||||
|
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
best_sensor_group = sensor_groups_importance_scores[0][0]
|
# Take the most important feature group and investigate it feature-by-feature
|
||||||
|
best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
|
||||||
best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
|
best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
|
||||||
|
|
||||||
best_sensor_group_importance_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
|
best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
|
||||||
|
|
||||||
|
xs, y_recall, y_fscore = sort_tuples_to_lists(best_sensor_features_scores)
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Visualize best sensor's F1 and recall scores
|
||||||
|
print(best_sensor_features_scores)
|
||||||
|
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
|
||||||
|
|
||||||
# TODO: visualization of sensor_group and best_sensor_group importance (Recall/f1-score chart)
|
for sensor_group in sensor_groups_importance_scores:
|
||||||
|
current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
|
||||||
|
current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
|
||||||
|
xs, y_recall, y_fscore = sort_tuples_to_lists(current_sensor_features_scores)
|
||||||
|
|
||||||
|
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore,
|
||||||
|
title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
Loading…
Reference in New Issue