From b286753696d745e659d86421e9491e985ade9dcc Mon Sep 17 00:00:00 2001 From: Primoz Date: Wed, 1 Feb 2023 13:51:56 +0100 Subject: [PATCH] Add vizualization sections for sequential addition of sensors' features. --- exploration/expl_features_groups_analysis.py | 127 ++++++++++++++++--- 1 file changed, 106 insertions(+), 21 deletions(-) diff --git a/exploration/expl_features_groups_analysis.py b/exploration/expl_features_groups_analysis.py index b2021a4..6c57d48 100644 --- a/exploration/expl_features_groups_analysis.py +++ b/exploration/expl_features_groups_analysis.py @@ -71,8 +71,9 @@ demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control' # %% # Get phone and non-phone columns +import warnings -def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[]): +def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False): """ This function makes predictions with sensor groups. It takes in a dataframe (df), a list of group substrings (groups_substrings) @@ -105,34 +106,36 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target'] - print(fgroup_substr, X.shape) - imputer = SimpleImputer(missing_values=np.nan, strategy='median') X = imputer.fit_transform(X) - X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2) + X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2) + X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2) rfc = RandomForestClassifier(random_state=0) rfc.fit(X_train, y_train) y_pred = rfc.predict(X_test) - if include_group: - print("\nPrediction with", fgroup_substr) - else: - print("\nPrediction without", fgroup_substr) + if print_flag: + if include_group: + print("\nPrediction with", fgroup_substr) + else: + print("\nPrediction without", fgroup_substr) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.") - acc = metrics.accuracy_score(y_test, y_pred) - prec = metrics.precision_score(y_test, y_pred) - rec = metrics.recall_score(y_test, y_pred) - f1 = metrics.f1_score(y_test, y_pred) + acc = metrics.accuracy_score(y_test, y_pred) + prec = metrics.precision_score(y_test, y_pred) + rec = metrics.recall_score(y_test, y_pred) + f1 = metrics.f1_score(y_test, y_pred) - print("************************************************") - print("Accuracy", acc) - print("Precision", prec) - print("Recall", rec) - print("F1", f1, "\n") + if print_flag: + print("************************************************") + print("Accuracy", acc) + print("Precision", prec) + print("Recall", rec) + print("F1", f1, "\n") if (not best_recall_score and not best_f1_sore) or (rec > best_recall_score): best_sensor = fgroup_substr @@ -166,6 +169,17 @@ sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "e "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"] # %% def find_sensor_group_features_importance(model_input, sensor_groups_strings): + """ + This function finds the importance of sensor groups for a given model input. It takes two parameters: + model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores, + which will be populated with tuples containing the best sensor, its recall score, and its F1 score. + It then makes a copy of the model input and the sensor groups strings. It then loops through each group + in the list of strings, creating a list of important columns from the sensor importance scores list. + It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score, + and its F1 score. These values are added to the sensor importance scores list as a tuple. The function + then removes that best sensor from the list of strings before looping again until all groups have been evaluated. + Finally, it returns the populated list of tuples containing all sensors' scores. + """ sensor_importance_scores = [] model_input = model_input.copy() sensor_groups_strings = sensor_groups_strings.copy() @@ -185,15 +199,86 @@ def find_sensor_group_features_importance(model_input, sensor_groups_strings): return sensor_importance_scores + +# %% +# Method for sorting list of tuples into 3 lists +def sort_tuples_to_lists(list_of_tuples): + """ + sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument + and sorts them into three separate lists. The first list, xs, contains the first element + of each tuple. The second list, yrecall, contains the second element of each tuple rounded + to 4 decimal places. The third list, y_fscore, contains the third element of each tuple + rounded to 4 decimal places. The method returns all three lists. + """ + xs, y_recall, y_fscore = [], [], [] + for a_tuple in list_of_tuples: + xs.append(a_tuple[0]) + y_recall.append(round(a_tuple[1], 4)) + y_fscore.append(round(a_tuple[2], 4)) + return xs, y_recall, y_fscore + +def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, title="Sequential addition of features and its F1, and recall scores"): + """ + This function plots the sequential progress of feature addition scores using two subplots. + The first subplot is for recall scores and the second subplot is for F1-scores. + The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes. + The title of the plot can be specified by the user using the parameter title. + The maximum recall index and maximum F1-score index are also plotted using a black dot. + The figure size is set to 18.5 inches in width and 10.5 inches in height, + and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed + using plt.show(). + """ + + fig, ax = plt.subplots(nrows=2, sharex=True) + ax[0].plot(xs, y_recall, color='red') + mrec_indx = np.argmax(y_recall) + ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black') + + ax[1].plot(xs, y_fscore) + mfscore_indx = np.argmax(y_fscore) + ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black') + fig.set_size_inches(18.5, 10.5) + + ax[0].title.set_text('Recall scores') + ax[1].title.set_text('F1-scores') + plt.suptitle(title, fontsize=14) + plt.xticks(rotation=90) + plt.show() + + # sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr) sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups) +xs, y_recall, y_fscore = sort_tuples_to_lists(sensor_groups_importance_scores) + +# %% [markdown] +# ### Visualize sensors groups F1 and recall scores +print(sensor_groups_importance_scores) +plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore) # %% -best_sensor_group = sensor_groups_importance_scores[0][0] +# Take the most important feature group and investigate it feature-by-feature +best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)] -best_sensor_group_importance_scores = find_sensor_group_features_importance(model_input, best_sensor_features) +best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features) + +xs, y_recall, y_fscore = sort_tuples_to_lists(best_sensor_features_scores) + +# %% [markdown] +# ### Visualize best sensor's F1 and recall scores +print(best_sensor_features_scores) +plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore) # %% +# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature -# TODO: visualization of sensor_group and best_sensor_group importance (Recall/f1-score chart) \ No newline at end of file +for sensor_group in sensor_groups_importance_scores: + current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])] + current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features) + xs, y_recall, y_fscore = sort_tuples_to_lists(current_sensor_features_scores) + + plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, + title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores") + + +# %%