From b286753696d745e659d86421e9491e985ade9dcc Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Wed, 1 Feb 2023 13:51:56 +0100
Subject: [PATCH] Add vizualization sections for sequential addition of
 sensors' features.

---
 exploration/expl_features_groups_analysis.py | 127 ++++++++++++++++---
 1 file changed, 106 insertions(+), 21 deletions(-)

diff --git a/exploration/expl_features_groups_analysis.py b/exploration/expl_features_groups_analysis.py
index b2021a4..6c57d48 100644
--- a/exploration/expl_features_groups_analysis.py
+++ b/exploration/expl_features_groups_analysis.py
@@ -71,8 +71,9 @@ demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control'
 
 # %%
 # Get phone and non-phone columns
+import warnings
 
-def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[]):
+def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
     """
     This function makes predictions with sensor groups. 
     It takes in a dataframe (df), a list of group substrings (groups_substrings) 
@@ -105,34 +106,36 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
 
         X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
 
-        print(fgroup_substr, X.shape)
-
         imputer = SimpleImputer(missing_values=np.nan, strategy='median')
         X = imputer.fit_transform(X)
 
-        X, _, y, _ =  train_test_split(X, y, random_state=19, test_size=0.25)
-        X_train, X_test, y_train, y_test =  train_test_split(X, y, random_state=2, test_size=0.2)
+        X, _, y, _ =  train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
+        X_train, X_test, y_train, y_test =  train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
         
         rfc = RandomForestClassifier(random_state=0)
         rfc.fit(X_train, y_train)
         y_pred = rfc.predict(X_test)
 
-        if include_group:
-            print("\nPrediction with", fgroup_substr)
-        else:
-            print("\nPrediction without", fgroup_substr)
+        if print_flag:
+            if include_group:
+                print("\nPrediction with", fgroup_substr)
+            else:
+                print("\nPrediction without", fgroup_substr)
 
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
 
-        acc = metrics.accuracy_score(y_test, y_pred)
-        prec = metrics.precision_score(y_test, y_pred)
-        rec = metrics.recall_score(y_test, y_pred)
-        f1 = metrics.f1_score(y_test, y_pred)
+            acc = metrics.accuracy_score(y_test, y_pred)
+            prec = metrics.precision_score(y_test, y_pred)
+            rec = metrics.recall_score(y_test, y_pred)
+            f1 = metrics.f1_score(y_test, y_pred)
 
-        print("************************************************")
-        print("Accuracy", acc)
-        print("Precision", prec)
-        print("Recall", rec)
-        print("F1", f1, "\n")
+        if print_flag:
+            print("************************************************")
+            print("Accuracy", acc)
+            print("Precision", prec)
+            print("Recall", rec)
+            print("F1", f1, "\n")
 
         if (not best_recall_score and not best_f1_sore) or (rec > best_recall_score):
             best_sensor = fgroup_substr
@@ -166,6 +169,17 @@ sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "e
                         "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
 # %%
 def find_sensor_group_features_importance(model_input, sensor_groups_strings):
+    """
+    This function finds the importance of sensor groups for a given model input. It takes two parameters: 
+    model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores, 
+    which will be populated with tuples containing the best sensor, its recall score, and its F1 score. 
+    It then makes a copy of the model input and the sensor groups strings. It then loops through each group 
+    in the list of strings, creating a list of important columns from the sensor importance scores list. 
+    It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score, 
+    and its F1 score. These values are added to the sensor importance scores list as a tuple. The function 
+    then removes that best sensor from the list of strings before looping again until all groups have been evaluated. 
+    Finally, it returns the populated list of tuples containing all sensors' scores. 
+    """
     sensor_importance_scores = []
     model_input = model_input.copy()
     sensor_groups_strings = sensor_groups_strings.copy()
@@ -185,15 +199,86 @@ def find_sensor_group_features_importance(model_input, sensor_groups_strings):
     
     return sensor_importance_scores
 
+
+# %%
+# Method for sorting list of tuples into 3 lists
+def sort_tuples_to_lists(list_of_tuples):
+    """
+    sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument 
+    and sorts them into three separate lists. The first list, xs, contains the first element 
+    of each tuple. The second list, yrecall, contains the second element of each tuple rounded 
+    to 4 decimal places. The third list, y_fscore, contains the third element of each tuple 
+    rounded to 4 decimal places. The method returns all three lists. 
+    """
+    xs, y_recall, y_fscore = [], [], []
+    for a_tuple in list_of_tuples:
+        xs.append(a_tuple[0])
+        y_recall.append(round(a_tuple[1], 4))
+        y_fscore.append(round(a_tuple[2], 4))
+    return xs, y_recall, y_fscore
+
+def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, title="Sequential addition of features and its F1, and recall scores"):
+    """
+    This function plots the sequential progress of feature addition scores using two subplots. 
+    The first subplot is for recall scores and the second subplot is for F1-scores. 
+    The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes. 
+    The title of the plot can be specified by the user using the parameter title. 
+    The maximum recall index and maximum F1-score index are also plotted using a black dot. 
+    The figure size is set to 18.5 inches in width and 10.5 inches in height, 
+    and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed 
+    using plt.show().
+    """
+    
+    fig, ax = plt.subplots(nrows=2, sharex=True)
+    ax[0].plot(xs, y_recall, color='red')
+    mrec_indx = np.argmax(y_recall)
+    ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
+
+    ax[1].plot(xs, y_fscore)
+    mfscore_indx = np.argmax(y_fscore)
+    ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
+    fig.set_size_inches(18.5, 10.5)
+
+    ax[0].title.set_text('Recall scores')
+    ax[1].title.set_text('F1-scores')
+    plt.suptitle(title, fontsize=14)
+    plt.xticks(rotation=90)
+    plt.show()
+
+
 # sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
 sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
+xs, y_recall, y_fscore = sort_tuples_to_lists(sensor_groups_importance_scores)
+
+# %% [markdown]
+# ### Visualize sensors groups F1 and recall scores
+print(sensor_groups_importance_scores)
+plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore)
 
 # %%
-best_sensor_group = sensor_groups_importance_scores[0][0]
+# Take the most important feature group and investigate it feature-by-feature
+best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
 best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
 
-best_sensor_group_importance_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
+best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
+
+xs, y_recall, y_fscore = sort_tuples_to_lists(best_sensor_features_scores)
+
+# %% [markdown]
+# ### Visualize best sensor's F1 and recall scores
+print(best_sensor_features_scores)
+plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore)
 
 # %%
+# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
 
-# TODO: visualization of sensor_group and best_sensor_group importance (Recall/f1-score chart)
\ No newline at end of file
+for sensor_group in sensor_groups_importance_scores:
+    current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
+    current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
+    xs, y_recall, y_fscore = sort_tuples_to_lists(current_sensor_features_scores)
+
+    plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, 
+    title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")
+
+
+# %%