Add vizualization sections for sequential addition of sensors' features.

2023-02-01 13:51:56 +01:00 · 2023-02-01 13:51:56 +01:00 · b286753696
parent 07ef72dec5
commit b286753696
1 changed files with 106 additions and 21 deletions
--- a/exploration/expl_features_groups_analysis.py
+++ b/exploration/expl_features_groups_analysis.py
@ -71,8 +71,9 @@ demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control'
 # %%
 # Get phone and non-phone columns
 import warnings
-def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[]):
+def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
    """
    This function makes predictions with sensor groups. 
    It takes in a dataframe (df), a list of group substrings (groups_substrings) 
@ -105,29 +106,31 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
        X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
        print(fgroup_substr, X.shape)
        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        X = imputer.fit_transform(X)
-        X, _, y, _ =  train_test_split(X, y, random_state=19, test_size=0.25)
+        X, _, y, _ =  train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
-        X_train, X_test, y_train, y_test =  train_test_split(X, y, random_state=2, test_size=0.2)
+        X_train, X_test, y_train, y_test =  train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
        rfc = RandomForestClassifier(random_state=0)
        rfc.fit(X_train, y_train)
        y_pred = rfc.predict(X_test)
        if print_flag:
            if include_group:
                print("\nPrediction with", fgroup_substr)
            else:
                print("\nPrediction without", fgroup_substr)
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
            acc = metrics.accuracy_score(y_test, y_pred)
            prec = metrics.precision_score(y_test, y_pred)
            rec = metrics.recall_score(y_test, y_pred)
            f1 = metrics.f1_score(y_test, y_pred)
        if print_flag:
            print("************************************************")
            print("Accuracy", acc)
            print("Precision", prec)
@ -166,6 +169,17 @@ sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "e
                        "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
 # %%
 def find_sensor_group_features_importance(model_input, sensor_groups_strings):
    """
    This function finds the importance of sensor groups for a given model input. It takes two parameters: 
    model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores, 
    which will be populated with tuples containing the best sensor, its recall score, and its F1 score. 
    It then makes a copy of the model input and the sensor groups strings. It then loops through each group 
    in the list of strings, creating a list of important columns from the sensor importance scores list. 
    It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score, 
    and its F1 score. These values are added to the sensor importance scores list as a tuple. The function 
    then removes that best sensor from the list of strings before looping again until all groups have been evaluated. 
    Finally, it returns the populated list of tuples containing all sensors' scores. 
    """
    sensor_importance_scores = []
    model_input = model_input.copy()
    sensor_groups_strings = sensor_groups_strings.copy()
@ -185,15 +199,86 @@ def find_sensor_group_features_importance(model_input, sensor_groups_strings):
    return sensor_importance_scores
 # %%
 # Method for sorting list of tuples into 3 lists
 def sort_tuples_to_lists(list_of_tuples):
    """
    sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument 
    and sorts them into three separate lists. The first list, xs, contains the first element 
    of each tuple. The second list, yrecall, contains the second element of each tuple rounded 
    to 4 decimal places. The third list, y_fscore, contains the third element of each tuple 
    rounded to 4 decimal places. The method returns all three lists. 
    """
    xs, y_recall, y_fscore = [], [], []
    for a_tuple in list_of_tuples:
        xs.append(a_tuple[0])
        y_recall.append(round(a_tuple[1], 4))
        y_fscore.append(round(a_tuple[2], 4))
    return xs, y_recall, y_fscore
 def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, title="Sequential addition of features and its F1, and recall scores"):
    """
    This function plots the sequential progress of feature addition scores using two subplots. 
    The first subplot is for recall scores and the second subplot is for F1-scores. 
    The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes. 
    The title of the plot can be specified by the user using the parameter title. 
    The maximum recall index and maximum F1-score index are also plotted using a black dot. 
    The figure size is set to 18.5 inches in width and 10.5 inches in height, 
    and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed 
    using plt.show().
    """
    fig, ax = plt.subplots(nrows=2, sharex=True)
    ax[0].plot(xs, y_recall, color='red')
    mrec_indx = np.argmax(y_recall)
    ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
    ax[1].plot(xs, y_fscore)
    mfscore_indx = np.argmax(y_fscore)
    ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
    fig.set_size_inches(18.5, 10.5)
    ax[0].title.set_text('Recall scores')
    ax[1].title.set_text('F1-scores')
    plt.suptitle(title, fontsize=14)
    plt.xticks(rotation=90)
    plt.show()
 # sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
 sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
 xs, y_recall, y_fscore = sort_tuples_to_lists(sensor_groups_importance_scores)
 # %% [markdown]
 # ### Visualize sensors groups F1 and recall scores
 print(sensor_groups_importance_scores)
 plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore)
 # %%
-best_sensor_group = sensor_groups_importance_scores[0][0]
+# Take the most important feature group and investigate it feature-by-feature
 best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
 best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
-best_sensor_group_importance_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
+best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
 xs, y_recall, y_fscore = sort_tuples_to_lists(best_sensor_features_scores)
 # %% [markdown]
 # ### Visualize best sensor's F1 and recall scores
 print(best_sensor_features_scores)
 plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore)
 # %%
 # This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
-# TODO: visualization of sensor_group and best_sensor_group importance (Recall/f1-score chart)
+for sensor_group in sensor_groups_importance_scores:
    current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
    current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
    xs, y_recall, y_fscore = sort_tuples_to_lists(current_sensor_features_scores)
    plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, 
    title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")
 # %%