Change ML model and ddd CV to sequential feat_select. Add std lines to plots.

2023-02-06 11:09:15 +01:00 · 2023-02-06 11:09:15 +01:00 · ea3f805ba7
parent e3aef2dae7
commit ea3f805ba7
1 changed files with 61 additions and 30 deletions
--- a/exploration/expl_features_groups_analysis.py
+++ b/exploration/expl_features_groups_analysis.py
@ -23,8 +23,8 @@ import matplotlib.pyplot as plt
 import pandas as pd

 from sklearn.impute import SimpleImputer
-from sklearn.ensemble import RandomForestClassifier 
-from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import GaussianNB  
+from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
 from sklearn import metrics 

 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
@ -82,7 +82,7 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
    while excluding the 'pid' and 'target' columns. It then splits the data into training 
    and test sets, using a test size of 0.25 for the first split and 0.2 for the second split. 
    A SimpleImputer is used to fill in missing values with median values. 
-    A RandomForestClassifier is then used to fit the training set and make predictions 
+    A LogisticRegression is then used to fit the training set and make predictions 
    on the test set. Finally, accuracy, precision, recall and F1 scores are printed 
    for each substring group depending on whether or not include_group 
    is set to True or False.
@ -90,7 +90,7 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
    """
    
    best_sensor = None
-    best_recall_score, best_f1_sore = None, None
+    best_recall_score, best_f1_score = None, None

    for fgroup_substr in groups_substrings:
        if fgroup_substr is None:
@ -105,16 +105,21 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru


        X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
+        X, _, y, _ =  train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
        
        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
-        X = imputer.fit_transform(X)
        
-        X, _, y, _ =  train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
+        nb = GaussianNB()
+        model_cv = cross_validate(
+            nb,
+            X=imputer.fit_transform(X),
+            y=y,
+            cv=StratifiedKFold(n_splits=5, shuffle=True),
+            n_jobs=-1,
+            scoring=('accuracy', 'precision', 'recall', 'f1')
+        )
        X_train, X_test, y_train, y_test =  train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
        
-        rfc = RandomForestClassifier(random_state=0)
-        rfc.fit(X_train, y_train)
-        y_pred = rfc.predict(X_test)

        if print_flag:
            if include_group:
@ -125,26 +130,34 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")

-            acc = metrics.accuracy_score(y_test, y_pred)
-            prec = metrics.precision_score(y_test, y_pred)
-            rec = metrics.recall_score(y_test, y_pred)
-            f1 = metrics.f1_score(y_test, y_pred)
+            acc = np.mean(model_cv['test_accuracy'])
+            acc_std = np.std(model_cv['test_accuracy'])
+            
+            prec = np.mean(model_cv['test_precision'])
+            prec_std = np.std(model_cv['test_precision'])
+            
+            rec = np.mean(model_cv['test_recall'])
+            rec_std = np.std(model_cv['test_recall'])
+            
+            f1 = np.mean(model_cv['test_f1'])
+            f1_std = np.std(model_cv['test_f1'])

        if print_flag:
            print("************************************************")
-            print("Accuracy", acc)
-            print("Precision", prec)
-            print("Recall", rec)
-            print("F1", f1, "\n")
+            print(f"Accuracy: {acc} (sd={acc_std})")
+            print(f"Precison: {prec} (sd={prec_std})")
+            print(f"Recall: {rec} (sd={rec_std})")
+            print(f"F1: {f1} (sd={f1_std})\n")

-        if (not best_recall_score and not best_f1_sore) or (rec > best_recall_score):
+        if (not best_recall_score and not best_f1_score) or (rec > best_recall_score):
            best_sensor = fgroup_substr
-            best_recall_score, best_f1_sore = rec, f1
+            best_recall_score, best_f1_score = rec, f1
+            best_recall_score_std, best_f1_score_std = rec_std, f1_std
        
-    return best_sensor, best_recall_score, best_f1_sore
+    return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std 

 # %% [markdown]
-# ### sensor big feature groups (phone, empatica, demografical)
+# ### sensor big feature groups (phone, empatica, demographical)
 big_groups_substr = ["phone_", "empatica_", "demo_"]
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)

@ -189,11 +202,11 @@ def find_sensor_group_features_importance(model_input, sensor_groups_strings):
        with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
        

-        best_sensor, best_recall_score, best_f1_sore = \
+        best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std  = \
            make_predictions_with_sensor_groups(model_input, 
            groups_substrings=sensor_groups_strings, include_group=True, 
            with_cols=with_cols)
-        sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore))
+        sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std ))
        print(f"\nAdded sensor: {best_sensor}\n")
        sensor_groups_strings.remove(best_sensor)
    
@ -210,14 +223,17 @@ def sort_tuples_to_lists(list_of_tuples):
    to 4 decimal places. The third list, y_fscore, contains the third element of each tuple 
    rounded to 4 decimal places. The method returns all three lists. 
    """
-    xs, y_recall, y_fscore = [], [], []
+    xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], []
    for a_tuple in list_of_tuples:
        xs.append(a_tuple[0])
        y_recall.append(round(a_tuple[1], 4))
        y_fscore.append(round(a_tuple[2], 4))
-    return xs, y_recall, y_fscore
+        recall_std.append(round(a_tuple[3], 4))
+        fscore_std.append(round(a_tuple[4], 4))
+    return xs, y_recall, y_fscore, recall_std, fscore_std

-def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, title="Sequential addition of features and its F1, and recall scores"):
+def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
+                                                        title="Sequential addition of features and its F1, and recall scores"):
    """
    This function plots the sequential progress of feature addition scores using two subplots. 
    The first subplot is for recall scores and the second subplot is for F1-scores. 
@ -230,13 +246,20 @@ def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore,
    """
    
    fig, ax = plt.subplots(nrows=2, sharex=True)
+    ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD
    ax[0].plot(xs, y_recall, color='red')
+    ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD
    mrec_indx = np.argmax(y_recall)
    ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
+    ax[0].legend(["Upper std", "Mean Recall", "Lower std"])

+    ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD
    ax[1].plot(xs, y_fscore)
+    ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD
    mfscore_indx = np.argmax(y_fscore)
    ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
+    ax[1].legend(["Upper std", "Mean F1-score", "Lower std"])
+    
    fig.set_size_inches(18.5, 10.5)

    ax[0].title.set_text('Recall scores')
@ -245,15 +268,23 @@ def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore,
    plt.xticks(rotation=90)
    plt.show()

+# %%
+sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
+                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_data_yield_", "phone_light_",
+                        "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]

+# sensors_features_groups = ["phone_", "empatica_", "demo_"]
+
+# %%
 # sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
 sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
-xs, y_recall, y_fscore = sort_tuples_to_lists(sensor_groups_importance_scores)
+xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores)

 # %% [markdown]
 # ### Visualize sensors groups F1 and recall scores
 print(sensor_groups_importance_scores)
-plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore)
+plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
+                                                    title="Sequential addition of sensors and its F1, and recall scores")

 # %%
 # Take the most important feature group and investigate it feature-by-feature