Implement algorithm for sequential adding of the most important features.

2023-01-25 14:19:29 +01:00 · 2023-01-25 14:19:29 +01:00 · 07ef72dec5
parent 85e572fca0
commit 07ef72dec5
1 changed files with 67 additions and 21 deletions
--- a/exploration/expl_features_groups_analysis.py
+++ b/exploration/expl_features_groups_analysis.py
@ -72,7 +72,7 @@ demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control'
 # %%
 # Get phone and non-phone columns

-def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
+def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[]):
    """
    This function makes predictions with sensor groups. 
    It takes in a dataframe (df), a list of group substrings (groups_substrings) 
@ -87,6 +87,10 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
    is set to True or False.

    """
+    
+    best_sensor = None
+    best_recall_score, best_f1_sore = None, None
+
    for fgroup_substr in groups_substrings:
        if fgroup_substr is None:
            feature_group_cols = list(df.columns)
@ -99,7 +103,10 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
                feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]


-        X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
+        X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
+
+        print(fgroup_substr, X.shape)
+
        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        X = imputer.fit_transform(X)

@ -115,39 +122,78 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
        else:
            print("\nPrediction without", fgroup_substr)

+
+        acc = metrics.accuracy_score(y_test, y_pred)
+        prec = metrics.precision_score(y_test, y_pred)
+        rec = metrics.recall_score(y_test, y_pred)
+        f1 = metrics.f1_score(y_test, y_pred)
+
        print("************************************************")
-        print("Accuracy", metrics.accuracy_score(y_test, y_pred))
-        print("Precision", metrics.precision_score(y_test, y_pred))
-        print("Recall", metrics.recall_score(y_test, y_pred))
-        print("F1", metrics.f1_score(y_test, y_pred), "\n")
+        print("Accuracy", acc)
+        print("Precision", prec)
+        print("Recall", rec)
+        print("F1", f1, "\n")
+
+        if (not best_recall_score and not best_f1_sore) or (rec > best_recall_score):
+            best_sensor = fgroup_substr
+            best_recall_score, best_f1_sore = rec, f1
+        
+    return best_sensor, best_recall_score, best_f1_sore

 # %% [markdown]
-# ### Senzor big feature groups (phone, empatica, demografical)
-groups_substr = [None, "phone_", "empatica_", "demo_"]
-make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
+# ### sensor big feature groups (phone, empatica, demografical)
+big_groups_substr = ["phone_", "empatica_", "demo_"]
+make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)

 # %% [markdown]
 # ### Empatica sezor groups
-make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
-e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
-make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
+# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
+# e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
+# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)

 # %% [markdown]
 # ### Phone sensor groups
-make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
-phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", 
-                "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
-make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
+# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
+# phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", 
+#                 "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
+# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)

 # %%
 # Write all the sensors  (phone, empatica), seperate other (demographical) cols also

-sensors_features_groups = ["_", "phone_", "empatica_", "empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
-                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", "phone_light_"
-                        "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
+sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
+                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_data_yield_", "phone_light_",
+                        "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
+# %%
+def find_sensor_group_features_importance(model_input, sensor_groups_strings):
+    sensor_importance_scores = []
+    model_input = model_input.copy()
+    sensor_groups_strings = sensor_groups_strings.copy()
+    groups_len = len(sensor_groups_strings)
+    for i in range(groups_len):
+        important_cols = [col[0] for col in sensor_importance_scores]
+        with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
        

+        best_sensor, best_recall_score, best_f1_sore = \
+            make_predictions_with_sensor_groups(model_input, 
+            groups_substrings=sensor_groups_strings, include_group=True, 
+            with_cols=with_cols)
+        sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore))
+        print(f"\nAdded sensor: {best_sensor}\n")
+        sensor_groups_strings.remove(best_sensor)
    
-print([col for col in model_input.columns if "phone_" not in col or "empatica_" not in col])
+    return sensor_importance_scores
+
+# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
+sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)

 # %%
+best_sensor_group = sensor_groups_importance_scores[0][0]
+best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
+
+best_sensor_group_importance_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
+
+# %%
+
+# TODO: visualization of sensor_group and best_sensor_group importance (Recall/f1-score chart)