From 07ef72dec5367c15b95b959e0f854fb047c86945 Mon Sep 17 00:00:00 2001 From: Primoz Date: Wed, 25 Jan 2023 14:19:29 +0100 Subject: [PATCH] Implement algorithm for sequential adding of the most important features. --- exploration/expl_features_groups_analysis.py | 88 +++++++++++++++----- 1 file changed, 67 insertions(+), 21 deletions(-) diff --git a/exploration/expl_features_groups_analysis.py b/exploration/expl_features_groups_analysis.py index bc46533..b2021a4 100644 --- a/exploration/expl_features_groups_analysis.py +++ b/exploration/expl_features_groups_analysis.py @@ -72,7 +72,7 @@ demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control' # %% # Get phone and non-phone columns -def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True): +def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[]): """ This function makes predictions with sensor groups. It takes in a dataframe (df), a list of group substrings (groups_substrings) @@ -87,6 +87,10 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru is set to True or False. """ + + best_sensor = None + best_recall_score, best_f1_sore = None, None + for fgroup_substr in groups_substrings: if fgroup_substr is None: feature_group_cols = list(df.columns) @@ -99,7 +103,10 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']] - X, y = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target'] + X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target'] + + print(fgroup_substr, X.shape) + imputer = SimpleImputer(missing_values=np.nan, strategy='median') X = imputer.fit_transform(X) @@ -115,39 +122,78 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru else: print("\nPrediction without", fgroup_substr) + + acc = metrics.accuracy_score(y_test, y_pred) + prec = metrics.precision_score(y_test, y_pred) + rec = metrics.recall_score(y_test, y_pred) + f1 = metrics.f1_score(y_test, y_pred) + print("************************************************") - print("Accuracy", metrics.accuracy_score(y_test, y_pred)) - print("Precision", metrics.precision_score(y_test, y_pred)) - print("Recall", metrics.recall_score(y_test, y_pred)) - print("F1", metrics.f1_score(y_test, y_pred), "\n") + print("Accuracy", acc) + print("Precision", prec) + print("Recall", rec) + print("F1", f1, "\n") + + if (not best_recall_score and not best_f1_sore) or (rec > best_recall_score): + best_sensor = fgroup_substr + best_recall_score, best_f1_sore = rec, f1 + + return best_sensor, best_recall_score, best_f1_sore # %% [markdown] -# ### Senzor big feature groups (phone, empatica, demografical) -groups_substr = [None, "phone_", "empatica_", "demo_"] -make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False) +# ### sensor big feature groups (phone, empatica, demografical) +big_groups_substr = ["phone_", "empatica_", "demo_"] +make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False) # %% [markdown] # ### Empatica sezor groups -make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) -e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"] -make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False) +# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) +# e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"] +# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False) # %% [markdown] # ### Phone sensor groups -make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) -phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", - "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"] -make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False) +# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) +# phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", +# "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"] +# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False) # %% # Write all the sensors (phone, empatica), seperate other (demographical) cols also -sensors_features_groups = ["_", "phone_", "empatica_", "empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_", - "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", "phone_light_" - "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"] +sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_", + "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_data_yield_", "phone_light_", + "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"] +# %% +def find_sensor_group_features_importance(model_input, sensor_groups_strings): + sensor_importance_scores = [] + model_input = model_input.copy() + sensor_groups_strings = sensor_groups_strings.copy() + groups_len = len(sensor_groups_strings) + for i in range(groups_len): + important_cols = [col[0] for col in sensor_importance_scores] + with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)] + + best_sensor, best_recall_score, best_f1_sore = \ + make_predictions_with_sensor_groups(model_input, + groups_substrings=sensor_groups_strings, include_group=True, + with_cols=with_cols) + sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore)) + print(f"\nAdded sensor: {best_sensor}\n") + sensor_groups_strings.remove(best_sensor) + + return sensor_importance_scores - -print([col for col in model_input.columns if "phone_" not in col or "empatica_" not in col]) +# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr) +sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups) # %% +best_sensor_group = sensor_groups_importance_scores[0][0] +best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)] + +best_sensor_group_importance_scores = find_sensor_group_features_importance(model_input, best_sensor_features) + +# %% + +# TODO: visualization of sensor_group and best_sensor_group importance (Recall/f1-score chart) \ No newline at end of file