Implement algorithm for sequential adding of the most important features.

ml_pipeline
Primoz 2023-01-25 14:19:29 +01:00
parent 85e572fca0
commit 07ef72dec5
1 changed files with 67 additions and 21 deletions

View File

@ -72,7 +72,7 @@ demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control'
# %%
# Get phone and non-phone columns
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[]):
"""
This function makes predictions with sensor groups.
It takes in a dataframe (df), a list of group substrings (groups_substrings)
@ -87,6 +87,10 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
is set to True or False.
"""
best_sensor = None
best_recall_score, best_f1_sore = None, None
for fgroup_substr in groups_substrings:
if fgroup_substr is None:
feature_group_cols = list(df.columns)
@ -99,7 +103,10 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
print(fgroup_substr, X.shape)
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)
@ -115,39 +122,78 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
else:
print("\nPrediction without", fgroup_substr)
acc = metrics.accuracy_score(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred)
rec = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
print("************************************************")
print("Accuracy", metrics.accuracy_score(y_test, y_pred))
print("Precision", metrics.precision_score(y_test, y_pred))
print("Recall", metrics.recall_score(y_test, y_pred))
print("F1", metrics.f1_score(y_test, y_pred), "\n")
print("Accuracy", acc)
print("Precision", prec)
print("Recall", rec)
print("F1", f1, "\n")
if (not best_recall_score and not best_f1_sore) or (rec > best_recall_score):
best_sensor = fgroup_substr
best_recall_score, best_f1_sore = rec, f1
return best_sensor, best_recall_score, best_f1_sore
# %% [markdown]
# ### Senzor big feature groups (phone, empatica, demografical)
groups_substr = [None, "phone_", "empatica_", "demo_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
# ### sensor big feature groups (phone, empatica, demografical)
big_groups_substr = ["phone_", "empatica_", "demo_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)
# %% [markdown]
# ### Empatica sezor groups
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
# e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
# %% [markdown]
# ### Phone sensor groups
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_",
"phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
# phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_",
# "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
# %%
# Write all the sensors (phone, empatica), seperate other (demographical) cols also
sensors_features_groups = ["_", "phone_", "empatica_", "empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", "phone_light_"
"phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_data_yield_", "phone_light_",
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
# %%
def find_sensor_group_features_importance(model_input, sensor_groups_strings):
sensor_importance_scores = []
model_input = model_input.copy()
sensor_groups_strings = sensor_groups_strings.copy()
groups_len = len(sensor_groups_strings)
for i in range(groups_len):
important_cols = [col[0] for col in sensor_importance_scores]
with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
best_sensor, best_recall_score, best_f1_sore = \
make_predictions_with_sensor_groups(model_input,
groups_substrings=sensor_groups_strings, include_group=True,
with_cols=with_cols)
sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore))
print(f"\nAdded sensor: {best_sensor}\n")
sensor_groups_strings.remove(best_sensor)
return sensor_importance_scores
print([col for col in model_input.columns if "phone_" not in col or "empatica_" not in col])
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
# %%
best_sensor_group = sensor_groups_importance_scores[0][0]
best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
best_sensor_group_importance_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
# %%
# TODO: visualization of sensor_group and best_sensor_group importance (Recall/f1-score chart)