329 lines
16 KiB
Python
329 lines
16 KiB
Python
# ---
|
|
# jupyter:
|
|
# jupytext:
|
|
# formats: ipynb,py:percent
|
|
# text_representation:
|
|
# extension: .py
|
|
# format_name: percent
|
|
# format_version: '1.3'
|
|
# jupytext_version: 1.13.0
|
|
# kernelspec:
|
|
# display_name: straw2analysis
|
|
# language: python
|
|
# name: straw2analysis
|
|
# ---
|
|
|
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
# %matplotlib inline
|
|
|
|
import os, sys, math
|
|
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import pandas as pd
|
|
|
|
from sklearn.impute import SimpleImputer
|
|
from sklearn.naive_bayes import GaussianNB
|
|
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
|
|
from sklearn import metrics
|
|
|
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
|
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
|
|
|
|
categorical_feature_colnames = ["gender", "startlanguage"]
|
|
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
|
|
categorical_feature_colnames += additional_categorical_features
|
|
|
|
categorical_features = model_input[categorical_feature_colnames].copy()
|
|
mode_categorical_features = categorical_features.mode().iloc[0]
|
|
|
|
# fillna with mode
|
|
categorical_features = categorical_features.fillna(mode_categorical_features)
|
|
|
|
# one-hot encoding
|
|
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
|
if not categorical_features.empty:
|
|
categorical_features = pd.get_dummies(categorical_features)
|
|
|
|
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
|
|
model_input = pd.concat([numerical_features, categorical_features], axis=1)
|
|
|
|
# Binarizacija targeta
|
|
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
|
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)
|
|
|
|
print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
|
|
print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)
|
|
|
|
|
|
# %%
|
|
# Add prefix to demographical features
|
|
demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile',
|
|
'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']
|
|
|
|
new_names = [(col, "demo_"+col) for col in demo_features]
|
|
model_input.rename(columns=dict(new_names), inplace=True)
|
|
|
|
demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio',
|
|
'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M',
|
|
'demo_startlanguage_nl', 'demo_startlanguage_sl']
|
|
|
|
# %%
|
|
# Get phone and non-phone columns
|
|
import warnings
|
|
|
|
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
|
|
"""
|
|
This function makes predictions with sensor groups.
|
|
It takes in a dataframe (df), a list of group substrings (groups_substrings)
|
|
and an optional parameter include_group (default is True).
|
|
It creates a list of columns in the dataframe that contain the group substrings,
|
|
while excluding the 'pid' and 'target' columns. It then splits the data into training
|
|
and test sets, using a test size of 0.25 for the first split and 0.2 for the second split.
|
|
A SimpleImputer is used to fill in missing values with median values.
|
|
A LogisticRegression is then used to fit the training set and make predictions
|
|
on the test set. Finally, accuracy, precision, recall and F1 scores are printed
|
|
for each substring group depending on whether or not include_group
|
|
is set to True or False.
|
|
|
|
"""
|
|
|
|
best_sensor = None
|
|
best_recall_score, best_f1_score = None, None
|
|
|
|
for fgroup_substr in groups_substrings:
|
|
if fgroup_substr is None:
|
|
feature_group_cols = list(df.columns)
|
|
feature_group_cols.remove("pid")
|
|
feature_group_cols.remove("target")
|
|
else:
|
|
if include_group:
|
|
feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
|
|
else:
|
|
feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
|
|
|
|
|
|
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
|
|
X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
|
|
|
|
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
|
|
|
nb = GaussianNB()
|
|
model_cv = cross_validate(
|
|
nb,
|
|
X=imputer.fit_transform(X),
|
|
y=y,
|
|
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
|
n_jobs=-1,
|
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
|
)
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
|
|
|
|
|
|
if print_flag:
|
|
if include_group:
|
|
print("\nPrediction with", fgroup_substr)
|
|
else:
|
|
print("\nPrediction without", fgroup_substr)
|
|
|
|
with warnings.catch_warnings():
|
|
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
|
|
|
acc = np.mean(model_cv['test_accuracy'])
|
|
acc_std = np.std(model_cv['test_accuracy'])
|
|
|
|
prec = np.mean(model_cv['test_precision'])
|
|
prec_std = np.std(model_cv['test_precision'])
|
|
|
|
rec = np.mean(model_cv['test_recall'])
|
|
rec_std = np.std(model_cv['test_recall'])
|
|
|
|
f1 = np.mean(model_cv['test_f1'])
|
|
f1_std = np.std(model_cv['test_f1'])
|
|
|
|
if print_flag:
|
|
print("************************************************")
|
|
print(f"Accuracy: {acc} (sd={acc_std})")
|
|
print(f"Precison: {prec} (sd={prec_std})")
|
|
print(f"Recall: {rec} (sd={rec_std})")
|
|
print(f"F1: {f1} (sd={f1_std})\n")
|
|
|
|
if (not best_recall_score and not best_f1_score) or (rec > best_recall_score):
|
|
best_sensor = fgroup_substr
|
|
best_recall_score, best_f1_score = rec, f1
|
|
best_recall_score_std, best_f1_score_std = rec_std, f1_std
|
|
|
|
return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std
|
|
|
|
# %% [markdown]
|
|
# ### sensor big feature groups (phone, empatica, demographical)
|
|
big_groups_substr = ["phone_", "empatica_", "demo_"]
|
|
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)
|
|
|
|
# %% [markdown]
|
|
# ### Empatica sezor groups
|
|
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
|
|
# e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
|
|
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
|
|
|
|
# %% [markdown]
|
|
# ### Phone sensor groups
|
|
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
|
|
# phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_",
|
|
# "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
|
|
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
|
|
|
|
# %%
|
|
# Write all the sensors (phone, empatica), seperate other (demographical) cols also
|
|
|
|
sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
|
|
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
|
|
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
|
|
# %%
|
|
def find_sensor_group_features_importance(model_input, sensor_groups_strings):
|
|
"""
|
|
This function finds the importance of sensor groups for a given model input. It takes two parameters:
|
|
model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores,
|
|
which will be populated with tuples containing the best sensor, its recall score, and its F1 score.
|
|
It then makes a copy of the model input and the sensor groups strings. It then loops through each group
|
|
in the list of strings, creating a list of important columns from the sensor importance scores list.
|
|
It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score,
|
|
and its F1 score. These values are added to the sensor importance scores list as a tuple. The function
|
|
then removes that best sensor from the list of strings before looping again until all groups have been evaluated.
|
|
Finally, it returns the populated list of tuples containing all sensors' scores.
|
|
"""
|
|
sensor_importance_scores = []
|
|
model_input = model_input.copy()
|
|
sensor_groups_strings = sensor_groups_strings.copy()
|
|
groups_len = len(sensor_groups_strings)
|
|
for i in range(groups_len):
|
|
important_cols = [col[0] for col in sensor_importance_scores]
|
|
with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
|
|
|
|
|
|
best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std = \
|
|
make_predictions_with_sensor_groups(model_input,
|
|
groups_substrings=sensor_groups_strings, include_group=True,
|
|
with_cols=with_cols)
|
|
sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std ))
|
|
print(f"\nAdded sensor: {best_sensor}\n")
|
|
sensor_groups_strings.remove(best_sensor)
|
|
|
|
return sensor_importance_scores
|
|
|
|
|
|
# %%
|
|
# Method for sorting list of tuples into 3 lists
|
|
def sort_tuples_to_lists(list_of_tuples):
|
|
"""
|
|
sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument
|
|
and sorts them into three separate lists. The first list, xs, contains the first element
|
|
of each tuple. The second list, yrecall, contains the second element of each tuple rounded
|
|
to 4 decimal places. The third list, y_fscore, contains the third element of each tuple
|
|
rounded to 4 decimal places. The method returns all three lists.
|
|
"""
|
|
xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], []
|
|
for a_tuple in list_of_tuples:
|
|
xs.append(a_tuple[0])
|
|
y_recall.append(round(a_tuple[1], 4))
|
|
y_fscore.append(round(a_tuple[2], 4))
|
|
recall_std.append(round(a_tuple[3], 4))
|
|
fscore_std.append(round(a_tuple[4], 4))
|
|
return xs, y_recall, y_fscore, recall_std, fscore_std
|
|
|
|
def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
|
|
title="Sequential addition of features and its F1, and recall scores"):
|
|
"""
|
|
This function plots the sequential progress of feature addition scores using two subplots.
|
|
The first subplot is for recall scores and the second subplot is for F1-scores.
|
|
The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes.
|
|
The title of the plot can be specified by the user using the parameter title.
|
|
The maximum recall index and maximum F1-score index are also plotted using a black dot.
|
|
The figure size is set to 18.5 inches in width and 10.5 inches in height,
|
|
and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed
|
|
using plt.show().
|
|
"""
|
|
|
|
fig, ax = plt.subplots(nrows=2, sharex=True)
|
|
ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD
|
|
ax[0].plot(xs, y_recall, color='red')
|
|
ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD
|
|
mrec_indx = np.argmax(y_recall)
|
|
ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
|
|
ax[0].legend(["Upper std", "Mean Recall", "Lower std"])
|
|
|
|
ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD
|
|
ax[1].plot(xs, y_fscore)
|
|
ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD
|
|
mfscore_indx = np.argmax(y_fscore)
|
|
ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
|
|
ax[1].legend(["Upper std", "Mean F1-score", "Lower std"])
|
|
|
|
fig.set_size_inches(18.5, 10.5)
|
|
|
|
ax[0].title.set_text('Recall scores')
|
|
ax[1].title.set_text('F1-scores')
|
|
plt.suptitle(title, fontsize=14)
|
|
plt.xticks(rotation=90)
|
|
plt.show()
|
|
|
|
# %%
|
|
sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
|
|
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
|
|
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
|
|
|
|
# sensors_features_groups = ["phone_", "empatica_", "demo_"]
|
|
|
|
# %%
|
|
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
|
|
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
|
|
xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores)
|
|
|
|
# %% [markdown]
|
|
# ### Visualize sensors groups F1 and recall scores
|
|
print(sensor_groups_importance_scores)
|
|
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
|
|
title="Sequential addition of sensors and its F1, and recall scores")
|
|
|
|
# %%
|
|
# Take the most important feature group and investigate it feature-by-feature
|
|
best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
|
|
best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
|
|
|
|
# best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
|
|
|
|
# xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(best_sensor_features_scores)
|
|
|
|
# %% [markdown]
|
|
# ### Visualize best sensor's F1 and recall scores
|
|
# print(best_sensor_features_scores)
|
|
# plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
|
|
# title="Best sensor addition it's features with F1 and recall scores")
|
|
|
|
# %%
|
|
# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
|
|
# It also saves the sequence of scores for all sensors' features in excel file
|
|
seq_columns = ["sensor_name", "feature_sequence", "recall", "f1_score"]
|
|
feature_sequence = pd.DataFrame(columns=seq_columns)
|
|
for i, sensor_group in enumerate(sensor_groups_importance_scores):
|
|
|
|
current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
|
|
current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
|
|
xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(current_sensor_features_scores)
|
|
feature_sequence = pd.concat([feature_sequence, pd.DataFrame({"sensor_name":sensor_group[0], "feature_sequence": [xs], "recall": [y_recall],
|
|
"f1_score": [y_fscore], "recall_std": [recall_std], "f1_std": [fscore_std]})])
|
|
|
|
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
|
|
title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")
|
|
|
|
feature_sequence.to_excel("all_sensors_sequential_addition_scores.xlsx", index=False)
|
|
|
|
# %%
|
|
# TODO: method that reads data from the excel file, specified above, and then the method,
|
|
# that selects only features that are max a thresh[%] below the max value (best for recall
|
|
# possibly for f1). This method should additionally take threshold parameter.
|
|
|
|
# %%
|
|
|