stress_at_work_analysis/exploration/expl_features_groups_analys...

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.13.0
#   kernelspec:
#     display_name: straw2analysis
#     language: python
#     name: straw2analysis
# ---

# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %matplotlib inline

import os, sys, math

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn import metrics

# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)

categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features

categorical_features = model_input[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]

# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)

# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)

numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
model_input = pd.concat([numerical_features, categorical_features], axis=1)

# Binarizacija targeta
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)

print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)


# %%
# Add prefix to demographical features
demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile',
                'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']

new_names = [(col, "demo_"+col) for col in demo_features]
model_input.rename(columns=dict(new_names), inplace=True)

demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio',
                'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M',
                'demo_startlanguage_nl', 'demo_startlanguage_sl']

# %%
# Get phone and non-phone columns
import warnings

def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
    """
    This function makes predictions with sensor groups.
    It takes in a dataframe (df), a list of group substrings (groups_substrings)
    and an optional parameter include_group (default is True).
    It creates a list of columns in the dataframe that contain the group substrings,
    while excluding the 'pid' and 'target' columns. It then splits the data into training
    and test sets, using a test size of 0.25 for the first split and 0.2 for the second split.
    A SimpleImputer is used to fill in missing values with median values.
    A LogisticRegression is then used to fit the training set and make predictions
    on the test set. Finally, accuracy, precision, recall and F1 scores are printed
    for each substring group depending on whether or not include_group
    is set to True or False.

    """

    best_sensor = None
    best_recall_score, best_f1_score = None, None

    for fgroup_substr in groups_substrings:
        if fgroup_substr is None:
            feature_group_cols = list(df.columns)
            feature_group_cols.remove("pid")
            feature_group_cols.remove("target")
        else:
            if include_group:
                feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
            else:
                feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]


        X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
        X, _, y, _ =  train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)

        imputer = SimpleImputer(missing_values=np.nan, strategy='median')

        nb = GaussianNB()
        model_cv = cross_validate(
            nb,
            X=imputer.fit_transform(X),
            y=y,
            cv=StratifiedKFold(n_splits=5, shuffle=True),
            n_jobs=-1,
            scoring=('accuracy', 'precision', 'recall', 'f1')
        )
        X_train, X_test, y_train, y_test =  train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)


        if print_flag:
            if include_group:
                print("\nPrediction with", fgroup_substr)
            else:
                print("\nPrediction without", fgroup_substr)

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")

            acc = np.mean(model_cv['test_accuracy'])
            acc_std = np.std(model_cv['test_accuracy'])

            prec = np.mean(model_cv['test_precision'])
            prec_std = np.std(model_cv['test_precision'])

            rec = np.mean(model_cv['test_recall'])
            rec_std = np.std(model_cv['test_recall'])

            f1 = np.mean(model_cv['test_f1'])
            f1_std = np.std(model_cv['test_f1'])

        if print_flag:
            print("************************************************")
            print(f"Accuracy: {acc} (sd={acc_std})")
            print(f"Precison: {prec} (sd={prec_std})")
            print(f"Recall: {rec} (sd={rec_std})")
            print(f"F1: {f1} (sd={f1_std})\n")

        if (not best_recall_score and not best_f1_score) or (rec > best_recall_score):
            best_sensor = fgroup_substr
            best_recall_score, best_f1_score = rec, f1
            best_recall_score_std, best_f1_score_std = rec_std, f1_std

    return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std

# %% [markdown]
# ### sensor big feature groups (phone, empatica, demographical)
big_groups_substr = ["phone_", "empatica_", "demo_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)

# %% [markdown]
# ### Empatica sezor groups
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
# e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)

# %% [markdown]
# ### Phone sensor groups
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
# phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_",
#                 "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)

# %%
# Write all the sensors  (phone, empatica), seperate other (demographical) cols also

sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
                        "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
# %%
def find_sensor_group_features_importance(model_input, sensor_groups_strings):
    """
    This function finds the importance of sensor groups for a given model input. It takes two parameters:
    model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores,
    which will be populated with tuples containing the best sensor, its recall score, and its F1 score.
    It then makes a copy of the model input and the sensor groups strings. It then loops through each group
    in the list of strings, creating a list of important columns from the sensor importance scores list.
    It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score,
    and its F1 score. These values are added to the sensor importance scores list as a tuple. The function
    then removes that best sensor from the list of strings before looping again until all groups have been evaluated.
    Finally, it returns the populated list of tuples containing all sensors' scores.
    """
    sensor_importance_scores = []
    model_input = model_input.copy()
    sensor_groups_strings = sensor_groups_strings.copy()
    groups_len = len(sensor_groups_strings)
    for i in range(groups_len):
        important_cols = [col[0] for col in sensor_importance_scores]
        with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]


        best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std  = \
            make_predictions_with_sensor_groups(model_input,
            groups_substrings=sensor_groups_strings, include_group=True,
            with_cols=with_cols)
        sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std ))
        print(f"\nAdded sensor: {best_sensor}\n")
        sensor_groups_strings.remove(best_sensor)

    return sensor_importance_scores


# %%
# Method for sorting list of tuples into 3 lists
def sort_tuples_to_lists(list_of_tuples):
    """
    sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument
    and sorts them into three separate lists. The first list, xs, contains the first element
    of each tuple. The second list, yrecall, contains the second element of each tuple rounded
    to 4 decimal places. The third list, y_fscore, contains the third element of each tuple
    rounded to 4 decimal places. The method returns all three lists.
    """
    xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], []
    for a_tuple in list_of_tuples:
        xs.append(a_tuple[0])
        y_recall.append(round(a_tuple[1], 4))
        y_fscore.append(round(a_tuple[2], 4))
        recall_std.append(round(a_tuple[3], 4))
        fscore_std.append(round(a_tuple[4], 4))
    return xs, y_recall, y_fscore, recall_std, fscore_std

def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
                                                        title="Sequential addition of features and its F1, and recall scores"):
    """
    This function plots the sequential progress of feature addition scores using two subplots.
    The first subplot is for recall scores and the second subplot is for F1-scores.
    The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes.
    The title of the plot can be specified by the user using the parameter title.
    The maximum recall index and maximum F1-score index are also plotted using a black dot.
    The figure size is set to 18.5 inches in width and 10.5 inches in height,
    and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed
    using plt.show().
    """

    fig, ax = plt.subplots(nrows=2, sharex=True)
    ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD
    ax[0].plot(xs, y_recall, color='red')
    ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD
    mrec_indx = np.argmax(y_recall)
    ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
    ax[0].legend(["Upper std", "Mean Recall", "Lower std"])

    ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD
    ax[1].plot(xs, y_fscore)
    ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD
    mfscore_indx = np.argmax(y_fscore)
    ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
    ax[1].legend(["Upper std", "Mean F1-score", "Lower std"])

    fig.set_size_inches(18.5, 10.5)

    ax[0].title.set_text('Recall scores')
    ax[1].title.set_text('F1-scores')
    plt.suptitle(title, fontsize=14)
    plt.xticks(rotation=90)
    plt.show()

# %%
sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
                        "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]

# sensors_features_groups = ["phone_", "empatica_", "demo_"]

# %%
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores)

# %% [markdown]
# ### Visualize sensors groups F1 and recall scores
print(sensor_groups_importance_scores)
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
                                                    title="Sequential addition of sensors and its F1, and recall scores")

# %%
# Take the most important feature group and investigate it feature-by-feature
best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]

# best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)

# xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(best_sensor_features_scores)

# %% [markdown]
# ### Visualize best sensor's F1 and recall scores
# print(best_sensor_features_scores)
# plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
#                                                     title="Best sensor addition it's features with F1 and recall scores")

# %%
# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
# It also saves the sequence of scores for all sensors' features in excel file
seq_columns = ["sensor_name", "feature_sequence", "recall", "f1_score"]
feature_sequence = pd.DataFrame(columns=seq_columns)
for i, sensor_group in enumerate(sensor_groups_importance_scores):

    current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
    current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
    xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(current_sensor_features_scores)
    feature_sequence = pd.concat([feature_sequence, pd.DataFrame({"sensor_name":sensor_group[0], "feature_sequence": [xs], "recall": [y_recall],
                                                             "f1_score": [y_fscore], "recall_std": [recall_std], "f1_std": [fscore_std]})])

    plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
    title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")

feature_sequence.to_excel("all_sensors_sequential_addition_scores.xlsx", index=False)

# %%
# TODO: method that reads data from the excel file, specified above, and then the method,
# that selects only features that are max a thresh[%] below the max value (best for recall
# possibly for f1). This method should additionally take threshold parameter.

# %%