stress_at_work_analysis/exploration/expl_features_groups_analys...

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.13.0
#   kernelspec:
#     display_name: straw2analysis
#     language: python
#     name: straw2analysis
# ---

# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %matplotlib inline

import os, sys, math

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)

categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features

categorical_features = model_input[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]

# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)

# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)

numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
model_input = pd.concat([numerical_features, categorical_features], axis=1)

# Binarizacija targeta
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)

print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)


# %%

# Get phone and non-phone columns

def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
    """
    This function makes predictions with sensor groups.
    It takes in a dataframe (df), a list of group substrings (groups_substrings)
    and an optional parameter include_group (default is True).
    It creates a list of columns in the dataframe that contain the group substrings,
    while excluding the 'pid' and 'target' columns. It then splits the data into training
    and test sets, using a test size of 0.25 for the first split and 0.2 for the second split.
    A SimpleImputer is used to fill in missing values with median values.
    A RandomForestClassifier is then used to fit the training set and make predictions
    on the test set. Finally, accuracy, precision, recall and F1 scores are printed
    for each substring group depending on whether or not include_group
    is set to True or False.

    """
    for fgroup_substr in groups_substrings:
        if include_group:
            feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
        else:
            feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]

        X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        X = imputer.fit_transform(X)

        X, _, y, _ =  train_test_split(X, y, random_state=19, test_size=0.25)
        X_train, X_test, y_train, y_test =  train_test_split(X, y, random_state=2, test_size=0.2)

        rfc = RandomForestClassifier(random_state=0)
        rfc.fit(X_train, y_train)
        y_pred = rfc.predict(X_test)

        if include_group:
            print("\nPrediction with", fgroup_substr)
        else:
            print("\nPrediction without", fgroup_substr)

        print("************************************************")
        print("Accuracy", metrics.accuracy_score(y_test, y_pred))
        print("Precision", metrics.precision_score(y_test, y_pred))
        print("Recall", metrics.recall_score(y_test, y_pred))
        print("F1", metrics.f1_score(y_test, y_pred), "\n")
# %%
model_input
groups_substr = ["_", "phone_", "empatica_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)

# %%
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)

# %%
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)

# %%
# Create an empty list to store the feature column groups
feature_column_groups = []

# Iterate through each column in model_input
for column in model_input.columns:

    # Split the column name by '_'
    split_column = column.split('_')

    # Create a variable to store the prefix of the current column
    prefix = ''

    # Iterate through each part of the split column name
    for part in split_column:

        # Add the part to the prefix variable
        prefix += part + '_'

        # Check if the prefix is already in our feature column groups list
        if prefix not in feature_column_groups:

            # If not, add it to our list of feature columns groups
            feature_column_groups.append(prefix)

# Print out all possible feature columns groups that contain more than one entry in a columns list
print(feature_column_groups)
# %%
# Write all the sensors  (phone, empatica), seperate other (demographical) cols also