stress_at_work_analysis/exploration/ml_pipeline_classification.py

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.14.5
#   kernelspec:
#     display_name: straw2analysis
#     language: python
#     name: straw2analysis
# ---

# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %matplotlib inline
import os
import sys

import pandas as pd
from IPython.core.interactiveshell import InteractiveShell

from machine_learning.helper import (
    impute_encode_categorical_features,
    prepare_cross_validator,
    prepare_sklearn_data_format,
    run_all_classification_models,
)

InteractiveShell.ast_node_interactivity = "all"

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)


# %%
CV_METHOD = "logo"  # logo, half_logo, 5kfold
# Cross-validation method (could be regarded as a hyperparameter)
N_SL = 3  # Number of largest/smallest accuracies (of particular CV) outputs
UNDERSAMPLING = False
# (bool) If True this will train and test data on balanced dataset
# (using undersampling method)

# %% jupyter={"source_hidden": false, "outputs_hidden": false}
model_input = pd.read_csv(
    "E:/STRAWresults/20230415/daily/input_PANAS_negative_affect_mean.csv"
)
# model_input =
# model_input[model_input.columns.drop(
# list(model_input.filter(regex='empatica_temperature'))
# )]

# %% jupyter={"source_hidden": false, "outputs_hidden": false}
model_input["target"].value_counts()

# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# bins = [-10, 0, 10] # bins for z-scored targets
bins = [-1, 0, 4]  # bins for stressfulness (0-4) target
model_input["target"], edges = pd.cut(
    model_input.target, bins=bins, labels=["low", "high"], retbins=True, right=True
)  # ['low', 'medium', 'high']
model_input["target"].value_counts(), edges
# model_input = model_input[model_input['target'] != "medium"]
model_input["target"] = (
    model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
)

model_input["target"].value_counts()

# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# UnderSampling
if UNDERSAMPLING:
    no_stress = model_input[model_input["target"] == 0]
    stress = model_input[model_input["target"] == 1]

    no_stress = no_stress.sample(n=len(stress))
    model_input = pd.concat([stress, no_stress], axis=0)


# %% jupyter={"source_hidden": false, "outputs_hidden": false}
model_input_encoded = impute_encode_categorical_features(model_input)
# %%
data_x, data_y, data_groups = prepare_sklearn_data_format(
    model_input_encoded, CV_METHOD
)
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)

# %%
data_y.head()

# %%
data_y.tail()
# %%
data_y.shape
# %%
scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
# %%
scores.to_csv(
    "../presentation/JCQ_supervisor_support_regression_" + CV_METHOD + ".csv",
    index=False,
)