stress_at_work_analysis/exploration/ml_pipeline_classification.py

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.14.5
#   kernelspec:
#     display_name: straw2analysis
#     language: python
#     name: straw2analysis
# ---

# %% jupyter={"outputs_hidden": false, "source_hidden": false}
# from IPython.core.interactiveshell import InteractiveShell
from pathlib import Path

# matplotlib inline
# import os
# import sys
import pandas as pd

from machine_learning.helper import (
    impute_encode_categorical_features,
    prepare_cross_validator,
    prepare_sklearn_data_format,
    run_all_classification_models,
)

# InteractiveShell.ast_node_interactivity = "all"
#
# nb_dir = os.path.split(os.getcwd())[0]
# if nb_dir not in sys.path:
#     sys.path.append(nb_dir)


# %%
CV_METHOD = "logo"  # logo, half_logo, 5kfold
# Cross-validation method (could be regarded as a hyperparameter)
print("CV_METHOD: " + CV_METHOD)
N_SL = 3  # Number of largest/smallest accuracies (of particular CV) outputs
UNDERSAMPLING = False
# (bool) If True this will train and test data on balanced dataset
# (using undersampling method)

# %% jupyter={"outputs_hidden": false, "source_hidden": false}
PATH_BASE = Path("E:/STRAWresults/20230415")

SEGMENT_TYPE = "period"
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
SEGMENT_LENGTH = "30_minutes_before"
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
TARGET_VARIABLE = "JCQ_job_control"
print("TARGET_VARIABLE: " + TARGET_VARIABLE)

if "appraisal" in TARGET_VARIABLE:
    TARGET_VARIABLE += "_"
    TARGET_VARIABLE += SEGMENT_TYPE

PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")

model_input = pd.read_csv(PATH_FULL)

if SEGMENT_LENGTH == "daily":
    DAY_LENGTH = "daily"  # or "working"
    print(DAY_LENGTH)
    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]

# %% jupyter={"outputs_hidden": false, "source_hidden": false}
model_input["target"].value_counts()

# %% jupyter={"outputs_hidden": false, "source_hidden": false}
# bins = [-10, 0, 10] # bins for z-scored targets
BINS = [-1, 0, 4]  # bins for stressfulness (0-4) target
print("BINS: ", BINS)
model_input["target"], edges = pd.cut(
    model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
)  # ['low', 'medium', 'high']
print(model_input["target"].value_counts())
REMOVE_MEDIUM = True
if ("medium" in model_input["target"]) and REMOVE_MEDIUM:
    model_input = model_input[model_input["target"] != "medium"]
    print(model_input["target"].value_counts())

model_input["target"] = (
    model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
)

# %% jupyter={"outputs_hidden": false, "source_hidden": false}
# UnderSampling
if UNDERSAMPLING:
    no_stress = model_input[model_input["target"] == 0]
    stress = model_input[model_input["target"] == 1]

    no_stress = no_stress.sample(n=len(stress))
    model_input = pd.concat([stress, no_stress], axis=0)


# %% jupyter={"outputs_hidden": false, "source_hidden": false}
model_input_encoded = impute_encode_categorical_features(model_input)
# %%
data_x, data_y, data_groups = prepare_sklearn_data_format(
    model_input_encoded, CV_METHOD
)
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)

# %%
data_y.head()

# %%
data_y.tail()
# %%
data_y.shape
# %%
scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
# %%
PATH_OUTPUT = Path("..") / Path("presentation/results")
path_output_full = PATH_OUTPUT / (
    TARGET_VARIABLE
    + "_"
    + SEGMENT_LENGTH
    + "_classification"
    + str(BINS)
    + "_"
    + CV_METHOD
    + ".csv"
)
scores.to_csv(path_output_full, index=False)
Add a script for ml classification pipeline. 2022-11-21 14:47:19 +01:00			`# ---`
			`# jupyter:`
			`# jupytext:`
			`# formats: ipynb,py:percent`
			`# text_representation:`
			`# extension: .py`
			`# format_name: percent`
			`# format_version: '1.3'`
Update classification runner. 2023-05-10 23:17:44 +02:00			`# jupytext_version: 1.14.5`
Add a script for ml classification pipeline. 2022-11-21 14:47:19 +01:00			`# kernelspec:`
			`# display_name: straw2analysis`
			`# language: python`
			`# name: straw2analysis`
			`# ---`

Format comments. 2023-05-11 16:51:38 +02:00			`# %% jupyter={"outputs_hidden": false, "source_hidden": false}`
Set path programmatically. 2023-05-18 16:36:46 +02:00			`# from IPython.core.interactiveshell import InteractiveShell`
			`from pathlib import Path`
Add a script for ml classification pipeline. 2022-11-21 14:47:19 +01:00
Set path programmatically. 2023-05-18 16:36:46 +02:00			`# matplotlib inline`
			`# import os`
			`# import sys`
Add a script for ml classification pipeline. 2022-11-21 14:47:19 +01:00			`import pandas as pd`

Update classification runner. 2023-05-10 23:17:44 +02:00			`from machine_learning.helper import (`
			`impute_encode_categorical_features,`
			`prepare_cross_validator,`
			`prepare_sklearn_data_format,`
			`run_all_classification_models,`
			`)`
Add a script for ml classification pipeline. 2022-11-21 14:47:19 +01:00
Set path programmatically. 2023-05-18 16:36:46 +02:00			`# InteractiveShell.ast_node_interactivity = "all"`
			`#`
			`# nb_dir = os.path.split(os.getcwd())[0]`
			`# if nb_dir not in sys.path:`
			`# sys.path.append(nb_dir)`
Add a script for ml classification pipeline. 2022-11-21 14:47:19 +01:00
Unhide jupyter code cells and outputs. 2023-01-04 21:25:12 +01:00
Update classification runner. 2023-05-10 23:17:44 +02:00			`# %%`
			`CV_METHOD = "logo" # logo, half_logo, 5kfold`
			`# Cross-validation method (could be regarded as a hyperparameter)`
Set path programmatically. 2023-05-18 16:36:46 +02:00			`print("CV_METHOD: " + CV_METHOD)`
Update classification runner. 2023-05-10 23:17:44 +02:00			`N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs`
			`UNDERSAMPLING = False`
			`# (bool) If True this will train and test data on balanced dataset`
			`# (using undersampling method)`
Add a script for ml classification pipeline. 2022-11-21 14:47:19 +01:00
Format comments. 2023-05-11 16:51:38 +02:00			`# %% jupyter={"outputs_hidden": false, "source_hidden": false}`
Set path programmatically. 2023-05-18 16:36:46 +02:00			`PATH_BASE = Path("E:/STRAWresults/20230415")`

			`SEGMENT_TYPE = "period"`
			`print("SEGMENT_TYPE: " + SEGMENT_TYPE)`
			`SEGMENT_LENGTH = "30_minutes_before"`
			`print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)`
Better handling of input filename. 2023-05-18 19:03:53 +02:00			`TARGET_VARIABLE = "JCQ_job_control"`
Set path programmatically. 2023-05-18 16:36:46 +02:00			`print("TARGET_VARIABLE: " + TARGET_VARIABLE)`

Better handling of input filename. 2023-05-18 19:03:53 +02:00			`if "appraisal" in TARGET_VARIABLE:`
			`TARGET_VARIABLE += "_"`
			`TARGET_VARIABLE += SEGMENT_TYPE`

			`PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")`
Set path programmatically. 2023-05-18 16:36:46 +02:00
			`model_input = pd.read_csv(PATH_FULL)`

			`if SEGMENT_LENGTH == "daily":`
			`DAY_LENGTH = "daily" # or "working"`
			`print(DAY_LENGTH)`
			`model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]`
Add a script for ml classification pipeline. 2022-11-21 14:47:19 +01:00
Format comments. 2023-05-11 16:51:38 +02:00			`# %% jupyter={"outputs_hidden": false, "source_hidden": false}`
Update classification runner. 2023-05-10 23:17:44 +02:00			`model_input["target"].value_counts()`
Improve general ml classification pipeline script. 2022-11-22 14:31:49 +01:00
Format comments. 2023-05-11 16:51:38 +02:00			`# %% jupyter={"outputs_hidden": false, "source_hidden": false}`
Add undersampling method (with on/off parameter). 2022-12-13 17:01:46 +01:00			`# bins = [-10, 0, 10] # bins for z-scored targets`
Set more parameters as user-specified constants. 2023-05-18 18:06:32 +02:00			`BINS = [-1, 0, 4] # bins for stressfulness (0-4) target`
			`print("BINS: ", BINS)`
Update classification runner. 2023-05-10 23:17:44 +02:00			`model_input["target"], edges = pd.cut(`
Set more parameters as user-specified constants. 2023-05-18 18:06:32 +02:00			`model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True`
Update classification runner. 2023-05-10 23:17:44 +02:00			`) # ['low', 'medium', 'high']`
Set more parameters as user-specified constants. 2023-05-18 18:06:32 +02:00			`print(model_input["target"].value_counts())`
			`REMOVE_MEDIUM = True`
			`if ("medium" in model_input["target"]) and REMOVE_MEDIUM:`
			`model_input = model_input[model_input["target"] != "medium"]`
			`print(model_input["target"].value_counts())`

Update classification runner. 2023-05-10 23:17:44 +02:00			`model_input["target"] = (`
			`model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)`
			`)`
Add a script for ml classification pipeline. 2022-11-21 14:47:19 +01:00
Format comments. 2023-05-11 16:51:38 +02:00			`# %% jupyter={"outputs_hidden": false, "source_hidden": false}`
Add undersampling method (with on/off parameter). 2022-12-13 17:01:46 +01:00			`# UnderSampling`
Update classification runner. 2023-05-10 23:17:44 +02:00			`if UNDERSAMPLING:`
			`no_stress = model_input[model_input["target"] == 0]`
			`stress = model_input[model_input["target"] == 1]`
Improve general ml classification pipeline script. 2022-11-22 14:31:49 +01:00
Update classification runner. 2023-05-10 23:17:44 +02:00			`no_stress = no_stress.sample(n=len(stress))`
			`model_input = pd.concat([stress, no_stress], axis=0)`
Add a script for ml classification pipeline. 2022-11-21 14:47:19 +01:00

Format comments. 2023-05-11 16:51:38 +02:00			`# %% jupyter={"outputs_hidden": false, "source_hidden": false}`
Update classification runner. 2023-05-10 23:17:44 +02:00			`model_input_encoded = impute_encode_categorical_features(model_input)`
Use stratified downsampling. And run all models with a method from machine_learning.helper. 2023-01-04 21:25:42 +01:00			`# %%`
Update classification runner. 2023-05-10 23:17:44 +02:00			`data_x, data_y, data_groups = prepare_sklearn_data_format(`
			`model_input_encoded, CV_METHOD`
Add a script for ml classification pipeline. 2022-11-21 14:47:19 +01:00			`)`
Update classification runner. 2023-05-10 23:17:44 +02:00			`cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)`
Add feature importance check. 2022-12-15 16:43:13 +01:00
Update classification runner. 2023-05-10 23:17:44 +02:00			`# %%`
			`data_y.head()`
Add a script for ml classification pipeline. 2022-11-21 14:47:19 +01:00
Update classification runner. 2023-05-10 23:17:44 +02:00			`# %%`
			`data_y.tail()`
			`# %%`
			`data_y.shape`
			`# %%`
			`scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)`
			`# %%`
Set output path programmatically. 2023-05-18 18:40:54 +02:00			`PATH_OUTPUT = Path("..") / Path("presentation/results")`
			`path_output_full = PATH_OUTPUT / (`
Add bins to output filename. 2023-05-18 18:58:19 +02:00			`TARGET_VARIABLE`
			`+ "_"`
			`+ SEGMENT_LENGTH`
			`+ "_classification"`
			`+ str(BINS)`
			`+ "_"`
			`+ CV_METHOD`
			`+ ".csv"`
Add a script for ml classification pipeline. 2022-11-21 14:47:19 +01:00			`)`
Set output path programmatically. 2023-05-18 18:40:54 +02:00			`scores.to_csv(path_output_full, index=False)`