stress_at_work_analysis/exploration/ml_pipeline_classification.py

131 lines
3.7 KiB
Python
Raw Normal View History

# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
2023-05-10 23:17:44 +02:00
# jupytext_version: 1.14.5
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
2023-05-11 16:51:38 +02:00
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
2023-05-18 16:36:46 +02:00
# from IPython.core.interactiveshell import InteractiveShell
from pathlib import Path
2023-05-18 16:36:46 +02:00
# matplotlib inline
# import os
# import sys
import pandas as pd
2023-05-10 23:17:44 +02:00
from machine_learning.helper import (
impute_encode_categorical_features,
prepare_cross_validator,
prepare_sklearn_data_format,
run_all_classification_models,
)
2023-05-18 16:36:46 +02:00
# InteractiveShell.ast_node_interactivity = "all"
#
# nb_dir = os.path.split(os.getcwd())[0]
# if nb_dir not in sys.path:
# sys.path.append(nb_dir)
2023-01-04 21:25:12 +01:00
2023-05-10 23:17:44 +02:00
# %%
CV_METHOD = "logo" # logo, half_logo, 5kfold
# Cross-validation method (could be regarded as a hyperparameter)
2023-05-18 16:36:46 +02:00
print("CV_METHOD: " + CV_METHOD)
2023-05-10 23:17:44 +02:00
N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs
UNDERSAMPLING = False
# (bool) If True this will train and test data on balanced dataset
# (using undersampling method)
2023-05-11 16:51:38 +02:00
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
2023-05-18 16:36:46 +02:00
PATH_BASE = Path("E:/STRAWresults/20230415")
SEGMENT_TYPE = "period"
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
SEGMENT_LENGTH = "30_minutes_before"
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
2023-05-18 19:03:53 +02:00
TARGET_VARIABLE = "JCQ_job_control"
2023-05-18 16:36:46 +02:00
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
2023-05-18 19:03:53 +02:00
if "appraisal" in TARGET_VARIABLE:
TARGET_VARIABLE += "_"
TARGET_VARIABLE += SEGMENT_TYPE
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
2023-05-18 16:36:46 +02:00
model_input = pd.read_csv(PATH_FULL)
if SEGMENT_LENGTH == "daily":
DAY_LENGTH = "daily" # or "working"
print(DAY_LENGTH)
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
2023-05-11 16:51:38 +02:00
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
2023-05-10 23:17:44 +02:00
model_input["target"].value_counts()
2023-05-11 16:51:38 +02:00
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
# bins = [-10, 0, 10] # bins for z-scored targets
BINS = [-1, 0, 4] # bins for stressfulness (0-4) target
print("BINS: ", BINS)
2023-05-10 23:17:44 +02:00
model_input["target"], edges = pd.cut(
model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
2023-05-10 23:17:44 +02:00
) # ['low', 'medium', 'high']
print(model_input["target"].value_counts())
REMOVE_MEDIUM = True
if ("medium" in model_input["target"]) and REMOVE_MEDIUM:
model_input = model_input[model_input["target"] != "medium"]
print(model_input["target"].value_counts())
2023-05-10 23:17:44 +02:00
model_input["target"] = (
model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
)
2023-05-11 16:51:38 +02:00
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
# UnderSampling
2023-05-10 23:17:44 +02:00
if UNDERSAMPLING:
no_stress = model_input[model_input["target"] == 0]
stress = model_input[model_input["target"] == 1]
2023-05-10 23:17:44 +02:00
no_stress = no_stress.sample(n=len(stress))
model_input = pd.concat([stress, no_stress], axis=0)
2023-05-11 16:51:38 +02:00
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
2023-05-10 23:17:44 +02:00
model_input_encoded = impute_encode_categorical_features(model_input)
# %%
2023-05-10 23:17:44 +02:00
data_x, data_y, data_groups = prepare_sklearn_data_format(
model_input_encoded, CV_METHOD
)
2023-05-10 23:17:44 +02:00
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
2022-12-15 16:43:13 +01:00
2023-05-10 23:17:44 +02:00
# %%
data_y.head()
2023-05-10 23:17:44 +02:00
# %%
data_y.tail()
# %%
data_y.shape
# %%
scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
# %%
2023-05-18 18:40:54 +02:00
PATH_OUTPUT = Path("..") / Path("presentation/results")
path_output_full = PATH_OUTPUT / (
2023-05-18 18:58:19 +02:00
TARGET_VARIABLE
+ "_"
+ SEGMENT_LENGTH
+ "_classification"
+ str(BINS)
+ "_"
+ CV_METHOD
+ ".csv"
)
2023-05-18 18:40:54 +02:00
scores.to_csv(path_output_full, index=False)