stress_at_work_analysis/exploration/ml_pipeline_regression.py

67 lines
1.4 KiB
Python
Raw Normal View History

# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
2023-05-10 23:00:03 +02:00
# jupytext_version: 1.14.5
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
2023-05-10 20:30:51 +02:00
# %%
import os
import sys
import pandas as pd
2023-04-21 21:34:54 +02:00
2023-05-10 20:30:51 +02:00
from machine_learning.helper import (
impute_encode_categorical_features,
prepare_cross_validator,
prepare_sklearn_data_format,
run_all_regression_models,
)
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
2023-05-10 20:30:51 +02:00
# %%
2023-04-21 21:34:54 +02:00
model_input = pd.read_csv(
"../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
)
2023-05-10 20:30:51 +02:00
# %%
2023-05-10 23:00:03 +02:00
model_input = model_input[model_input["local_segment"].str.contains("daily")]
# %%
CV_METHOD = "logo" # logo, half_logo, 5kfold
2023-05-10 20:30:51 +02:00
model_input_encoded = impute_encode_categorical_features(model_input)
# %%
data_x, data_y, data_groups = prepare_sklearn_data_format(
model_input_encoded, CV_METHOD
)
2023-05-10 20:30:51 +02:00
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
2023-05-10 23:00:03 +02:00
# %%
data_y.head()
# %%
data_y.tail()
# %%
data_y.shape
# %%
2023-05-10 20:30:51 +02:00
scores = run_all_regression_models(data_x, data_y, data_groups, cross_validator)
2023-05-10 23:00:03 +02:00
# %%
scores.to_csv(
"../presentation/JCQ_supervisor_support_regression_" + CV_METHOD + ".csv",
index=False,
)