stress_at_work_analysis/exploration/ml_pipeline_regression.py

451 lines
11 KiB
Python
Raw Normal View History

# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %% jupyter={"source_hidden": true}
# %matplotlib inline
import os
import sys
import numpy as np
import pandas as pd
import xgboost as xg
2023-04-21 21:41:00 +02:00
from machine_learning.helper import prepare_regression_model_input
2023-04-21 21:34:54 +02:00
from sklearn import gaussian_process, kernel_ridge, linear_model, svm
from sklearn.dummy import DummyRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
# %% jupyter={"source_hidden": true}
2023-04-21 21:34:54 +02:00
model_input = pd.read_csv(
"../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
)
# %% jupyter={"source_hidden": true}
2023-04-21 21:34:54 +02:00
cv_method = "half_logo" # logo, half_logo, 5kfold
2023-04-21 21:41:00 +02:00
train_x, data_y, data_groups = prepare_regression_model_input(model_input, cv_method)
# %% jupyter={"source_hidden": true}
logo = LeaveOneGroupOut()
logo.get_n_splits(
train_x,
data_y,
groups=data_groups,
)
# Defaults to 5 k folds in cross_validate method
2023-04-21 21:34:54 +02:00
if cv_method != "logo" and cv_method != "half_logo":
logo = None
# %% jupyter={"source_hidden": true}
sum(data_y.isna())
# %% [markdown]
# ### Baseline: Dummy Regression (mean)
dummy_regr = DummyRegressor(strategy="mean")
# %% jupyter={"source_hidden": true}
2023-04-21 21:34:54 +02:00
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
# %% jupyter={"source_hidden": true}
dummy_regressor = cross_validate(
dummy_regr,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
2023-04-21 21:34:54 +02:00
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
)
2023-04-21 21:34:54 +02:00
print(
"Negative Mean Squared Error",
np.median(dummy_regressor["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(dummy_regressor["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(dummy_regressor["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(dummy_regressor["test_r2"]))
# %% [markdown]
# ### Linear Regression
# %% jupyter={"source_hidden": true}
lin_reg_rapids = linear_model.LinearRegression()
# %% jupyter={"source_hidden": true}
2023-04-21 21:34:54 +02:00
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
# %% jupyter={"source_hidden": true}
lin_reg_scores = cross_validate(
lin_reg_rapids,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
2023-04-21 21:34:54 +02:00
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
)
print(
"Negative Mean Squared Error",
np.median(lin_reg_scores["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(lin_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(lin_reg_scores["test_neg_root_mean_squared_error"]),
)
2023-04-21 21:34:54 +02:00
print("R2", np.median(lin_reg_scores["test_r2"]))
# %% [markdown]
# ### XGBRegressor Linear Regression
# %% jupyter={"source_hidden": true}
2023-04-21 21:34:54 +02:00
xgb_r = xg.XGBRegressor(objective="reg:squarederror", n_estimators=10)
# %% jupyter={"source_hidden": true}
2023-04-21 21:34:54 +02:00
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
# %% jupyter={"source_hidden": true}
xgb_reg_scores = cross_validate(
xgb_r,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
2023-04-21 21:34:54 +02:00
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
)
2023-04-21 21:34:54 +02:00
print(
"Negative Mean Squared Error",
np.median(xgb_reg_scores["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(xgb_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(xgb_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(xgb_reg_scores["test_r2"]))
# %% [markdown]
# ### XGBRegressor Pseudo Huber Error Regression
# %% jupyter={"source_hidden": true}
2023-04-21 21:34:54 +02:00
xgb_psuedo_huber_r = xg.XGBRegressor(objective="reg:pseudohubererror", n_estimators=10)
# %% jupyter={"source_hidden": true}
2023-04-21 21:34:54 +02:00
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
# %% jupyter={"source_hidden": true}
xgb_psuedo_huber_reg_scores = cross_validate(
xgb_psuedo_huber_r,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
2023-04-21 21:34:54 +02:00
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
)
print(
"Negative Mean Squared Error",
np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_absolute_error"]),
)
2023-04-21 21:34:54 +02:00
print(
"Negative Root Mean Squared Error",
np.median(xgb_psuedo_huber_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(xgb_psuedo_huber_reg_scores["test_r2"]))
# %% [markdown]
# ### Ridge regression
# %% jupyter={"source_hidden": true}
2023-04-21 21:34:54 +02:00
ridge_reg = linear_model.Ridge(alpha=0.5)
# %% tags=[] jupyter={"source_hidden": true}
ridge_reg_scores = cross_validate(
ridge_reg,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
2023-04-21 21:34:54 +02:00
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
)
print(
"Negative Mean Squared Error",
np.median(ridge_reg_scores["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(ridge_reg_scores["test_neg_mean_absolute_error"]),
)
2023-04-21 21:34:54 +02:00
print(
"Negative Root Mean Squared Error",
np.median(ridge_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(ridge_reg_scores["test_r2"]))
# %% [markdown]
# ### Lasso
# %% jupyter={"source_hidden": true}
lasso_reg = linear_model.Lasso(alpha=0.1)
# %% jupyter={"source_hidden": true}
lasso_reg_score = cross_validate(
lasso_reg,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
2023-04-21 21:34:54 +02:00
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
)
print(
"Negative Mean Squared Error",
np.median(lasso_reg_score["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(lasso_reg_score["test_neg_mean_absolute_error"]),
)
2023-04-21 21:34:54 +02:00
print(
"Negative Root Mean Squared Error",
np.median(lasso_reg_score["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(lasso_reg_score["test_r2"]))
# %% [markdown]
# ### Bayesian Ridge
# %% jupyter={"source_hidden": true}
bayesian_ridge_reg = linear_model.BayesianRidge()
# %% jupyter={"source_hidden": true}
bayesian_ridge_reg_score = cross_validate(
bayesian_ridge_reg,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
2023-04-21 21:34:54 +02:00
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
)
print(
"Negative Mean Squared Error",
np.median(bayesian_ridge_reg_score["test_neg_mean_squared_error"]),
)
print(
"Negative Mean Absolute Error",
np.median(bayesian_ridge_reg_score["test_neg_mean_absolute_error"]),
)
2023-04-21 21:34:54 +02:00
print(
"Negative Root Mean Squared Error",
np.median(bayesian_ridge_reg_score["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(bayesian_ridge_reg_score["test_r2"]))
# %% [markdown]
# ### RANSAC (outlier robust regression)
# %% jupyter={"source_hidden": true}
ransac_reg = linear_model.RANSACRegressor()
# %% jupyter={"source_hidden": true}
ransac_reg_scores = cross_validate(
ransac_reg,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
2023-04-21 21:34:54 +02:00
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
)
print(
"Negative Mean Squared Error",
np.median(ransac_reg_scores["test_neg_mean_squared_error"]),
)
2023-04-21 21:34:54 +02:00
print(
"Negative Mean Absolute Error",
np.median(ransac_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(ransac_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(ransac_reg_scores["test_r2"]))
# %% [markdown]
# ### Support vector regression
# %% jupyter={"source_hidden": true}
svr = svm.SVR()
# %% jupyter={"source_hidden": true}
svr_scores = cross_validate(
svr,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
2023-04-21 21:34:54 +02:00
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
)
print(
"Negative Mean Squared Error", np.median(svr_scores["test_neg_mean_squared_error"])
)
2023-04-21 21:34:54 +02:00
print(
"Negative Mean Absolute Error",
np.median(svr_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(svr_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(svr_scores["test_r2"]))
# %% [markdown]
# ### Kernel Ridge regression
# %% jupyter={"source_hidden": true}
kridge = kernel_ridge.KernelRidge()
# %% jupyter={"source_hidden": true}
kridge_scores = cross_validate(
kridge,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
2023-04-21 21:34:54 +02:00
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
)
print(
"Negative Mean Squared Error",
np.median(kridge_scores["test_neg_mean_squared_error"]),
)
2023-04-21 21:34:54 +02:00
print(
"Negative Mean Absolute Error",
np.median(kridge_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(kridge_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(kridge_scores["test_r2"]))
# %% [markdown]
# ### Gaussian Process Regression
# %% jupyter={"source_hidden": true}
gpr = gaussian_process.GaussianProcessRegressor()
# %% jupyter={"source_hidden": true}
gpr_scores = cross_validate(
gpr,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
2023-04-21 21:34:54 +02:00
scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
)
print(
"Negative Mean Squared Error", np.median(gpr_scores["test_neg_mean_squared_error"])
)
print(
"Negative Mean Absolute Error",
np.median(gpr_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(gpr_scores["test_neg_root_mean_squared_error"]),
)
2023-04-21 21:34:54 +02:00
print("R2", np.median(gpr_scores["test_r2"]))
# %%