Reformat ml_pipeline_regression.py

master
junos 2023-04-21 21:34:54 +02:00
parent 583ee82e80
commit 48118f125d
1 changed files with 256 additions and 99 deletions

View File

@ -15,64 +15,69 @@
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# %matplotlib inline # %matplotlib inline
import datetime
import importlib
import os import os
import sys import sys
import numpy as np import numpy as np
import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import seaborn as sns
import yaml
from pyprojroot import here
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor
import xgboost as xg import xgboost as xg
from IPython.core.interactiveshell import InteractiveShell from sklearn import gaussian_process, kernel_ridge, linear_model, svm
InteractiveShell.ast_node_interactivity = "all" from sklearn.dummy import DummyRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0] nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path: if nb_dir not in sys.path:
sys.path.append(nb_dir) sys.path.append(nb_dir)
import machine_learning.features_sensor # %% jupyter={"source_hidden": true}
import machine_learning.labels model_input = pd.read_csv(
import machine_learning.model "../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
)
# %% [markdown]
# # RAPIDS models
# %% [markdown]
# ## PANAS negative affect
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv") index_columns = [
"local_segment",
# %% jupyter={"source_hidden": true} "local_segment_label",
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] "local_segment_start_datetime",
#if "pid" in model_input.columns: "local_segment_end_datetime",
]
# if "pid" in model_input.columns:
# index_columns.append("pid") # index_columns.append("pid")
model_input.set_index(index_columns, inplace=True) model_input.set_index(index_columns, inplace=True)
cv_method = 'half_logo' # logo, half_logo, 5kfold cv_method = "half_logo" # logo, half_logo, 5kfold
if cv_method == 'logo': if cv_method == "logo":
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"] data_x, data_y, data_groups = (
model_input.drop(["target", "pid"], axis=1),
model_input["target"],
model_input["pid"],
)
else: else:
model_input['pid_index'] = model_input.groupby('pid').cumcount() model_input["pid_index"] = model_input.groupby("pid").cumcount()
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count') model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")
model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round() model_input["pid_index"] = (
model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str) model_input["pid_index"] / model_input["pid_count"] + 1
).round()
model_input["pid_half"] = (
model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
)
data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"] data_x, data_y, data_groups = (
model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
model_input["target"],
model_input["pid_half"],
)
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
categorical_feature_colnames = ["gender", "startlanguage"] categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col] additional_categorical_features = [
col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col
]
categorical_feature_colnames += additional_categorical_features categorical_feature_colnames += additional_categorical_features
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
@ -109,7 +114,7 @@ logo.get_n_splits(
) )
# Defaults to 5 k folds in cross_validate method # Defaults to 5 k folds in cross_validate method
if cv_method != 'logo' and cv_method != 'half_logo': if cv_method != "logo" and cv_method != "half_logo":
logo = None logo = None
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
@ -120,7 +125,7 @@ sum(data_y.isna())
dummy_regr = DummyRegressor(strategy="mean") dummy_regr = DummyRegressor(strategy="mean")
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
dummy_regressor = cross_validate( dummy_regressor = cross_validate(
@ -130,12 +135,26 @@ dummy_regressor = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(dummy_regressor['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(dummy_regressor['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(dummy_regressor['test_neg_root_mean_squared_error'])) np.median(dummy_regressor["test_neg_mean_squared_error"]),
print("R2", np.median(dummy_regressor['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(dummy_regressor["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(dummy_regressor["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(dummy_regressor["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Linear Regression # ### Linear Regression
@ -143,7 +162,7 @@ print("R2", np.median(dummy_regressor['test_r2']))
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
lin_reg_rapids = linear_model.LinearRegression() lin_reg_rapids = linear_model.LinearRegression()
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
lin_reg_scores = cross_validate( lin_reg_scores = cross_validate(
@ -153,19 +172,33 @@ lin_reg_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error'])) np.median(lin_reg_scores["test_neg_mean_squared_error"]),
print("R2", np.median(lin_reg_scores['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(lin_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(lin_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(lin_reg_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### XGBRegressor Linear Regression # ### XGBRegressor Linear Regression
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10) xgb_r = xg.XGBRegressor(objective="reg:squarederror", n_estimators=10)
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
xgb_reg_scores = cross_validate( xgb_reg_scores = cross_validate(
@ -175,19 +208,33 @@ xgb_reg_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(xgb_reg_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(xgb_reg_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(xgb_reg_scores['test_neg_root_mean_squared_error'])) np.median(xgb_reg_scores["test_neg_mean_squared_error"]),
print("R2", np.median(xgb_reg_scores['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(xgb_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(xgb_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(xgb_reg_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### XGBRegressor Pseudo Huber Error Regression # ### XGBRegressor Pseudo Huber Error Regression
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10) xgb_psuedo_huber_r = xg.XGBRegressor(objective="reg:pseudohubererror", n_estimators=10)
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
xgb_psuedo_huber_reg_scores = cross_validate( xgb_psuedo_huber_reg_scores = cross_validate(
@ -197,18 +244,32 @@ xgb_psuedo_huber_reg_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error'])) np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_squared_error"]),
print("R2", np.median(xgb_psuedo_huber_reg_scores['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(xgb_psuedo_huber_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(xgb_psuedo_huber_reg_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Ridge regression # ### Ridge regression
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
ridge_reg = linear_model.Ridge(alpha=.5) ridge_reg = linear_model.Ridge(alpha=0.5)
# %% tags=[] jupyter={"source_hidden": true} # %% tags=[] jupyter={"source_hidden": true}
ridge_reg_scores = cross_validate( ridge_reg_scores = cross_validate(
@ -218,12 +279,26 @@ ridge_reg_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(ridge_reg_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(ridge_reg_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(ridge_reg_scores['test_neg_root_mean_squared_error'])) np.median(ridge_reg_scores["test_neg_mean_squared_error"]),
print("R2", np.median(ridge_reg_scores['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(ridge_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(ridge_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(ridge_reg_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Lasso # ### Lasso
@ -239,12 +314,26 @@ lasso_reg_score = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(lasso_reg_score['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(lasso_reg_score['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(lasso_reg_score['test_neg_root_mean_squared_error'])) np.median(lasso_reg_score["test_neg_mean_squared_error"]),
print("R2", np.median(lasso_reg_score['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(lasso_reg_score["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(lasso_reg_score["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(lasso_reg_score["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Bayesian Ridge # ### Bayesian Ridge
@ -260,12 +349,26 @@ bayesian_ridge_reg_score = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error'])) np.median(bayesian_ridge_reg_score["test_neg_mean_squared_error"]),
print("R2", np.median(bayesian_ridge_reg_score['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(bayesian_ridge_reg_score["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(bayesian_ridge_reg_score["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(bayesian_ridge_reg_score["test_r2"]))
# %% [markdown] # %% [markdown]
# ### RANSAC (outlier robust regression) # ### RANSAC (outlier robust regression)
@ -281,12 +384,26 @@ ransac_reg_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(ransac_reg_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(ransac_reg_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(ransac_reg_scores['test_neg_root_mean_squared_error'])) np.median(ransac_reg_scores["test_neg_mean_squared_error"]),
print("R2", np.median(ransac_reg_scores['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(ransac_reg_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(ransac_reg_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(ransac_reg_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Support vector regression # ### Support vector regression
@ -302,12 +419,25 @@ svr_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(svr_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(svr_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error", np.median(svr_scores["test_neg_mean_squared_error"])
print("Negative Root Mean Squared Error", np.median(svr_scores['test_neg_root_mean_squared_error'])) )
print("R2", np.median(svr_scores['test_r2'])) print(
"Negative Mean Absolute Error",
np.median(svr_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(svr_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(svr_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Kernel Ridge regression # ### Kernel Ridge regression
@ -323,12 +453,26 @@ kridge_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(kridge_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(kridge_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error",
print("Negative Root Mean Squared Error", np.median(kridge_scores['test_neg_root_mean_squared_error'])) np.median(kridge_scores["test_neg_mean_squared_error"]),
print("R2", np.median(kridge_scores['test_r2'])) )
print(
"Negative Mean Absolute Error",
np.median(kridge_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(kridge_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(kridge_scores["test_r2"]))
# %% [markdown] # %% [markdown]
# ### Gaussian Process Regression # ### Gaussian Process Regression
@ -345,11 +489,24 @@ gpr_scores = cross_validate(
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') scoring=(
"r2",
"neg_mean_squared_error",
"neg_mean_absolute_error",
"neg_root_mean_squared_error",
),
) )
print("Negative Mean Squared Error", np.median(gpr_scores['test_neg_mean_squared_error'])) print(
print("Negative Mean Absolute Error", np.median(gpr_scores['test_neg_mean_absolute_error'])) "Negative Mean Squared Error", np.median(gpr_scores["test_neg_mean_squared_error"])
print("Negative Root Mean Squared Error", np.median(gpr_scores['test_neg_root_mean_squared_error'])) )
print("R2", np.median(gpr_scores['test_r2'])) print(
"Negative Mean Absolute Error",
np.median(gpr_scores["test_neg_mean_absolute_error"]),
)
print(
"Negative Root Mean Squared Error",
np.median(gpr_scores["test_neg_root_mean_squared_error"]),
)
print("R2", np.median(gpr_scores["test_r2"]))
# %% # %%