# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% jupyter={"source_hidden": true} # %matplotlib inline import os import sys import numpy as np import pandas as pd import xgboost as xg from machine_learning.helper import prepare_regression_model_input from sklearn import gaussian_process, kernel_ridge, linear_model, svm from sklearn.dummy import DummyRegressor from sklearn.impute import SimpleImputer from sklearn.model_selection import LeaveOneGroupOut, cross_validate # from IPython.core.interactiveshell import InteractiveShell # InteractiveShell.ast_node_interactivity = "all" nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) # %% jupyter={"source_hidden": true} model_input = pd.read_csv( "../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv" ) # %% jupyter={"source_hidden": true} cv_method = "half_logo" # logo, half_logo, 5kfold train_x, data_y, data_groups = prepare_regression_model_input(model_input, cv_method) # %% jupyter={"source_hidden": true} logo = LeaveOneGroupOut() logo.get_n_splits( train_x, data_y, groups=data_groups, ) # Defaults to 5 k folds in cross_validate method if cv_method != "logo" and cv_method != "half_logo": logo = None # %% jupyter={"source_hidden": true} sum(data_y.isna()) # %% [markdown] # ### Baseline: Dummy Regression (mean) dummy_regr = DummyRegressor(strategy="mean") # %% jupyter={"source_hidden": true} imputer = SimpleImputer(missing_values=np.nan, strategy="mean") # %% jupyter={"source_hidden": true} dummy_regressor = cross_validate( dummy_regr, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(dummy_regressor["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(dummy_regressor["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(dummy_regressor["test_neg_root_mean_squared_error"]), ) print("R2", np.median(dummy_regressor["test_r2"])) # %% [markdown] # ### Linear Regression # %% jupyter={"source_hidden": true} lin_reg_rapids = linear_model.LinearRegression() # %% jupyter={"source_hidden": true} imputer = SimpleImputer(missing_values=np.nan, strategy="mean") # %% jupyter={"source_hidden": true} lin_reg_scores = cross_validate( lin_reg_rapids, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(lin_reg_scores["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(lin_reg_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(lin_reg_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(lin_reg_scores["test_r2"])) # %% [markdown] # ### XGBRegressor Linear Regression # %% jupyter={"source_hidden": true} xgb_r = xg.XGBRegressor(objective="reg:squarederror", n_estimators=10) # %% jupyter={"source_hidden": true} imputer = SimpleImputer(missing_values=np.nan, strategy="mean") # %% jupyter={"source_hidden": true} xgb_reg_scores = cross_validate( xgb_r, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(xgb_reg_scores["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(xgb_reg_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(xgb_reg_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(xgb_reg_scores["test_r2"])) # %% [markdown] # ### XGBRegressor Pseudo Huber Error Regression # %% jupyter={"source_hidden": true} xgb_psuedo_huber_r = xg.XGBRegressor(objective="reg:pseudohubererror", n_estimators=10) # %% jupyter={"source_hidden": true} imputer = SimpleImputer(missing_values=np.nan, strategy="mean") # %% jupyter={"source_hidden": true} xgb_psuedo_huber_reg_scores = cross_validate( xgb_psuedo_huber_r, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(xgb_psuedo_huber_reg_scores["test_r2"])) # %% [markdown] # ### Ridge regression # %% jupyter={"source_hidden": true} ridge_reg = linear_model.Ridge(alpha=0.5) # %% tags=[] jupyter={"source_hidden": true} ridge_reg_scores = cross_validate( ridge_reg, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(ridge_reg_scores["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(ridge_reg_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(ridge_reg_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(ridge_reg_scores["test_r2"])) # %% [markdown] # ### Lasso # %% jupyter={"source_hidden": true} lasso_reg = linear_model.Lasso(alpha=0.1) # %% jupyter={"source_hidden": true} lasso_reg_score = cross_validate( lasso_reg, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(lasso_reg_score["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(lasso_reg_score["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(lasso_reg_score["test_neg_root_mean_squared_error"]), ) print("R2", np.median(lasso_reg_score["test_r2"])) # %% [markdown] # ### Bayesian Ridge # %% jupyter={"source_hidden": true} bayesian_ridge_reg = linear_model.BayesianRidge() # %% jupyter={"source_hidden": true} bayesian_ridge_reg_score = cross_validate( bayesian_ridge_reg, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(bayesian_ridge_reg_score["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score["test_neg_root_mean_squared_error"]), ) print("R2", np.median(bayesian_ridge_reg_score["test_r2"])) # %% [markdown] # ### RANSAC (outlier robust regression) # %% jupyter={"source_hidden": true} ransac_reg = linear_model.RANSACRegressor() # %% jupyter={"source_hidden": true} ransac_reg_scores = cross_validate( ransac_reg, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(ransac_reg_scores["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(ransac_reg_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(ransac_reg_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(ransac_reg_scores["test_r2"])) # %% [markdown] # ### Support vector regression # %% jupyter={"source_hidden": true} svr = svm.SVR() # %% jupyter={"source_hidden": true} svr_scores = cross_validate( svr, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(svr_scores["test_neg_mean_squared_error"]) ) print( "Negative Mean Absolute Error", np.median(svr_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(svr_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(svr_scores["test_r2"])) # %% [markdown] # ### Kernel Ridge regression # %% jupyter={"source_hidden": true} kridge = kernel_ridge.KernelRidge() # %% jupyter={"source_hidden": true} kridge_scores = cross_validate( kridge, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(kridge_scores["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(kridge_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(kridge_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(kridge_scores["test_r2"])) # %% [markdown] # ### Gaussian Process Regression # %% jupyter={"source_hidden": true} gpr = gaussian_process.GaussianProcessRegressor() # %% jupyter={"source_hidden": true} gpr_scores = cross_validate( gpr, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(gpr_scores["test_neg_mean_squared_error"]) ) print( "Negative Mean Absolute Error", np.median(gpr_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(gpr_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(gpr_scores["test_r2"])) # %%