# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% jupyter={"source_hidden": true} # %matplotlib inline import os import sys import numpy as np import pandas as pd import xgboost as xg from sklearn import gaussian_process, kernel_ridge, linear_model, svm from sklearn.dummy import DummyRegressor from sklearn.impute import SimpleImputer from sklearn.model_selection import LeaveOneGroupOut, cross_validate # from IPython.core.interactiveshell import InteractiveShell # InteractiveShell.ast_node_interactivity = "all" nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) # %% jupyter={"source_hidden": true} model_input = pd.read_csv( "../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv" ) # %% jupyter={"source_hidden": true} index_columns = [ "local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime", ] # if "pid" in model_input.columns: # index_columns.append("pid") model_input.set_index(index_columns, inplace=True) cv_method = "half_logo" # logo, half_logo, 5kfold if cv_method == "logo": data_x, data_y, data_groups = ( model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"], ) else: model_input["pid_index"] = model_input.groupby("pid").cumcount() model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count") model_input["pid_index"] = ( model_input["pid_index"] / model_input["pid_count"] + 1 ).round() model_input["pid_half"] = ( model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str) ) data_x, data_y, data_groups = ( model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"], ) # %% jupyter={"source_hidden": true} categorical_feature_colnames = ["gender", "startlanguage"] additional_categorical_features = [ col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col ] categorical_feature_colnames += additional_categorical_features # %% jupyter={"source_hidden": true} categorical_features = data_x[categorical_feature_colnames].copy() # %% jupyter={"source_hidden": true} mode_categorical_features = categorical_features.mode().iloc[0] # %% jupyter={"source_hidden": true} # fillna with mode categorical_features = categorical_features.fillna(mode_categorical_features) # %% jupyter={"source_hidden": true} # one-hot encoding categorical_features = categorical_features.apply(lambda col: col.astype("category")) if not categorical_features.empty: categorical_features = pd.get_dummies(categorical_features) # %% jupyter={"source_hidden": true} numerical_features = data_x.drop(categorical_feature_colnames, axis=1) # %% jupyter={"source_hidden": true} train_x = pd.concat([numerical_features, categorical_features], axis=1) # %% jupyter={"source_hidden": true} train_x.dtypes # %% jupyter={"source_hidden": true} logo = LeaveOneGroupOut() logo.get_n_splits( train_x, data_y, groups=data_groups, ) # Defaults to 5 k folds in cross_validate method if cv_method != "logo" and cv_method != "half_logo": logo = None # %% jupyter={"source_hidden": true} sum(data_y.isna()) # %% [markdown] # ### Baseline: Dummy Regression (mean) dummy_regr = DummyRegressor(strategy="mean") # %% jupyter={"source_hidden": true} imputer = SimpleImputer(missing_values=np.nan, strategy="mean") # %% jupyter={"source_hidden": true} dummy_regressor = cross_validate( dummy_regr, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(dummy_regressor["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(dummy_regressor["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(dummy_regressor["test_neg_root_mean_squared_error"]), ) print("R2", np.median(dummy_regressor["test_r2"])) # %% [markdown] # ### Linear Regression # %% jupyter={"source_hidden": true} lin_reg_rapids = linear_model.LinearRegression() # %% jupyter={"source_hidden": true} imputer = SimpleImputer(missing_values=np.nan, strategy="mean") # %% jupyter={"source_hidden": true} lin_reg_scores = cross_validate( lin_reg_rapids, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(lin_reg_scores["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(lin_reg_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(lin_reg_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(lin_reg_scores["test_r2"])) # %% [markdown] # ### XGBRegressor Linear Regression # %% jupyter={"source_hidden": true} xgb_r = xg.XGBRegressor(objective="reg:squarederror", n_estimators=10) # %% jupyter={"source_hidden": true} imputer = SimpleImputer(missing_values=np.nan, strategy="mean") # %% jupyter={"source_hidden": true} xgb_reg_scores = cross_validate( xgb_r, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(xgb_reg_scores["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(xgb_reg_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(xgb_reg_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(xgb_reg_scores["test_r2"])) # %% [markdown] # ### XGBRegressor Pseudo Huber Error Regression # %% jupyter={"source_hidden": true} xgb_psuedo_huber_r = xg.XGBRegressor(objective="reg:pseudohubererror", n_estimators=10) # %% jupyter={"source_hidden": true} imputer = SimpleImputer(missing_values=np.nan, strategy="mean") # %% jupyter={"source_hidden": true} xgb_psuedo_huber_reg_scores = cross_validate( xgb_psuedo_huber_r, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(xgb_psuedo_huber_reg_scores["test_r2"])) # %% [markdown] # ### Ridge regression # %% jupyter={"source_hidden": true} ridge_reg = linear_model.Ridge(alpha=0.5) # %% tags=[] jupyter={"source_hidden": true} ridge_reg_scores = cross_validate( ridge_reg, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(ridge_reg_scores["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(ridge_reg_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(ridge_reg_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(ridge_reg_scores["test_r2"])) # %% [markdown] # ### Lasso # %% jupyter={"source_hidden": true} lasso_reg = linear_model.Lasso(alpha=0.1) # %% jupyter={"source_hidden": true} lasso_reg_score = cross_validate( lasso_reg, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(lasso_reg_score["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(lasso_reg_score["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(lasso_reg_score["test_neg_root_mean_squared_error"]), ) print("R2", np.median(lasso_reg_score["test_r2"])) # %% [markdown] # ### Bayesian Ridge # %% jupyter={"source_hidden": true} bayesian_ridge_reg = linear_model.BayesianRidge() # %% jupyter={"source_hidden": true} bayesian_ridge_reg_score = cross_validate( bayesian_ridge_reg, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(bayesian_ridge_reg_score["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score["test_neg_root_mean_squared_error"]), ) print("R2", np.median(bayesian_ridge_reg_score["test_r2"])) # %% [markdown] # ### RANSAC (outlier robust regression) # %% jupyter={"source_hidden": true} ransac_reg = linear_model.RANSACRegressor() # %% jupyter={"source_hidden": true} ransac_reg_scores = cross_validate( ransac_reg, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(ransac_reg_scores["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(ransac_reg_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(ransac_reg_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(ransac_reg_scores["test_r2"])) # %% [markdown] # ### Support vector regression # %% jupyter={"source_hidden": true} svr = svm.SVR() # %% jupyter={"source_hidden": true} svr_scores = cross_validate( svr, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(svr_scores["test_neg_mean_squared_error"]) ) print( "Negative Mean Absolute Error", np.median(svr_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(svr_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(svr_scores["test_r2"])) # %% [markdown] # ### Kernel Ridge regression # %% jupyter={"source_hidden": true} kridge = kernel_ridge.KernelRidge() # %% jupyter={"source_hidden": true} kridge_scores = cross_validate( kridge, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(kridge_scores["test_neg_mean_squared_error"]), ) print( "Negative Mean Absolute Error", np.median(kridge_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(kridge_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(kridge_scores["test_r2"])) # %% [markdown] # ### Gaussian Process Regression # %% jupyter={"source_hidden": true} gpr = gaussian_process.GaussianProcessRegressor() # %% jupyter={"source_hidden": true} gpr_scores = cross_validate( gpr, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=( "r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_root_mean_squared_error", ), ) print( "Negative Mean Squared Error", np.median(gpr_scores["test_neg_mean_squared_error"]) ) print( "Negative Mean Absolute Error", np.median(gpr_scores["test_neg_mean_absolute_error"]), ) print( "Negative Root Mean Squared Error", np.median(gpr_scores["test_neg_root_mean_squared_error"]), ) print("R2", np.median(gpr_scores["test_r2"])) # %%