From b505fb2b6a07602d283e531b00ac048668f42776 Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 10 May 2023 20:30:51 +0200 Subject: [PATCH] Thoroughly refactor regression runner. --- exploration/ml_pipeline_regression.py | 438 ++------------------------ machine_learning/helper.py | 154 +++++---- 2 files changed, 104 insertions(+), 488 deletions(-) diff --git a/exploration/ml_pipeline_regression.py b/exploration/ml_pipeline_regression.py index 6a6bcae..ed27364 100644 --- a/exploration/ml_pipeline_regression.py +++ b/exploration/ml_pipeline_regression.py @@ -13,438 +13,36 @@ # name: straw2analysis # --- -# %% jupyter={"source_hidden": true} -# %matplotlib inline +# %% import os import sys -import numpy as np import pandas as pd -import xgboost as xg -from machine_learning.helper import prepare_regression_model_input -from sklearn import gaussian_process, kernel_ridge, linear_model, svm -from sklearn.dummy import DummyRegressor -from sklearn.impute import SimpleImputer -from sklearn.model_selection import LeaveOneGroupOut, cross_validate -# from IPython.core.interactiveshell import InteractiveShell -# InteractiveShell.ast_node_interactivity = "all" +from machine_learning.helper import ( + impute_encode_categorical_features, + prepare_cross_validator, + prepare_sklearn_data_format, + run_all_regression_models, +) nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) -# %% jupyter={"source_hidden": true} +# %% model_input = pd.read_csv( "../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv" ) -# %% jupyter={"source_hidden": true} -cv_method = "half_logo" # logo, half_logo, 5kfold - -train_x, data_y, data_groups = prepare_regression_model_input(model_input, cv_method) -# %% jupyter={"source_hidden": true} -logo = LeaveOneGroupOut() -logo.get_n_splits( - train_x, - data_y, - groups=data_groups, -) - -# Defaults to 5 k folds in cross_validate method -if cv_method != "logo" and cv_method != "half_logo": - logo = None - -# %% jupyter={"source_hidden": true} -sum(data_y.isna()) - -# %% [markdown] -# ### Baseline: Dummy Regression (mean) -dummy_regr = DummyRegressor(strategy="mean") - -# %% jupyter={"source_hidden": true} -imputer = SimpleImputer(missing_values=np.nan, strategy="mean") - -# %% jupyter={"source_hidden": true} -dummy_regressor = cross_validate( - dummy_regr, - X=imputer.fit_transform(train_x), - y=data_y, - groups=data_groups, - cv=logo, - n_jobs=-1, - scoring=( - "r2", - "neg_mean_squared_error", - "neg_mean_absolute_error", - "neg_root_mean_squared_error", - ), -) -print( - "Negative Mean Squared Error", - np.median(dummy_regressor["test_neg_mean_squared_error"]), -) -print( - "Negative Mean Absolute Error", - np.median(dummy_regressor["test_neg_mean_absolute_error"]), -) -print( - "Negative Root Mean Squared Error", - np.median(dummy_regressor["test_neg_root_mean_squared_error"]), -) -print("R2", np.median(dummy_regressor["test_r2"])) - -# %% [markdown] -# ### Linear Regression - -# %% jupyter={"source_hidden": true} -lin_reg_rapids = linear_model.LinearRegression() -# %% jupyter={"source_hidden": true} -imputer = SimpleImputer(missing_values=np.nan, strategy="mean") - -# %% jupyter={"source_hidden": true} -lin_reg_scores = cross_validate( - lin_reg_rapids, - X=imputer.fit_transform(train_x), - y=data_y, - groups=data_groups, - cv=logo, - n_jobs=-1, - scoring=( - "r2", - "neg_mean_squared_error", - "neg_mean_absolute_error", - "neg_root_mean_squared_error", - ), -) -print( - "Negative Mean Squared Error", - np.median(lin_reg_scores["test_neg_mean_squared_error"]), -) -print( - "Negative Mean Absolute Error", - np.median(lin_reg_scores["test_neg_mean_absolute_error"]), -) -print( - "Negative Root Mean Squared Error", - np.median(lin_reg_scores["test_neg_root_mean_squared_error"]), -) -print("R2", np.median(lin_reg_scores["test_r2"])) - -# %% [markdown] -# ### XGBRegressor Linear Regression -# %% jupyter={"source_hidden": true} -xgb_r = xg.XGBRegressor(objective="reg:squarederror", n_estimators=10) -# %% jupyter={"source_hidden": true} -imputer = SimpleImputer(missing_values=np.nan, strategy="mean") - -# %% jupyter={"source_hidden": true} -xgb_reg_scores = cross_validate( - xgb_r, - X=imputer.fit_transform(train_x), - y=data_y, - groups=data_groups, - cv=logo, - n_jobs=-1, - scoring=( - "r2", - "neg_mean_squared_error", - "neg_mean_absolute_error", - "neg_root_mean_squared_error", - ), -) -print( - "Negative Mean Squared Error", - np.median(xgb_reg_scores["test_neg_mean_squared_error"]), -) -print( - "Negative Mean Absolute Error", - np.median(xgb_reg_scores["test_neg_mean_absolute_error"]), -) -print( - "Negative Root Mean Squared Error", - np.median(xgb_reg_scores["test_neg_root_mean_squared_error"]), -) -print("R2", np.median(xgb_reg_scores["test_r2"])) - -# %% [markdown] -# ### XGBRegressor Pseudo Huber Error Regression -# %% jupyter={"source_hidden": true} -xgb_psuedo_huber_r = xg.XGBRegressor(objective="reg:pseudohubererror", n_estimators=10) -# %% jupyter={"source_hidden": true} -imputer = SimpleImputer(missing_values=np.nan, strategy="mean") - -# %% jupyter={"source_hidden": true} -xgb_psuedo_huber_reg_scores = cross_validate( - xgb_psuedo_huber_r, - X=imputer.fit_transform(train_x), - y=data_y, - groups=data_groups, - cv=logo, - n_jobs=-1, - scoring=( - "r2", - "neg_mean_squared_error", - "neg_mean_absolute_error", - "neg_root_mean_squared_error", - ), -) -print( - "Negative Mean Squared Error", - np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_squared_error"]), -) -print( - "Negative Mean Absolute Error", - np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_absolute_error"]), -) -print( - "Negative Root Mean Squared Error", - np.median(xgb_psuedo_huber_reg_scores["test_neg_root_mean_squared_error"]), -) -print("R2", np.median(xgb_psuedo_huber_reg_scores["test_r2"])) - -# %% [markdown] -# ### Ridge regression - -# %% jupyter={"source_hidden": true} -ridge_reg = linear_model.Ridge(alpha=0.5) - -# %% tags=[] jupyter={"source_hidden": true} -ridge_reg_scores = cross_validate( - ridge_reg, - X=imputer.fit_transform(train_x), - y=data_y, - groups=data_groups, - cv=logo, - n_jobs=-1, - scoring=( - "r2", - "neg_mean_squared_error", - "neg_mean_absolute_error", - "neg_root_mean_squared_error", - ), -) -print( - "Negative Mean Squared Error", - np.median(ridge_reg_scores["test_neg_mean_squared_error"]), -) -print( - "Negative Mean Absolute Error", - np.median(ridge_reg_scores["test_neg_mean_absolute_error"]), -) -print( - "Negative Root Mean Squared Error", - np.median(ridge_reg_scores["test_neg_root_mean_squared_error"]), -) -print("R2", np.median(ridge_reg_scores["test_r2"])) - -# %% [markdown] -# ### Lasso - -# %% jupyter={"source_hidden": true} -lasso_reg = linear_model.Lasso(alpha=0.1) - -# %% jupyter={"source_hidden": true} -lasso_reg_score = cross_validate( - lasso_reg, - X=imputer.fit_transform(train_x), - y=data_y, - groups=data_groups, - cv=logo, - n_jobs=-1, - scoring=( - "r2", - "neg_mean_squared_error", - "neg_mean_absolute_error", - "neg_root_mean_squared_error", - ), -) -print( - "Negative Mean Squared Error", - np.median(lasso_reg_score["test_neg_mean_squared_error"]), -) -print( - "Negative Mean Absolute Error", - np.median(lasso_reg_score["test_neg_mean_absolute_error"]), -) -print( - "Negative Root Mean Squared Error", - np.median(lasso_reg_score["test_neg_root_mean_squared_error"]), -) -print("R2", np.median(lasso_reg_score["test_r2"])) - -# %% [markdown] -# ### Bayesian Ridge - -# %% jupyter={"source_hidden": true} -bayesian_ridge_reg = linear_model.BayesianRidge() - -# %% jupyter={"source_hidden": true} -bayesian_ridge_reg_score = cross_validate( - bayesian_ridge_reg, - X=imputer.fit_transform(train_x), - y=data_y, - groups=data_groups, - cv=logo, - n_jobs=-1, - scoring=( - "r2", - "neg_mean_squared_error", - "neg_mean_absolute_error", - "neg_root_mean_squared_error", - ), -) -print( - "Negative Mean Squared Error", - np.median(bayesian_ridge_reg_score["test_neg_mean_squared_error"]), -) -print( - "Negative Mean Absolute Error", - np.median(bayesian_ridge_reg_score["test_neg_mean_absolute_error"]), -) -print( - "Negative Root Mean Squared Error", - np.median(bayesian_ridge_reg_score["test_neg_root_mean_squared_error"]), -) -print("R2", np.median(bayesian_ridge_reg_score["test_r2"])) - -# %% [markdown] -# ### RANSAC (outlier robust regression) - -# %% jupyter={"source_hidden": true} -ransac_reg = linear_model.RANSACRegressor() - -# %% jupyter={"source_hidden": true} -ransac_reg_scores = cross_validate( - ransac_reg, - X=imputer.fit_transform(train_x), - y=data_y, - groups=data_groups, - cv=logo, - n_jobs=-1, - scoring=( - "r2", - "neg_mean_squared_error", - "neg_mean_absolute_error", - "neg_root_mean_squared_error", - ), -) -print( - "Negative Mean Squared Error", - np.median(ransac_reg_scores["test_neg_mean_squared_error"]), -) -print( - "Negative Mean Absolute Error", - np.median(ransac_reg_scores["test_neg_mean_absolute_error"]), -) -print( - "Negative Root Mean Squared Error", - np.median(ransac_reg_scores["test_neg_root_mean_squared_error"]), -) -print("R2", np.median(ransac_reg_scores["test_r2"])) - -# %% [markdown] -# ### Support vector regression - -# %% jupyter={"source_hidden": true} -svr = svm.SVR() - -# %% jupyter={"source_hidden": true} -svr_scores = cross_validate( - svr, - X=imputer.fit_transform(train_x), - y=data_y, - groups=data_groups, - cv=logo, - n_jobs=-1, - scoring=( - "r2", - "neg_mean_squared_error", - "neg_mean_absolute_error", - "neg_root_mean_squared_error", - ), -) -print( - "Negative Mean Squared Error", np.median(svr_scores["test_neg_mean_squared_error"]) -) -print( - "Negative Mean Absolute Error", - np.median(svr_scores["test_neg_mean_absolute_error"]), -) -print( - "Negative Root Mean Squared Error", - np.median(svr_scores["test_neg_root_mean_squared_error"]), -) -print("R2", np.median(svr_scores["test_r2"])) - -# %% [markdown] -# ### Kernel Ridge regression - -# %% jupyter={"source_hidden": true} -kridge = kernel_ridge.KernelRidge() - -# %% jupyter={"source_hidden": true} -kridge_scores = cross_validate( - kridge, - X=imputer.fit_transform(train_x), - y=data_y, - groups=data_groups, - cv=logo, - n_jobs=-1, - scoring=( - "r2", - "neg_mean_squared_error", - "neg_mean_absolute_error", - "neg_root_mean_squared_error", - ), -) -print( - "Negative Mean Squared Error", - np.median(kridge_scores["test_neg_mean_squared_error"]), -) -print( - "Negative Mean Absolute Error", - np.median(kridge_scores["test_neg_mean_absolute_error"]), -) -print( - "Negative Root Mean Squared Error", - np.median(kridge_scores["test_neg_root_mean_squared_error"]), -) -print("R2", np.median(kridge_scores["test_r2"])) - -# %% [markdown] -# ### Gaussian Process Regression - -# %% jupyter={"source_hidden": true} -gpr = gaussian_process.GaussianProcessRegressor() - -# %% jupyter={"source_hidden": true} - -gpr_scores = cross_validate( - gpr, - X=imputer.fit_transform(train_x), - y=data_y, - groups=data_groups, - cv=logo, - n_jobs=-1, - scoring=( - "r2", - "neg_mean_squared_error", - "neg_mean_absolute_error", - "neg_root_mean_squared_error", - ), -) -print( - "Negative Mean Squared Error", np.median(gpr_scores["test_neg_mean_squared_error"]) -) -print( - "Negative Mean Absolute Error", - np.median(gpr_scores["test_neg_mean_absolute_error"]), -) -print( - "Negative Root Mean Squared Error", - np.median(gpr_scores["test_neg_root_mean_squared_error"]), -) -print("R2", np.median(gpr_scores["test_r2"])) - # %% +CV_METHOD = "half_logo" # logo, half_logo, 5kfold + +model_input_encoded = impute_encode_categorical_features(model_input) +# %% +data_x, data_y, data_groups = prepare_sklearn_data_format( + model_input_encoded, CV_METHOD +) +cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD) +# %% +scores = run_all_regression_models(data_x, data_y, data_groups, cross_validator) diff --git a/machine_learning/helper.py b/machine_learning/helper.py index 20ac7eb..e3ea457 100644 --- a/machine_learning/helper.py +++ b/machine_learning/helper.py @@ -11,7 +11,12 @@ from sklearn import ( svm, ) from sklearn.dummy import DummyClassifier, DummyRegressor -from sklearn.model_selection import LeaveOneGroupOut, cross_validate +from sklearn.model_selection import ( + BaseCrossValidator, + LeaveOneGroupOut, + StratifiedKFold, + cross_validate, +) from xgboost import XGBClassifier, XGBRegressor @@ -73,7 +78,40 @@ def insert_row(df, row): return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True) -def prepare_sklearn_data_format(model_input, cv_method="logo"): +def impute_encode_categorical_features(model_input: pd.DataFrame) -> pd.DataFrame: + categorical_feature_col_names = [ + "gender", + "startlanguage", + "limesurvey_demand_control_ratio_quartile", + ] + additional_categorical_features = [ + col + for col in model_input.columns + if "mostcommonactivity" in col or "homelabel" in col + ] + categorical_feature_col_names += additional_categorical_features + + categorical_features = model_input[categorical_feature_col_names].copy() + + mode_categorical_features = categorical_features.mode().iloc[0] + # fillna with mode + categorical_features = categorical_features.fillna(mode_categorical_features) + # one-hot encoding + categorical_features = categorical_features.apply( + lambda col: col.astype("category") + ) + if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) + + numerical_features = model_input.drop(categorical_feature_col_names, axis=1) + + model_input = pd.concat([numerical_features, categorical_features], axis=1) + return model_input + + +def prepare_sklearn_data_format( + model_input: pd.DataFrame, cv_method: str = "logo" +) -> tuple: index_columns = [ "local_segment", "local_segment_label", @@ -107,50 +145,30 @@ def prepare_sklearn_data_format(model_input, cv_method="logo"): return data_x, data_y, data_groups -def prepare_regression_model_input(model_input, cv_method="logo"): - data_x, data_y, data_groups = prepare_sklearn_data_format( - model_input, cv_method=cv_method - ) - - categorical_feature_colnames = [ - "gender", - "startlanguage", - "limesurvey_demand_control_ratio_quartile", - ] - additional_categorical_features = [ - col - for col in data_x.columns - if "mostcommonactivity" in col or "homelabel" in col - ] - categorical_feature_colnames += additional_categorical_features - - categorical_features = data_x[categorical_feature_colnames].copy() - - mode_categorical_features = categorical_features.mode().iloc[0] - # fillna with mode - categorical_features = categorical_features.fillna(mode_categorical_features) - # one-hot encoding - categorical_features = categorical_features.apply( - lambda col: col.astype("category") - ) - if not categorical_features.empty: - categorical_features = pd.get_dummies(categorical_features) - - numerical_features = data_x.drop(categorical_feature_colnames, axis=1) - - train_x = pd.concat([numerical_features, categorical_features], axis=1) - - return train_x, data_y, data_groups +def prepare_cross_validator( + data_x: pd.DataFrame, + data_y: pd.DataFrame, + data_groups: pd.DataFrame, + cv_method: str = "logo", +) -> BaseCrossValidator: + if cv_method == "logo" or cv_method == "half_logo": + cv = LeaveOneGroupOut() + cv.get_n_splits( + data_x, + data_y, + groups=data_groups, + ) + else: + cv = StratifiedKFold(n_splits=5, shuffle=True) + return cv -def run_all_regression_models(train_x, data_y, data_groups): - # Prepare cross validation - logo = LeaveOneGroupOut() - logo.get_n_splits( - train_x, - data_y, - groups=data_groups, - ) +def run_all_regression_models( + data_x: pd.DataFrame, + data_y: pd.DataFrame, + data_groups: pd.DataFrame, + cross_validator: BaseCrossValidator, +) -> pd.DataFrame: metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"] test_metrics = ["test_" + metric for metric in metrics] scores = pd.DataFrame(columns=["method", "max", "nanmedian"]) @@ -159,10 +177,10 @@ def run_all_regression_models(train_x, data_y, data_groups): dummy_regr = DummyRegressor(strategy="mean") dummy_regr_scores = cross_validate( dummy_regr, - X=train_x, + X=data_x, y=data_y, groups=data_groups, - cv=logo, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -177,10 +195,10 @@ def run_all_regression_models(train_x, data_y, data_groups): lin_reg_rapids = linear_model.LinearRegression() lin_reg_scores = cross_validate( lin_reg_rapids, - X=train_x, + X=data_x, y=data_y, groups=data_groups, - cv=logo, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -195,10 +213,10 @@ def run_all_regression_models(train_x, data_y, data_groups): ridge_reg = linear_model.Ridge(alpha=0.5) ridge_reg_scores = cross_validate( ridge_reg, - X=train_x, + X=data_x, y=data_y, groups=data_groups, - cv=logo, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -212,10 +230,10 @@ def run_all_regression_models(train_x, data_y, data_groups): lasso_reg = linear_model.Lasso(alpha=0.1) lasso_reg_score = cross_validate( lasso_reg, - X=train_x, + X=data_x, y=data_y, groups=data_groups, - cv=logo, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -229,10 +247,10 @@ def run_all_regression_models(train_x, data_y, data_groups): bayesian_ridge_reg = linear_model.BayesianRidge() bayesian_ridge_reg_score = cross_validate( bayesian_ridge_reg, - X=train_x, + X=data_x, y=data_y, groups=data_groups, - cv=logo, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -246,10 +264,10 @@ def run_all_regression_models(train_x, data_y, data_groups): ransac_reg = linear_model.RANSACRegressor() ransac_reg_score = cross_validate( ransac_reg, - X=train_x, + X=data_x, y=data_y, groups=data_groups, - cv=logo, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -263,10 +281,10 @@ def run_all_regression_models(train_x, data_y, data_groups): svr = svm.SVR() svr_score = cross_validate( svr, - X=train_x, + X=data_x, y=data_y, groups=data_groups, - cv=logo, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -280,10 +298,10 @@ def run_all_regression_models(train_x, data_y, data_groups): kridge = kernel_ridge.KernelRidge() kridge_score = cross_validate( kridge, - X=train_x, + X=data_x, y=data_y, groups=data_groups, - cv=logo, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -297,10 +315,10 @@ def run_all_regression_models(train_x, data_y, data_groups): gpr = gaussian_process.GaussianProcessRegressor() gpr_score = cross_validate( gpr, - X=train_x, + X=data_x, y=data_y, groups=data_groups, - cv=logo, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -314,10 +332,10 @@ def run_all_regression_models(train_x, data_y, data_groups): rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1) rfr_score = cross_validate( rfr, - X=train_x, + X=data_x, y=data_y, groups=data_groups, - cv=logo, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -331,10 +349,10 @@ def run_all_regression_models(train_x, data_y, data_groups): xgb = XGBRegressor() xgb_score = cross_validate( xgb, - X=train_x, + X=data_x, y=data_y, groups=data_groups, - cv=logo, + cv=cross_validator, n_jobs=-1, scoring=metrics, ) @@ -348,10 +366,10 @@ def run_all_regression_models(train_x, data_y, data_groups): ada = ensemble.AdaBoostRegressor() ada_score = cross_validate( ada, - X=train_x, + X=data_x, y=data_y, groups=data_groups, - cv=logo, + cv=cross_validator, n_jobs=-1, scoring=metrics, )