From 8131626c4abb4ad5c7f24e5b76f08b0a9d8de658 Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 7 Dec 2022 21:25:05 +0100 Subject: [PATCH] Include more metrics in regression helper methods. --- machine_learning/helper.py | 186 ++++++++++++++++++++++++------------- 1 file changed, 120 insertions(+), 66 deletions(-) diff --git a/machine_learning/helper.py b/machine_learning/helper.py index dd57393..5d999fe 100644 --- a/machine_learning/helper.py +++ b/machine_learning/helper.py @@ -1,6 +1,6 @@ from pathlib import Path from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble -from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate +from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate from sklearn.metrics import mean_squared_error, r2_score from sklearn.impute import SimpleImputer from sklearn.dummy import DummyRegressor @@ -66,7 +66,7 @@ def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> P def insert_row(df, row): return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True) -def prepare_model_input(input_csv): +def prepare_regression_model_input(input_csv): model_input = pd.read_csv(input_csv) @@ -76,10 +76,9 @@ def prepare_model_input(input_csv): data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"] categorical_feature_colnames = ["gender", "startlanguage", "limesurvey_demand_control_ratio_quartile"] + additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col] + categorical_feature_colnames += additional_categorical_features #TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully - #additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col] - #TODO: check if mostcommonactivity is indeed a categorical features after aggregating - #categorical_feature_colnames += additional_categorical_features categorical_features = data_x[categorical_feature_colnames].copy() mode_categorical_features = categorical_features.mode().iloc[0] # fillna with mode @@ -96,172 +95,227 @@ def prepare_model_input(input_csv): return train_x, data_y, data_groups -def run_all_models(input_csv): +def run_all_regression_models(input_csv): # Prepare data - train_x, data_y, data_groups = prepare_model_input(input_csv) + data_x, data_y, data_groups = prepare_regression_model_input(input_csv) # Prepare cross validation logo = LeaveOneGroupOut() logo.get_n_splits( - train_x, + data_x, data_y, groups=data_groups, ) - scores = pd.DataFrame(columns=["method", "median", "max"]) + metrics = ['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error'] + test_metrics = ["test_" + metric for metric in metrics] + scores = pd.DataFrame(columns=["method", "max", "nanmedian"]) # Validate models - lin_reg_rapids = linear_model.LinearRegression() - lin_reg_scores = cross_val_score( - lin_reg_rapids, - X=train_x, + dummy_regr = DummyRegressor(strategy="mean") + dummy_regr_scores = cross_validate( + dummy_regr, + X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, - scoring='r2' + scoring=metrics + ) + print("Dummy model:") + print("R^2: ", np.nanmedian(dummy_regr_scores['test_r2'])) + + scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics] + scores_df = scores_df.agg(['max', np.nanmedian]).transpose() + scores_df["method"] = "dummy" + scores = pd.concat([scores, scores_df]) + + lin_reg_rapids = linear_model.LinearRegression() + lin_reg_scores = cross_validate( + lin_reg_rapids, + X=data_x, + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring=metrics ) print("Linear regression:") - print(np.median(lin_reg_scores)) - scores = insert_row(scores, ["Linear regression",np.median(lin_reg_scores),np.max(lin_reg_scores)]) + print("R^2: ", np.nanmedian(lin_reg_scores['test_r2'])) + + scores_df = pd.DataFrame(lin_reg_scores)[test_metrics] + scores_df = scores_df.agg(['max', np.nanmedian]).transpose() + scores_df["method"] = "linear_reg" + scores = pd.concat([scores, scores_df]) ridge_reg = linear_model.Ridge(alpha=.5) - ridge_reg_scores = cross_val_score( + ridge_reg_scores = cross_validate( ridge_reg, - X=train_x, + X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, - scoring="r2" + scoring=metrics ) print("Ridge regression") - print(np.median(ridge_reg_scores)) - scores = insert_row(scores, ["Ridge regression",np.median(ridge_reg_scores),np.max(ridge_reg_scores)]) + + scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics] + scores_df = scores_df.agg(['max', np.nanmedian]).transpose() + scores_df["method"] = "ridge_reg" + scores = pd.concat([scores, scores_df]) + lasso_reg = linear_model.Lasso(alpha=0.1) - lasso_reg_score = cross_val_score( + lasso_reg_score = cross_validate( lasso_reg, - X=train_x, + X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, - scoring="r2" + scoring=metrics ) print("Lasso regression") - print(np.median(lasso_reg_score)) - scores = insert_row(scores, ["Lasso regression",np.median(lasso_reg_score),np.max(lasso_reg_score)]) + + scores_df = pd.DataFrame(lasso_reg_score)[test_metrics] + scores_df = scores_df.agg(['max', np.nanmedian]).transpose() + scores_df["method"] = "lasso_reg" + scores = pd.concat([scores, scores_df]) bayesian_ridge_reg = linear_model.BayesianRidge() - bayesian_ridge_reg_score = cross_val_score( + bayesian_ridge_reg_score = cross_validate( bayesian_ridge_reg, - X=train_x, + X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, - scoring="r2" + scoring=metrics ) print("Bayesian Ridge") - print(np.median(bayesian_ridge_reg_score)) - scores = insert_row(scores, ["Bayesian Ridge",np.median(bayesian_ridge_reg_score),np.max(bayesian_ridge_reg_score)]) + + scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics] + scores_df = scores_df.agg(['max', np.nanmedian]).transpose() + scores_df["method"] = "bayesian_ridge" + scores = pd.concat([scores, scores_df]) ransac_reg = linear_model.RANSACRegressor() - ransac_reg_score = cross_val_score( + ransac_reg_score = cross_validate( ransac_reg, - X=train_x, + X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, - scoring="r2" + scoring=metrics ) print("RANSAC (outlier robust regression)") - print(np.median(ransac_reg_score)) - scores = insert_row(scores, ["RANSAC",np.median(ransac_reg_score),np.max(ransac_reg_score)]) + + scores_df = pd.DataFrame(ransac_reg_score)[test_metrics] + scores_df = scores_df.agg(['max', np.nanmedian]).transpose() + scores_df["method"] = "RANSAC" + scores = pd.concat([scores, scores_df]) svr = svm.SVR() - svr_score = cross_val_score( + svr_score = cross_validate( svr, - X=train_x, + X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, - scoring="r2" + scoring=metrics ) print("Support vector regression") - print(np.median(svr_score)) - scores = insert_row(scores, ["Support vector regression",np.median(svr_score),np.max(svr_score)]) + + scores_df = pd.DataFrame(svr_score)[test_metrics] + scores_df = scores_df.agg(['max', np.nanmedian]).transpose() + scores_df["method"] = "SVR" + scores = pd.concat([scores, scores_df]) kridge = kernel_ridge.KernelRidge() - kridge_score = cross_val_score( + kridge_score = cross_validate( kridge, - X=train_x, + X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, - scoring="r2" + scoring=metrics ) print("Kernel Ridge regression") - print(np.median(kridge_score)) - scores = insert_row(scores, ["Kernel Ridge regression",np.median(kridge_score),np.max(kridge_score)]) + + scores_df = pd.DataFrame(kridge_score)[test_metrics] + scores_df = scores_df.agg(['max', np.nanmedian]).transpose() + scores_df["method"] = "kernel_ridge" + scores = pd.concat([scores, scores_df]) gpr = gaussian_process.GaussianProcessRegressor() - gpr_score = cross_val_score( + gpr_score = cross_validate( gpr, - X=train_x, + X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, - scoring="r2" + scoring=metrics ) print("Gaussian Process Regression") - print(np.median(gpr_score)) - scores = insert_row(scores, ["Gaussian Process Regression",np.median(gpr_score),np.max(gpr_score)]) + + scores_df = pd.DataFrame(gpr_score)[test_metrics] + scores_df = scores_df.agg(['max', np.nanmedian]).transpose() + scores_df["method"] = "gaussian_proc" + scores = pd.concat([scores, scores_df]) rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1) - rfr_score = cross_val_score( + rfr_score = cross_validate( rfr, - X=train_x, + X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, - scoring="r2" + scoring=metrics ) print("Random Forest Regression") - print(np.median(rfr_score)) - scores = insert_row(scores, ["Random Forest Regression",np.median(rfr_score),np.max(rfr_score)]) + + scores_df = pd.DataFrame(rfr_score)[test_metrics] + scores_df = scores_df.agg(['max', np.nanmedian]).transpose() + scores_df["method"] = "random_forest" + scores = pd.concat([scores, scores_df]) xgb = XGBRegressor() - xgb_score = cross_val_score( + xgb_score = cross_validate( xgb, - X=train_x, + X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, - scoring="r2" + scoring=metrics ) print("XGBoost Regressor") - print(np.median(xgb_score)) - scores = insert_row(scores, ["XGBoost Regressor",np.median(xgb_score),np.max(xgb_score)]) + + scores_df = pd.DataFrame(xgb_score)[test_metrics] + scores_df = scores_df.agg(['max', np.nanmedian]).transpose() + scores_df["method"] = "XGBoost" + scores = pd.concat([scores, scores_df]) ada = ensemble.AdaBoostRegressor() - ada_score = cross_val_score( + ada_score = cross_validate( ada, - X=train_x, + X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, - scoring="r2" + scoring=metrics ) print("ADA Boost Regressor") - print(np.median(ada_score)) - scores = insert_row(scores, ["ADA Boost Regressor",np.median(ada_score),np.max(ada_score)]) + + scores_df = pd.DataFrame(ada_score)[test_metrics] + scores_df = scores_df.agg(['max', np.nanmedian]).transpose() + scores_df["method"] = "ADA_boost" + scores = pd.concat([scores, scores_df]) return scores