Add xgboost to dependencies and reformat helper.py.

master
junos 2023-04-21 21:33:06 +02:00
parent 59552c18a9
commit 583ee82e80
2 changed files with 131 additions and 138 deletions

View File

@ -22,4 +22,5 @@ dependencies:
- scikit-learn - scikit-learn
- sqlalchemy - sqlalchemy
- statsmodels - statsmodels
- tabulate - tabulate
- xgboost

View File

@ -1,15 +1,18 @@
from pathlib import Path from pathlib import Path
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble, naive_bayes, neighbors, tree
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor, DummyClassifier
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xg
import pandas as pd
import numpy as np import numpy as np
import pandas as pd
from sklearn import (
ensemble,
gaussian_process,
kernel_ridge,
linear_model,
naive_bayes,
svm,
)
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
from xgboost import XGBClassifier, XGBRegressor
def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame: def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
@ -65,28 +68,48 @@ def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> P
full_path = folder / export_filename full_path = folder / export_filename
return full_path return full_path
def insert_row(df, row): def insert_row(df, row):
return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True) return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
def prepare_regression_model_input(input_csv):
def prepare_regression_model_input(input_csv):
model_input = pd.read_csv(input_csv) model_input = pd.read_csv(input_csv)
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] index_columns = [
"local_segment",
"local_segment_label",
"local_segment_start_datetime",
"local_segment_end_datetime",
]
model_input.set_index(index_columns, inplace=True) model_input.set_index(index_columns, inplace=True)
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"] data_x, data_y, data_groups = (
model_input.drop(["target", "pid"], axis=1),
model_input["target"],
model_input["pid"],
)
categorical_feature_colnames = ["gender", "startlanguage", "limesurvey_demand_control_ratio_quartile"] categorical_feature_colnames = [
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col] "gender",
"startlanguage",
"limesurvey_demand_control_ratio_quartile",
]
additional_categorical_features = [
col
for col in data_x.columns
if "mostcommonactivity" in col or "homelabel" in col
]
categorical_feature_colnames += additional_categorical_features categorical_feature_colnames += additional_categorical_features
#TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully # TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully
categorical_features = data_x[categorical_feature_colnames].copy() categorical_features = data_x[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0] mode_categorical_features = categorical_features.mode().iloc[0]
# fillna with mode # fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features) categorical_features = categorical_features.fillna(mode_categorical_features)
# one-hot encoding # one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category")) categorical_features = categorical_features.apply(
lambda col: col.astype("category")
)
if not categorical_features.empty: if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features) categorical_features = pd.get_dummies(categorical_features)
@ -108,7 +131,7 @@ def run_all_regression_models(input_csv):
data_y, data_y,
groups=data_groups, groups=data_groups,
) )
metrics = ['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error'] metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
test_metrics = ["test_" + metric for metric in metrics] test_metrics = ["test_" + metric for metric in metrics]
scores = pd.DataFrame(columns=["method", "max", "nanmedian"]) scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
@ -121,13 +144,13 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Dummy model:") print("Dummy model:")
print("R^2: ", np.nanmedian(dummy_regr_scores['test_r2'])) print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))
scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics] scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "dummy" scores_df["method"] = "dummy"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
@ -139,17 +162,17 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Linear regression:") print("Linear regression:")
print("R^2: ", np.nanmedian(lin_reg_scores['test_r2'])) print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))
scores_df = pd.DataFrame(lin_reg_scores)[test_metrics] scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "linear_reg" scores_df["method"] = "linear_reg"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
ridge_reg = linear_model.Ridge(alpha=.5) ridge_reg = linear_model.Ridge(alpha=0.5)
ridge_reg_scores = cross_validate( ridge_reg_scores = cross_validate(
ridge_reg, ridge_reg,
X=data_x, X=data_x,
@ -157,16 +180,15 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Ridge regression") print("Ridge regression")
scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics] scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "ridge_reg" scores_df["method"] = "ridge_reg"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
lasso_reg = linear_model.Lasso(alpha=0.1) lasso_reg = linear_model.Lasso(alpha=0.1)
lasso_reg_score = cross_validate( lasso_reg_score = cross_validate(
lasso_reg, lasso_reg,
@ -175,12 +197,12 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Lasso regression") print("Lasso regression")
scores_df = pd.DataFrame(lasso_reg_score)[test_metrics] scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "lasso_reg" scores_df["method"] = "lasso_reg"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
@ -192,12 +214,12 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Bayesian Ridge") print("Bayesian Ridge")
scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics] scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "bayesian_ridge" scores_df["method"] = "bayesian_ridge"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
@ -209,29 +231,23 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("RANSAC (outlier robust regression)") print("RANSAC (outlier robust regression)")
scores_df = pd.DataFrame(ransac_reg_score)[test_metrics] scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "RANSAC" scores_df["method"] = "RANSAC"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
svr = svm.SVR() svr = svm.SVR()
svr_score = cross_validate( svr_score = cross_validate(
svr, svr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
) )
print("Support vector regression") print("Support vector regression")
scores_df = pd.DataFrame(svr_score)[test_metrics] scores_df = pd.DataFrame(svr_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "SVR" scores_df["method"] = "SVR"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
@ -243,80 +259,56 @@ def run_all_regression_models(input_csv):
groups=data_groups, groups=data_groups,
cv=logo, cv=logo,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Kernel Ridge regression") print("Kernel Ridge regression")
scores_df = pd.DataFrame(kridge_score)[test_metrics] scores_df = pd.DataFrame(kridge_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "kernel_ridge" scores_df["method"] = "kernel_ridge"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
gpr = gaussian_process.GaussianProcessRegressor() gpr = gaussian_process.GaussianProcessRegressor()
gpr_score = cross_validate( gpr_score = cross_validate(
gpr, gpr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
) )
print("Gaussian Process Regression") print("Gaussian Process Regression")
scores_df = pd.DataFrame(gpr_score)[test_metrics] scores_df = pd.DataFrame(gpr_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "gaussian_proc" scores_df["method"] = "gaussian_proc"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1) rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
rfr_score = cross_validate( rfr_score = cross_validate(
rfr, rfr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
) )
print("Random Forest Regression") print("Random Forest Regression")
scores_df = pd.DataFrame(rfr_score)[test_metrics] scores_df = pd.DataFrame(rfr_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "random_forest" scores_df["method"] = "random_forest"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
xgb = XGBRegressor() xgb = XGBRegressor()
xgb_score = cross_validate( xgb_score = cross_validate(
xgb, xgb, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
) )
print("XGBoost Regressor") print("XGBoost Regressor")
scores_df = pd.DataFrame(xgb_score)[test_metrics] scores_df = pd.DataFrame(xgb_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "XGBoost" scores_df["method"] = "XGBoost"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
ada = ensemble.AdaBoostRegressor() ada = ensemble.AdaBoostRegressor()
ada_score = cross_validate( ada_score = cross_validate(
ada, ada, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
) )
print("ADA Boost Regressor") print("ADA Boost Regressor")
scores_df = pd.DataFrame(ada_score)[test_metrics] scores_df = pd.DataFrame(ada_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose() scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
scores_df["method"] = "ADA_boost" scores_df["method"] = "ADA_boost"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
@ -324,7 +316,7 @@ def run_all_regression_models(input_csv):
def run_all_classification_models(data_x, data_y, data_groups, cv_method): def run_all_classification_models(data_x, data_y, data_groups, cv_method):
metrics = ['accuracy', 'average_precision', 'recall', 'f1'] metrics = ["accuracy", "average_precision", "recall", "f1"]
test_metrics = ["test_" + metric for metric in metrics] test_metrics = ["test_" + metric for metric in metrics]
scores = pd.DataFrame(columns=["method", "max", "mean"]) scores = pd.DataFrame(columns=["method", "max", "mean"])
@ -332,127 +324,127 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
dummy_class = DummyClassifier(strategy="most_frequent") dummy_class = DummyClassifier(strategy="most_frequent")
dummy_score = cross_validate( dummy_score = cross_validate(
dummy_class, dummy_class,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
error_score='raise', error_score="raise",
scoring=metrics scoring=metrics,
) )
print("Dummy") print("Dummy")
scores_df = pd.DataFrame(dummy_score)[test_metrics] scores_df = pd.DataFrame(dummy_score)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "Dummy" scores_df["method"] = "Dummy"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
logistic_regression = linear_model.LogisticRegression() logistic_regression = linear_model.LogisticRegression()
log_reg_scores = cross_validate( log_reg_scores = cross_validate(
logistic_regression, logistic_regression,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Logistic regression") print("Logistic regression")
scores_df = pd.DataFrame(log_reg_scores)[test_metrics] scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "logistic_reg" scores_df["method"] = "logistic_reg"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
svc = svm.SVC() svc = svm.SVC()
svc_scores = cross_validate( svc_scores = cross_validate(
svc, svc,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Support Vector Machine") print("Support Vector Machine")
scores_df = pd.DataFrame(svc_scores)[test_metrics] scores_df = pd.DataFrame(svc_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "svc" scores_df["method"] = "svc"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
gaussian_nb = naive_bayes.GaussianNB() gaussian_nb = naive_bayes.GaussianNB()
gaussian_nb_scores = cross_validate( gaussian_nb_scores = cross_validate(
gaussian_nb, gaussian_nb,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Gaussian Naive Bayes") print("Gaussian Naive Bayes")
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics] scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "gaussian_naive_bayes" scores_df["method"] = "gaussian_naive_bayes"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
sgdc = linear_model.SGDClassifier() sgdc = linear_model.SGDClassifier()
sgdc_scores = cross_validate( sgdc_scores = cross_validate(
sgdc, sgdc,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Stochastic Gradient Descent") print("Stochastic Gradient Descent")
scores_df = pd.DataFrame(sgdc_scores)[test_metrics] scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "stochastic_gradient_descent" scores_df["method"] = "stochastic_gradient_descent"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
rfc = ensemble.RandomForestClassifier() rfc = ensemble.RandomForestClassifier()
rfc_scores = cross_validate( rfc_scores = cross_validate(
rfc, rfc,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("Random Forest") print("Random Forest")
scores_df = pd.DataFrame(rfc_scores)[test_metrics] scores_df = pd.DataFrame(rfc_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "random_forest" scores_df["method"] = "random_forest"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])
xgb_classifier = XGBClassifier() xgb_classifier = XGBClassifier()
xgb_scores = cross_validate( xgb_scores = cross_validate(
xgb_classifier, xgb_classifier,
X=data_x, X=data_x,
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
scoring=metrics scoring=metrics,
) )
print("XGBoost") print("XGBoost")
scores_df = pd.DataFrame(xgb_scores)[test_metrics] scores_df = pd.DataFrame(xgb_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose() scores_df = scores_df.agg(["max", "mean"]).transpose()
scores_df["method"] = "xgboost" scores_df["method"] = "xgboost"
scores = pd.concat([scores, scores_df]) scores = pd.concat([scores, scores_df])