diff --git a/exploration/ex_all_feat_ml_pipeline.py b/exploration/ex_all_feat_ml_pipeline.py index b24f200..c0eadd2 100644 --- a/exploration/ex_all_feat_ml_pipeline.py +++ b/exploration/ex_all_feat_ml_pipeline.py @@ -26,10 +26,11 @@ import pandas as pd import seaborn as sns import yaml from pyprojroot import here -from sklearn import linear_model, svm, kernel_ridge, gaussian_process +from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble from sklearn.model_selection import LeaveOneGroupOut, cross_val_score from sklearn.metrics import mean_squared_error, r2_score from sklearn.impute import SimpleImputer +from xgboost import XGBRegressor nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: @@ -270,3 +271,203 @@ np.median( ) ) # %% +def insert_row(df, row): + return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True) + +# %% +def run_all_models(input_csv): + # Prepare data + model_input = pd.read_csv(input_csv) + model_input.dropna(axis=1, how="all", inplace=True) + model_input.dropna(axis=0, how="any", subset=["target"], inplace=True) + + index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] + model_input.set_index(index_columns, inplace=True) + + data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"] + + categorical_feature_colnames = ["gender", "startlanguage"] + categorical_features = data_x[categorical_feature_colnames].copy() + mode_categorical_features = categorical_features.mode().iloc[0] + # fillna with mode + categorical_features = categorical_features.fillna(mode_categorical_features) + # one-hot encoding + categorical_features = categorical_features.apply(lambda col: col.astype("category")) + if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) + + numerical_features = data_x.drop(categorical_feature_colnames, axis=1) + + train_x = pd.concat([numerical_features, categorical_features], axis=1) + imputer = SimpleImputer(missing_values=np.nan, strategy='mean') + train_x_imputed = imputer.fit_transform(train_x) + + # Prepare cross validation + logo = LeaveOneGroupOut() + logo.get_n_splits( + train_x, + data_y, + groups=data_groups, + ) + scores = pd.DataFrame(columns=["method", "median", "max"]) + + # Validate models + lin_reg_rapids = linear_model.LinearRegression() + lin_reg_scores = cross_val_score( + lin_reg_rapids, + X=train_x_imputed, + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring='r2' + ) + print("Linear regression:") + print(np.median(lin_reg_scores)) + scores = insert_row(scores, ["Linear regression",np.median(lin_reg_scores),np.max(lin_reg_scores)]) + + ridge_reg = linear_model.Ridge(alpha=.5) + ridge_reg_scores = cross_val_score( + ridge_reg, + X=train_x_imputed, + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" + ) + print("Ridge regression") + print(np.median(ridge_reg_scores)) + scores = insert_row(scores, ["Ridge regression",np.median(ridge_reg_scores),np.max(ridge_reg_scores)]) + + lasso_reg = linear_model.Lasso(alpha=0.1) + lasso_reg_score = cross_val_score( + lasso_reg, + X=train_x_imputed, + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" + ) + print("Lasso regression") + print(np.median(lasso_reg_score)) + scores = insert_row(scores, ["Lasso regression",np.median(lasso_reg_score),np.max(lasso_reg_score)]) + + bayesian_ridge_reg = linear_model.BayesianRidge() + bayesian_ridge_reg_score = cross_val_score( + bayesian_ridge_reg, + X=train_x_imputed, + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" + ) + print("Bayesian Ridge") + print(np.median(bayesian_ridge_reg_score)) + scores = insert_row(scores, ["Bayesian Ridge",np.median(bayesian_ridge_reg_score),np.max(bayesian_ridge_reg_score)]) + + ransac_reg = linear_model.RANSACRegressor() + ransac_reg_score = cross_val_score( + ransac_reg, + X=train_x_imputed, + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" + ) + print("RANSAC (outlier robust regression)") + print(np.median(ransac_reg_score)) + scores = insert_row(scores, ["RANSAC",np.median(ransac_reg_score),np.max(ransac_reg_score)]) + + svr = svm.SVR() + svr_score = cross_val_score( + svr, + X=train_x_imputed, + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" + ) + print("Support vector regression") + print(np.median(svr_score)) + scores = insert_row(scores, ["Support vector regression",np.median(svr_score),np.max(svr_score)]) + + kridge = kernel_ridge.KernelRidge() + kridge_score = cross_val_score( + kridge, + X=train_x_imputed, + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" + ) + print("Kernel Ridge regression") + print(np.median(kridge_score)) + scores = insert_row(scores, ["Kernel Ridge regression",np.median(kridge_score),np.max(kridge_score)]) + + gpr = gaussian_process.GaussianProcessRegressor() + gpr_score = cross_val_score( + gpr, + X=train_x_imputed, + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" + ) + print("Gaussian Process Regression") + print(np.median(gpr_score)) + scores = insert_row(scores, ["Gaussian Process Regression",np.median(gpr_score),np.max(gpr_score)]) + + rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1) + rfr_score = cross_val_score( + rfr, + X=train_x_imputed, + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" + ) + print("Random Forest Regression") + print(np.median(rfr_score)) + scores = insert_row(scores, ["Random Forest Regression",np.median(rfr_score),np.max(rfr_score)]) + + xgb = XGBRegressor() + xgb_score = cross_val_score( + xgb, + X=train_x_imputed, + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" + ) + print("XGBoost Regressor") + print(np.median(xgb_score)) + scores = insert_row(scores, ["XGBoost Regressor",np.median(xgb_score),np.max(xgb_score)]) + + ada = ensemble.AdaBoostRegressor() + ada_score = cross_val_score( + ada, + X=train_x_imputed, + y=data_y, + groups=data_groups, + cv=logo, + n_jobs=-1, + scoring="r2" + ) + print("ADA Boost Regressor") + print(np.median(ada_score)) + scores = insert_row(scores, ["ADA Boost Regressor",np.median(ada_score),np.max(ada_score)]) + + return scores + + + +