diff --git a/exploration/ml_pipeline_regression.py b/exploration/ml_pipeline_regression.py index e4ea488..6a6bcae 100644 --- a/exploration/ml_pipeline_regression.py +++ b/exploration/ml_pipeline_regression.py @@ -21,6 +21,7 @@ import sys import numpy as np import pandas as pd import xgboost as xg +from machine_learning.helper import prepare_regression_model_input from sklearn import gaussian_process, kernel_ridge, linear_model, svm from sklearn.dummy import DummyRegressor from sklearn.impute import SimpleImputer @@ -39,72 +40,9 @@ model_input = pd.read_csv( ) # %% jupyter={"source_hidden": true} -index_columns = [ - "local_segment", - "local_segment_label", - "local_segment_start_datetime", - "local_segment_end_datetime", -] -# if "pid" in model_input.columns: -# index_columns.append("pid") -model_input.set_index(index_columns, inplace=True) - cv_method = "half_logo" # logo, half_logo, 5kfold -if cv_method == "logo": - data_x, data_y, data_groups = ( - model_input.drop(["target", "pid"], axis=1), - model_input["target"], - model_input["pid"], - ) -else: - model_input["pid_index"] = model_input.groupby("pid").cumcount() - model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count") - - model_input["pid_index"] = ( - model_input["pid_index"] / model_input["pid_count"] + 1 - ).round() - model_input["pid_half"] = ( - model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str) - ) - - data_x, data_y, data_groups = ( - model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), - model_input["target"], - model_input["pid_half"], - ) - -# %% jupyter={"source_hidden": true} -categorical_feature_colnames = ["gender", "startlanguage"] -additional_categorical_features = [ - col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col -] -categorical_feature_colnames += additional_categorical_features - -# %% jupyter={"source_hidden": true} -categorical_features = data_x[categorical_feature_colnames].copy() - -# %% jupyter={"source_hidden": true} -mode_categorical_features = categorical_features.mode().iloc[0] - -# %% jupyter={"source_hidden": true} -# fillna with mode -categorical_features = categorical_features.fillna(mode_categorical_features) - -# %% jupyter={"source_hidden": true} -# one-hot encoding -categorical_features = categorical_features.apply(lambda col: col.astype("category")) -if not categorical_features.empty: - categorical_features = pd.get_dummies(categorical_features) - -# %% jupyter={"source_hidden": true} -numerical_features = data_x.drop(categorical_feature_colnames, axis=1) - -# %% jupyter={"source_hidden": true} -train_x = pd.concat([numerical_features, categorical_features], axis=1) - -# %% jupyter={"source_hidden": true} -train_x.dtypes +train_x, data_y, data_groups = prepare_regression_model_input(model_input, cv_method) # %% jupyter={"source_hidden": true} logo = LeaveOneGroupOut() logo.get_n_splits( diff --git a/machine_learning/helper.py b/machine_learning/helper.py index 69822d0..aa4c870 100644 --- a/machine_learning/helper.py +++ b/machine_learning/helper.py @@ -73,9 +73,7 @@ def insert_row(df, row): return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True) -def prepare_regression_model_input(input_csv): - model_input = pd.read_csv(input_csv) - +def prepare_regression_model_input(model_input, cv_method="logo"): index_columns = [ "local_segment", "local_segment_label", @@ -84,11 +82,28 @@ def prepare_regression_model_input(input_csv): ] model_input.set_index(index_columns, inplace=True) - data_x, data_y, data_groups = ( - model_input.drop(["target", "pid"], axis=1), - model_input["target"], - model_input["pid"], - ) + if cv_method == "logo": + data_x, data_y, data_groups = ( + model_input.drop(["target", "pid"], axis=1), + model_input["target"], + model_input["pid"], + ) + else: + model_input["pid_index"] = model_input.groupby("pid").cumcount() + model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count") + + model_input["pid_index"] = ( + model_input["pid_index"] / model_input["pid_count"] + 1 + ).round() + model_input["pid_half"] = ( + model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str) + ) + + data_x, data_y, data_groups = ( + model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), + model_input["target"], + model_input["pid_half"], + ) categorical_feature_colnames = [ "gender", @@ -101,8 +116,9 @@ def prepare_regression_model_input(input_csv): if "mostcommonactivity" in col or "homelabel" in col ] categorical_feature_colnames += additional_categorical_features - # TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully + categorical_features = data_x[categorical_feature_colnames].copy() + mode_categorical_features = categorical_features.mode().iloc[0] # fillna with mode categorical_features = categorical_features.fillna(mode_categorical_features)