Use methods in helper.py.
parent
48118f125d
commit
c66e046014
|
@ -21,6 +21,7 @@ import sys
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import xgboost as xg
|
import xgboost as xg
|
||||||
|
from machine_learning.helper import prepare_regression_model_input
|
||||||
from sklearn import gaussian_process, kernel_ridge, linear_model, svm
|
from sklearn import gaussian_process, kernel_ridge, linear_model, svm
|
||||||
from sklearn.dummy import DummyRegressor
|
from sklearn.dummy import DummyRegressor
|
||||||
from sklearn.impute import SimpleImputer
|
from sklearn.impute import SimpleImputer
|
||||||
|
@ -39,72 +40,9 @@ model_input = pd.read_csv(
|
||||||
)
|
)
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
index_columns = [
|
|
||||||
"local_segment",
|
|
||||||
"local_segment_label",
|
|
||||||
"local_segment_start_datetime",
|
|
||||||
"local_segment_end_datetime",
|
|
||||||
]
|
|
||||||
# if "pid" in model_input.columns:
|
|
||||||
# index_columns.append("pid")
|
|
||||||
model_input.set_index(index_columns, inplace=True)
|
|
||||||
|
|
||||||
cv_method = "half_logo" # logo, half_logo, 5kfold
|
cv_method = "half_logo" # logo, half_logo, 5kfold
|
||||||
if cv_method == "logo":
|
|
||||||
data_x, data_y, data_groups = (
|
|
||||||
model_input.drop(["target", "pid"], axis=1),
|
|
||||||
model_input["target"],
|
|
||||||
model_input["pid"],
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
model_input["pid_index"] = model_input.groupby("pid").cumcount()
|
|
||||||
model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")
|
|
||||||
|
|
||||||
model_input["pid_index"] = (
|
|
||||||
model_input["pid_index"] / model_input["pid_count"] + 1
|
|
||||||
).round()
|
|
||||||
model_input["pid_half"] = (
|
|
||||||
model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
|
|
||||||
)
|
|
||||||
|
|
||||||
data_x, data_y, data_groups = (
|
|
||||||
model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
|
|
||||||
model_input["target"],
|
|
||||||
model_input["pid_half"],
|
|
||||||
)
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
|
||||||
additional_categorical_features = [
|
|
||||||
col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col
|
|
||||||
]
|
|
||||||
categorical_feature_colnames += additional_categorical_features
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
# fillna with mode
|
|
||||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
# one-hot encoding
|
|
||||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
|
||||||
if not categorical_features.empty:
|
|
||||||
categorical_features = pd.get_dummies(categorical_features)
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
train_x.dtypes
|
|
||||||
|
|
||||||
|
train_x, data_y, data_groups = prepare_regression_model_input(model_input, cv_method)
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
logo = LeaveOneGroupOut()
|
logo = LeaveOneGroupOut()
|
||||||
logo.get_n_splits(
|
logo.get_n_splits(
|
||||||
|
|
|
@ -73,9 +73,7 @@ def insert_row(df, row):
|
||||||
return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
|
return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
def prepare_regression_model_input(input_csv):
|
def prepare_regression_model_input(model_input, cv_method="logo"):
|
||||||
model_input = pd.read_csv(input_csv)
|
|
||||||
|
|
||||||
index_columns = [
|
index_columns = [
|
||||||
"local_segment",
|
"local_segment",
|
||||||
"local_segment_label",
|
"local_segment_label",
|
||||||
|
@ -84,11 +82,28 @@ def prepare_regression_model_input(input_csv):
|
||||||
]
|
]
|
||||||
model_input.set_index(index_columns, inplace=True)
|
model_input.set_index(index_columns, inplace=True)
|
||||||
|
|
||||||
data_x, data_y, data_groups = (
|
if cv_method == "logo":
|
||||||
model_input.drop(["target", "pid"], axis=1),
|
data_x, data_y, data_groups = (
|
||||||
model_input["target"],
|
model_input.drop(["target", "pid"], axis=1),
|
||||||
model_input["pid"],
|
model_input["target"],
|
||||||
)
|
model_input["pid"],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
model_input["pid_index"] = model_input.groupby("pid").cumcount()
|
||||||
|
model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")
|
||||||
|
|
||||||
|
model_input["pid_index"] = (
|
||||||
|
model_input["pid_index"] / model_input["pid_count"] + 1
|
||||||
|
).round()
|
||||||
|
model_input["pid_half"] = (
|
||||||
|
model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
|
||||||
|
)
|
||||||
|
|
||||||
|
data_x, data_y, data_groups = (
|
||||||
|
model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
|
||||||
|
model_input["target"],
|
||||||
|
model_input["pid_half"],
|
||||||
|
)
|
||||||
|
|
||||||
categorical_feature_colnames = [
|
categorical_feature_colnames = [
|
||||||
"gender",
|
"gender",
|
||||||
|
@ -101,8 +116,9 @@ def prepare_regression_model_input(input_csv):
|
||||||
if "mostcommonactivity" in col or "homelabel" in col
|
if "mostcommonactivity" in col or "homelabel" in col
|
||||||
]
|
]
|
||||||
categorical_feature_colnames += additional_categorical_features
|
categorical_feature_colnames += additional_categorical_features
|
||||||
# TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully
|
|
||||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||||
|
|
||||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||||
# fillna with mode
|
# fillna with mode
|
||||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||||
|
|
Loading…
Reference in New Issue