2022-09-13 11:51:03 +02:00
|
|
|
from pprint import pprint
|
|
|
|
import sklearn.metrics
|
|
|
|
import autosklearn.regression
|
|
|
|
|
|
|
|
import datetime
|
|
|
|
import importlib
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
import pandas as pd
|
|
|
|
import seaborn as sns
|
|
|
|
import yaml
|
|
|
|
|
|
|
|
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
|
|
|
|
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, train_test_split
|
|
|
|
from sklearn.metrics import mean_squared_error, r2_score
|
|
|
|
from sklearn.impute import SimpleImputer
|
|
|
|
|
|
|
|
model_input = pd.read_csv("data/processed/models/population_model/z_input.csv") # Standardizirani podatki
|
|
|
|
|
|
|
|
model_input.dropna(axis=1, how="all", inplace=True)
|
|
|
|
model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
|
|
|
|
|
|
|
|
categorical_feature_colnames = ["gender", "startlanguage"]
|
|
|
|
categorical_features = model_input[categorical_feature_colnames].copy()
|
|
|
|
mode_categorical_features = categorical_features.mode().iloc[0]
|
|
|
|
categorical_features = categorical_features.fillna(mode_categorical_features)
|
|
|
|
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
|
|
|
if not categorical_features.empty:
|
|
|
|
categorical_features = pd.get_dummies(categorical_features)
|
|
|
|
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
|
|
|
|
model_in = pd.concat([numerical_features, categorical_features], axis=1)
|
|
|
|
|
|
|
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
|
|
|
model_in.set_index(index_columns, inplace=True)
|
|
|
|
|
2022-09-14 16:13:03 +02:00
|
|
|
X_train, X_test, y_train, y_test = train_test_split(model_in.drop(["target", "pid"], axis=1), model_in["target"], test_size=0.30)
|
2022-09-13 11:51:03 +02:00
|
|
|
|
|
|
|
automl = autosklearn.regression.AutoSklearnRegressor(
|
2022-09-14 16:13:03 +02:00
|
|
|
time_left_for_this_task=14400,
|
2022-09-13 15:54:06 +02:00
|
|
|
per_run_time_limit=120
|
2022-09-13 11:51:03 +02:00
|
|
|
)
|
|
|
|
automl.fit(X_train, y_train, dataset_name='straw')
|
|
|
|
|
|
|
|
print(automl.leaderboard())
|
|
|
|
pprint(automl.show_models(), indent=4)
|
|
|
|
|
|
|
|
train_predictions = automl.predict(X_train)
|
|
|
|
print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
|
|
|
|
test_predictions = automl.predict(X_test)
|
|
|
|
print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))
|
|
|
|
|
|
|
|
import sys
|
|
|
|
sys.exit()
|