rapids/automl_test.py

from pprint import pprint
import sklearn.metrics
import autosklearn.regression

import datetime
import importlib
import os
import sys

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import yaml

from sklearn import linear_model, svm, kernel_ridge, gaussian_process
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

model_input = pd.read_csv("data/processed/models/population_model/z_input.csv") # Standardizirani podatki

model_input.dropna(axis=1, how="all", inplace=True)
model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)

categorical_feature_colnames = ["gender", "startlanguage"]
categorical_features = model_input[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]
categorical_features = categorical_features.fillna(mode_categorical_features)
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
model_in = pd.concat([numerical_features, categorical_features], axis=1)

index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_in.set_index(index_columns, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(model_in.drop(["target", "pid"], axis=1), model_in["target"], test_size=0.30)

automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=14400,
    per_run_time_limit=120
)
automl.fit(X_train, y_train, dataset_name='straw')

print(automl.leaderboard())
pprint(automl.show_models(), indent=4)

train_predictions = automl.predict(X_train)
print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
test_predictions = automl.predict(X_test)
print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))

import sys
sys.exit()
Testing auto-sklearn 2022-09-13 11:51:03 +02:00			`from pprint import pprint`
			`import sklearn.metrics`
			`import autosklearn.regression`

			`import datetime`
			`import importlib`
			`import os`
			`import sys`

			`import numpy as np`
			`import matplotlib.pyplot as plt`
			`import pandas as pd`
			`import seaborn as sns`
			`import yaml`

			`from sklearn import linear_model, svm, kernel_ridge, gaussian_process`
			`from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, train_test_split`
			`from sklearn.metrics import mean_squared_error, r2_score`
			`from sklearn.impute import SimpleImputer`

			`model_input = pd.read_csv("data/processed/models/population_model/z_input.csv") # Standardizirani podatki`

			`model_input.dropna(axis=1, how="all", inplace=True)`
			`model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)`

			`categorical_feature_colnames = ["gender", "startlanguage"]`
			`categorical_features = model_input[categorical_feature_colnames].copy()`
			`mode_categorical_features = categorical_features.mode().iloc[0]`
			`categorical_features = categorical_features.fillna(mode_categorical_features)`
			`categorical_features = categorical_features.apply(lambda col: col.astype("category"))`
			`if not categorical_features.empty:`
			`categorical_features = pd.get_dummies(categorical_features)`
			`numerical_features = model_input.drop(categorical_feature_colnames, axis=1)`
			`model_in = pd.concat([numerical_features, categorical_features], axis=1)`

			`index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]`
			`model_in.set_index(index_columns, inplace=True)`

Writing testing scripts to determine the point of manual imputation. 2022-09-14 16:13:03 +02:00			`X_train, X_test, y_train, y_test = train_test_split(model_in.drop(["target", "pid"], axis=1), model_in["target"], test_size=0.30)`
Testing auto-sklearn 2022-09-13 11:51:03 +02:00
			`automl = autosklearn.regression.AutoSklearnRegressor(`
Writing testing scripts to determine the point of manual imputation. 2022-09-14 16:13:03 +02:00			`time_left_for_this_task=14400,`
Changes in AutoML params and environment.yml 2022-09-13 15:54:06 +02:00			`per_run_time_limit=120`
Testing auto-sklearn 2022-09-13 11:51:03 +02:00			`)`
			`automl.fit(X_train, y_train, dataset_name='straw')`

			`print(automl.leaderboard())`
			`pprint(automl.show_models(), indent=4)`

			`train_predictions = automl.predict(X_train)`
			`print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))`
			`test_predictions = automl.predict(X_test)`
			`print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))`

			`import sys`
			`sys.exit()`