rapids/automl_test.py

58 lines
2.2 KiB
Python
Raw Normal View History

2022-09-13 11:51:03 +02:00
from pprint import pprint
import sklearn.metrics
import autosklearn.regression
import datetime
import importlib
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import yaml
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
2022-10-10 18:45:38 +02:00
model_input = pd.read_csv("data/processed/models/population_model/input_PANAS_negative_affect_mean.csv") # Standardizirani podatki
2022-09-13 11:51:03 +02:00
model_input.dropna(axis=1, how="all", inplace=True)
model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
categorical_feature_colnames = ["gender", "startlanguage"]
2022-10-10 18:45:38 +02:00
categorical_feature_colnames += [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
2022-09-13 11:51:03 +02:00
categorical_features = model_input[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]
categorical_features = categorical_features.fillna(mode_categorical_features)
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
model_in = pd.concat([numerical_features, categorical_features], axis=1)
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_in.set_index(index_columns, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(model_in.drop(["target", "pid"], axis=1), model_in["target"], test_size=0.30)
2022-09-13 11:51:03 +02:00
automl = autosklearn.regression.AutoSklearnRegressor(
2022-10-10 18:45:38 +02:00
time_left_for_this_task=7200,
per_run_time_limit=120
2022-09-13 11:51:03 +02:00
)
automl.fit(X_train, y_train, dataset_name='straw')
print(automl.leaderboard())
pprint(automl.show_models(), indent=4)
train_predictions = automl.predict(X_train)
print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
test_predictions = automl.predict(X_test)
print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))
import sys
sys.exit()