from pprint import pprint import sklearn.metrics import autosklearn.regression import datetime import importlib import os import sys import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import yaml from sklearn import linear_model, svm, kernel_ridge, gaussian_process from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, train_test_split from sklearn.metrics import mean_squared_error, r2_score from sklearn.impute import SimpleImputer model_input = pd.read_csv("data/processed/models/population_model/input_PANAS_negative_affect_mean.csv") # Standardizirani podatki model_input.dropna(axis=1, how="all", inplace=True) model_input.dropna(axis=0, how="any", subset=["target"], inplace=True) categorical_feature_colnames = ["gender", "startlanguage"] categorical_feature_colnames += [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col] categorical_features = model_input[categorical_feature_colnames].copy() mode_categorical_features = categorical_features.mode().iloc[0] categorical_features = categorical_features.fillna(mode_categorical_features) categorical_features = categorical_features.apply(lambda col: col.astype("category")) if not categorical_features.empty: categorical_features = pd.get_dummies(categorical_features) numerical_features = model_input.drop(categorical_feature_colnames, axis=1) model_in = pd.concat([numerical_features, categorical_features], axis=1) index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] model_in.set_index(index_columns, inplace=True) X_train, X_test, y_train, y_test = train_test_split(model_in.drop(["target", "pid"], axis=1), model_in["target"], test_size=0.30) automl = autosklearn.regression.AutoSklearnRegressor( time_left_for_this_task=7200, per_run_time_limit=120 ) automl.fit(X_train, y_train, dataset_name='straw') print(automl.leaderboard()) pprint(automl.show_models(), indent=4) train_predictions = automl.predict(X_train) print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions)) test_predictions = automl.predict(X_test) print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions)) import sys sys.exit()