# %% import datetime import importlib import os import sys import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import yaml from pyprojroot import here from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate, cross_val_predict from sklearn.metrics import mean_squared_error, r2_score from sklearn.impute import SimpleImputer from sklearn.dummy import DummyRegressor from sklearn.decomposition import PCA from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import machine_learning.helper # %% segment = "intradaily_30_min" target = "JCQ_job_demand" csv_name = "./data/" + segment + "_all_targets/input_" + target + "_mean.csv" #csv_name = "./data/daily_18_hours_all_targets/input_JCQ_job_demand_mean.csv" # %% data_x, data_y, data_groups = machine_learning.helper.prepare_model_input(csv_name) # %% data_y.head() # %% scores = machine_learning.helper.run_all_models(csv_name) # %% jupyter={"source_hidden": true} logo = LeaveOneGroupOut() logo.get_n_splits( data_x, data_y, groups=data_groups, ) # %% [markdown] # ### Baseline: Dummy Regression (mean) dummy_regr = DummyRegressor(strategy="mean") # %% jupyter={"source_hidden": true} lin_reg_scores = cross_validate( dummy_regr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') ) print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error'])) print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error'])) print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error'])) print("R2", np.median(lin_reg_scores['test_r2'])) ################## # %% chosen_model = "Random Forest" rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1) rfr_score = cross_validate( rfr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') ) print("Negative Mean Squared Error", np.median(rfr_score['test_neg_mean_squared_error'])) print("Negative Mean Absolute Error", np.median(rfr_score['test_neg_mean_absolute_error'])) print("Negative Root Mean Squared Error", np.median(rfr_score['test_neg_root_mean_squared_error'])) print("R2", np.median(rfr_score['test_r2'])) # %% y_predicted = cross_val_predict(rfr, data_x, data_y, groups=data_groups, cv=logo) ######################### # %% chosen_model = "Bayesian Ridge" bayesian_ridge_reg = linear_model.BayesianRidge() bayesian_ridge_reg_score = cross_validate( bayesian_ridge_reg, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') ) print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error'])) print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error'])) print("R2", np.median(bayesian_ridge_reg_score['test_r2'])) # %% y_predicted = cross_val_predict(bayesian_ridge_reg, data_x, data_y, groups=data_groups, cv=logo) # %% data_y = pd.DataFrame(pd.concat([data_y, data_groups], axis=1)) data_y.rename(columns={"target": "y_true"}, inplace=True) data_y["y_predicted"] = y_predicted # %% data_y.head() # %% g1 = sns.relplot(data=data_y, x="y_true", y="y_predicted") #g1.set_axis_labels("true", "predicted") #g1.map(plt.axhline, y=0, color=".7", dashes=(2, 1), zorder=0) #g1.map(plt.axline, xy1=(0,0), slope=1) g1.set(title=",".join([segment, target, chosen_model])) display(g1) g1.savefig("_".join([segment, target, chosen_model, "_relplot.pdf"])) # %% data_y_long = pd.wide_to_long( data_y.reset_index(), i=["local_segment", "pid"], j="value", stubnames="y", sep="_", suffix=".+", ) # %% data_y_long.head() # %% g2 = sns.displot(data_y_long, x="y", hue="value", binwidth=0.1, height=5, aspect=1.5) sns.move_legend(g2, "upper left", bbox_to_anchor=(.55, .45)) g2.set(title=",".join([segment, target, chosen_model])) g2.savefig("_".join([segment, target, chosen_model, "hist.pdf"])) # %% pca = PCA(n_components=2) pca.fit(data_x) print(pca.explained_variance_ratio_) # %% data_x_pca = pca.fit_transform(data_x) data_pca = pd.DataFrame(pd.concat([data_y.reset_index()["y_true"], pd.DataFrame(data_x_pca, columns = {"pca_0", "pca_1"})], axis=1)) # %% data_pca # %% g3 = sns.relplot(data = data_pca, x = "pca_0", y = "pca_1", hue = "y_true", palette = sns.color_palette("Spectral", as_cmap=True)) g3.set(title=",".join([segment, target, chosen_model]) + "\n variance explained = " + str(round(sum(pca.explained_variance_ratio_), 2))) g3.savefig("_".join([segment, target, chosen_model, "_PCA.pdf"])) # %%