# %% import datetime import importlib import os import sys import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import yaml from pyprojroot import here from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate, cross_val_predict from sklearn.metrics import mean_squared_error, r2_score from sklearn.impute import SimpleImputer from sklearn.dummy import DummyRegressor from sklearn.decomposition import PCA import xgboost as xg from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import machine_learning.helper # %% csv_name = "./data/daily_18_hours_all_targets/input_JCQ_job_demand_mean.csv" # %% data_x, data_y, data_groups = machine_learning.helper.prepare_model_input(csv_name) # %% data_y.head() # %% scores = machine_learning.helper.run_all_models(csv_name) # %% jupyter={"source_hidden": true} logo = LeaveOneGroupOut() logo.get_n_splits( data_x, data_y, groups=data_groups, ) # %% [markdown] # ### Baseline: Dummy Regression (mean) dummy_regr = DummyRegressor(strategy="mean") # %% jupyter={"source_hidden": true} lin_reg_scores = cross_validate( dummy_regr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') ) print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error'])) print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error'])) print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error'])) print("R2", np.median(lin_reg_scores['test_r2'])) # %% rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1) rfr_score = cross_validate( rfr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error') ) print("Negative Mean Squared Error", np.median(rfr_score['test_neg_mean_squared_error'])) print("Negative Mean Absolute Error", np.median(rfr_score['test_neg_mean_absolute_error'])) print("Negative Root Mean Squared Error", np.median(rfr_score['test_neg_root_mean_squared_error'])) print("R2", np.median(rfr_score['test_r2'])) # %% y_predicted = cross_val_predict(rfr, data_x, data_y, groups=data_groups, cv=logo) # %% g1 = sns.relplot(data=data_y, x="y_true", y="y_predicted") #g1.set_axis_labels("true", "predicted") g1.set(title="Negative affect, Random Forest") display(g1) g1.savefig("d18NArfr_relplot.pdf") # %% data_y = pd.DataFrame(pd.concat([data_y, data_groups], axis=1)) data_y.rename(columns={"target": "y_true"}, inplace=True) data_y["y_predicted"] = y_predicted # %% data_y.head() # %% data_y_long = pd.wide_to_long( data_y.reset_index(), i=["local_segment", "pid"], j="value", stubnames="y", sep="_", suffix=".+", ) # %% data_y_long.head() # %% g2 = sns.displot(data_y_long, x="y", hue="value", binwidth=0.1, height=5, aspect=1.5) sns.move_legend(g2, "upper left", bbox_to_anchor=(.55, .45)) g2.set(title="Negative affect, Random Forest") g2.savefig("d18NArfr_hist.pdf") # %% pca = PCA(n_components=2) pca.fit(data_x) print(pca.explained_variance_ratio_) # %% data_x_pca = pca.fit_transform(data_x) data_pca = pd.DataFrame(pd.concat([data_y.reset_index()["y_true"], pd.DataFrame(data_x_pca, columns = {"pca_0", "pca_1"})], axis=1)) # %% data_pca # %% g3 = sns.relplot(data = data_pca, x = "pca_0", y = "pca_1", hue = "y_true", palette = sns.color_palette("Spectral", as_cmap=True)) g3.savefig("d18NArfr_PCA.pdf") # %%