Save results.

ml_pipeline
junos 2022-12-07 15:33:18 +01:00
parent ae2d7a038d
commit 71e1fcf8ca
18 changed files with 37 additions and 10 deletions

2
.gitignore vendored
View File

@ -9,3 +9,5 @@ __pycache__/
/data/features/ /data/features/
/data/baseline/ /data/baseline/
/data/*input*.csv /data/*input*.csv
/data/daily*
/data/intradaily*

View File

@ -16,7 +16,6 @@ from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor from sklearn.dummy import DummyRegressor
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
import xgboost as xg
from IPython.core.interactiveshell import InteractiveShell from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" InteractiveShell.ast_node_interactivity = "all"
@ -27,7 +26,10 @@ if nb_dir not in sys.path:
import machine_learning.helper import machine_learning.helper
# %% # %%
csv_name = "./data/daily_18_hours_all_targets/input_JCQ_job_demand_mean.csv" segment = "intradaily_30_min"
target = "JCQ_job_demand"
csv_name = "./data/" + segment + "_all_targets/input_" + target + "_mean.csv"
#csv_name = "./data/daily_18_hours_all_targets/input_JCQ_job_demand_mean.csv"
# %% # %%
data_x, data_y, data_groups = machine_learning.helper.prepare_model_input(csv_name) data_x, data_y, data_groups = machine_learning.helper.prepare_model_input(csv_name)
@ -66,7 +68,9 @@ print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_ab
print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error'])) print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
print("R2", np.median(lin_reg_scores['test_r2'])) print("R2", np.median(lin_reg_scores['test_r2']))
##################
# %% # %%
chosen_model = "Random Forest"
rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1) rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
rfr_score = cross_validate( rfr_score = cross_validate(
rfr, rfr,
@ -84,13 +88,25 @@ print("R2", np.median(rfr_score['test_r2']))
# %% # %%
y_predicted = cross_val_predict(rfr, data_x, data_y, groups=data_groups, cv=logo) y_predicted = cross_val_predict(rfr, data_x, data_y, groups=data_groups, cv=logo)
#########################
# %%
chosen_model = "Bayesian Ridge"
bayesian_ridge_reg = linear_model.BayesianRidge()
bayesian_ridge_reg_score = cross_validate(
bayesian_ridge_reg,
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
)
print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
# %% # %%
g1 = sns.relplot(data=data_y, x="y_true", y="y_predicted") y_predicted = cross_val_predict(bayesian_ridge_reg, data_x, data_y, groups=data_groups, cv=logo)
#g1.set_axis_labels("true", "predicted")
g1.set(title="Negative affect, Random Forest")
display(g1)
g1.savefig("d18NArfr_relplot.pdf")
# %% # %%
data_y = pd.DataFrame(pd.concat([data_y, data_groups], axis=1)) data_y = pd.DataFrame(pd.concat([data_y, data_groups], axis=1))
@ -100,6 +116,14 @@ data_y["y_predicted"] = y_predicted
# %% # %%
data_y.head() data_y.head()
# %%
g1 = sns.relplot(data=data_y, x="y_true", y="y_predicted")
#g1.set_axis_labels("true", "predicted")
#g1.map(plt.axhline, y=0, color=".7", dashes=(2, 1), zorder=0)
#g1.map(plt.axline, xy1=(0,0), slope=1)
g1.set(title=",".join([segment, target, chosen_model]))
display(g1)
g1.savefig("_".join([segment, target, chosen_model, "_relplot.pdf"]))
# %% # %%
data_y_long = pd.wide_to_long( data_y_long = pd.wide_to_long(
@ -116,8 +140,8 @@ data_y_long.head()
# %% # %%
g2 = sns.displot(data_y_long, x="y", hue="value", binwidth=0.1, height=5, aspect=1.5) g2 = sns.displot(data_y_long, x="y", hue="value", binwidth=0.1, height=5, aspect=1.5)
sns.move_legend(g2, "upper left", bbox_to_anchor=(.55, .45)) sns.move_legend(g2, "upper left", bbox_to_anchor=(.55, .45))
g2.set(title="Negative affect, Random Forest") g2.set(title=",".join([segment, target, chosen_model]))
g2.savefig("d18NArfr_hist.pdf") g2.savefig("_".join([segment, target, chosen_model, "hist.pdf"]))
# %% # %%
pca = PCA(n_components=2) pca = PCA(n_components=2)
@ -133,6 +157,7 @@ data_pca
# %% # %%
g3 = sns.relplot(data = data_pca, x = "pca_0", y = "pca_1", hue = "y_true", palette = sns.color_palette("Spectral", as_cmap=True)) g3 = sns.relplot(data = data_pca, x = "pca_0", y = "pca_1", hue = "y_true", palette = sns.color_palette("Spectral", as_cmap=True))
g3.savefig("d18NArfr_PCA.pdf") g3.set(title=",".join([segment, target, chosen_model]) + "\n variance explained = " + str(round(sum(pca.explained_variance_ratio_), 2)))
g3.savefig("_".join([segment, target, chosen_model, "_PCA.pdf"]))
# %% # %%