stress_at_work_analysis/presentation/prox_comm_PANAS_nb.py

132 lines
3.0 KiB
Python

# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
# %matplotlib inline
import yaml
from sklearn import linear_model
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
import os
import importlib
import matplotlib.pyplot as plt
import sys
import numpy as np
import seaborn as sns
import pandas as pd
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
# %%
from machine_learning import pipeline, features_sensor, labels, model
# %%
importlib.reload(labels)
# %%
with open("./config/prox_comm_PANAS_features.yaml", "r") as file:
sensor_features_params = yaml.safe_load(file)
sensor_features = features_sensor.SensorFeatures(**sensor_features_params)
#sensor_features.set_sensor_data()
sensor_features.calculate_features(cached=True)
# %%
all_features = sensor_features.get_features("all","all")
# %%
with open("./config/prox_comm_PANAS_labels.yaml", "r") as file:
labels_params = yaml.safe_load(file)
labels_current = labels.Labels(**labels_params)
#labels_current.set_labels()
labels_current.aggregate_labels(cached=True)
# %%
model_validation = model.ModelValidation(
sensor_features.get_features("all", "all"),
labels_current.get_aggregated_labels(),
group_variable="participant_id",
cv_name="loso",
)
model_validation.model = linear_model.LinearRegression()
model_validation.set_cv_method()
# %%
model_loso_r2 = model_validation.cross_validate()
# %%
print(model_loso_r2)
print(np.mean(model_loso_r2))
# %%
model_loso_r2[model_loso_r2 > 0]
# %%
logo = LeaveOneGroupOut()
# %%
try_X = model_validation.X.reset_index().drop(["participant_id","date_lj"], axis=1)
try_y = model_validation.y.reset_index().drop(["participant_id","date_lj"], axis=1)
# %%
model_loso_mean_absolute_error = -1 * cross_val_score(
estimator=model_validation.model,
X=try_X,
y=try_y,
groups=model_validation.groups,
cv=logo.split(X=try_X, y=try_y, groups=model_validation.groups),
scoring='neg_mean_absolute_error'
)
# %%
model_loso_mean_absolute_error
# %%
np.median(model_loso_mean_absolute_error)
# %%
model_validation.model.fit(try_X, try_y)
# %%
Y_predicted = model_validation.model.predict(try_X)
# %%
try_y.rename(columns={"NA": "NA_true"}, inplace=True)
try_y["NA_predicted"] = Y_predicted
NA_long = pd.wide_to_long(
try_y.reset_index(),
i="index",
j="value",
stubnames="NA",
sep="_",
suffix=".+",
)
# %%
g1 = sns.displot(NA_long, x="NA", hue="value", binwidth=0.1, height=5, aspect=1.5)
sns.move_legend(g1, "upper left", bbox_to_anchor=(.55, .45))
g1.set_axis_labels("Daily mean", "Day count")
display(g1)
g1.savefig("prox_comm_PANAS_predictions.pdf")
# %%
from sklearn.metrics import mean_absolute_error
# %%
mean_absolute_error(try_y["NA_true"], try_y["NA_predicted"])
# %%
model_loso_mean_absolute_error