2023-05-31 21:00:18 +02:00
|
|
|
# ---
|
|
|
|
# jupyter:
|
|
|
|
# jupytext:
|
|
|
|
# formats: ipynb,py:percent
|
|
|
|
# text_representation:
|
|
|
|
# extension: .py
|
|
|
|
# format_name: percent
|
|
|
|
# format_version: '1.3'
|
|
|
|
# jupytext_version: 1.14.5
|
|
|
|
# kernelspec:
|
|
|
|
# display_name: straw2analysis
|
|
|
|
# language: python
|
|
|
|
# name: straw2analysis
|
|
|
|
# ---
|
|
|
|
|
|
|
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import pandas as pd
|
2023-05-31 21:12:21 +02:00
|
|
|
import seaborn as sns
|
|
|
|
from sklearn.decomposition import PCA
|
2023-05-31 21:00:18 +02:00
|
|
|
|
|
|
|
from machine_learning.helper import (
|
|
|
|
impute_encode_categorical_features,
|
|
|
|
prepare_cross_validator,
|
|
|
|
prepare_sklearn_data_format,
|
|
|
|
run_all_classification_models,
|
|
|
|
)
|
|
|
|
|
|
|
|
# %%
|
|
|
|
CV_METHOD = "logo" # logo, half_logo, 5kfold
|
|
|
|
# Cross-validation method (could be regarded as a hyperparameter)
|
|
|
|
print("CV_METHOD: " + CV_METHOD)
|
|
|
|
N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
|
|
|
UNDERSAMPLING = False
|
|
|
|
# (bool) If True this will train and test data on balanced dataset
|
|
|
|
# (using undersampling method)
|
|
|
|
|
|
|
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
|
|
|
PATH_BASE = Path("E:/STRAWresults/20230415")
|
|
|
|
|
|
|
|
SEGMENT_TYPE = "period"
|
|
|
|
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
|
|
|
SEGMENT_LENGTH = "30_minutes_before"
|
|
|
|
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
|
|
|
|
|
|
|
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv"
|
|
|
|
|
2023-05-31 22:25:39 +02:00
|
|
|
all_features_with_baseline = pd.read_csv(PATH_FULL)
|
2023-05-31 21:00:18 +02:00
|
|
|
|
|
|
|
# %%
|
|
|
|
TARGETS = [
|
|
|
|
"PANAS_negative_affect_mean",
|
|
|
|
"PANAS_positive_affect_mean",
|
|
|
|
"JCQ_job_demand_mean",
|
|
|
|
"JCQ_job_control_mean",
|
|
|
|
"appraisal_stressfulness_period_mean",
|
|
|
|
]
|
|
|
|
|
|
|
|
# %%
|
|
|
|
all_features_cleaned = pd.DataFrame()
|
|
|
|
for target in TARGETS:
|
|
|
|
PATH_FULL = (
|
|
|
|
PATH_BASE
|
|
|
|
/ SEGMENT_LENGTH
|
|
|
|
/ "features"
|
|
|
|
/ ("all_sensor_features_cleaned_straw_py_(" + target + ").csv")
|
|
|
|
)
|
|
|
|
current_features = pd.read_csv(PATH_FULL, index_col="local_segment")
|
|
|
|
if all_features_cleaned.empty:
|
|
|
|
all_features_cleaned = current_features
|
|
|
|
else:
|
|
|
|
all_features_cleaned = all_features_cleaned.join(
|
|
|
|
current_features[("phone_esm_straw_" + target)],
|
|
|
|
how="inner",
|
|
|
|
rsuffix="_" + target,
|
|
|
|
)
|
|
|
|
print(all_features_cleaned.shape)
|
|
|
|
|
2023-05-31 21:12:21 +02:00
|
|
|
# %%
|
|
|
|
pca = PCA(n_components=1)
|
|
|
|
TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS]
|
|
|
|
pca.fit(all_features_cleaned[TARGETS_PREFIXED])
|
|
|
|
print(pca.explained_variance_ratio_)
|
|
|
|
|
|
|
|
# %%
|
|
|
|
model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED)
|
|
|
|
model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED])
|
|
|
|
|
|
|
|
# %%
|
|
|
|
sns.histplot(data=model_input, x="target")
|
|
|
|
|
|
|
|
# %%
|
|
|
|
model_input.target.quantile(0.6)
|
|
|
|
|
2023-05-31 21:00:18 +02:00
|
|
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
|
|
|
# bins = [-10, 0, 10] # bins for z-scored targets
|
2023-05-31 21:12:21 +02:00
|
|
|
BINS = [-10, 0, 10] # bins for stressfulness (0-4) target
|
2023-05-31 21:00:18 +02:00
|
|
|
print("BINS: ", BINS)
|
|
|
|
model_input["target"], edges = pd.cut(
|
|
|
|
model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
|
|
|
|
) # ['low', 'medium', 'high']
|
|
|
|
print(model_input["target"].value_counts())
|
|
|
|
REMOVE_MEDIUM = True
|
2023-05-31 22:46:49 +02:00
|
|
|
if REMOVE_MEDIUM:
|
|
|
|
if "medium" in model_input["target"]:
|
|
|
|
model_input = model_input[model_input["target"] != "medium"]
|
2023-05-31 21:00:18 +02:00
|
|
|
model_input["target"] = (
|
|
|
|
model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
model_input["target"] = model_input["target"].map(
|
|
|
|
{"low": 0, "medium": 1, "high": 2}
|
|
|
|
)
|
|
|
|
print(model_input["target"].value_counts())
|
|
|
|
|
|
|
|
|
|
|
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
|
|
|
# UnderSampling
|
|
|
|
if UNDERSAMPLING:
|
|
|
|
no_stress = model_input[model_input["target"] == 0]
|
|
|
|
stress = model_input[model_input["target"] == 1]
|
|
|
|
|
|
|
|
no_stress = no_stress.sample(n=len(stress))
|
|
|
|
model_input = pd.concat([stress, no_stress], axis=0)
|
|
|
|
|
|
|
|
|
2023-05-31 22:25:39 +02:00
|
|
|
# %%
|
|
|
|
TARGET_VARIABLE = "PANAS_negative_affect"
|
|
|
|
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
|
|
|
|
|
|
|
PATH_FULL_HELP = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
|
|
|
|
|
|
|
model_input_with_baseline = pd.read_csv(PATH_FULL_HELP, index_col="local_segment")
|
|
|
|
|
|
|
|
# %%
|
|
|
|
baseline_col_names = [
|
|
|
|
col for col in model_input_with_baseline.columns if col not in model_input.columns
|
|
|
|
]
|
|
|
|
print(baseline_col_names)
|
|
|
|
|
|
|
|
# %%
|
|
|
|
model_input = model_input.join(
|
|
|
|
model_input_with_baseline[baseline_col_names], how="left"
|
|
|
|
)
|
|
|
|
model_input.reset_index(inplace=True)
|
|
|
|
|
|
|
|
# %%
|
2023-05-31 21:00:18 +02:00
|
|
|
model_input_encoded = impute_encode_categorical_features(model_input)
|
2023-05-31 22:25:39 +02:00
|
|
|
|
2023-05-31 21:00:18 +02:00
|
|
|
# %%
|
|
|
|
data_x, data_y, data_groups = prepare_sklearn_data_format(
|
|
|
|
model_input_encoded, CV_METHOD
|
|
|
|
)
|
|
|
|
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
|
|
|
|
|
|
|
|
# %%
|
|
|
|
data_y.head()
|
|
|
|
|
|
|
|
# %%
|
|
|
|
data_y.tail()
|
|
|
|
# %%
|
|
|
|
data_y.shape
|
|
|
|
# %%
|
|
|
|
scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
|
|
|
|
# %%
|
|
|
|
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
|
|
|
path_output_full = PATH_OUTPUT / (
|
|
|
|
"composite_"
|
|
|
|
+ SEGMENT_LENGTH
|
|
|
|
+ "_classification"
|
|
|
|
+ str(BINS)
|
|
|
|
+ "_"
|
|
|
|
+ CV_METHOD
|
|
|
|
+ ".csv"
|
|
|
|
)
|
|
|
|
scores.to_csv(path_output_full, index=False)
|