stress_at_work_analysis/exploration/ml_pipeline_classification_...

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.14.5
#   kernelspec:
#     display_name: straw2analysis
#     language: python
#     name: straw2analysis
# ---

# %% jupyter={"outputs_hidden": false, "source_hidden": false}
from pathlib import Path

import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA

from machine_learning.helper import (
    impute_encode_categorical_features,
    prepare_cross_validator,
    prepare_sklearn_data_format,
    run_all_classification_models,
)

# %%
CV_METHOD = "logo"  # logo, half_logo, 5kfold
# Cross-validation method (could be regarded as a hyperparameter)
print("CV_METHOD: " + CV_METHOD)
N_SL = 3  # Number of largest/smallest accuracies (of particular CV) outputs
UNDERSAMPLING = False
# (bool) If True this will train and test data on balanced dataset
# (using undersampling method)

# %% jupyter={"outputs_hidden": false, "source_hidden": false}
PATH_BASE = Path("E:/STRAWresults/20230415")

SEGMENT_TYPE = "period"
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
SEGMENT_LENGTH = "30_minutes_before"
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)

PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv"

all_features_with_baseline = pd.read_csv(PATH_FULL)

# %%
TARGETS = [
    "PANAS_negative_affect_mean",
    "PANAS_positive_affect_mean",
    "JCQ_job_demand_mean",
    "JCQ_job_control_mean",
    "appraisal_stressfulness_period_mean",
]

# %%
all_features_cleaned = pd.DataFrame()
for target in TARGETS:
    PATH_FULL = (
        PATH_BASE
        / SEGMENT_LENGTH
        / "features"
        / ("all_sensor_features_cleaned_straw_py_(" + target + ").csv")
    )
    current_features = pd.read_csv(PATH_FULL, index_col="local_segment")
    if all_features_cleaned.empty:
        all_features_cleaned = current_features
    else:
        all_features_cleaned = all_features_cleaned.join(
            current_features[("phone_esm_straw_" + target)],
            how="inner",
            rsuffix="_" + target,
        )
    print(all_features_cleaned.shape)

# %%
pca = PCA(n_components=1)
TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS]
pca.fit(all_features_cleaned[TARGETS_PREFIXED])
print(pca.explained_variance_ratio_)

# %%
model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED)
model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED])

# %%
sns.histplot(data=model_input, x="target")

# %%
model_input.target.quantile(0.6)

# %% jupyter={"outputs_hidden": false, "source_hidden": false}
# bins = [-10, 0, 10] # bins for z-scored targets
BINS = [-10, 0, 10]  # bins for stressfulness (0-4) target
print("BINS: ", BINS)
model_input["target"], edges = pd.cut(
    model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
)  # ['low', 'medium', 'high']
print(model_input["target"].value_counts())
REMOVE_MEDIUM = True
if REMOVE_MEDIUM:
    if "medium" in model_input["target"]:
        model_input = model_input[model_input["target"] != "medium"]
    model_input["target"] = (
        model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
    )
else:
    model_input["target"] = model_input["target"].map(
        {"low": 0, "medium": 1, "high": 2}
    )
    print(model_input["target"].value_counts())


# %% jupyter={"outputs_hidden": false, "source_hidden": false}
# UnderSampling
if UNDERSAMPLING:
    no_stress = model_input[model_input["target"] == 0]
    stress = model_input[model_input["target"] == 1]

    no_stress = no_stress.sample(n=len(stress))
    model_input = pd.concat([stress, no_stress], axis=0)


# %%
TARGET_VARIABLE = "PANAS_negative_affect"
print("TARGET_VARIABLE: " + TARGET_VARIABLE)

PATH_FULL_HELP = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")

model_input_with_baseline = pd.read_csv(PATH_FULL_HELP, index_col="local_segment")

# %%
baseline_col_names = [
    col for col in model_input_with_baseline.columns if col not in model_input.columns
]
print(baseline_col_names)

# %%
model_input = model_input.join(
    model_input_with_baseline[baseline_col_names], how="left"
)
model_input.reset_index(inplace=True)

# %%
model_input_encoded = impute_encode_categorical_features(model_input)

# %%
data_x, data_y, data_groups = prepare_sklearn_data_format(
    model_input_encoded, CV_METHOD
)
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)

# %%
data_y.head()

# %%
data_y.tail()
# %%
data_y.shape
# %%
scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
# %%
PATH_OUTPUT = Path("..") / Path("presentation/results")
path_output_full = PATH_OUTPUT / (
    "composite_"
    + SEGMENT_LENGTH
    + "_classification"
    + str(BINS)
    + "_"
    + CV_METHOD
    + ".csv"
)
scores.to_csv(path_output_full, index=False)
Add analysis for composite score of stress. 2023-05-31 21:00:18 +02:00			`# ---`
			`# jupyter:`
			`# jupytext:`
			`# formats: ipynb,py:percent`
			`# text_representation:`
			`# extension: .py`
			`# format_name: percent`
			`# format_version: '1.3'`
			`# jupytext_version: 1.14.5`
			`# kernelspec:`
			`# display_name: straw2analysis`
			`# language: python`
			`# name: straw2analysis`
			`# ---`

			`# %% jupyter={"outputs_hidden": false, "source_hidden": false}`
			`from pathlib import Path`

			`import pandas as pd`
Add PCA for composite target. 2023-05-31 21:12:21 +02:00			`import seaborn as sns`
			`from sklearn.decomposition import PCA`
Add analysis for composite score of stress. 2023-05-31 21:00:18 +02:00
			`from machine_learning.helper import (`
			`impute_encode_categorical_features,`
			`prepare_cross_validator,`
			`prepare_sklearn_data_format,`
			`run_all_classification_models,`
			`)`

			`# %%`
			`CV_METHOD = "logo" # logo, half_logo, 5kfold`
			`# Cross-validation method (could be regarded as a hyperparameter)`
			`print("CV_METHOD: " + CV_METHOD)`
			`N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs`
			`UNDERSAMPLING = False`
			`# (bool) If True this will train and test data on balanced dataset`
			`# (using undersampling method)`

			`# %% jupyter={"outputs_hidden": false, "source_hidden": false}`
			`PATH_BASE = Path("E:/STRAWresults/20230415")`

			`SEGMENT_TYPE = "period"`
			`print("SEGMENT_TYPE: " + SEGMENT_TYPE)`
			`SEGMENT_LENGTH = "30_minutes_before"`
			`print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)`

			`PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv"`

Add baseline features. 2023-05-31 22:25:39 +02:00			`all_features_with_baseline = pd.read_csv(PATH_FULL)`
Add analysis for composite score of stress. 2023-05-31 21:00:18 +02:00
			`# %%`
			`TARGETS = [`
			`"PANAS_negative_affect_mean",`
			`"PANAS_positive_affect_mean",`
			`"JCQ_job_demand_mean",`
			`"JCQ_job_control_mean",`
			`"appraisal_stressfulness_period_mean",`
			`]`

			`# %%`
			`all_features_cleaned = pd.DataFrame()`
			`for target in TARGETS:`
			`PATH_FULL = (`
			`PATH_BASE`
			`/ SEGMENT_LENGTH`
			`/ "features"`
			`/ ("all_sensor_features_cleaned_straw_py_(" + target + ").csv")`
			`)`
			`current_features = pd.read_csv(PATH_FULL, index_col="local_segment")`
			`if all_features_cleaned.empty:`
			`all_features_cleaned = current_features`
			`else:`
			`all_features_cleaned = all_features_cleaned.join(`
			`current_features[("phone_esm_straw_" + target)],`
			`how="inner",`
			`rsuffix="_" + target,`
			`)`
			`print(all_features_cleaned.shape)`

Add PCA for composite target. 2023-05-31 21:12:21 +02:00			`# %%`
			`pca = PCA(n_components=1)`
			`TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS]`
			`pca.fit(all_features_cleaned[TARGETS_PREFIXED])`
			`print(pca.explained_variance_ratio_)`

			`# %%`
			`model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED)`
			`model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED])`

			`# %%`
			`sns.histplot(data=model_input, x="target")`

			`# %%`
			`model_input.target.quantile(0.6)`

Add analysis for composite score of stress. 2023-05-31 21:00:18 +02:00			`# %% jupyter={"outputs_hidden": false, "source_hidden": false}`
			`# bins = [-10, 0, 10] # bins for z-scored targets`
Add PCA for composite target. 2023-05-31 21:12:21 +02:00			`BINS = [-10, 0, 10] # bins for stressfulness (0-4) target`
Add analysis for composite score of stress. 2023-05-31 21:00:18 +02:00			`print("BINS: ", BINS)`
			`model_input["target"], edges = pd.cut(`
			`model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True`
			`) # ['low', 'medium', 'high']`
			`print(model_input["target"].value_counts())`
			`REMOVE_MEDIUM = True`
Improve removal of "medium" class. 2023-05-31 22:46:49 +02:00			`if REMOVE_MEDIUM:`
			`if "medium" in model_input["target"]:`
			`model_input = model_input[model_input["target"] != "medium"]`
Add analysis for composite score of stress. 2023-05-31 21:00:18 +02:00			`model_input["target"] = (`
			`model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)`
			`)`
			`else:`
			`model_input["target"] = model_input["target"].map(`
			`{"low": 0, "medium": 1, "high": 2}`
			`)`
			`print(model_input["target"].value_counts())`


			`# %% jupyter={"outputs_hidden": false, "source_hidden": false}`
			`# UnderSampling`
			`if UNDERSAMPLING:`
			`no_stress = model_input[model_input["target"] == 0]`
			`stress = model_input[model_input["target"] == 1]`

			`no_stress = no_stress.sample(n=len(stress))`
			`model_input = pd.concat([stress, no_stress], axis=0)`


Add baseline features. 2023-05-31 22:25:39 +02:00			`# %%`
			`TARGET_VARIABLE = "PANAS_negative_affect"`
			`print("TARGET_VARIABLE: " + TARGET_VARIABLE)`

			`PATH_FULL_HELP = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")`

			`model_input_with_baseline = pd.read_csv(PATH_FULL_HELP, index_col="local_segment")`

			`# %%`
			`baseline_col_names = [`
			`col for col in model_input_with_baseline.columns if col not in model_input.columns`
			`]`
			`print(baseline_col_names)`

			`# %%`
			`model_input = model_input.join(`
			`model_input_with_baseline[baseline_col_names], how="left"`
			`)`
			`model_input.reset_index(inplace=True)`

			`# %%`
Add analysis for composite score of stress. 2023-05-31 21:00:18 +02:00			`model_input_encoded = impute_encode_categorical_features(model_input)`
Add baseline features. 2023-05-31 22:25:39 +02:00
Add analysis for composite score of stress. 2023-05-31 21:00:18 +02:00			`# %%`
			`data_x, data_y, data_groups = prepare_sklearn_data_format(`
			`model_input_encoded, CV_METHOD`
			`)`
			`cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)`

			`# %%`
			`data_y.head()`

			`# %%`
			`data_y.tail()`
			`# %%`
			`data_y.shape`
			`# %%`
			`scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)`
			`# %%`
			`PATH_OUTPUT = Path("..") / Path("presentation/results")`
			`path_output_full = PATH_OUTPUT / (`
			`"composite_"`
			`+ SEGMENT_LENGTH`
			`+ "_classification"`
			`+ str(BINS)`
			`+ "_"`
			`+ CV_METHOD`
			`+ ".csv"`
			`)`
			`scores.to_csv(path_output_full, index=False)`