From 78807b941ca35e384f863c2238c1932b320e9dce Mon Sep 17 00:00:00 2001
From: junos <junos.lukan@ijs.si>
Date: Wed, 31 May 2023 21:00:18 +0200
Subject: [PATCH] Add analysis for composite score of stress.

---
 .../ml_pipeline_classification_composite.py   | 142 ++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 exploration/ml_pipeline_classification_composite.py

diff --git a/exploration/ml_pipeline_classification_composite.py b/exploration/ml_pipeline_classification_composite.py
new file mode 100644
index 0000000..a7a5fab
--- /dev/null
+++ b/exploration/ml_pipeline_classification_composite.py
@@ -0,0 +1,142 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.14.5
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+from pathlib import Path
+
+import pandas as pd
+
+from machine_learning.helper import (
+    impute_encode_categorical_features,
+    prepare_cross_validator,
+    prepare_sklearn_data_format,
+    run_all_classification_models,
+)
+
+# %%
+CV_METHOD = "logo"  # logo, half_logo, 5kfold
+# Cross-validation method (could be regarded as a hyperparameter)
+print("CV_METHOD: " + CV_METHOD)
+N_SL = 3  # Number of largest/smallest accuracies (of particular CV) outputs
+UNDERSAMPLING = False
+# (bool) If True this will train and test data on balanced dataset
+# (using undersampling method)
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+PATH_BASE = Path("E:/STRAWresults/20230415")
+
+SEGMENT_TYPE = "period"
+print("SEGMENT_TYPE: " + SEGMENT_TYPE)
+SEGMENT_LENGTH = "30_minutes_before"
+print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
+
+PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv"
+
+model_input = pd.read_csv(PATH_FULL)
+
+if SEGMENT_LENGTH == "daily":
+    DAY_LENGTH = "daily"  # or "working"
+    print(DAY_LENGTH)
+    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
+
+# %%
+TARGETS = [
+    "PANAS_negative_affect_mean",
+    "PANAS_positive_affect_mean",
+    "JCQ_job_demand_mean",
+    "JCQ_job_control_mean",
+    "appraisal_stressfulness_period_mean",
+]
+
+# %%
+all_features_cleaned = pd.DataFrame()
+for target in TARGETS:
+    PATH_FULL = (
+        PATH_BASE
+        / SEGMENT_LENGTH
+        / "features"
+        / ("all_sensor_features_cleaned_straw_py_(" + target + ").csv")
+    )
+    current_features = pd.read_csv(PATH_FULL, index_col="local_segment")
+    if all_features_cleaned.empty:
+        all_features_cleaned = current_features
+    else:
+        all_features_cleaned = all_features_cleaned.join(
+            current_features[("phone_esm_straw_" + target)],
+            how="inner",
+            rsuffix="_" + target,
+        )
+    print(all_features_cleaned.shape)
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# bins = [-10, 0, 10] # bins for z-scored targets
+BINS = [-1, 0, 4]  # bins for stressfulness (0-4) target
+print("BINS: ", BINS)
+model_input["target"], edges = pd.cut(
+    model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
+)  # ['low', 'medium', 'high']
+print(model_input["target"].value_counts())
+REMOVE_MEDIUM = True
+if ("medium" in model_input["target"]) and REMOVE_MEDIUM:
+    model_input = model_input[model_input["target"] != "medium"]
+    model_input["target"] = (
+        model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
+    )
+else:
+    model_input["target"] = model_input["target"].map(
+        {"low": 0, "medium": 1, "high": 2}
+    )
+    print(model_input["target"].value_counts())
+
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+# UnderSampling
+if UNDERSAMPLING:
+    no_stress = model_input[model_input["target"] == 0]
+    stress = model_input[model_input["target"] == 1]
+
+    no_stress = no_stress.sample(n=len(stress))
+    model_input = pd.concat([stress, no_stress], axis=0)
+
+
+# %% jupyter={"outputs_hidden": false, "source_hidden": false}
+model_input_encoded = impute_encode_categorical_features(model_input)
+# %%
+data_x, data_y, data_groups = prepare_sklearn_data_format(
+    model_input_encoded, CV_METHOD
+)
+cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
+
+# %%
+data_y.head()
+
+# %%
+data_y.tail()
+# %%
+data_y.shape
+# %%
+scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
+# %%
+PATH_OUTPUT = Path("..") / Path("presentation/results")
+path_output_full = PATH_OUTPUT / (
+    "composite_"
+    + SEGMENT_LENGTH
+    + "_classification"
+    + str(BINS)
+    + "_"
+    + CV_METHOD
+    + ".csv"
+)
+scores.to_csv(path_output_full, index=False)