Add analysis for composite score of stress.

2023-05-31 21:00:18 +02:00 · 2023-05-31 21:00:18 +02:00 · 78807b941c
parent a9af113c9c
commit 78807b941c
1 changed files with 142 additions and 0 deletions
--- a/exploration/ml_pipeline_classification_composite.py
+++ b/exploration/ml_pipeline_classification_composite.py
@ -0,0 +1,142 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.14.5
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"outputs_hidden": false, "source_hidden": false}
 from pathlib import Path
 import pandas as pd
 from machine_learning.helper import (
    impute_encode_categorical_features,
    prepare_cross_validator,
    prepare_sklearn_data_format,
    run_all_classification_models,
 )
 # %%
 CV_METHOD = "logo"  # logo, half_logo, 5kfold
 # Cross-validation method (could be regarded as a hyperparameter)
 print("CV_METHOD: " + CV_METHOD)
 N_SL = 3  # Number of largest/smallest accuracies (of particular CV) outputs
 UNDERSAMPLING = False
 # (bool) If True this will train and test data on balanced dataset
 # (using undersampling method)
 # %% jupyter={"outputs_hidden": false, "source_hidden": false}
 PATH_BASE = Path("E:/STRAWresults/20230415")
 SEGMENT_TYPE = "period"
 print("SEGMENT_TYPE: " + SEGMENT_TYPE)
 SEGMENT_LENGTH = "30_minutes_before"
 print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
 PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv"
 model_input = pd.read_csv(PATH_FULL)
 if SEGMENT_LENGTH == "daily":
    DAY_LENGTH = "daily"  # or "working"
    print(DAY_LENGTH)
    model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
 # %%
 TARGETS = [
    "PANAS_negative_affect_mean",
    "PANAS_positive_affect_mean",
    "JCQ_job_demand_mean",
    "JCQ_job_control_mean",
    "appraisal_stressfulness_period_mean",
 ]
 # %%
 all_features_cleaned = pd.DataFrame()
 for target in TARGETS:
    PATH_FULL = (
        PATH_BASE
        / SEGMENT_LENGTH
        / "features"
        / ("all_sensor_features_cleaned_straw_py_(" + target + ").csv")
    )
    current_features = pd.read_csv(PATH_FULL, index_col="local_segment")
    if all_features_cleaned.empty:
        all_features_cleaned = current_features
    else:
        all_features_cleaned = all_features_cleaned.join(
            current_features[("phone_esm_straw_" + target)],
            how="inner",
            rsuffix="_" + target,
        )
    print(all_features_cleaned.shape)
 # %% jupyter={"outputs_hidden": false, "source_hidden": false}
 # bins = [-10, 0, 10] # bins for z-scored targets
 BINS = [-1, 0, 4]  # bins for stressfulness (0-4) target
 print("BINS: ", BINS)
 model_input["target"], edges = pd.cut(
    model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
 )  # ['low', 'medium', 'high']
 print(model_input["target"].value_counts())
 REMOVE_MEDIUM = True
 if ("medium" in model_input["target"]) and REMOVE_MEDIUM:
    model_input = model_input[model_input["target"] != "medium"]
    model_input["target"] = (
        model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
    )
 else:
    model_input["target"] = model_input["target"].map(
        {"low": 0, "medium": 1, "high": 2}
    )
    print(model_input["target"].value_counts())
 # %% jupyter={"outputs_hidden": false, "source_hidden": false}
 # UnderSampling
 if UNDERSAMPLING:
    no_stress = model_input[model_input["target"] == 0]
    stress = model_input[model_input["target"] == 1]
    no_stress = no_stress.sample(n=len(stress))
    model_input = pd.concat([stress, no_stress], axis=0)
 # %% jupyter={"outputs_hidden": false, "source_hidden": false}
 model_input_encoded = impute_encode_categorical_features(model_input)
 # %%
 data_x, data_y, data_groups = prepare_sklearn_data_format(
    model_input_encoded, CV_METHOD
 )
 cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
 # %%
 data_y.head()
 # %%
 data_y.tail()
 # %%
 data_y.shape
 # %%
 scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
 # %%
 PATH_OUTPUT = Path("..") / Path("presentation/results")
 path_output_full = PATH_OUTPUT / (
    "composite_"
    + SEGMENT_LENGTH
    + "_classification"
    + str(BINS)
    + "_"
    + CV_METHOD
    + ".csv"
 )
 scores.to_csv(path_output_full, index=False)