From 78807b941ca35e384f863c2238c1932b320e9dce Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 31 May 2023 21:00:18 +0200 Subject: [PATCH] Add analysis for composite score of stress. --- .../ml_pipeline_classification_composite.py | 142 ++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 exploration/ml_pipeline_classification_composite.py diff --git a/exploration/ml_pipeline_classification_composite.py b/exploration/ml_pipeline_classification_composite.py new file mode 100644 index 0000000..a7a5fab --- /dev/null +++ b/exploration/ml_pipeline_classification_composite.py @@ -0,0 +1,142 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.14.5 +# kernelspec: +# display_name: straw2analysis +# language: python +# name: straw2analysis +# --- + +# %% jupyter={"outputs_hidden": false, "source_hidden": false} +from pathlib import Path + +import pandas as pd + +from machine_learning.helper import ( + impute_encode_categorical_features, + prepare_cross_validator, + prepare_sklearn_data_format, + run_all_classification_models, +) + +# %% +CV_METHOD = "logo" # logo, half_logo, 5kfold +# Cross-validation method (could be regarded as a hyperparameter) +print("CV_METHOD: " + CV_METHOD) +N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs +UNDERSAMPLING = False +# (bool) If True this will train and test data on balanced dataset +# (using undersampling method) + +# %% jupyter={"outputs_hidden": false, "source_hidden": false} +PATH_BASE = Path("E:/STRAWresults/20230415") + +SEGMENT_TYPE = "period" +print("SEGMENT_TYPE: " + SEGMENT_TYPE) +SEGMENT_LENGTH = "30_minutes_before" +print("SEGMENT_LENGTH: " + SEGMENT_LENGTH) + +PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv" + +model_input = pd.read_csv(PATH_FULL) + +if SEGMENT_LENGTH == "daily": + DAY_LENGTH = "daily" # or "working" + print(DAY_LENGTH) + model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)] + +# %% +TARGETS = [ + "PANAS_negative_affect_mean", + "PANAS_positive_affect_mean", + "JCQ_job_demand_mean", + "JCQ_job_control_mean", + "appraisal_stressfulness_period_mean", +] + +# %% +all_features_cleaned = pd.DataFrame() +for target in TARGETS: + PATH_FULL = ( + PATH_BASE + / SEGMENT_LENGTH + / "features" + / ("all_sensor_features_cleaned_straw_py_(" + target + ").csv") + ) + current_features = pd.read_csv(PATH_FULL, index_col="local_segment") + if all_features_cleaned.empty: + all_features_cleaned = current_features + else: + all_features_cleaned = all_features_cleaned.join( + current_features[("phone_esm_straw_" + target)], + how="inner", + rsuffix="_" + target, + ) + print(all_features_cleaned.shape) + +# %% jupyter={"outputs_hidden": false, "source_hidden": false} +# bins = [-10, 0, 10] # bins for z-scored targets +BINS = [-1, 0, 4] # bins for stressfulness (0-4) target +print("BINS: ", BINS) +model_input["target"], edges = pd.cut( + model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True +) # ['low', 'medium', 'high'] +print(model_input["target"].value_counts()) +REMOVE_MEDIUM = True +if ("medium" in model_input["target"]) and REMOVE_MEDIUM: + model_input = model_input[model_input["target"] != "medium"] + model_input["target"] = ( + model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1) + ) +else: + model_input["target"] = model_input["target"].map( + {"low": 0, "medium": 1, "high": 2} + ) + print(model_input["target"].value_counts()) + + +# %% jupyter={"outputs_hidden": false, "source_hidden": false} +# UnderSampling +if UNDERSAMPLING: + no_stress = model_input[model_input["target"] == 0] + stress = model_input[model_input["target"] == 1] + + no_stress = no_stress.sample(n=len(stress)) + model_input = pd.concat([stress, no_stress], axis=0) + + +# %% jupyter={"outputs_hidden": false, "source_hidden": false} +model_input_encoded = impute_encode_categorical_features(model_input) +# %% +data_x, data_y, data_groups = prepare_sklearn_data_format( + model_input_encoded, CV_METHOD +) +cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD) + +# %% +data_y.head() + +# %% +data_y.tail() +# %% +data_y.shape +# %% +scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator) +# %% +PATH_OUTPUT = Path("..") / Path("presentation/results") +path_output_full = PATH_OUTPUT / ( + "composite_" + + SEGMENT_LENGTH + + "_classification" + + str(BINS) + + "_" + + CV_METHOD + + ".csv" +) +scores.to_csv(path_output_full, index=False)