diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py index eca7210..33fbfcb 100644 --- a/exploration/ml_pipeline_classification.py +++ b/exploration/ml_pipeline_classification.py @@ -14,12 +14,13 @@ # --- # %% jupyter={"outputs_hidden": false, "source_hidden": false} -# %matplotlib inline -import os -import sys +# from IPython.core.interactiveshell import InteractiveShell +from pathlib import Path +# matplotlib inline +# import os +# import sys import pandas as pd -from IPython.core.interactiveshell import InteractiveShell from machine_learning.helper import ( impute_encode_categorical_features, @@ -28,30 +29,44 @@ from machine_learning.helper import ( run_all_classification_models, ) -InteractiveShell.ast_node_interactivity = "all" - -nb_dir = os.path.split(os.getcwd())[0] -if nb_dir not in sys.path: - sys.path.append(nb_dir) +# InteractiveShell.ast_node_interactivity = "all" +# +# nb_dir = os.path.split(os.getcwd())[0] +# if nb_dir not in sys.path: +# sys.path.append(nb_dir) # %% CV_METHOD = "logo" # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) +print("CV_METHOD: " + CV_METHOD) N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs UNDERSAMPLING = False # (bool) If True this will train and test data on balanced dataset # (using undersampling method) # %% jupyter={"outputs_hidden": false, "source_hidden": false} -model_input = pd.read_csv( - "E:/STRAWresults/20230415/stress_event/input_appraisal_stressfulness_event_mean.csv" +PATH_BASE = Path("E:/STRAWresults/20230415") + +SEGMENT_TYPE = "period" +print("SEGMENT_TYPE: " + SEGMENT_TYPE) +SEGMENT_LENGTH = "30_minutes_before" +print("SEGMENT_LENGTH: " + SEGMENT_LENGTH) +TARGET_VARIABLE = "appraisal_stressfulness" +print("TARGET_VARIABLE: " + TARGET_VARIABLE) + +PATH_FULL = ( + PATH_BASE + / SEGMENT_LENGTH + / ("input_" + TARGET_VARIABLE + "_" + SEGMENT_TYPE + "_mean.csv") ) -# model_input = -# model_input[model_input.columns.drop( -# list(model_input.filter(regex='empatica_temperature')) -# )] -# model_input = model_input[model_input['local_segment'].str.contains("daily")] + +model_input = pd.read_csv(PATH_FULL) + +if SEGMENT_LENGTH == "daily": + DAY_LENGTH = "daily" # or "working" + print(DAY_LENGTH) + model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)] # %% jupyter={"outputs_hidden": false, "source_hidden": false} model_input["target"].value_counts() @@ -63,7 +78,7 @@ model_input["target"], edges = pd.cut( model_input.target, bins=bins, labels=["low", "high"], retbins=True, right=True ) # ['low', 'medium', 'high'] model_input["target"].value_counts(), edges -# model_input = model_input[model_input['target'] != "medium"] +model_input = model_input[model_input["target"] != "medium"] model_input["target"] = ( model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1) ) @@ -99,7 +114,7 @@ data_y.shape scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator) # %% scores.to_csv( - "../presentation/appraisal_stressfulness_event_classification_" + "../presentation/results/appraisal_stressfulness_awake_classification_" + CV_METHOD + ".csv", index=False,