# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.14.5 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% jupyter={"outputs_hidden": false, "source_hidden": false} # %matplotlib inline import os import sys import pandas as pd from IPython.core.interactiveshell import InteractiveShell from machine_learning.helper import ( impute_encode_categorical_features, prepare_cross_validator, prepare_sklearn_data_format, run_all_classification_models, ) InteractiveShell.ast_node_interactivity = "all" nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) # %% CV_METHOD = "logo" # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs UNDERSAMPLING = False # (bool) If True this will train and test data on balanced dataset # (using undersampling method) # %% jupyter={"outputs_hidden": false, "source_hidden": false} model_input = pd.read_csv( "E:/STRAWresults/20230415/stress_event/input_appraisal_stressfulness_event_mean.csv" ) # model_input = # model_input[model_input.columns.drop( # list(model_input.filter(regex='empatica_temperature')) # )] # model_input = model_input[model_input['local_segment'].str.contains("daily")] # %% jupyter={"outputs_hidden": false, "source_hidden": false} model_input["target"].value_counts() # %% jupyter={"outputs_hidden": false, "source_hidden": false} # bins = [-10, 0, 10] # bins for z-scored targets bins = [-1, 0, 4] # bins for stressfulness (0-4) target model_input["target"], edges = pd.cut( model_input.target, bins=bins, labels=["low", "high"], retbins=True, right=True ) # ['low', 'medium', 'high'] model_input["target"].value_counts(), edges # model_input = model_input[model_input['target'] != "medium"] model_input["target"] = ( model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1) ) model_input["target"].value_counts() # %% jupyter={"outputs_hidden": false, "source_hidden": false} # UnderSampling if UNDERSAMPLING: no_stress = model_input[model_input["target"] == 0] stress = model_input[model_input["target"] == 1] no_stress = no_stress.sample(n=len(stress)) model_input = pd.concat([stress, no_stress], axis=0) # %% jupyter={"outputs_hidden": false, "source_hidden": false} model_input_encoded = impute_encode_categorical_features(model_input) # %% data_x, data_y, data_groups = prepare_sklearn_data_format( model_input_encoded, CV_METHOD ) cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD) # %% data_y.head() # %% data_y.tail() # %% data_y.shape # %% scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator) # %% scores.to_csv( "../presentation/appraisal_stressfulness_event_classification_" + CV_METHOD + ".csv", index=False, )