# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.14.5 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% jupyter={"outputs_hidden": false, "source_hidden": false} # from IPython.core.interactiveshell import InteractiveShell from pathlib import Path # matplotlib inline # import os # import sys import pandas as pd from machine_learning.helper import ( impute_encode_categorical_features, prepare_cross_validator, prepare_sklearn_data_format, run_all_classification_models, ) # InteractiveShell.ast_node_interactivity = "all" # # nb_dir = os.path.split(os.getcwd())[0] # if nb_dir not in sys.path: # sys.path.append(nb_dir) # %% CV_METHOD = "logo" # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) print("CV_METHOD: " + CV_METHOD) N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs UNDERSAMPLING = False # (bool) If True this will train and test data on balanced dataset # (using undersampling method) # %% jupyter={"outputs_hidden": false, "source_hidden": false} PATH_BASE = Path("E:/STRAWresults/20230415") SEGMENT_TYPE = "period" print("SEGMENT_TYPE: " + SEGMENT_TYPE) SEGMENT_LENGTH = "30_minutes_before" print("SEGMENT_LENGTH: " + SEGMENT_LENGTH) TARGET_VARIABLE = "JCQ_job_control" print("TARGET_VARIABLE: " + TARGET_VARIABLE) if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE): TARGET_VARIABLE += "_" TARGET_VARIABLE += SEGMENT_TYPE PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv") model_input = pd.read_csv(PATH_FULL) if SEGMENT_LENGTH == "daily": DAY_LENGTH = "daily" # or "working" print(DAY_LENGTH) model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)] # %% jupyter={"outputs_hidden": false, "source_hidden": false} model_input["target"].value_counts() # %% jupyter={"outputs_hidden": false, "source_hidden": false} # bins = [-10, 0, 10] # bins for z-scored targets BINS = [-1, 0, 4] # bins for stressfulness (0-4) target print("BINS: ", BINS) model_input["target"], edges = pd.cut( model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True ) # ['low', 'medium', 'high'] print(model_input["target"].value_counts()) REMOVE_MEDIUM = True if ("medium" in model_input["target"]) and REMOVE_MEDIUM: model_input = model_input[model_input["target"] != "medium"] model_input["target"] = ( model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1) ) else: model_input["target"] = model_input["target"].map( {"low": 0, "medium": 1, "high": 2} ) print(model_input["target"].value_counts()) # %% jupyter={"outputs_hidden": false, "source_hidden": false} # UnderSampling if UNDERSAMPLING: no_stress = model_input[model_input["target"] == 0] stress = model_input[model_input["target"] == 1] no_stress = no_stress.sample(n=len(stress)) model_input = pd.concat([stress, no_stress], axis=0) # %% jupyter={"outputs_hidden": false, "source_hidden": false} model_input_encoded = impute_encode_categorical_features(model_input) # %% data_x, data_y, data_groups = prepare_sklearn_data_format( model_input_encoded, CV_METHOD ) cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD) # %% data_y.head() # %% data_y.tail() # %% data_y.shape # %% scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator) # %% PATH_OUTPUT = Path("..") / Path("presentation/results") path_output_full = PATH_OUTPUT / ( TARGET_VARIABLE + "_" + SEGMENT_LENGTH + "_classification" + str(BINS) + "_" + CV_METHOD + ".csv" ) scores.to_csv(path_output_full, index=False)