# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.14.5 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% jupyter={"outputs_hidden": false, "source_hidden": false} from pathlib import Path import pandas as pd import seaborn as sns from sklearn.decomposition import PCA from machine_learning.helper import ( impute_encode_categorical_features, prepare_cross_validator, prepare_sklearn_data_format, run_all_classification_models, ) # %% CV_METHOD = "logo" # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) print("CV_METHOD: " + CV_METHOD) N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs UNDERSAMPLING = False # (bool) If True this will train and test data on balanced dataset # (using undersampling method) # %% jupyter={"outputs_hidden": false, "source_hidden": false} PATH_BASE = Path("E:/STRAWresults/20230415") SEGMENT_TYPE = "period" print("SEGMENT_TYPE: " + SEGMENT_TYPE) SEGMENT_LENGTH = "30_minutes_before" print("SEGMENT_LENGTH: " + SEGMENT_LENGTH) PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv" model_input = pd.read_csv(PATH_FULL) if SEGMENT_LENGTH == "daily": DAY_LENGTH = "daily" # or "working" print(DAY_LENGTH) model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)] # %% TARGETS = [ "PANAS_negative_affect_mean", "PANAS_positive_affect_mean", "JCQ_job_demand_mean", "JCQ_job_control_mean", "appraisal_stressfulness_period_mean", ] # %% all_features_cleaned = pd.DataFrame() for target in TARGETS: PATH_FULL = ( PATH_BASE / SEGMENT_LENGTH / "features" / ("all_sensor_features_cleaned_straw_py_(" + target + ").csv") ) current_features = pd.read_csv(PATH_FULL, index_col="local_segment") if all_features_cleaned.empty: all_features_cleaned = current_features else: all_features_cleaned = all_features_cleaned.join( current_features[("phone_esm_straw_" + target)], how="inner", rsuffix="_" + target, ) print(all_features_cleaned.shape) # %% pca = PCA(n_components=1) TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS] pca.fit(all_features_cleaned[TARGETS_PREFIXED]) print(pca.explained_variance_ratio_) # %% model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED) model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED]) # %% sns.histplot(data=model_input, x="target") # %% model_input.target.quantile(0.6) # %% jupyter={"outputs_hidden": false, "source_hidden": false} # bins = [-10, 0, 10] # bins for z-scored targets BINS = [-10, 0, 10] # bins for stressfulness (0-4) target print("BINS: ", BINS) model_input["target"], edges = pd.cut( model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True ) # ['low', 'medium', 'high'] print(model_input["target"].value_counts()) REMOVE_MEDIUM = True if ("medium" in model_input["target"]) and REMOVE_MEDIUM: model_input = model_input[model_input["target"] != "medium"] model_input["target"] = ( model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1) ) else: model_input["target"] = model_input["target"].map( {"low": 0, "medium": 1, "high": 2} ) print(model_input["target"].value_counts()) # %% jupyter={"outputs_hidden": false, "source_hidden": false} # UnderSampling if UNDERSAMPLING: no_stress = model_input[model_input["target"] == 0] stress = model_input[model_input["target"] == 1] no_stress = no_stress.sample(n=len(stress)) model_input = pd.concat([stress, no_stress], axis=0) # %% jupyter={"outputs_hidden": false, "source_hidden": false} model_input_encoded = impute_encode_categorical_features(model_input) # %% data_x, data_y, data_groups = prepare_sklearn_data_format( model_input_encoded, CV_METHOD ) cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD) # %% data_y.head() # %% data_y.tail() # %% data_y.shape # %% scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator) # %% PATH_OUTPUT = Path("..") / Path("presentation/results") path_output_full = PATH_OUTPUT / ( "composite_" + SEGMENT_LENGTH + "_classification" + str(BINS) + "_" + CV_METHOD + ".csv" ) scores.to_csv(path_output_full, index=False)