diff --git a/exploration/ml_pipeline_classification_composite.py b/exploration/ml_pipeline_classification_composite.py index a7a5fab..9c67269 100644 --- a/exploration/ml_pipeline_classification_composite.py +++ b/exploration/ml_pipeline_classification_composite.py @@ -17,6 +17,8 @@ from pathlib import Path import pandas as pd +import seaborn as sns +from sklearn.decomposition import PCA from machine_learning.helper import ( impute_encode_categorical_features, @@ -80,9 +82,25 @@ for target in TARGETS: ) print(all_features_cleaned.shape) +# %% +pca = PCA(n_components=1) +TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS] +pca.fit(all_features_cleaned[TARGETS_PREFIXED]) +print(pca.explained_variance_ratio_) + +# %% +model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED) +model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED]) + +# %% +sns.histplot(data=model_input, x="target") + +# %% +model_input.target.quantile(0.6) + # %% jupyter={"outputs_hidden": false, "source_hidden": false} # bins = [-10, 0, 10] # bins for z-scored targets -BINS = [-1, 0, 4] # bins for stressfulness (0-4) target +BINS = [-10, 0, 10] # bins for stressfulness (0-4) target print("BINS: ", BINS) model_input["target"], edges = pd.cut( model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True