Add PCA for composite target.

master
junos 2023-05-31 21:12:21 +02:00
parent 78807b941c
commit 9cc6bf7c21
1 changed files with 19 additions and 1 deletions

View File

@ -17,6 +17,8 @@
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from machine_learning.helper import ( from machine_learning.helper import (
impute_encode_categorical_features, impute_encode_categorical_features,
@ -80,9 +82,25 @@ for target in TARGETS:
) )
print(all_features_cleaned.shape) print(all_features_cleaned.shape)
# %%
pca = PCA(n_components=1)
TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS]
pca.fit(all_features_cleaned[TARGETS_PREFIXED])
print(pca.explained_variance_ratio_)
# %%
model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED)
model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED])
# %%
sns.histplot(data=model_input, x="target")
# %%
model_input.target.quantile(0.6)
# %% jupyter={"outputs_hidden": false, "source_hidden": false} # %% jupyter={"outputs_hidden": false, "source_hidden": false}
# bins = [-10, 0, 10] # bins for z-scored targets # bins = [-10, 0, 10] # bins for z-scored targets
BINS = [-1, 0, 4] # bins for stressfulness (0-4) target BINS = [-10, 0, 10] # bins for stressfulness (0-4) target
print("BINS: ", BINS) print("BINS: ", BINS)
model_input["target"], edges = pd.cut( model_input["target"], edges = pd.cut(
model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True