diff --git a/exploration/ml_pipeline_classification_with_clustering.py b/exploration/ml_pipeline_classification_with_clustering.py index 34ab47a..ed69d70 100644 --- a/exploration/ml_pipeline_classification_with_clustering.py +++ b/exploration/ml_pipeline_classification_with_clustering.py @@ -25,9 +25,6 @@ from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold, cross_val from machine_learning.classification_models import ClassificationModels -# %% [markdown] -# # RAPIDS models - # %% # ## Set script's parameters N_CLUSTERS = 4 # Number of clusters (could be regarded as a hyperparameter) @@ -75,8 +72,10 @@ print("BINS: " + str(BINS)) model_input[CLUST_COL].describe() -# %% jupyter={"source_hidden": true} +# %% +model_input["target"].value_counts() +# %% jupyter={"source_hidden": true} # Filter-out outlier rows by clust_col # model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)] @@ -92,6 +91,9 @@ uniq["cluster"] = km model_input = model_input.merge(uniq[["pid", "cluster"]]) +# %% +model_input[["cluster", "target"]].value_counts().sort_index() + # %% jupyter={"source_hidden": true} model_input.set_index(index_columns, inplace=True) @@ -107,7 +109,7 @@ for k in range(N_CLUSTERS): model_input_subset.loc[:, "target"], bins=BINS, labels=["low", "high"], - right=False, + right=True, ) # ['low', 'medium', 'high'] model_input_subset["target"].value_counts() # model_input_subset = model_input_subset[model_input_subset["target"] != "medium"] @@ -115,7 +117,7 @@ for k in range(N_CLUSTERS): model_input_subset["target"].astype(str).apply(lambda x: 0 if x == "low" else 1) ) - model_input_subset["target"].value_counts() + print(model_input_subset["target"].value_counts()) if CV_METHOD == "half_logo": model_input_subset["pid_index"] = model_input_subset.groupby("pid").cumcount()