Handle clustering classification the same as other classification models.

master
junos 2023-05-19 02:52:56 +02:00
parent a2401b5e36
commit c51e0da0f7
1 changed files with 120 additions and 97 deletions

View File

@ -6,7 +6,7 @@
# extension: .py # extension: .py
# format_name: percent # format_name: percent
# format_version: '1.3' # format_version: '1.3'
# jupytext_version: 1.13.0 # jupytext_version: 1.14.5
# kernelspec: # kernelspec:
# display_name: straw2analysis # display_name: straw2analysis
# language: python # language: python
@ -14,92 +14,83 @@
# --- # ---
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# %matplotlib inline from pathlib import Path
import os
import sys
import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np
import pandas as pd import pandas as pd
from scipy import stats from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from IPython.core.interactiveshell import InteractiveShell from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
InteractiveShell.ast_node_interactivity = "all" from sklearn.model_selection import train_test_split
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
from machine_learning.classification_models import ClassificationModels from machine_learning.classification_models import ClassificationModels
from machine_learning.helper import impute_encode_categorical_features
# %% [markdown]
# # RAPIDS models
# %% [markdown]
# # Useful method
def treat_categorical_features(input_set):
categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features
categorical_features = input_set[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]
# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
numerical_features = input_set.drop(categorical_feature_colnames, axis=1)
return pd.concat([numerical_features, categorical_features], axis=1)
# %% [markdown] # %% [markdown]
# ## Set script's parameters # ## Set script's parameters
n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter) #
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
# %%
n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
# %%
PATH_BASE = Path("E:/STRAWresults/20230415")
SEGMENT_TYPE = "period"
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
SEGMENT_LENGTH = "30_minutes_before"
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
TARGET_VARIABLE = "appraisal_stressfulness"
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
TARGET_VARIABLE += "_"
TARGET_VARIABLE += SEGMENT_TYPE
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
model_input = pd.read_csv(PATH_FULL)
if SEGMENT_LENGTH == "daily":
DAY_LENGTH = "daily" # or "working"
print(DAY_LENGTH)
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv") CLUST_COL = "limesurvey_demand_control_ratio"
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] print("CLUST_COL: " + CLUST_COL)
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance BINS = [-1, 0, 4]
print("BINS: " + str(BINS))
model_input.columns[list(model_input.columns).index('age'):-1] index_columns = [
"local_segment",
"local_segment_label",
"local_segment_start_datetime",
"local_segment_end_datetime",
]
lime_cols = [col for col in model_input if col.startswith('limesurvey')] model_input[CLUST_COL].describe()
lime_cols
lime_col = 'limesurvey_demand_control_ratio'
clust_col = lime_col
model_input[clust_col].describe()
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# Filter-out outlier rows by clust_col
model_input = model_input[(np.abs(stats.zscore(model_input[CLUST_COL])) < 3)]
# Filter-out outlier rows by clust_col uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)] plt.bar(uniq["pid"], uniq[CLUST_COL])
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
plt.bar(uniq['pid'], uniq[clust_col])
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# Get clusters by cluster col & and merge the clusters to main df # Get clusters by cluster col & and merge the clusters to main df
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid')) km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index("pid"))
np.unique(km, return_counts=True) np.unique(km, return_counts=True)
uniq['cluster'] = km uniq["cluster"] = km
uniq print(uniq)
model_input = model_input.merge(uniq[['pid', 'cluster']]) model_input = model_input.merge(uniq[["pid", "cluster"]])
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
model_input.set_index(index_columns, inplace=True) model_input.set_index(index_columns, inplace=True)
@ -109,50 +100,64 @@ model_input.set_index(index_columns, inplace=True)
cm = ClassificationModels() cm = ClassificationModels()
cmodels = cm.get_cmodels() cmodels = cm.get_cmodels()
# %%
model_input["target"].value_counts()
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
for k in range(n_clusters): for k in range(n_clusters):
model_input_subset = model_input[model_input["cluster"] == k].copy() model_input_subset = model_input[model_input["cluster"] == k].copy()
# Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
model_input_subset['numerical_target'] = model_input_subset['target']
bins = [-10, 0, 10] # bins for z-scored targets
model_input_subset.loc[:, 'target'] = \
pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True)
p15 = np.percentile(model_input_subset['numerical_target'], 15)
p85 = np.percentile(model_input_subset['numerical_target'], 85)
# Treat categorical features
model_input_subset = treat_categorical_features(model_input_subset)
# Split to train, validate, and test subsets
train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
train_set['target'].value_counts() # Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
test_set['target'].value_counts() # model_input_subset['numerical_target'] = model_input_subset['target']
model_input_subset.loc[:, "target"] = pd.cut(
model_input_subset.loc[:, "target"], bins=BINS, labels=[0, 1], right=True
)
# p15 = np.percentile(model_input_subset['numerical_target'], 15)
# p85 = np.percentile(model_input_subset['numerical_target'], 85)
# Treat categorical features
model_input_subset = impute_encode_categorical_features(model_input_subset)
# Split to train, validate, and test subsets
# train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
# test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
train_set, test_set = train_test_split(
model_input_subset,
test_size=0.3,
stratify=model_input_subset["pid"],
random_state=42,
)
print(train_set["target"].value_counts())
print(test_set["target"].value_counts())
train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"] train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
validate_x, test_x, validate_y, test_y = \ validate_x, test_x, validate_y, test_y = train_test_split(
train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42) test_set.drop(["target", "pid"], axis=1),
test_set["target"],
test_size=0.50,
random_state=42,
)
# Impute missing values # Impute missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='median') imputer = SimpleImputer(missing_values=np.nan, strategy="median")
train_x = imputer.fit_transform(train_x) train_x = imputer.fit_transform(train_x)
validate_x = imputer.fit_transform(validate_x) validate_x = imputer.fit_transform(validate_x)
test_x = imputer.fit_transform(test_x) test_x = imputer.fit_transform(test_x)
for model_title, model in cmodels.items(): for model_title, model in cmodels.items():
model['model'].fit(train_x, train_y) model["model"].fit(train_x, train_y)
y_pred = model['model'].predict(validate_x) y_pred = model["model"].predict(validate_x)
acc = accuracy_score(validate_y, y_pred) acc = accuracy_score(validate_y, y_pred)
prec = precision_score(validate_y, y_pred) prec = precision_score(validate_y, y_pred)
rec = recall_score(validate_y, y_pred) rec = recall_score(validate_y, y_pred)
f1 = f1_score(validate_y, y_pred) f1 = f1_score(validate_y, y_pred)
print("\n-------------------------------------\n") print("\n-------------------------------------\n")
print("Current cluster:", k, end="\n") print("Current cluster:", k, end="\n")
print("Current model:", model_title, end="\n") print("Current model:", model_title, end="\n")
@ -160,12 +165,30 @@ for k in range(n_clusters):
print("Precision", prec) print("Precision", prec)
print("Recall", rec) print("Recall", rec)
print("F1", f1) print("F1", f1)
cmodels[model_title]['metrics'][0] += acc cmodels[model_title]["metrics"][0] += acc
cmodels[model_title]['metrics'][1] += prec cmodels[model_title]["metrics"][1] += prec
cmodels[model_title]['metrics'][2] += rec cmodels[model_title]["metrics"][2] += rec
cmodels[model_title]['metrics'][3] += f1 cmodels[model_title]["metrics"][3] += f1
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# Get overall results # Get overall results
cm.get_total_models_scores(n_clusters=n_clusters) scores = cm.get_total_models_scores(n_clusters=n_clusters)
# %%
print(scores)
# %%
PATH_OUTPUT = Path("..") / Path("presentation/results")
path_output_full = PATH_OUTPUT / (
TARGET_VARIABLE
+ "_"
+ SEGMENT_LENGTH
+ "_classification"
+ str(BINS)
+ "_CLUST_"
+ CLUST_COL
+ +str(n_clusters)
+ ".csv"
)
scores.to_csv(path_output_full, index=False)