Return scores for classification.

master
junos 2023-05-10 23:49:56 +02:00
parent f58d20ffc2
commit 055e87dbac
2 changed files with 172 additions and 126 deletions

View File

@ -6,7 +6,7 @@
# extension: .py # extension: .py
# format_name: percent # format_name: percent
# format_version: '1.3' # format_version: '1.3'
# jupytext_version: 1.13.0 # jupytext_version: 1.14.5
# kernelspec: # kernelspec:
# display_name: straw2analysis # display_name: straw2analysis
# language: python # language: python
@ -15,57 +15,45 @@
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# %matplotlib inline # %matplotlib inline
import datetime
import importlib
import os import os
import sys import sys
import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np
import pandas as pd import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyClassifier
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
import xgboost as xg
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold, cross_validate
from IPython.core.interactiveshell import InteractiveShell from machine_learning.classification_models import ClassificationModels
InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0] nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path: if nb_dir not in sys.path:
sys.path.append(nb_dir) sys.path.append(nb_dir)
import machine_learning.labels
import machine_learning.model
from machine_learning.classification_models import ClassificationModels
# %% [markdown] # %% [markdown]
# # RAPIDS models # # RAPIDS models
# %% [markdown] # %%
# ## Set script's parameters # ## Set script's parameters
n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter) N_CLUSTERS = 4 # Number of clusters (could be regarded as a hyperparameter)
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) CV_METHOD = "logo" # logo, halflogo, 5kfold
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs # Cross-validation method (could be regarded as a hyperparameter)
N_SL = 1 # Number of largest/smallest accuracies (of particular CV) outputs
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv") model_input = pd.read_csv(
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] "E:/STRAWresults/20230415/30_minutes_before/input_PANAS_negative_affect_mean.csv"
)
index_columns = [
"local_segment",
"local_segment_label",
"local_segment_start_datetime",
"local_segment_end_datetime",
]
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance lime_col = "limesurvey_demand_control_ratio_quartile"
model_input.columns[list(model_input.columns).index('age'):-1]
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
lime_cols
lime_col = 'limesurvey_demand_control_ratio_quartile'
clust_col = lime_col clust_col = lime_col
model_input[clust_col].describe() model_input[clust_col].describe()
@ -73,21 +61,20 @@ model_input[clust_col].describe()
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# Filter-out outlier rows by clust_col # Filter-out outlier rows by clust_col
#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)] # model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True) uniq = model_input[[clust_col, "pid"]].drop_duplicates().reset_index(drop=True)
uniq = uniq.dropna() uniq = uniq.dropna()
plt.bar(uniq['pid'], uniq[clust_col]) plt.bar(uniq["pid"], uniq[clust_col])
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# Get clusters by cluster col & and merge the clusters to main df # Get clusters by cluster col & and merge the clusters to main df
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid')) km = KMeans(n_clusters=N_CLUSTERS).fit_predict(uniq.set_index("pid"))
np.unique(km, return_counts=True) np.unique(km, return_counts=True)
uniq['cluster'] = km uniq["cluster"] = km
uniq
model_input = model_input.merge(uniq[['pid', 'cluster']]) model_input = model_input.merge(uniq[["pid", "cluster"]])
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
model_input.set_index(index_columns, inplace=True) model_input.set_index(index_columns, inplace=True)
@ -98,31 +85,57 @@ cm = ClassificationModels()
cmodels = cm.get_cmodels() cmodels = cm.get_cmodels()
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
for k in range(n_clusters): for k in range(N_CLUSTERS):
model_input_subset = model_input[model_input["cluster"] == k].copy() model_input_subset = model_input[model_input["cluster"] == k].copy()
bins = [-10, -1, 1, 10] # bins for z-scored targets bins = [-10, -1, 1, 10] # bins for z-scored targets
model_input_subset.loc[:, 'target'] = \ model_input_subset.loc[:, "target"] = pd.cut(
pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high'] model_input_subset.loc[:, "target"],
model_input_subset['target'].value_counts() bins=bins,
model_input_subset = model_input_subset[model_input_subset['target'] != "medium"] labels=["low", "medium", "high"],
model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1) right=False,
) # ['low', 'medium', 'high']
model_input_subset["target"].value_counts()
model_input_subset = model_input_subset[model_input_subset["target"] != "medium"]
model_input_subset["target"] = (
model_input_subset["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
)
model_input_subset['target'].value_counts() model_input_subset["target"].value_counts()
if cv_method_str == 'half_logo':
model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round() if CV_METHOD == "half_logo":
model_input_subset["pid_half"] = model_input_subset["pid"] + "_" + model_input_subset["pid_index"].astype(int).astype(str) model_input_subset["pid_index"] = model_input_subset.groupby("pid").cumcount()
model_input_subset["pid_count"] = model_input_subset.groupby("pid")[
"pid"
].transform("count")
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"] model_input_subset["pid_index"] = (
model_input_subset["pid_index"] / model_input_subset["pid_count"] + 1
).round()
model_input_subset["pid_half"] = (
model_input_subset["pid"]
+ "_"
+ model_input_subset["pid_index"].astype(int).astype(str)
)
data_x, data_y, data_groups = (
model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
model_input_subset["target"],
model_input_subset["pid_half"],
)
else: else:
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"] data_x, data_y, data_groups = (
model_input_subset.drop(["target", "pid"], axis=1),
model_input_subset["target"],
model_input_subset["pid"],
)
# Treat categorical features # Treat categorical features
categorical_feature_colnames = ["gender", "startlanguage"] categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col] additional_categorical_features = [
col
for col in data_x.columns
if "mostcommonactivity" in col or "homelabel" in col
]
categorical_feature_colnames += additional_categorical_features categorical_feature_colnames += additional_categorical_features
categorical_features = data_x[categorical_feature_colnames].copy() categorical_features = data_x[categorical_feature_colnames].copy()
@ -132,7 +145,9 @@ for k in range(n_clusters):
categorical_features = categorical_features.fillna(mode_categorical_features) categorical_features = categorical_features.fillna(mode_categorical_features)
# one-hot encoding # one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category")) categorical_features = categorical_features.apply(
lambda col: col.astype("category")
)
if not categorical_features.empty: if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features) categorical_features = pd.get_dummies(categorical_features)
@ -140,8 +155,10 @@ for k in range(n_clusters):
train_x = pd.concat([numerical_features, categorical_features], axis=1) train_x = pd.concat([numerical_features, categorical_features], axis=1)
# Establish cv method # Establish cv method
cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method cv_method = StratifiedKFold(
if cv_method_str == 'logo' or cv_method_str == 'half_logo': n_splits=5, shuffle=True
) # Defaults to 5 k-folds in cross_validate method
if CV_METHOD == "logo" or CV_METHOD == "half_logo":
cv_method = LeaveOneGroupOut() cv_method = LeaveOneGroupOut()
cv_method.get_n_splits( cv_method.get_n_splits(
train_x, train_x,
@ -149,36 +166,41 @@ for k in range(n_clusters):
groups=data_groups, groups=data_groups,
) )
imputer = SimpleImputer(missing_values=np.nan, strategy='median') imputer = SimpleImputer(missing_values=np.nan, strategy="median")
for model_title, model in cmodels.items(): for model_title, model in cmodels.items():
classifier = cross_validate( classifier = cross_validate(
model['model'], model["model"],
X=imputer.fit_transform(train_x), X=imputer.fit_transform(train_x),
y=data_y, y=data_y,
groups=data_groups, groups=data_groups,
cv=cv_method, cv=cv_method,
n_jobs=-1, n_jobs=-1,
error_score='raise', error_score="raise",
scoring=('accuracy', 'precision', 'recall', 'f1') scoring=("accuracy", "precision", "recall", "f1"),
) )
print("\n-------------------------------------\n") print("\n-------------------------------------\n")
print("Current cluster:", k, end="\n") print("Current cluster:", k, end="\n")
print("Current model:", model_title, end="\n") print("Current model:", model_title, end="\n")
print("Acc", np.mean(classifier['test_accuracy'])) print("Acc", np.mean(classifier["test_accuracy"]))
print("Precision", np.mean(classifier['test_precision'])) print("Precision", np.mean(classifier["test_precision"]))
print("Recall", np.mean(classifier['test_recall'])) print("Recall", np.mean(classifier["test_recall"]))
print("F1", np.mean(classifier['test_f1'])) print("F1", np.mean(classifier["test_f1"]))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1]) print(
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl])) f"Largest {N_SL} ACC:",
np.sort(-np.partition(-classifier["test_accuracy"], N_SL)[:N_SL])[::-1],
cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy']) )
cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision']) print(
cmodels[model_title]['metrics'][2] += np.mean(classifier['test_recall']) f"Smallest {N_SL} ACC:",
cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1']) np.sort(np.partition(classifier["test_accuracy"], N_SL)[:N_SL]),
)
cmodels[model_title]["metrics"][0] += np.mean(classifier["test_accuracy"])
cmodels[model_title]["metrics"][1] += np.mean(classifier["test_precision"])
cmodels[model_title]["metrics"][2] += np.mean(classifier["test_recall"])
cmodels[model_title]["metrics"][3] += np.mean(classifier["test_f1"])
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# Get overall results # Get overall results
cm.get_total_models_scores(n_clusters=n_clusters) scores = cm.get_total_models_scores(n_clusters=N_CLUSTERS)

View File

@ -1,71 +1,95 @@
from sklearn.dummy import DummyClassifier import pandas as pd
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble import xgboost as xg
from lightgbm import LGBMClassifier from lightgbm import LGBMClassifier
import xgboost as xg from sklearn import ensemble, linear_model, naive_bayes, neighbors, svm, tree
from sklearn.dummy import DummyClassifier
class ClassificationModels():
class ClassificationModels:
def __init__(self): def __init__(self):
self.cmodels = self.init_classification_models() self.cmodels = self.init_classification_models()
def get_cmodels(self): def get_cmodels(self):
return self.cmodels return self.cmodels
def init_classification_models(self): def init_classification_models(self):
cmodels = { cmodels = {
'dummy_classifier': { "dummy_classifier": {
'model': DummyClassifier(strategy="most_frequent"), "model": DummyClassifier(strategy="most_frequent"),
'metrics': [0, 0, 0, 0] "metrics": [0, 0, 0, 0],
}, },
'logistic_regression': { "logistic_regression": {
'model': linear_model.LogisticRegression(max_iter=1000), "model": linear_model.LogisticRegression(max_iter=1000),
'metrics': [0, 0, 0, 0] "metrics": [0, 0, 0, 0],
}, },
'support_vector_machine': { "support_vector_machine": {"model": svm.SVC(), "metrics": [0, 0, 0, 0]},
'model': svm.SVC(), "gaussian_naive_bayes": {
'metrics': [0, 0, 0, 0] "model": naive_bayes.GaussianNB(),
"metrics": [0, 0, 0, 0],
}, },
'gaussian_naive_bayes': { "stochastic_gradient_descent_classifier": {
'model': naive_bayes.GaussianNB(), "model": linear_model.SGDClassifier(),
'metrics': [0, 0, 0, 0] "metrics": [0, 0, 0, 0],
}, },
'stochastic_gradient_descent_classifier': { "knn": {"model": neighbors.KNeighborsClassifier(), "metrics": [0, 0, 0, 0]},
'model': linear_model.SGDClassifier(), "decision_tree": {
'metrics': [0, 0, 0, 0] "model": tree.DecisionTreeClassifier(),
"metrics": [0, 0, 0, 0],
}, },
'knn': { "random_forest_classifier": {
'model': neighbors.KNeighborsClassifier(), "model": ensemble.RandomForestClassifier(),
'metrics': [0, 0, 0, 0] "metrics": [0, 0, 0, 0],
}, },
'decision_tree': { "gradient_boosting_classifier": {
'model': tree.DecisionTreeClassifier(), "model": ensemble.GradientBoostingClassifier(),
'metrics': [0, 0, 0, 0] "metrics": [0, 0, 0, 0],
}, },
'random_forest_classifier': { "lgbm_classifier": {"model": LGBMClassifier(), "metrics": [0, 0, 0, 0]},
'model': ensemble.RandomForestClassifier(), "XGBoost_classifier": {
'metrics': [0, 0, 0, 0] "model": xg.sklearn.XGBClassifier(),
"metrics": [0, 0, 0, 0],
}, },
'gradient_boosting_classifier': {
'model': ensemble.GradientBoostingClassifier(),
'metrics': [0, 0, 0, 0]
},
'lgbm_classifier': {
'model': LGBMClassifier(),
'metrics': [0, 0, 0, 0]
},
'XGBoost_classifier': {
'model': xg.sklearn.XGBClassifier(),
'metrics': [0, 0, 0, 0]
}
} }
return cmodels return cmodels
def get_total_models_scores(self, n_clusters=1): def get_total_models_scores(self, n_clusters=1):
scores = pd.DataFrame(columns=["method", "metric", "mean"])
for model_title, model in self.cmodels.items(): for model_title, model in self.cmodels.items():
scores_df = pd.DataFrame(columns=["method", "metric", "mean"])
print("\n************************************\n") print("\n************************************\n")
print("Current model:", model_title, end="\n") print("Current model:", model_title, end="\n")
print("Acc:", model['metrics'][0]/n_clusters) print("Acc:", model["metrics"][0] / n_clusters)
print("Precision:", model['metrics'][1]/n_clusters) scores_df.append(
print("Recall:", model['metrics'][2]/n_clusters) {
print("F1:", model['metrics'][3]/n_clusters) "method": model_title,
"metric": "test_accuracy",
"mean": model["metrics"][0] / n_clusters,
}
)
print("Precision:", model["metrics"][1] / n_clusters)
scores_df.append(
{
"method": model_title,
"metric": "test_precision",
"mean": model["metrics"][1] / n_clusters,
}
)
print("Recall:", model["metrics"][2] / n_clusters)
scores_df.append(
{
"method": model_title,
"metric": "test_recall",
"mean": model["metrics"][2] / n_clusters,
}
)
print("F1:", model["metrics"][3] / n_clusters)
scores_df.append(
{
"method": model_title,
"metric": "test_f1",
"mean": model["metrics"][3] / n_clusters,
}
)
scores = pd.concat([scores, scores_df])
return scores