2023-05-10 23:49:56 +02:00
|
|
|
import pandas as pd
|
|
|
|
import xgboost as xg
|
2022-11-25 12:35:45 +01:00
|
|
|
from lightgbm import LGBMClassifier
|
2023-05-10 23:49:56 +02:00
|
|
|
from sklearn import ensemble, linear_model, naive_bayes, neighbors, svm, tree
|
|
|
|
from sklearn.dummy import DummyClassifier
|
2022-11-25 12:35:45 +01:00
|
|
|
|
2023-05-10 23:49:56 +02:00
|
|
|
|
|
|
|
class ClassificationModels:
|
2022-11-25 12:35:45 +01:00
|
|
|
def __init__(self):
|
|
|
|
self.cmodels = self.init_classification_models()
|
2023-05-10 23:49:56 +02:00
|
|
|
|
2022-11-25 12:35:45 +01:00
|
|
|
def get_cmodels(self):
|
|
|
|
return self.cmodels
|
|
|
|
|
|
|
|
def init_classification_models(self):
|
|
|
|
cmodels = {
|
2023-05-10 23:49:56 +02:00
|
|
|
"dummy_classifier": {
|
|
|
|
"model": DummyClassifier(strategy="most_frequent"),
|
|
|
|
"metrics": [0, 0, 0, 0],
|
2022-11-25 12:35:45 +01:00
|
|
|
},
|
2023-05-10 23:49:56 +02:00
|
|
|
"logistic_regression": {
|
|
|
|
"model": linear_model.LogisticRegression(max_iter=1000),
|
|
|
|
"metrics": [0, 0, 0, 0],
|
2022-11-25 12:35:45 +01:00
|
|
|
},
|
2023-05-10 23:49:56 +02:00
|
|
|
"support_vector_machine": {"model": svm.SVC(), "metrics": [0, 0, 0, 0]},
|
|
|
|
"gaussian_naive_bayes": {
|
|
|
|
"model": naive_bayes.GaussianNB(),
|
|
|
|
"metrics": [0, 0, 0, 0],
|
2022-11-25 12:35:45 +01:00
|
|
|
},
|
2023-05-10 23:49:56 +02:00
|
|
|
"stochastic_gradient_descent_classifier": {
|
|
|
|
"model": linear_model.SGDClassifier(),
|
|
|
|
"metrics": [0, 0, 0, 0],
|
2022-11-25 12:35:45 +01:00
|
|
|
},
|
2023-05-10 23:49:56 +02:00
|
|
|
"knn": {"model": neighbors.KNeighborsClassifier(), "metrics": [0, 0, 0, 0]},
|
|
|
|
"decision_tree": {
|
|
|
|
"model": tree.DecisionTreeClassifier(),
|
|
|
|
"metrics": [0, 0, 0, 0],
|
2022-11-25 12:35:45 +01:00
|
|
|
},
|
2023-05-10 23:49:56 +02:00
|
|
|
"random_forest_classifier": {
|
|
|
|
"model": ensemble.RandomForestClassifier(),
|
|
|
|
"metrics": [0, 0, 0, 0],
|
2022-11-25 12:35:45 +01:00
|
|
|
},
|
2023-05-10 23:49:56 +02:00
|
|
|
"gradient_boosting_classifier": {
|
|
|
|
"model": ensemble.GradientBoostingClassifier(),
|
|
|
|
"metrics": [0, 0, 0, 0],
|
2022-11-25 12:35:45 +01:00
|
|
|
},
|
2023-05-10 23:49:56 +02:00
|
|
|
"lgbm_classifier": {"model": LGBMClassifier(), "metrics": [0, 0, 0, 0]},
|
|
|
|
"XGBoost_classifier": {
|
|
|
|
"model": xg.sklearn.XGBClassifier(),
|
|
|
|
"metrics": [0, 0, 0, 0],
|
2022-11-25 12:35:45 +01:00
|
|
|
},
|
|
|
|
}
|
2023-05-10 23:49:56 +02:00
|
|
|
|
2022-11-25 12:35:45 +01:00
|
|
|
return cmodels
|
2023-05-10 23:49:56 +02:00
|
|
|
|
2022-11-25 12:35:45 +01:00
|
|
|
def get_total_models_scores(self, n_clusters=1):
|
2023-05-10 23:49:56 +02:00
|
|
|
scores = pd.DataFrame(columns=["method", "metric", "mean"])
|
2022-11-25 12:35:45 +01:00
|
|
|
for model_title, model in self.cmodels.items():
|
2023-05-10 23:49:56 +02:00
|
|
|
scores_df = pd.DataFrame(columns=["method", "metric", "mean"])
|
2022-11-25 12:35:45 +01:00
|
|
|
print("\n************************************\n")
|
|
|
|
print("Current model:", model_title, end="\n")
|
2023-05-10 23:49:56 +02:00
|
|
|
print("Acc:", model["metrics"][0] / n_clusters)
|
2023-05-12 16:32:08 +02:00
|
|
|
scores_df = pd.concat(
|
|
|
|
[
|
|
|
|
scores_df,
|
|
|
|
pd.DataFrame(
|
|
|
|
{
|
|
|
|
"method": model_title,
|
|
|
|
"metric": "test_accuracy",
|
|
|
|
"mean": model["metrics"][0] / n_clusters,
|
|
|
|
}
|
|
|
|
),
|
|
|
|
],
|
|
|
|
ignore_index=True,
|
2023-05-10 23:49:56 +02:00
|
|
|
)
|
|
|
|
print("Precision:", model["metrics"][1] / n_clusters)
|
2023-05-12 16:32:08 +02:00
|
|
|
scores_df = pd.concat(
|
|
|
|
[
|
|
|
|
scores_df,
|
|
|
|
pd.DataFrame(
|
|
|
|
{
|
|
|
|
"method": model_title,
|
|
|
|
"metric": "test_precision",
|
|
|
|
"mean": model["metrics"][1] / n_clusters,
|
|
|
|
}
|
|
|
|
),
|
|
|
|
],
|
|
|
|
ignore_index=True,
|
2023-05-10 23:49:56 +02:00
|
|
|
)
|
|
|
|
print("Recall:", model["metrics"][2] / n_clusters)
|
2023-05-12 16:32:08 +02:00
|
|
|
scores_df = pd.concat(
|
|
|
|
[
|
|
|
|
scores_df,
|
|
|
|
pd.DataFrame(
|
|
|
|
{
|
|
|
|
"method": model_title,
|
|
|
|
"metric": "test_recall",
|
|
|
|
"mean": model["metrics"][2] / n_clusters,
|
|
|
|
}
|
|
|
|
),
|
|
|
|
],
|
|
|
|
ignore_index=True,
|
2023-05-10 23:49:56 +02:00
|
|
|
)
|
|
|
|
print("F1:", model["metrics"][3] / n_clusters)
|
2023-05-12 16:32:08 +02:00
|
|
|
scores_df = pd.concat(
|
|
|
|
[
|
|
|
|
scores_df,
|
|
|
|
pd.DataFrame(
|
|
|
|
{
|
|
|
|
"method": model_title,
|
|
|
|
"metric": "test_f1",
|
|
|
|
"mean": model["metrics"][3] / n_clusters,
|
|
|
|
}
|
|
|
|
),
|
|
|
|
],
|
|
|
|
ignore_index=True,
|
2023-05-10 23:49:56 +02:00
|
|
|
)
|
|
|
|
scores = pd.concat([scores, scores_df])
|
|
|
|
return scores
|