stress_at_work_analysis/machine_learning/classification_models.py

96 lines
3.5 KiB
Python
Raw Normal View History

2023-05-10 23:49:56 +02:00
import pandas as pd
import xgboost as xg
from lightgbm import LGBMClassifier
2023-05-10 23:49:56 +02:00
from sklearn import ensemble, linear_model, naive_bayes, neighbors, svm, tree
from sklearn.dummy import DummyClassifier
2023-05-10 23:49:56 +02:00
class ClassificationModels:
def __init__(self):
self.cmodels = self.init_classification_models()
2023-05-10 23:49:56 +02:00
def get_cmodels(self):
return self.cmodels
def init_classification_models(self):
cmodels = {
2023-05-10 23:49:56 +02:00
"dummy_classifier": {
"model": DummyClassifier(strategy="most_frequent"),
"metrics": [0, 0, 0, 0],
},
2023-05-10 23:49:56 +02:00
"logistic_regression": {
"model": linear_model.LogisticRegression(max_iter=1000),
"metrics": [0, 0, 0, 0],
},
2023-05-10 23:49:56 +02:00
"support_vector_machine": {"model": svm.SVC(), "metrics": [0, 0, 0, 0]},
"gaussian_naive_bayes": {
"model": naive_bayes.GaussianNB(),
"metrics": [0, 0, 0, 0],
},
2023-05-10 23:49:56 +02:00
"stochastic_gradient_descent_classifier": {
"model": linear_model.SGDClassifier(),
"metrics": [0, 0, 0, 0],
},
2023-05-10 23:49:56 +02:00
"knn": {"model": neighbors.KNeighborsClassifier(), "metrics": [0, 0, 0, 0]},
"decision_tree": {
"model": tree.DecisionTreeClassifier(),
"metrics": [0, 0, 0, 0],
},
2023-05-10 23:49:56 +02:00
"random_forest_classifier": {
"model": ensemble.RandomForestClassifier(),
"metrics": [0, 0, 0, 0],
},
2023-05-10 23:49:56 +02:00
"gradient_boosting_classifier": {
"model": ensemble.GradientBoostingClassifier(),
"metrics": [0, 0, 0, 0],
},
2023-05-10 23:49:56 +02:00
"lgbm_classifier": {"model": LGBMClassifier(), "metrics": [0, 0, 0, 0]},
"XGBoost_classifier": {
"model": xg.sklearn.XGBClassifier(),
"metrics": [0, 0, 0, 0],
},
}
2023-05-10 23:49:56 +02:00
return cmodels
2023-05-10 23:49:56 +02:00
def get_total_models_scores(self, n_clusters=1):
2023-05-10 23:49:56 +02:00
scores = pd.DataFrame(columns=["method", "metric", "mean"])
for model_title, model in self.cmodels.items():
2023-05-10 23:49:56 +02:00
scores_df = pd.DataFrame(columns=["method", "metric", "mean"])
print("\n************************************\n")
print("Current model:", model_title, end="\n")
2023-05-10 23:49:56 +02:00
print("Acc:", model["metrics"][0] / n_clusters)
scores_df.append(
{
"method": model_title,
"metric": "test_accuracy",
"mean": model["metrics"][0] / n_clusters,
}
)
print("Precision:", model["metrics"][1] / n_clusters)
scores_df.append(
{
"method": model_title,
"metric": "test_precision",
"mean": model["metrics"][1] / n_clusters,
}
)
print("Recall:", model["metrics"][2] / n_clusters)
scores_df.append(
{
"method": model_title,
"metric": "test_recall",
"mean": model["metrics"][2] / n_clusters,
}
)
print("F1:", model["metrics"][3] / n_clusters)
scores_df.append(
{
"method": model_title,
"metric": "test_f1",
"mean": model["metrics"][3] / n_clusters,
}
)
scores = pd.concat([scores, scores_df])
return scores