stress_at_work_analysis/machine_learning/classification_models.py

124 lines
4.4 KiB
Python

import pandas as pd
import xgboost as xg
from lightgbm import LGBMClassifier
from sklearn import ensemble, linear_model, naive_bayes, neighbors, svm, tree
from sklearn.dummy import DummyClassifier
class ClassificationModels:
def __init__(self):
self.cmodels = self.init_classification_models()
def get_cmodels(self):
return self.cmodels
def init_classification_models(self):
cmodels = {
"dummy_classifier": {
"model": DummyClassifier(strategy="most_frequent"),
"metrics": [0, 0, 0, 0],
},
"logistic_regression": {
"model": linear_model.LogisticRegression(max_iter=1000),
"metrics": [0, 0, 0, 0],
},
"support_vector_machine": {"model": svm.SVC(), "metrics": [0, 0, 0, 0]},
"gaussian_naive_bayes": {
"model": naive_bayes.GaussianNB(),
"metrics": [0, 0, 0, 0],
},
"stochastic_gradient_descent_classifier": {
"model": linear_model.SGDClassifier(),
"metrics": [0, 0, 0, 0],
},
"knn": {"model": neighbors.KNeighborsClassifier(), "metrics": [0, 0, 0, 0]},
"decision_tree": {
"model": tree.DecisionTreeClassifier(),
"metrics": [0, 0, 0, 0],
},
"random_forest_classifier": {
"model": ensemble.RandomForestClassifier(),
"metrics": [0, 0, 0, 0],
},
"gradient_boosting_classifier": {
"model": ensemble.GradientBoostingClassifier(),
"metrics": [0, 0, 0, 0],
},
"lgbm_classifier": {"model": LGBMClassifier(), "metrics": [0, 0, 0, 0]},
"XGBoost_classifier": {
"model": xg.sklearn.XGBClassifier(),
"metrics": [0, 0, 0, 0],
},
}
return cmodels
def get_total_models_scores(self, n_clusters=1):
scores = pd.DataFrame(columns=["method", "metric", "mean"])
for model_title, model in self.cmodels.items():
scores_df = pd.DataFrame(columns=["method", "metric", "mean"])
print("\n************************************\n")
print("Current model:", model_title, end="\n")
print("Acc:", model["metrics"][0] / n_clusters)
scores_df = pd.concat(
[
scores_df,
pd.DataFrame(
{
"method": model_title,
"metric": "test_accuracy",
"mean": model["metrics"][0] / n_clusters,
},
index=[0],
),
],
ignore_index=True,
)
print("Precision:", model["metrics"][1] / n_clusters)
scores_df = pd.concat(
[
scores_df,
pd.DataFrame(
{
"method": model_title,
"metric": "test_precision",
"mean": model["metrics"][1] / n_clusters,
},
index=[0],
),
],
ignore_index=True,
)
print("Recall:", model["metrics"][2] / n_clusters)
scores_df = pd.concat(
[
scores_df,
pd.DataFrame(
{
"method": model_title,
"metric": "test_recall",
"mean": model["metrics"][2] / n_clusters,
},
index=[0],
),
],
ignore_index=True,
)
print("F1:", model["metrics"][3] / n_clusters)
scores_df = pd.concat(
[
scores_df,
pd.DataFrame(
{
"method": model_title,
"metric": "test_f1",
"mean": model["metrics"][3] / n_clusters,
},
index=[0],
),
],
ignore_index=True,
)
scores = pd.concat([scores, scores_df])
return scores