diff --git a/exploration/ml_pipeline_classification_with_clustering.py b/exploration/ml_pipeline_classification_with_clustering.py index 56edc90..4b771b8 100644 --- a/exploration/ml_pipeline_classification_with_clustering.py +++ b/exploration/ml_pipeline_classification_with_clustering.py @@ -26,12 +26,13 @@ import pandas as pd import seaborn as sns from scipy import stats -from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble from sklearn.model_selection import LeaveOneGroupOut, cross_validate -from sklearn.dummy import DummyClassifier from sklearn.impute import SimpleImputer + +from sklearn.dummy import DummyClassifier +from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble from lightgbm import LGBMClassifier -import xgboost as xg +import xgboost as xg from sklearn.cluster import KMeans @@ -44,6 +45,7 @@ if nb_dir not in sys.path: import machine_learning.labels import machine_learning.model +from machine_learning.classification_models import ClassificationModels # %% [markdown] # # RAPIDS models @@ -92,52 +94,8 @@ model_input.set_index(index_columns, inplace=True) # %% jupyter={"source_hidden": true} # Create dict with classification ml models -cmodels = { - 'dummy_classifier': { - 'model': DummyClassifier(strategy="most_frequent"), - 'metrics': [0, 0, 0, 0] - }, - 'logistic_regression': { - 'model': linear_model.LogisticRegression(), - 'metrics': [0, 0, 0, 0] - }, - 'support_vector_machine': { - 'model': svm.SVC(), - 'metrics': [0, 0, 0, 0] - }, - 'gaussian_naive_bayes': { - 'model': naive_bayes.GaussianNB(), - 'metrics': [0, 0, 0, 0] - }, - 'stochastic_gradient_descent_classifier': { - 'model': linear_model.SGDClassifier(), - 'metrics': [0, 0, 0, 0] - }, - 'knn': { - 'model': neighbors.KNeighborsClassifier(), - 'metrics': [0, 0, 0, 0] - }, - 'decision_tree': { - 'model': tree.DecisionTreeClassifier(), - 'metrics': [0, 0, 0, 0] - }, - 'random_forest_classifier': { - 'model': ensemble.RandomForestClassifier(), - 'metrics': [0, 0, 0, 0] - }, - 'gradient_boosting_classifier': { - 'model': ensemble.GradientBoostingClassifier(), - 'metrics': [0, 0, 0, 0] - }, - 'lgbm_classifier': { - 'model': LGBMClassifier(), - 'metrics': [0, 0, 0, 0] - }, - 'XGBoost_classifier': { - 'model': xg.sklearn.XGBClassifier(), - 'metrics': [0, 0, 0, 0] - } -} +cm = ClassificationModels() +cmodels = cm.get_cmodels() # %% jupyter={"source_hidden": true} for k in range(n_clusters): @@ -223,10 +181,4 @@ for k in range(n_clusters): # %% jupyter={"source_hidden": true} # Get overall results -for model_title, model in cmodels.items(): - print("\n************************************\n") - print("Current model:", model_title, end="\n") - print("Acc", model['metrics'][0]/n_clusters) - print("Precision", model['metrics'][1]/n_clusters) - print("Recall", model['metrics'][2]/n_clusters) - print("F1", model['metrics'][3]/n_clusters) +cm.get_total_models_scores(n_clusters=n_clusters) diff --git a/machine_learning/classification_models.py b/machine_learning/classification_models.py new file mode 100644 index 0000000..094e280 --- /dev/null +++ b/machine_learning/classification_models.py @@ -0,0 +1,71 @@ +from sklearn.dummy import DummyClassifier +from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble +from lightgbm import LGBMClassifier +import xgboost as xg + +class ClassificationModels(): + + def __init__(self): + self.cmodels = self.init_classification_models() + + def get_cmodels(self): + return self.cmodels + + def init_classification_models(self): + cmodels = { + 'dummy_classifier': { + 'model': DummyClassifier(strategy="most_frequent"), + 'metrics': [0, 0, 0, 0] + }, + 'logistic_regression': { + 'model': linear_model.LogisticRegression(), + 'metrics': [0, 0, 0, 0] + }, + 'support_vector_machine': { + 'model': svm.SVC(), + 'metrics': [0, 0, 0, 0] + }, + 'gaussian_naive_bayes': { + 'model': naive_bayes.GaussianNB(), + 'metrics': [0, 0, 0, 0] + }, + 'stochastic_gradient_descent_classifier': { + 'model': linear_model.SGDClassifier(), + 'metrics': [0, 0, 0, 0] + }, + 'knn': { + 'model': neighbors.KNeighborsClassifier(), + 'metrics': [0, 0, 0, 0] + }, + 'decision_tree': { + 'model': tree.DecisionTreeClassifier(), + 'metrics': [0, 0, 0, 0] + }, + 'random_forest_classifier': { + 'model': ensemble.RandomForestClassifier(), + 'metrics': [0, 0, 0, 0] + }, + 'gradient_boosting_classifier': { + 'model': ensemble.GradientBoostingClassifier(), + 'metrics': [0, 0, 0, 0] + }, + 'lgbm_classifier': { + 'model': LGBMClassifier(), + 'metrics': [0, 0, 0, 0] + }, + 'XGBoost_classifier': { + 'model': xg.sklearn.XGBClassifier(), + 'metrics': [0, 0, 0, 0] + } + } + + return cmodels + + def get_total_models_scores(self, n_clusters=1): + for model_title, model in self.cmodels.items(): + print("\n************************************\n") + print("Current model:", model_title, end="\n") + print("Acc:", model['metrics'][0]/n_clusters) + print("Precision:", model['metrics'][1]/n_clusters) + print("Recall:", model['metrics'][2]/n_clusters) + print("F1:", model['metrics'][3]/n_clusters) \ No newline at end of file