Create a classification models class and use it in the ml pipeline script.

2022-11-25 12:35:45 +01:00 · 2022-11-25 12:35:45 +01:00 · 98f78d72fc
parent 218b684514
commit 98f78d72fc
2 changed files with 79 additions and 56 deletions
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@ -26,12 +26,13 @@ import pandas as pd
 import seaborn as sns
 from scipy import stats

-from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
 from sklearn.model_selection import LeaveOneGroupOut, cross_validate
-from sklearn.dummy import DummyClassifier
 from sklearn.impute import SimpleImputer
+
+from sklearn.dummy import DummyClassifier
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
 from lightgbm import LGBMClassifier
-import xgboost as xg
+import xgboost as xg 

 from sklearn.cluster import KMeans

@ -44,6 +45,7 @@ if nb_dir not in sys.path:

 import machine_learning.labels
 import machine_learning.model
+from machine_learning.classification_models import ClassificationModels

 # %% [markdown]
 # # RAPIDS models
@ -92,52 +94,8 @@ model_input.set_index(index_columns, inplace=True)

 # %% jupyter={"source_hidden": true}
 # Create dict with classification ml models
-cmodels = {
-    'dummy_classifier': {
-        'model': DummyClassifier(strategy="most_frequent"),
-        'metrics': [0, 0, 0, 0]
-    },
-    'logistic_regression': {
-        'model': linear_model.LogisticRegression(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'support_vector_machine': {
-        'model': svm.SVC(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'gaussian_naive_bayes': {
-        'model': naive_bayes.GaussianNB(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'stochastic_gradient_descent_classifier': {
-        'model': linear_model.SGDClassifier(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'knn': {
-        'model': neighbors.KNeighborsClassifier(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'decision_tree': {
-        'model': tree.DecisionTreeClassifier(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'random_forest_classifier': {
-        'model': ensemble.RandomForestClassifier(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'gradient_boosting_classifier': {
-        'model': ensemble.GradientBoostingClassifier(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'lgbm_classifier': {
-        'model': LGBMClassifier(),
-        'metrics': [0, 0, 0, 0]
-    },
-    'XGBoost_classifier': {
-        'model': xg.sklearn.XGBClassifier(),
-        'metrics': [0, 0, 0, 0]
-    }
-}
+cm = ClassificationModels()
+cmodels = cm.get_cmodels()

 # %% jupyter={"source_hidden": true}
 for k in range(n_clusters):
@ -223,10 +181,4 @@ for k in range(n_clusters):

 # %% jupyter={"source_hidden": true}
 # Get overall results
-for model_title, model in cmodels.items():
-    print("\n************************************\n")
-    print("Current model:", model_title, end="\n")
-    print("Acc", model['metrics'][0]/n_clusters)
-    print("Precision", model['metrics'][1]/n_clusters)
-    print("Recall", model['metrics'][2]/n_clusters)
-    print("F1", model['metrics'][3]/n_clusters)
+cm.get_total_models_scores(n_clusters=n_clusters)
--- a/machine_learning/classification_models.py
+++ b/machine_learning/classification_models.py
@ -0,0 +1,71 @@
+from sklearn.dummy import DummyClassifier
+from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
+from lightgbm import LGBMClassifier
+import xgboost as xg 
+
+class ClassificationModels():
+    
+    def __init__(self):
+        self.cmodels = self.init_classification_models()
+        
+    def get_cmodels(self):
+        return self.cmodels
+
+    def init_classification_models(self):
+        cmodels = {
+            'dummy_classifier': {
+                'model': DummyClassifier(strategy="most_frequent"),
+                'metrics': [0, 0, 0, 0]
+            },
+            'logistic_regression': {
+                'model': linear_model.LogisticRegression(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'support_vector_machine': {
+                'model': svm.SVC(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'gaussian_naive_bayes': {
+                'model': naive_bayes.GaussianNB(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'stochastic_gradient_descent_classifier': {
+                'model': linear_model.SGDClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'knn': {
+                'model': neighbors.KNeighborsClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'decision_tree': {
+                'model': tree.DecisionTreeClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'random_forest_classifier': {
+                'model': ensemble.RandomForestClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'gradient_boosting_classifier': {
+                'model': ensemble.GradientBoostingClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'lgbm_classifier': {
+                'model': LGBMClassifier(),
+                'metrics': [0, 0, 0, 0]
+            },
+            'XGBoost_classifier': {
+                'model': xg.sklearn.XGBClassifier(),
+                'metrics': [0, 0, 0, 0]
+            }
+        }
+        
+        return cmodels
+    
+    def get_total_models_scores(self, n_clusters=1):
+        for model_title, model in self.cmodels.items():
+            print("\n************************************\n")
+            print("Current model:", model_title, end="\n")
+            print("Acc:", model['metrics'][0]/n_clusters)
+            print("Precision:", model['metrics'][1]/n_clusters)
+            print("Recall:", model['metrics'][2]/n_clusters)
+            print("F1:", model['metrics'][3]/n_clusters)