Define classification models method.
parent
509707855e
commit
852e17afbe
|
@ -1,10 +1,12 @@
|
|||
from pathlib import Path
|
||||
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble
|
||||
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble, naive_bayes, neighbors, tree
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.dummy import DummyRegressor
|
||||
from xgboost import XGBRegressor
|
||||
from sklearn.dummy import DummyRegressor, DummyClassifier
|
||||
|
||||
from xgboost import XGBRegressor, XGBClassifier
|
||||
import xgboost as xg
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
@ -319,3 +321,139 @@ def run_all_regression_models(input_csv):
|
|||
scores = pd.concat([scores, scores_df])
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
||||
metrics = ['accuracy', 'average_precision', 'recall', 'f1']
|
||||
test_metrics = ["test_" + metric for metric in metrics]
|
||||
|
||||
scores = pd.DataFrame(columns=["method", "max", "mean"])
|
||||
|
||||
dummy_class = DummyClassifier(strategy="most_frequent")
|
||||
|
||||
dummy_score = cross_validate(
|
||||
dummy_class,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=metrics
|
||||
)
|
||||
print("Dummy")
|
||||
|
||||
scores_df = pd.DataFrame(dummy_score)[test_metrics]
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "Dummy"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
|
||||
logistic_regression = linear_model.LogisticRegression()
|
||||
|
||||
log_reg_scores = cross_validate(
|
||||
logistic_regression,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Logistic regression")
|
||||
|
||||
scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "logistic_reg"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
|
||||
svc = svm.SVC()
|
||||
|
||||
svc_scores = cross_validate(
|
||||
svc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Support Vector Machine")
|
||||
|
||||
scores_df = pd.DataFrame(svc_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "svc"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
|
||||
gaussian_nb = naive_bayes.GaussianNB()
|
||||
|
||||
gaussian_nb_scores = cross_validate(
|
||||
gaussian_nb,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Gaussian Naive Bayes")
|
||||
|
||||
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "gaussian_naive_bayes"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
|
||||
sgdc = linear_model.SGDClassifier()
|
||||
|
||||
sgdc_scores = cross_validate(
|
||||
sgdc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Stochastic Gradient Descent")
|
||||
|
||||
scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "stochastic_gradient_descent"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
|
||||
rfc = ensemble.RandomForestClassifier()
|
||||
|
||||
rfc_scores = cross_validate(
|
||||
rfc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Random Forest")
|
||||
|
||||
scores_df = pd.DataFrame(rfc_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "random_forest"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
|
||||
xgb_classifier = XGBClassifier()
|
||||
|
||||
xgb_scores = cross_validate(
|
||||
xgb_classifier,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=metrics
|
||||
)
|
||||
print("XGBoost")
|
||||
|
||||
scores_df = pd.DataFrame(xgb_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "xgboost"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
|
||||
return scores
|
||||
|
|
Loading…
Reference in New Issue