Define classification models method.
parent
509707855e
commit
852e17afbe
|
@ -1,10 +1,12 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble
|
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble, naive_bayes, neighbors, tree
|
||||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate
|
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate
|
||||||
from sklearn.metrics import mean_squared_error, r2_score
|
from sklearn.metrics import mean_squared_error, r2_score
|
||||||
from sklearn.impute import SimpleImputer
|
from sklearn.impute import SimpleImputer
|
||||||
from sklearn.dummy import DummyRegressor
|
from sklearn.dummy import DummyRegressor, DummyClassifier
|
||||||
from xgboost import XGBRegressor
|
|
||||||
|
from xgboost import XGBRegressor, XGBClassifier
|
||||||
|
import xgboost as xg
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -319,3 +321,139 @@ def run_all_regression_models(input_csv):
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
|
|
||||||
|
def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
||||||
|
metrics = ['accuracy', 'average_precision', 'recall', 'f1']
|
||||||
|
test_metrics = ["test_" + metric for metric in metrics]
|
||||||
|
|
||||||
|
scores = pd.DataFrame(columns=["method", "max", "mean"])
|
||||||
|
|
||||||
|
dummy_class = DummyClassifier(strategy="most_frequent")
|
||||||
|
|
||||||
|
dummy_score = cross_validate(
|
||||||
|
dummy_class,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
error_score='raise',
|
||||||
|
scoring=metrics
|
||||||
|
)
|
||||||
|
print("Dummy")
|
||||||
|
|
||||||
|
scores_df = pd.DataFrame(dummy_score)[test_metrics]
|
||||||
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
|
scores_df["method"] = "Dummy"
|
||||||
|
scores = pd.concat([scores, scores_df])
|
||||||
|
|
||||||
|
logistic_regression = linear_model.LogisticRegression()
|
||||||
|
|
||||||
|
log_reg_scores = cross_validate(
|
||||||
|
logistic_regression,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=metrics
|
||||||
|
)
|
||||||
|
print("Logistic regression")
|
||||||
|
|
||||||
|
scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
|
||||||
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
|
scores_df["method"] = "logistic_reg"
|
||||||
|
scores = pd.concat([scores, scores_df])
|
||||||
|
|
||||||
|
svc = svm.SVC()
|
||||||
|
|
||||||
|
svc_scores = cross_validate(
|
||||||
|
svc,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=metrics
|
||||||
|
)
|
||||||
|
print("Support Vector Machine")
|
||||||
|
|
||||||
|
scores_df = pd.DataFrame(svc_scores)[test_metrics]
|
||||||
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
|
scores_df["method"] = "svc"
|
||||||
|
scores = pd.concat([scores, scores_df])
|
||||||
|
|
||||||
|
gaussian_nb = naive_bayes.GaussianNB()
|
||||||
|
|
||||||
|
gaussian_nb_scores = cross_validate(
|
||||||
|
gaussian_nb,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=metrics
|
||||||
|
)
|
||||||
|
print("Gaussian Naive Bayes")
|
||||||
|
|
||||||
|
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
|
||||||
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
|
scores_df["method"] = "gaussian_naive_bayes"
|
||||||
|
scores = pd.concat([scores, scores_df])
|
||||||
|
|
||||||
|
sgdc = linear_model.SGDClassifier()
|
||||||
|
|
||||||
|
sgdc_scores = cross_validate(
|
||||||
|
sgdc,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=metrics
|
||||||
|
)
|
||||||
|
print("Stochastic Gradient Descent")
|
||||||
|
|
||||||
|
scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
|
||||||
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
|
scores_df["method"] = "stochastic_gradient_descent"
|
||||||
|
scores = pd.concat([scores, scores_df])
|
||||||
|
|
||||||
|
rfc = ensemble.RandomForestClassifier()
|
||||||
|
|
||||||
|
rfc_scores = cross_validate(
|
||||||
|
rfc,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=metrics
|
||||||
|
)
|
||||||
|
print("Random Forest")
|
||||||
|
|
||||||
|
scores_df = pd.DataFrame(rfc_scores)[test_metrics]
|
||||||
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
|
scores_df["method"] = "random_forest"
|
||||||
|
scores = pd.concat([scores, scores_df])
|
||||||
|
|
||||||
|
xgb_classifier = XGBClassifier()
|
||||||
|
|
||||||
|
xgb_scores = cross_validate(
|
||||||
|
xgb_classifier,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=metrics
|
||||||
|
)
|
||||||
|
print("XGBoost")
|
||||||
|
|
||||||
|
scores_df = pd.DataFrame(xgb_scores)[test_metrics]
|
||||||
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
|
scores_df["method"] = "xgboost"
|
||||||
|
scores = pd.concat([scores, scores_df])
|
||||||
|
|
||||||
|
return scores
|
||||||
|
|
Loading…
Reference in New Issue