rapids/src/models/workflow_example/modelling_utils.py

165 lines
7.2 KiB
Python
Raw Normal View History

2020-04-30 00:53:54 +02:00
import pandas as pd
2020-11-25 22:34:05 +01:00
import numpy as np
2020-04-30 00:53:54 +02:00
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import cohen_kappa_score, roc_auc_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler
def getMatchingColNames(operators, features):
col_names = []
for col in features.columns:
if any(operator in col for operator in operators):
col_names.append(col)
return col_names
2020-04-30 00:53:54 +02:00
# drop columns with zero variance
def dropZeroVarianceCols(data):
if not data.empty:
var_df = data.var()
keep_col = []
for col in var_df.index:
if var_df.loc[col] > 0:
keep_col.append(col)
data_drop_cols_var = data.loc[:, keep_col]
else:
data_drop_cols_var = data
return data_drop_cols_var
# normalize based on all participants: return fitted scaler
def getNormAllParticipantsScaler(features, scaler_flag):
# MinMaxScaler
if scaler_flag == "minmaxscaler":
scaler = MinMaxScaler()
# StandardScaler
elif scaler_flag == "standardscaler":
scaler = StandardScaler()
# RobustScaler
elif scaler_flag == "robustscaler":
scaler = RobustScaler()
else:
# throw exception
raise ValueError("The normalization method is not predefined, please check if the PARAMS_FOR_ANALYSIS.NORMALIZED in config.yaml file is correct.")
scaler.fit(features)
return scaler
2020-11-25 22:34:05 +01:00
# get metrics: accuracy, precision0, recall0, f10, precision1, recall1, f11, f1_macro, auc, kappa
def getMetrics(pred_y, pred_y_proba, true_y):
metrics = {}
2020-11-25 22:34:05 +01:00
count = len(np.unique(true_y))
label= np.unique(true_y)[0]
# metrics for all categories
metrics["accuracy"] = accuracy_score(true_y, pred_y)
2020-11-25 22:34:05 +01:00
metrics["f1_macro"] = f1_score(true_y, pred_y, average="macro") # unweighted mean
metrics["auc"] = np.nan if count == 1 else roc_auc_score(true_y, pred_y_proba)
metrics["kappa"] = cohen_kappa_score(true_y, pred_y)
# metrics for label 0
2020-11-25 22:34:05 +01:00
metrics["precision0"] = np.nan if (count == 1 and label == 1) else precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[0]
metrics["recall0"] = np.nan if (count == 1 and label == 1) else recall_score(true_y, pred_y, average=None, labels=[0,1])[0]
metrics["f10"] = np.nan if (count == 1 and label == 1) else f1_score(true_y, pred_y, average=None, labels=[0,1])[0]
# metrics for label 1
2020-11-25 22:34:05 +01:00
metrics["precision1"] = np.nan if (count == 1 and label == 0) else precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[1]
metrics["recall1"] = np.nan if (count == 1 and label == 0) else recall_score(true_y, pred_y, average=None, labels=[0,1])[1]
metrics["f11"] = np.nan if (count == 1 and label == 0) else f1_score(true_y, pred_y, average=None, labels=[0,1])[1]
2020-04-30 00:53:54 +02:00
return metrics
2020-04-30 00:53:54 +02:00
# get feature importances
def getFeatureImportances(model, clf, cols):
if model == "LogReg":
# Extract the coefficient of the features in the decision function
# Calculate the absolute value
# Normalize it to sum 1
feature_importances = pd.DataFrame(zip(clf.coef_[0],cols), columns=["Value", "Feature"])
feature_importances["Value"] = feature_importances["Value"].abs()/feature_importances["Value"].abs().sum()
elif model == "kNN":
# Feature importance is not defined for the KNN Classification, return an empty dataframe
feature_importances = pd.DataFrame(columns=["Value", "Feature"])
elif model == "SVM":
# Coefficient of the features are only available for linear kernel
try:
# For linear kernel
# Extract the coefficient of the features in the decision function
# Calculate the absolute value
# Normalize it to sum 1
feature_importances = pd.DataFrame(zip(clf.coef_[0],cols), columns=["Value", "Feature"])
feature_importances["Value"] = feature_importances["Value"].abs()/feature_importances["Value"].abs().sum()
except:
# For nonlinear kernel, return an empty dataframe directly
feature_importances = pd.DataFrame(columns=["Value", "Feature"])
elif model == "LightGBM":
# Extract feature_importances_ and normalize it to sum 1
feature_importances = pd.DataFrame(zip(clf.feature_importances_,cols), columns=["Value", "Feature"])
feature_importances["Value"] = feature_importances["Value"]/feature_importances["Value"].sum()
else:
# For DT, RF, GB, XGBoost classifier, extract feature_importances_. This field has already been normalized.
feature_importances = pd.DataFrame(zip(clf.feature_importances_,cols), columns=["Value", "Feature"])
feature_importances = feature_importances.set_index(["Feature"]).T
return feature_importances
def createPipeline(model, oversampler_type):
if oversampler_type == "SMOTE":
oversampler = SMOTE(sampling_strategy="minority", random_state=0)
elif oversampler_type == "RandomOverSampler":
oversampler = RandomOverSampler(sampling_strategy="minority", random_state=0)
else:
raise ValueError("RAPIDS pipeline only support 'SMOTE' and 'RandomOverSampler' oversampling methods.")
2020-04-30 00:53:54 +02:00
if model == "LogReg":
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
("sampling", oversampler),
2020-04-30 00:53:54 +02:00
("clf", LogisticRegression(random_state=0))
])
elif model == "kNN":
from sklearn.neighbors import KNeighborsClassifier
pipeline = Pipeline([
("sampling", oversampler),
2020-04-30 00:53:54 +02:00
("clf", KNeighborsClassifier())
])
elif model == "SVM":
from sklearn.svm import SVC
pipeline = Pipeline([
("sampling", oversampler),
2020-04-30 00:53:54 +02:00
("clf", SVC(random_state=0, probability=True))
])
elif model == "DT":
from sklearn.tree import DecisionTreeClassifier
pipeline = Pipeline([
("sampling", oversampler),
2020-04-30 00:53:54 +02:00
("clf", DecisionTreeClassifier(random_state=0))
])
elif model == "RF":
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
("sampling", oversampler),
2020-04-30 00:53:54 +02:00
("clf", RandomForestClassifier(random_state=0))
])
elif model == "GB":
from sklearn.ensemble import GradientBoostingClassifier
pipeline = Pipeline([
("sampling", oversampler),
2020-04-30 00:53:54 +02:00
("clf", GradientBoostingClassifier(random_state=0))
])
elif model == "XGBoost":
from xgboost import XGBClassifier
pipeline = Pipeline([
("sampling", oversampler),
("clf", XGBClassifier(random_state=0, n_jobs=36))
2020-04-30 00:53:54 +02:00
])
elif model == "LightGBM":
from lightgbm import LGBMClassifier
pipeline = Pipeline([
("sampling", oversampler),
("clf", LGBMClassifier(random_state=0, n_jobs=36))
2020-04-30 00:53:54 +02:00
])
else:
raise ValueError("RAPIDS pipeline only support LogReg, kNN, SVM, DT, RF, GB, XGBoost, and LightGBM algorithms for classification problems.")
return pipeline