parent
570d2eb656
commit
9ab0c8f289
3 changed files with 282 additions and 0 deletions
@ -0,0 +1,105 @@ |
||||
import numpy as np |
||||
import pandas as pd |
||||
from statistics import mean |
||||
from modelling_utils import getMetrics, createPipeline |
||||
from sklearn.model_selection import LeaveOneOut |
||||
|
||||
|
||||
# As we do not have probability of each category, use label to denote the probability directly. |
||||
# The probability will only be used to calculate the AUC value. |
||||
def baselineAccuracyOfMajorityClassClassifier(targets): |
||||
majority_class = targets["target"].value_counts().idxmax() |
||||
pred_y = [majority_class] * targets.shape[0] |
||||
pred_y_proba = pred_y |
||||
metrics = getMetrics(pred_y, pred_y_proba, targets["target"].values.ravel().tolist()) |
||||
return metrics, majority_class |
||||
|
||||
def baselineMetricsOfRandomWeightedClassifier(targets, majority_ratio, majority_class, iter_times): |
||||
metrics_all_iters = {"accuracy": [], "precision0":[], "recall0": [], "f10": [], "precision1": [], "recall1": [], "f11": [], "f1_macro": [], "auc": [], "kappa": []} |
||||
probabilities = [0, 0] |
||||
probabilities[majority_class], probabilities[1 - majority_class] = majority_ratio, 1 - majority_ratio |
||||
for i in range(iter_times): |
||||
pred_y = np.random.RandomState(i).multinomial(1, probabilities, targets.shape[0])[:,1].tolist() |
||||
pred_y_proba = pred_y |
||||
metrics = getMetrics(pred_y, pred_y_proba, targets["target"].values.ravel().tolist()) |
||||
for key in metrics_all_iters.keys(): |
||||
metrics_all_iters[key].append(metrics[key].item()) |
||||
# Calculate average metrics across all iterations |
||||
avg_metrics = {} |
||||
for key in metrics_all_iters.keys(): |
||||
avg_metrics[key] = mean(metrics_all_iters[key]) |
||||
return avg_metrics |
||||
|
||||
def baselineMetricsOfDTWithDemographicFeatures(cv_method, data_x, data_y, oversampler_type): |
||||
pred_y, true_y = [], [] |
||||
for train_index, test_index in cv_method.split(data_x): |
||||
train_x, test_x = data_x.iloc[train_index], data_x.iloc[test_index] |
||||
train_y, test_y = data_y.iloc[train_index], data_y.iloc[test_index] |
||||
clf = createPipeline("DT", oversampler_type) |
||||
clf.fit(train_x, train_y.values.ravel()) |
||||
pred_y = pred_y + clf.predict(test_x).ravel().tolist() |
||||
pred_y_proba = pred_y |
||||
true_y = true_y + test_y.values.ravel().tolist() |
||||
return getMetrics(pred_y, pred_y_proba, true_y) |
||||
|
||||
|
||||
cv_method = globals()[snakemake.params["cv_method"]]() |
||||
colnames_demographic_features = snakemake.params["colnames_demographic_features"] |
||||
|
||||
data = pd.read_csv(snakemake.input[0]) |
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] |
||||
if "pid" in data.columns: |
||||
index_columns.append("pid") |
||||
data.set_index(index_columns, inplace=True) |
||||
|
||||
|
||||
data_x, data_y = data.drop("target", axis=1), data[["target"]] |
||||
targets_value_counts = data_y["target"].value_counts() |
||||
|
||||
baseline_metrics = pd.DataFrame(columns=["method", "fullMethodName", "accuracy", "precision0", "recall0", "f10", "precision1", "recall1", "f11", "f1_macro", "auc", "kappa"]) |
||||
if len(targets_value_counts) < 2: |
||||
fout = open(snakemake.log[0], "w") |
||||
fout.write(targets_value_counts.to_string()) |
||||
fout.close() |
||||
|
||||
else: |
||||
if min(targets_value_counts) >= 6: |
||||
oversampler_type = "SMOTE" |
||||
else: |
||||
oversampler_type = "RandomOverSampler" |
||||
|
||||
# Baseline 1: majority class classifier => predict every sample as majority class |
||||
baseline1_metrics, majority_class = baselineAccuracyOfMajorityClassClassifier(data_y) |
||||
majority_ratio = baseline1_metrics["accuracy"] |
||||
# Baseline 2: random weighted classifier => random classifier with binomial distribution |
||||
baseline2_metrics = baselineMetricsOfRandomWeightedClassifier(data_y, majority_ratio, majority_class, 1000) |
||||
|
||||
if "pid" in index_columns: |
||||
# Baseline 3: decision tree with demographic features |
||||
baseline3_metrics = baselineMetricsOfDTWithDemographicFeatures(cv_method, data_x[colnames_demographic_features], data_y, oversampler_type) |
||||
|
||||
baselines = [baseline1_metrics, baseline2_metrics, baseline3_metrics] |
||||
methods = ["majority", "rwc", "dt"] |
||||
fullMethodNames = ["MajorityClassClassifier", "RandomWeightedClassifier", "DecisionTreeWithDemographicFeatures"] |
||||
|
||||
else: |
||||
# Only have 2 baselines |
||||
baselines = [baseline1_metrics, baseline2_metrics] |
||||
methods = ["majority", "rwc"] |
||||
fullMethodNames = ["MajorityClassClassifier", "RandomWeightedClassifier"] |
||||
|
||||
baseline_metrics = pd.DataFrame({"method": methods, |
||||
"fullMethodName": fullMethodNames, |
||||
"accuracy": [baseline["accuracy"] for baseline in baselines], |
||||
"precision0": [baseline["precision0"] for baseline in baselines], |
||||
"recall0": [baseline["recall0"] for baseline in baselines], |
||||
"f10": [baseline["f10"] for baseline in baselines], |
||||
"precision1": [baseline["precision1"] for baseline in baselines], |
||||
"recall1": [baseline["recall1"] for baseline in baselines], |
||||
"f11": [baseline["f11"] for baseline in baselines], |
||||
"f1_macro": [baseline["f1_macro"] for baseline in baselines], |
||||
"auc": [baseline["auc"] for baseline in baselines], |
||||
"kappa": [baseline["kappa"] for baseline in baselines]}) |
||||
|
||||
|
||||
baseline_metrics.to_csv(snakemake.output[0], index=False) |
@ -0,0 +1,164 @@ |
||||
import pandas as pd |
||||
import numpy as np |
||||
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler |
||||
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix |
||||
from sklearn.metrics import precision_recall_fscore_support |
||||
from sklearn.metrics import cohen_kappa_score, roc_auc_score |
||||
from imblearn.pipeline import Pipeline |
||||
from imblearn.over_sampling import SMOTE, RandomOverSampler |
||||
|
||||
|
||||
def getMatchingColNames(operators, features): |
||||
col_names = [] |
||||
for col in features.columns: |
||||
if any(operator in col for operator in operators): |
||||
col_names.append(col) |
||||
return col_names |
||||
|
||||
# drop columns with zero variance |
||||
def dropZeroVarianceCols(data): |
||||
if not data.empty: |
||||
var_df = data.var() |
||||
keep_col = [] |
||||
for col in var_df.index: |
||||
if var_df.loc[col] > 0: |
||||
keep_col.append(col) |
||||
data_drop_cols_var = data.loc[:, keep_col] |
||||
else: |
||||
data_drop_cols_var = data |
||||
return data_drop_cols_var |
||||
|
||||
# normalize based on all participants: return fitted scaler |
||||
def getNormAllParticipantsScaler(features, scaler_flag): |
||||
# MinMaxScaler |
||||
if scaler_flag == "minmaxscaler": |
||||
scaler = MinMaxScaler() |
||||
# StandardScaler |
||||
elif scaler_flag == "standardscaler": |
||||
scaler = StandardScaler() |
||||
# RobustScaler |
||||
elif scaler_flag == "robustscaler": |
||||
scaler = RobustScaler() |
||||
else: |
||||
# throw exception |
||||
raise ValueError("The normalization method is not predefined, please check if the PARAMS_FOR_ANALYSIS.NORMALIZED in config.yaml file is correct.") |
||||
scaler.fit(features) |
||||
return scaler |
||||
|
||||
# get metrics: accuracy, precision0, recall0, f10, precision1, recall1, f11, f1_macro, auc, kappa |
||||
def getMetrics(pred_y, pred_y_proba, true_y): |
||||
metrics = {} |
||||
count = len(np.unique(true_y)) |
||||
label= np.unique(true_y)[0] |
||||
# metrics for all categories |
||||
metrics["accuracy"] = accuracy_score(true_y, pred_y) |
||||
metrics["f1_macro"] = f1_score(true_y, pred_y, average="macro") # unweighted mean |
||||
metrics["auc"] = np.nan if count == 1 else roc_auc_score(true_y, pred_y_proba) |
||||
metrics["kappa"] = cohen_kappa_score(true_y, pred_y) |
||||
# metrics for label 0 |
||||
metrics["precision0"] = np.nan if (count == 1 and label == 1) else precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[0] |
||||
metrics["recall0"] = np.nan if (count == 1 and label == 1) else recall_score(true_y, pred_y, average=None, labels=[0,1])[0] |
||||
metrics["f10"] = np.nan if (count == 1 and label == 1) else f1_score(true_y, pred_y, average=None, labels=[0,1])[0] |
||||
# metrics for label 1 |
||||
metrics["precision1"] = np.nan if (count == 1 and label == 0) else precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[1] |
||||
metrics["recall1"] = np.nan if (count == 1 and label == 0) else recall_score(true_y, pred_y, average=None, labels=[0,1])[1] |
||||
metrics["f11"] = np.nan if (count == 1 and label == 0) else f1_score(true_y, pred_y, average=None, labels=[0,1])[1] |
||||
|
||||
return metrics |
||||
|
||||
# get feature importances |
||||
def getFeatureImportances(model, clf, cols): |
||||
if model == "LogReg": |
||||
# Extract the coefficient of the features in the decision function |
||||
# Calculate the absolute value |
||||
# Normalize it to sum 1 |
||||
feature_importances = pd.DataFrame(zip(clf.coef_[0],cols), columns=["Value", "Feature"]) |
||||
feature_importances["Value"] = feature_importances["Value"].abs()/feature_importances["Value"].abs().sum() |
||||
elif model == "kNN": |
||||
# Feature importance is not defined for the KNN Classification, return an empty dataframe |
||||
feature_importances = pd.DataFrame(columns=["Value", "Feature"]) |
||||
elif model == "SVM": |
||||
# Coefficient of the features are only available for linear kernel |
||||
try: |
||||
# For linear kernel |
||||
# Extract the coefficient of the features in the decision function |
||||
# Calculate the absolute value |
||||
# Normalize it to sum 1 |
||||
feature_importances = pd.DataFrame(zip(clf.coef_[0],cols), columns=["Value", "Feature"]) |
||||
feature_importances["Value"] = feature_importances["Value"].abs()/feature_importances["Value"].abs().sum() |
||||
except: |
||||
# For nonlinear kernel, return an empty dataframe directly |
||||
feature_importances = pd.DataFrame(columns=["Value", "Feature"]) |
||||
elif model == "LightGBM": |
||||
# Extract feature_importances_ and normalize it to sum 1 |
||||
feature_importances = pd.DataFrame(zip(clf.feature_importances_,cols), columns=["Value", "Feature"]) |
||||
feature_importances["Value"] = feature_importances["Value"]/feature_importances["Value"].sum() |
||||
else: |
||||
# For DT, RF, GB, XGBoost classifier, extract feature_importances_. This field has already been normalized. |
||||
feature_importances = pd.DataFrame(zip(clf.feature_importances_,cols), columns=["Value", "Feature"]) |
||||
|
||||
feature_importances = feature_importances.set_index(["Feature"]).T |
||||
|
||||
return feature_importances |
||||
|
||||
def createPipeline(model, oversampler_type): |
||||
|
||||
if oversampler_type == "SMOTE": |
||||
oversampler = SMOTE(sampling_strategy="minority", random_state=0) |
||||
elif oversampler_type == "RandomOverSampler": |
||||
oversampler = RandomOverSampler(sampling_strategy="minority", random_state=0) |
||||
else: |
||||
raise ValueError("RAPIDS pipeline only support 'SMOTE' and 'RandomOverSampler' oversampling methods.") |
||||
|
||||
if model == "LogReg": |
||||
from sklearn.linear_model import LogisticRegression |
||||
pipeline = Pipeline([ |
||||
("sampling", oversampler), |
||||
("clf", LogisticRegression(random_state=0)) |
||||
]) |
||||
elif model == "kNN": |
||||
from sklearn.neighbors import KNeighborsClassifier |
||||
pipeline = Pipeline([ |
||||
("sampling", oversampler), |
||||
("clf", KNeighborsClassifier()) |
||||
]) |
||||
elif model == "SVM": |
||||
from sklearn.svm import SVC |
||||
pipeline = Pipeline([ |
||||
("sampling", oversampler), |
||||
("clf", SVC(random_state=0, probability=True)) |
||||
]) |
||||
elif model == "DT": |
||||
from sklearn.tree import DecisionTreeClassifier |
||||
pipeline = Pipeline([ |
||||
("sampling", oversampler), |
||||
("clf", DecisionTreeClassifier(random_state=0)) |
||||
]) |
||||
elif model == "RF": |
||||
from sklearn.ensemble import RandomForestClassifier |
||||
pipeline = Pipeline([ |
||||
("sampling", oversampler), |
||||
("clf", RandomForestClassifier(random_state=0)) |
||||
]) |
||||
elif model == "GB": |
||||
from sklearn.ensemble import GradientBoostingClassifier |
||||
pipeline = Pipeline([ |
||||
("sampling", oversampler), |
||||
("clf", GradientBoostingClassifier(random_state=0)) |
||||
]) |
||||
elif model == "XGBoost": |
||||
from xgboost import XGBClassifier |
||||
pipeline = Pipeline([ |
||||
("sampling", oversampler), |
||||
("clf", XGBClassifier(random_state=0, n_jobs=36)) |
||||
]) |
||||
elif model == "LightGBM": |
||||
from lightgbm import LGBMClassifier |
||||
pipeline = Pipeline([ |
||||
("sampling", oversampler), |
||||
("clf", LGBMClassifier(random_state=0, n_jobs=36)) |
||||
]) |
||||
else: |
||||
raise ValueError("RAPIDS pipeline only support LogReg, kNN, SVM, DT, RF, GB, XGBoost, and LightGBM algorithms for classification problems.") |
||||
|
||||
return pipeline |
Loading…
Reference in new issue