diff --git a/rules/models.smk b/rules/models.smk index b2eb03e4..28ee5069 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -48,3 +48,16 @@ rule merge_features_and_targets_for_population_model: "data/processed/models/population_model/input.csv" script: "../src/models/merge_features_and_targets_for_population_model.py" + +rule model_individual_baselines: + input: + "data/processed/models/individual_model/{pid}/input.csv" + params: + cv_method = "{cv_method}", + colnames_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FEATURES"], + output: + "data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv" + log: + "data/processed/models/individual_model/{pid}/output_{cv_method}/baselines_notes.log" + script: + "../src/models/model_baselines.py" diff --git a/src/models/model_baselines.py b/src/models/model_baselines.py new file mode 100644 index 00000000..9c1ae564 --- /dev/null +++ b/src/models/model_baselines.py @@ -0,0 +1,105 @@ +import numpy as np +import pandas as pd +from statistics import mean +from modelling_utils import getMetrics, createPipeline +from sklearn.model_selection import LeaveOneOut + + +# As we do not have probability of each category, use label to denote the probability directly. +# The probability will only be used to calculate the AUC value. +def baselineAccuracyOfMajorityClassClassifier(targets): + majority_class = targets["target"].value_counts().idxmax() + pred_y = [majority_class] * targets.shape[0] + pred_y_proba = pred_y + metrics = getMetrics(pred_y, pred_y_proba, targets["target"].values.ravel().tolist()) + return metrics, majority_class + +def baselineMetricsOfRandomWeightedClassifier(targets, majority_ratio, majority_class, iter_times): + metrics_all_iters = {"accuracy": [], "precision0":[], "recall0": [], "f10": [], "precision1": [], "recall1": [], "f11": [], "f1_macro": [], "auc": [], "kappa": []} + probabilities = [0, 0] + probabilities[majority_class], probabilities[1 - majority_class] = majority_ratio, 1 - majority_ratio + for i in range(iter_times): + pred_y = np.random.RandomState(i).multinomial(1, probabilities, targets.shape[0])[:,1].tolist() + pred_y_proba = pred_y + metrics = getMetrics(pred_y, pred_y_proba, targets["target"].values.ravel().tolist()) + for key in metrics_all_iters.keys(): + metrics_all_iters[key].append(metrics[key].item()) + # Calculate average metrics across all iterations + avg_metrics = {} + for key in metrics_all_iters.keys(): + avg_metrics[key] = mean(metrics_all_iters[key]) + return avg_metrics + +def baselineMetricsOfDTWithDemographicFeatures(cv_method, data_x, data_y, oversampler_type): + pred_y, true_y = [], [] + for train_index, test_index in cv_method.split(data_x): + train_x, test_x = data_x.iloc[train_index], data_x.iloc[test_index] + train_y, test_y = data_y.iloc[train_index], data_y.iloc[test_index] + clf = createPipeline("DT", oversampler_type) + clf.fit(train_x, train_y.values.ravel()) + pred_y = pred_y + clf.predict(test_x).ravel().tolist() + pred_y_proba = pred_y + true_y = true_y + test_y.values.ravel().tolist() + return getMetrics(pred_y, pred_y_proba, true_y) + + +cv_method = globals()[snakemake.params["cv_method"]]() +colnames_demographic_features = snakemake.params["colnames_demographic_features"] + +data = pd.read_csv(snakemake.input[0]) +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +if "pid" in data.columns: + index_columns.append("pid") +data.set_index(index_columns, inplace=True) + + +data_x, data_y = data.drop("target", axis=1), data[["target"]] +targets_value_counts = data_y["target"].value_counts() + +baseline_metrics = pd.DataFrame(columns=["method", "fullMethodName", "accuracy", "precision0", "recall0", "f10", "precision1", "recall1", "f11", "f1_macro", "auc", "kappa"]) +if len(targets_value_counts) < 2: + fout = open(snakemake.log[0], "w") + fout.write(targets_value_counts.to_string()) + fout.close() + +else: + if min(targets_value_counts) >= 6: + oversampler_type = "SMOTE" + else: + oversampler_type = "RandomOverSampler" + + # Baseline 1: majority class classifier => predict every sample as majority class + baseline1_metrics, majority_class = baselineAccuracyOfMajorityClassClassifier(data_y) + majority_ratio = baseline1_metrics["accuracy"] + # Baseline 2: random weighted classifier => random classifier with binomial distribution + baseline2_metrics = baselineMetricsOfRandomWeightedClassifier(data_y, majority_ratio, majority_class, 1000) + + if "pid" in index_columns: + # Baseline 3: decision tree with demographic features + baseline3_metrics = baselineMetricsOfDTWithDemographicFeatures(cv_method, data_x[colnames_demographic_features], data_y, oversampler_type) + + baselines = [baseline1_metrics, baseline2_metrics, baseline3_metrics] + methods = ["majority", "rwc", "dt"] + fullMethodNames = ["MajorityClassClassifier", "RandomWeightedClassifier", "DecisionTreeWithDemographicFeatures"] + + else: + # Only have 2 baselines + baselines = [baseline1_metrics, baseline2_metrics] + methods = ["majority", "rwc"] + fullMethodNames = ["MajorityClassClassifier", "RandomWeightedClassifier"] + + baseline_metrics = pd.DataFrame({"method": methods, + "fullMethodName": fullMethodNames, + "accuracy": [baseline["accuracy"] for baseline in baselines], + "precision0": [baseline["precision0"] for baseline in baselines], + "recall0": [baseline["recall0"] for baseline in baselines], + "f10": [baseline["f10"] for baseline in baselines], + "precision1": [baseline["precision1"] for baseline in baselines], + "recall1": [baseline["recall1"] for baseline in baselines], + "f11": [baseline["f11"] for baseline in baselines], + "f1_macro": [baseline["f1_macro"] for baseline in baselines], + "auc": [baseline["auc"] for baseline in baselines], + "kappa": [baseline["kappa"] for baseline in baselines]}) + + +baseline_metrics.to_csv(snakemake.output[0], index=False) diff --git a/src/models/modelling_utils.py b/src/models/modelling_utils.py new file mode 100644 index 00000000..5d11c995 --- /dev/null +++ b/src/models/modelling_utils.py @@ -0,0 +1,164 @@ +import pandas as pd +import numpy as np +from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix +from sklearn.metrics import precision_recall_fscore_support +from sklearn.metrics import cohen_kappa_score, roc_auc_score +from imblearn.pipeline import Pipeline +from imblearn.over_sampling import SMOTE, RandomOverSampler + + +def getMatchingColNames(operators, features): + col_names = [] + for col in features.columns: + if any(operator in col for operator in operators): + col_names.append(col) + return col_names + +# drop columns with zero variance +def dropZeroVarianceCols(data): + if not data.empty: + var_df = data.var() + keep_col = [] + for col in var_df.index: + if var_df.loc[col] > 0: + keep_col.append(col) + data_drop_cols_var = data.loc[:, keep_col] + else: + data_drop_cols_var = data + return data_drop_cols_var + +# normalize based on all participants: return fitted scaler +def getNormAllParticipantsScaler(features, scaler_flag): + # MinMaxScaler + if scaler_flag == "minmaxscaler": + scaler = MinMaxScaler() + # StandardScaler + elif scaler_flag == "standardscaler": + scaler = StandardScaler() + # RobustScaler + elif scaler_flag == "robustscaler": + scaler = RobustScaler() + else: + # throw exception + raise ValueError("The normalization method is not predefined, please check if the PARAMS_FOR_ANALYSIS.NORMALIZED in config.yaml file is correct.") + scaler.fit(features) + return scaler + +# get metrics: accuracy, precision0, recall0, f10, precision1, recall1, f11, f1_macro, auc, kappa +def getMetrics(pred_y, pred_y_proba, true_y): + metrics = {} + count = len(np.unique(true_y)) + label= np.unique(true_y)[0] + # metrics for all categories + metrics["accuracy"] = accuracy_score(true_y, pred_y) + metrics["f1_macro"] = f1_score(true_y, pred_y, average="macro") # unweighted mean + metrics["auc"] = np.nan if count == 1 else roc_auc_score(true_y, pred_y_proba) + metrics["kappa"] = cohen_kappa_score(true_y, pred_y) + # metrics for label 0 + metrics["precision0"] = np.nan if (count == 1 and label == 1) else precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[0] + metrics["recall0"] = np.nan if (count == 1 and label == 1) else recall_score(true_y, pred_y, average=None, labels=[0,1])[0] + metrics["f10"] = np.nan if (count == 1 and label == 1) else f1_score(true_y, pred_y, average=None, labels=[0,1])[0] + # metrics for label 1 + metrics["precision1"] = np.nan if (count == 1 and label == 0) else precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[1] + metrics["recall1"] = np.nan if (count == 1 and label == 0) else recall_score(true_y, pred_y, average=None, labels=[0,1])[1] + metrics["f11"] = np.nan if (count == 1 and label == 0) else f1_score(true_y, pred_y, average=None, labels=[0,1])[1] + + return metrics + +# get feature importances +def getFeatureImportances(model, clf, cols): + if model == "LogReg": + # Extract the coefficient of the features in the decision function + # Calculate the absolute value + # Normalize it to sum 1 + feature_importances = pd.DataFrame(zip(clf.coef_[0],cols), columns=["Value", "Feature"]) + feature_importances["Value"] = feature_importances["Value"].abs()/feature_importances["Value"].abs().sum() + elif model == "kNN": + # Feature importance is not defined for the KNN Classification, return an empty dataframe + feature_importances = pd.DataFrame(columns=["Value", "Feature"]) + elif model == "SVM": + # Coefficient of the features are only available for linear kernel + try: + # For linear kernel + # Extract the coefficient of the features in the decision function + # Calculate the absolute value + # Normalize it to sum 1 + feature_importances = pd.DataFrame(zip(clf.coef_[0],cols), columns=["Value", "Feature"]) + feature_importances["Value"] = feature_importances["Value"].abs()/feature_importances["Value"].abs().sum() + except: + # For nonlinear kernel, return an empty dataframe directly + feature_importances = pd.DataFrame(columns=["Value", "Feature"]) + elif model == "LightGBM": + # Extract feature_importances_ and normalize it to sum 1 + feature_importances = pd.DataFrame(zip(clf.feature_importances_,cols), columns=["Value", "Feature"]) + feature_importances["Value"] = feature_importances["Value"]/feature_importances["Value"].sum() + else: + # For DT, RF, GB, XGBoost classifier, extract feature_importances_. This field has already been normalized. + feature_importances = pd.DataFrame(zip(clf.feature_importances_,cols), columns=["Value", "Feature"]) + + feature_importances = feature_importances.set_index(["Feature"]).T + + return feature_importances + +def createPipeline(model, oversampler_type): + + if oversampler_type == "SMOTE": + oversampler = SMOTE(sampling_strategy="minority", random_state=0) + elif oversampler_type == "RandomOverSampler": + oversampler = RandomOverSampler(sampling_strategy="minority", random_state=0) + else: + raise ValueError("RAPIDS pipeline only support 'SMOTE' and 'RandomOverSampler' oversampling methods.") + + if model == "LogReg": + from sklearn.linear_model import LogisticRegression + pipeline = Pipeline([ + ("sampling", oversampler), + ("clf", LogisticRegression(random_state=0)) + ]) + elif model == "kNN": + from sklearn.neighbors import KNeighborsClassifier + pipeline = Pipeline([ + ("sampling", oversampler), + ("clf", KNeighborsClassifier()) + ]) + elif model == "SVM": + from sklearn.svm import SVC + pipeline = Pipeline([ + ("sampling", oversampler), + ("clf", SVC(random_state=0, probability=True)) + ]) + elif model == "DT": + from sklearn.tree import DecisionTreeClassifier + pipeline = Pipeline([ + ("sampling", oversampler), + ("clf", DecisionTreeClassifier(random_state=0)) + ]) + elif model == "RF": + from sklearn.ensemble import RandomForestClassifier + pipeline = Pipeline([ + ("sampling", oversampler), + ("clf", RandomForestClassifier(random_state=0)) + ]) + elif model == "GB": + from sklearn.ensemble import GradientBoostingClassifier + pipeline = Pipeline([ + ("sampling", oversampler), + ("clf", GradientBoostingClassifier(random_state=0)) + ]) + elif model == "XGBoost": + from xgboost import XGBClassifier + pipeline = Pipeline([ + ("sampling", oversampler), + ("clf", XGBClassifier(random_state=0, n_jobs=36)) + ]) + elif model == "LightGBM": + from lightgbm import LGBMClassifier + pipeline = Pipeline([ + ("sampling", oversampler), + ("clf", LGBMClassifier(random_state=0, n_jobs=36)) + ]) + else: + raise ValueError("RAPIDS pipeline only support LogReg, kNN, SVM, DT, RF, GB, XGBoost, and LightGBM algorithms for classification problems.") + + return pipeline