From 9f5edf1c2b4d63bd298c21944eea03ad116a5d75 Mon Sep 17 00:00:00 2001 From: junos Date: Tue, 12 Apr 2022 16:59:42 +0200 Subject: [PATCH] Revert "Add a rule for model baselines." The example was for a classification rather than regression problem. This reverts commit 9ab0c8f289fb296c444dd16b514f9ade7758c54c. # Conflicts: # rules/models.smk --- rules/models.smk | 12 --- src/models/model_baselines.py | 105 ---------------------- src/models/modelling_utils.py | 164 ---------------------------------- 3 files changed, 281 deletions(-) delete mode 100644 src/models/model_baselines.py delete mode 100644 src/models/modelling_utils.py diff --git a/rules/models.smk b/rules/models.smk index 56dd49d4..4ae8eb6d 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -49,15 +49,3 @@ rule merge_features_and_targets_for_population_model: script: "../src/models/merge_features_and_targets_for_population_model.py" -rule model_individual_baselines: - input: - "data/processed/models/individual_model/{pid}/input.csv" - params: - cv_method = "{cv_method}", - colnames_demographic_features = config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FEATURES"], - output: - "data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv" - log: - "data/processed/models/individual_model/{pid}/output_{cv_method}/baselines_notes.log" - script: - "../src/models/model_baselines.py" diff --git a/src/models/model_baselines.py b/src/models/model_baselines.py deleted file mode 100644 index 9c1ae564..00000000 --- a/src/models/model_baselines.py +++ /dev/null @@ -1,105 +0,0 @@ -import numpy as np -import pandas as pd -from statistics import mean -from modelling_utils import getMetrics, createPipeline -from sklearn.model_selection import LeaveOneOut - - -# As we do not have probability of each category, use label to denote the probability directly. -# The probability will only be used to calculate the AUC value. -def baselineAccuracyOfMajorityClassClassifier(targets): - majority_class = targets["target"].value_counts().idxmax() - pred_y = [majority_class] * targets.shape[0] - pred_y_proba = pred_y - metrics = getMetrics(pred_y, pred_y_proba, targets["target"].values.ravel().tolist()) - return metrics, majority_class - -def baselineMetricsOfRandomWeightedClassifier(targets, majority_ratio, majority_class, iter_times): - metrics_all_iters = {"accuracy": [], "precision0":[], "recall0": [], "f10": [], "precision1": [], "recall1": [], "f11": [], "f1_macro": [], "auc": [], "kappa": []} - probabilities = [0, 0] - probabilities[majority_class], probabilities[1 - majority_class] = majority_ratio, 1 - majority_ratio - for i in range(iter_times): - pred_y = np.random.RandomState(i).multinomial(1, probabilities, targets.shape[0])[:,1].tolist() - pred_y_proba = pred_y - metrics = getMetrics(pred_y, pred_y_proba, targets["target"].values.ravel().tolist()) - for key in metrics_all_iters.keys(): - metrics_all_iters[key].append(metrics[key].item()) - # Calculate average metrics across all iterations - avg_metrics = {} - for key in metrics_all_iters.keys(): - avg_metrics[key] = mean(metrics_all_iters[key]) - return avg_metrics - -def baselineMetricsOfDTWithDemographicFeatures(cv_method, data_x, data_y, oversampler_type): - pred_y, true_y = [], [] - for train_index, test_index in cv_method.split(data_x): - train_x, test_x = data_x.iloc[train_index], data_x.iloc[test_index] - train_y, test_y = data_y.iloc[train_index], data_y.iloc[test_index] - clf = createPipeline("DT", oversampler_type) - clf.fit(train_x, train_y.values.ravel()) - pred_y = pred_y + clf.predict(test_x).ravel().tolist() - pred_y_proba = pred_y - true_y = true_y + test_y.values.ravel().tolist() - return getMetrics(pred_y, pred_y_proba, true_y) - - -cv_method = globals()[snakemake.params["cv_method"]]() -colnames_demographic_features = snakemake.params["colnames_demographic_features"] - -data = pd.read_csv(snakemake.input[0]) -index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] -if "pid" in data.columns: - index_columns.append("pid") -data.set_index(index_columns, inplace=True) - - -data_x, data_y = data.drop("target", axis=1), data[["target"]] -targets_value_counts = data_y["target"].value_counts() - -baseline_metrics = pd.DataFrame(columns=["method", "fullMethodName", "accuracy", "precision0", "recall0", "f10", "precision1", "recall1", "f11", "f1_macro", "auc", "kappa"]) -if len(targets_value_counts) < 2: - fout = open(snakemake.log[0], "w") - fout.write(targets_value_counts.to_string()) - fout.close() - -else: - if min(targets_value_counts) >= 6: - oversampler_type = "SMOTE" - else: - oversampler_type = "RandomOverSampler" - - # Baseline 1: majority class classifier => predict every sample as majority class - baseline1_metrics, majority_class = baselineAccuracyOfMajorityClassClassifier(data_y) - majority_ratio = baseline1_metrics["accuracy"] - # Baseline 2: random weighted classifier => random classifier with binomial distribution - baseline2_metrics = baselineMetricsOfRandomWeightedClassifier(data_y, majority_ratio, majority_class, 1000) - - if "pid" in index_columns: - # Baseline 3: decision tree with demographic features - baseline3_metrics = baselineMetricsOfDTWithDemographicFeatures(cv_method, data_x[colnames_demographic_features], data_y, oversampler_type) - - baselines = [baseline1_metrics, baseline2_metrics, baseline3_metrics] - methods = ["majority", "rwc", "dt"] - fullMethodNames = ["MajorityClassClassifier", "RandomWeightedClassifier", "DecisionTreeWithDemographicFeatures"] - - else: - # Only have 2 baselines - baselines = [baseline1_metrics, baseline2_metrics] - methods = ["majority", "rwc"] - fullMethodNames = ["MajorityClassClassifier", "RandomWeightedClassifier"] - - baseline_metrics = pd.DataFrame({"method": methods, - "fullMethodName": fullMethodNames, - "accuracy": [baseline["accuracy"] for baseline in baselines], - "precision0": [baseline["precision0"] for baseline in baselines], - "recall0": [baseline["recall0"] for baseline in baselines], - "f10": [baseline["f10"] for baseline in baselines], - "precision1": [baseline["precision1"] for baseline in baselines], - "recall1": [baseline["recall1"] for baseline in baselines], - "f11": [baseline["f11"] for baseline in baselines], - "f1_macro": [baseline["f1_macro"] for baseline in baselines], - "auc": [baseline["auc"] for baseline in baselines], - "kappa": [baseline["kappa"] for baseline in baselines]}) - - -baseline_metrics.to_csv(snakemake.output[0], index=False) diff --git a/src/models/modelling_utils.py b/src/models/modelling_utils.py deleted file mode 100644 index 5d11c995..00000000 --- a/src/models/modelling_utils.py +++ /dev/null @@ -1,164 +0,0 @@ -import pandas as pd -import numpy as np -from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler -from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix -from sklearn.metrics import precision_recall_fscore_support -from sklearn.metrics import cohen_kappa_score, roc_auc_score -from imblearn.pipeline import Pipeline -from imblearn.over_sampling import SMOTE, RandomOverSampler - - -def getMatchingColNames(operators, features): - col_names = [] - for col in features.columns: - if any(operator in col for operator in operators): - col_names.append(col) - return col_names - -# drop columns with zero variance -def dropZeroVarianceCols(data): - if not data.empty: - var_df = data.var() - keep_col = [] - for col in var_df.index: - if var_df.loc[col] > 0: - keep_col.append(col) - data_drop_cols_var = data.loc[:, keep_col] - else: - data_drop_cols_var = data - return data_drop_cols_var - -# normalize based on all participants: return fitted scaler -def getNormAllParticipantsScaler(features, scaler_flag): - # MinMaxScaler - if scaler_flag == "minmaxscaler": - scaler = MinMaxScaler() - # StandardScaler - elif scaler_flag == "standardscaler": - scaler = StandardScaler() - # RobustScaler - elif scaler_flag == "robustscaler": - scaler = RobustScaler() - else: - # throw exception - raise ValueError("The normalization method is not predefined, please check if the PARAMS_FOR_ANALYSIS.NORMALIZED in config.yaml file is correct.") - scaler.fit(features) - return scaler - -# get metrics: accuracy, precision0, recall0, f10, precision1, recall1, f11, f1_macro, auc, kappa -def getMetrics(pred_y, pred_y_proba, true_y): - metrics = {} - count = len(np.unique(true_y)) - label= np.unique(true_y)[0] - # metrics for all categories - metrics["accuracy"] = accuracy_score(true_y, pred_y) - metrics["f1_macro"] = f1_score(true_y, pred_y, average="macro") # unweighted mean - metrics["auc"] = np.nan if count == 1 else roc_auc_score(true_y, pred_y_proba) - metrics["kappa"] = cohen_kappa_score(true_y, pred_y) - # metrics for label 0 - metrics["precision0"] = np.nan if (count == 1 and label == 1) else precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[0] - metrics["recall0"] = np.nan if (count == 1 and label == 1) else recall_score(true_y, pred_y, average=None, labels=[0,1])[0] - metrics["f10"] = np.nan if (count == 1 and label == 1) else f1_score(true_y, pred_y, average=None, labels=[0,1])[0] - # metrics for label 1 - metrics["precision1"] = np.nan if (count == 1 and label == 0) else precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[1] - metrics["recall1"] = np.nan if (count == 1 and label == 0) else recall_score(true_y, pred_y, average=None, labels=[0,1])[1] - metrics["f11"] = np.nan if (count == 1 and label == 0) else f1_score(true_y, pred_y, average=None, labels=[0,1])[1] - - return metrics - -# get feature importances -def getFeatureImportances(model, clf, cols): - if model == "LogReg": - # Extract the coefficient of the features in the decision function - # Calculate the absolute value - # Normalize it to sum 1 - feature_importances = pd.DataFrame(zip(clf.coef_[0],cols), columns=["Value", "Feature"]) - feature_importances["Value"] = feature_importances["Value"].abs()/feature_importances["Value"].abs().sum() - elif model == "kNN": - # Feature importance is not defined for the KNN Classification, return an empty dataframe - feature_importances = pd.DataFrame(columns=["Value", "Feature"]) - elif model == "SVM": - # Coefficient of the features are only available for linear kernel - try: - # For linear kernel - # Extract the coefficient of the features in the decision function - # Calculate the absolute value - # Normalize it to sum 1 - feature_importances = pd.DataFrame(zip(clf.coef_[0],cols), columns=["Value", "Feature"]) - feature_importances["Value"] = feature_importances["Value"].abs()/feature_importances["Value"].abs().sum() - except: - # For nonlinear kernel, return an empty dataframe directly - feature_importances = pd.DataFrame(columns=["Value", "Feature"]) - elif model == "LightGBM": - # Extract feature_importances_ and normalize it to sum 1 - feature_importances = pd.DataFrame(zip(clf.feature_importances_,cols), columns=["Value", "Feature"]) - feature_importances["Value"] = feature_importances["Value"]/feature_importances["Value"].sum() - else: - # For DT, RF, GB, XGBoost classifier, extract feature_importances_. This field has already been normalized. - feature_importances = pd.DataFrame(zip(clf.feature_importances_,cols), columns=["Value", "Feature"]) - - feature_importances = feature_importances.set_index(["Feature"]).T - - return feature_importances - -def createPipeline(model, oversampler_type): - - if oversampler_type == "SMOTE": - oversampler = SMOTE(sampling_strategy="minority", random_state=0) - elif oversampler_type == "RandomOverSampler": - oversampler = RandomOverSampler(sampling_strategy="minority", random_state=0) - else: - raise ValueError("RAPIDS pipeline only support 'SMOTE' and 'RandomOverSampler' oversampling methods.") - - if model == "LogReg": - from sklearn.linear_model import LogisticRegression - pipeline = Pipeline([ - ("sampling", oversampler), - ("clf", LogisticRegression(random_state=0)) - ]) - elif model == "kNN": - from sklearn.neighbors import KNeighborsClassifier - pipeline = Pipeline([ - ("sampling", oversampler), - ("clf", KNeighborsClassifier()) - ]) - elif model == "SVM": - from sklearn.svm import SVC - pipeline = Pipeline([ - ("sampling", oversampler), - ("clf", SVC(random_state=0, probability=True)) - ]) - elif model == "DT": - from sklearn.tree import DecisionTreeClassifier - pipeline = Pipeline([ - ("sampling", oversampler), - ("clf", DecisionTreeClassifier(random_state=0)) - ]) - elif model == "RF": - from sklearn.ensemble import RandomForestClassifier - pipeline = Pipeline([ - ("sampling", oversampler), - ("clf", RandomForestClassifier(random_state=0)) - ]) - elif model == "GB": - from sklearn.ensemble import GradientBoostingClassifier - pipeline = Pipeline([ - ("sampling", oversampler), - ("clf", GradientBoostingClassifier(random_state=0)) - ]) - elif model == "XGBoost": - from xgboost import XGBClassifier - pipeline = Pipeline([ - ("sampling", oversampler), - ("clf", XGBClassifier(random_state=0, n_jobs=36)) - ]) - elif model == "LightGBM": - from lightgbm import LGBMClassifier - pipeline = Pipeline([ - ("sampling", oversampler), - ("clf", LGBMClassifier(random_state=0, n_jobs=36)) - ]) - else: - raise ValueError("RAPIDS pipeline only support LogReg, kNN, SVM, DT, RF, GB, XGBoost, and LightGBM algorithms for classification problems.") - - return pipeline