diff --git a/Snakefile b/Snakefile index b3b3b2ba..2778cb0b 100644 --- a/Snakefile +++ b/Snakefile @@ -6,6 +6,11 @@ include: "rules/models.snakefile" include: "rules/reports.snakefile" include: "rules/mystudy.snakefile" # You can add snakfiles with rules tailored to your project +models, scalers = [], [] +for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]: + models = models + [model_name] * len(config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name]) + scalers = scalers + config["PARAMS_FOR_ANALYSIS"]["MODEL_SCALER"][model_name] + rule all: input: # My study (this is an example of a rule created specifically for a study) @@ -120,6 +125,16 @@ rule all: source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"], summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]), + expand("data/processed/output_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}_{cv_method}_baseline.csv", + rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], + cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], + days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], + days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], + cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"], + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"], + summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]), expand( expand("data/processed/output_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{{model}}/{cv_method}/{source}_{day_segment}_{summarised}_{{scaler}}/{result_component}.csv", rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], diff --git a/rules/models.snakefile b/rules/models.snakefile index d7d7ac9a..d0185b03 100644 --- a/rules/models.snakefile +++ b/rules/models.snakefile @@ -110,6 +110,21 @@ rule merge_features_and_targets: script: "../src/models/merge_features_and_targets.py" +rule baseline: + input: + "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv" + params: + cv_method = "{cv_method}", + rowsnan_colsnan_days_colsvar_threshold = "{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}", + demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC_FEATURES"] + output: + "data/processed/output_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}_{cv_method}_baseline.csv" + log: + "data/processed/output_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}_{cv_method}_notes.log" + script: + "../src/models/baseline.py" + + rule modeling: input: data = "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv" diff --git a/src/models/baseline.py b/src/models/baseline.py new file mode 100644 index 00000000..faa5bfeb --- /dev/null +++ b/src/models/baseline.py @@ -0,0 +1,89 @@ +import numpy as np +import pandas as pd +from statistics import mean +from modeling_utils import getMetrics, createPipeline +from sklearn.model_selection import LeaveOneOut + + +# As we do not have probability of each category, use label to denote the probability directly. +# The probability will only be used to calculate the AUC value. +def baselineAccuracyOfMajorityClassClassifier(targets): + majority_class = targets["target"].value_counts().idxmax() + pred_y = [majority_class] * targets.shape[0] + pred_y_prob = pred_y + metrics = getMetrics(pred_y, pred_y_prob, targets["target"].values.ravel().tolist()) + return metrics, majority_class + +def baselineMetricsOfRandomWeightedClassifier(targets, majority_ratio, majority_class, iter_times): + metrics_all_iters = {"accuracy": [], "precision0":[], "recall0": [], "f10": [], "precision1": [], "recall1": [], "f11": [], "auc": [], "kappa": []} + probabilities = [0, 0] + probabilities[majority_class], probabilities[1 - majority_class] = majority_ratio, 1 - majority_ratio + for i in range(iter_times): + pred_y = np.random.RandomState(i).multinomial(1, probabilities, targets.shape[0])[:,1].tolist() + pred_y_prob = pred_y + metrics = getMetrics(pred_y, pred_y_prob, targets["target"].values.ravel().tolist()) + for key in metrics_all_iters.keys(): + metrics_all_iters[key].append(metrics[key].item()) + # Calculate average metrics across all iterations + avg_metrics = {} + for key in metrics_all_iters.keys(): + avg_metrics[key] = mean(metrics_all_iters[key]) + return avg_metrics + +def baselineMetricsOfDTWithDemographicFeatures(cv_method, data_x, data_y, oversampler_type): + pred_y, true_y = [], [] + for train_index, test_index in cv_method.split(data_x): + train_x, test_x = data_x.iloc[train_index], data_x.iloc[test_index] + train_y, test_y = data_y.iloc[train_index], data_y.iloc[test_index] + clf = createPipeline("DT", oversampler_type) + clf.fit(train_x, train_y.values.ravel()) + pred_y = pred_y + clf.predict(test_x).ravel().tolist() + pred_y_prob = pred_y + true_y = true_y + test_y.values.ravel().tolist() + return getMetrics(pred_y, pred_y_prob, true_y) + + +cv_method = globals()[snakemake.params["cv_method"]]() +colnames_demographic_features = snakemake.params["demographic_features"] +rowsnan_colsnan_days_colsvar_threshold = snakemake.params["rowsnan_colsnan_days_colsvar_threshold"] + + +data = pd.read_csv(snakemake.input[0], index_col=["pid"]) +data_x, data_y = data.drop("target", axis=1), data[["target"]] +targets_value_counts = data_y["target"].value_counts() + +baseline_metrics = pd.DataFrame(columns=["method", "fullMethodName", "accuracy", "precision0", "recall0", "f10", "precision1", "recall1", "f11", "auc", "kappa"]) +if len(targets_value_counts) < 2: + fout = open(snakemake.log[0], "w") + fout.write(targets_value_counts.to_string()) + fout.close() + +else: + if min(targets_value_counts) >= 6: + oversampler_type = "SMOTE" + else: + oversampler_type = "RandomOverSampler" + + # Baseline 1: majority class classifier => predict every sample as majority class + baseline1_metrics, majority_class = baselineAccuracyOfMajorityClassClassifier(data_y) + majority_ratio = baseline1_metrics["accuracy"] + # Baseline 2: random weighted classifier => random classifier with binomial distribution + baseline2_metrics = baselineMetricsOfRandomWeightedClassifier(data_y, majority_ratio, majority_class, 1000) + # Baseline 3: decision tree with demographic features + baseline3_metrics = baselineMetricsOfDTWithDemographicFeatures(cv_method, data_x[colnames_demographic_features], data_y, oversampler_type) + + baselines = [baseline1_metrics, baseline2_metrics, baseline3_metrics] + + baseline_metrics = pd.DataFrame({"method": ["majority", "rwc", "dt"], + "fullMethodName": ["MajorityClassClassifier", "RandomWeightedClassifier", "DecisionTreeWithDemographicFeatures"], + "accuracy": [baseline["accuracy"] for baseline in baselines], + "precision0": [baseline["precision0"] for baseline in baselines], + "recall0": [baseline["recall0"] for baseline in baselines], + "f10": [baseline["f10"] for baseline in baselines], + "precision1": [baseline["precision1"] for baseline in baselines], + "recall1": [baseline["recall1"] for baseline in baselines], + "f11": [baseline["f11"] for baseline in baselines], + "auc": [baseline["auc"] for baseline in baselines], + "kappa": [baseline["kappa"] for baseline in baselines]}) + +baseline_metrics.to_csv(snakemake.output[0], index=False)