Split modeling module into two rules; Add RandomOverSampler for resampling; Add log; Fix bug of AUC

pull/95/head
Meng Li 2020-05-15 18:42:03 -04:00
parent 5fab99d8df
commit 8c8378f74a
6 changed files with 193 additions and 105 deletions

View File

@ -111,19 +111,31 @@ rule all:
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
expand("data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/{result_component}.csv", expand("data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv",
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"],
summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
expand(
expand("data/processed/output_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{{model}}/{cv_method}/{source}_{day_segment}_{summarised}_{{scaler}}/{result_component}.csv",
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
model = config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"],
cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"], cv_method = config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"], day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"],
summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"], summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"],
scaler = config["PARAMS_FOR_ANALYSIS"]["SCALER"],
result_component = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"]), result_component = config["PARAMS_FOR_ANALYSIS"]["RESULT_COMPONENTS"]),
zip,
model = models,
scaler = scalers),
# Vizualisations # Vizualisations
expand("reports/figures/{pid}/{sensor}_heatmap_rows.html", pid=config["PIDS"], sensor=config["SENSORS"]), expand("reports/figures/{pid}/{sensor}_heatmap_rows.html", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("reports/figures/{pid}/compliance_heatmap.html", pid=config["PIDS"]), expand("reports/figures/{pid}/compliance_heatmap.html", pid=config["PIDS"]),

View File

@ -166,26 +166,35 @@ PARAMS_FOR_ANALYSIS:
PARTICIPANT_DAYS_AFTER_THRESHOLD: 4 PARTICIPANT_DAYS_AFTER_THRESHOLD: 4
# Extract summarised features from daily features with any of the following substrings # Extract summarised features from daily features with any of the following substrings
NUMERICAL_OPERATORS: ["count", "sum", "length", "avg"] NUMERICAL_OPERATORS: ["count", "sum", "length", "avg", "restinghr"]
CATEGORICAL_OPERATORS: ["mostcommon"] CATEGORICAL_OPERATORS: ["mostcommon"]
MODEL_NAMES: ["LogReg", "kNN", "SVM", "DT", "RF", "GB", "XGBoost", "LightGBM"] MODEL_NAMES: ["LogReg", "kNN", "SVM", "DT", "RF", "GB", "XGBoost", "LightGBM"]
CV_METHODS: ["LeaveOneOut"] CV_METHODS: ["LeaveOneOut"]
SUMMARISED: ["summarised"] # "summarised" or "notsummarised" SUMMARISED: ["summarised"] # "summarised" or "notsummarised"
SCALER: ["notnormalized", "minmaxscaler", "standardscaler", "robustscaler"]
RESULT_COMPONENTS: ["fold_predictions", "fold_metrics", "overall_results", "fold_feature_importances"] RESULT_COMPONENTS: ["fold_predictions", "fold_metrics", "overall_results", "fold_feature_importances"]
MODEL_SCALER:
LogReg: ["notnormalized", "minmaxscaler", "standardscaler", "robustscaler"]
kNN: ["minmaxscaler", "standardscaler", "robustscaler"]
SVM: ["minmaxscaler", "standardscaler", "robustscaler"]
DT: ["notnormalized"]
RF: ["notnormalized"]
GB: ["notnormalized"]
XGBoost: ["notnormalized"]
LightGBM: ["notnormalized"]
MODEL_HYPERPARAMS: MODEL_HYPERPARAMS:
LogReg: LogReg:
{"clf__C": [0.01, 0.1, 1, 10, 100], "clf__solver": ["newton-cg", "lbfgs", "liblinear", "saga"], "clf__penalty": ["l2"]} {"clf__C": [0.01, 0.1, 1, 10, 100], "clf__solver": ["newton-cg", "lbfgs", "liblinear", "saga"], "clf__penalty": ["l2"]}
kNN: kNN:
{"clf__n_neighbors": range(1, 21, 2), "clf__weights": ["uniform", "distance"], "clf__metric": ["euclidean", "manhattan", "minkowski"]} {"clf__n_neighbors": [1, 3, 5], "clf__weights": ["uniform", "distance"], "clf__metric": ["euclidean", "manhattan", "minkowski"]}
SVM: SVM:
{"clf__C": [0.01, 0.1, 1, 10, 100], "clf__gamma": ["scale", "auto"], "clf__kernel": ["rbf", "poly", "sigmoid"]} {"clf__C": [0.01, 0.1, 1, 10, 100], "clf__gamma": ["scale", "auto"], "clf__kernel": ["rbf", "poly", "sigmoid"]}
DT: DT:
{"clf__criterion": ["gini", "entropy"], "clf__max_depth": [None, 3, 5, 7, 9], "clf__max_features": [None, "auto", "sqrt", "log2"]} {"clf__criterion": ["gini", "entropy"], "clf__max_depth": [null, 3, 5, 7, 9], "clf__max_features": [null, "auto", "sqrt", "log2"]}
RF: RF:
{"clf__n_estimators": [2, 5, 10, 100],"clf__max_depth": [None, 3, 5, 7, 9]} {"clf__n_estimators": [2, 5, 10, 100],"clf__max_depth": [null, 3, 5, 7, 9]}
GB: GB:
{"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [5, 10, 100, 200], "clf__subsample": [0.5, 0.7, 1.0], "clf__max_depth": [3, 5, 7, 9]} {"clf__learning_rate": [0.01, 0.1, 1], "clf__n_estimators": [5, 10, 100, 200], "clf__subsample": [0.5, 0.7, 1.0], "clf__max_depth": [3, 5, 7, 9]}
XGBoost: XGBoost:

View File

@ -1,3 +1,5 @@
ruleorder: nan_cells_ratio_of_cleaned_features > merge_features_and_targets
def input_merge_features_of_single_participant(wildcards): def input_merge_features_of_single_participant(wildcards):
if wildcards.source == "phone_fitbit_features": if wildcards.source == "phone_fitbit_features":
return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"]["PHONE_FEATURES"] + config["PARAMS_FOR_ANALYSIS"]["FITBIT_FEATURES"], day_segment=wildcards.day_segment) return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"]["PHONE_FEATURES"] + config["PARAMS_FOR_ANALYSIS"]["FITBIT_FEATURES"], day_segment=wildcards.day_segment)
@ -93,11 +95,24 @@ rule nan_cells_ratio_of_cleaned_features:
script: script:
"../src/models/nan_cells_ratio_of_cleaned_features.py" "../src/models/nan_cells_ratio_of_cleaned_features.py"
rule modeling: rule merge_features_and_targets:
input: input:
cleaned_features = "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv", cleaned_features = "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
demographic_features = "data/processed/data_for_population_model/demographic_features.csv", demographic_features = "data/processed/data_for_population_model/demographic_features.csv",
targets = "data/processed/data_for_population_model/targets_{summarised}.csv", targets = "data/processed/data_for_population_model/targets_{summarised}.csv",
params:
summarised = "{summarised}",
cols_var_threshold = "{cols_var_threshold}",
numerical_operators = config["PARAMS_FOR_ANALYSIS"]["NUMERICAL_OPERATORS"],
categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"]
output:
"data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv"
script:
"../src/models/merge_features_and_targets.py"
rule modeling:
input:
data = "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv"
params: params:
model = "{model}", model = "{model}",
cv_method = "{cv_method}", cv_method = "{cv_method}",
@ -105,16 +120,16 @@ rule modeling:
day_segment = "{day_segment}", day_segment = "{day_segment}",
summarised = "{summarised}", summarised = "{summarised}",
scaler = "{scaler}", scaler = "{scaler}",
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
numerical_operators = config["PARAMS_FOR_ANALYSIS"]["NUMERICAL_OPERATORS"],
categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"], categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"],
categorical_demographic_features = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_DEMOGRAPHIC_FEATURES"], categorical_demographic_features = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_DEMOGRAPHIC_FEATURES"],
model_hyperparams = config["PARAMS_FOR_ANALYSIS"]["MODEL_HYPERPARAMS"], model_hyperparams = config["PARAMS_FOR_ANALYSIS"]["MODEL_HYPERPARAMS"],
rowsnan_colsnan_days_colsvar_threshold = "{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}" rowsnan_colsnan_days_colsvar_threshold = "{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}"
output: output:
fold_predictions = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_predictions.csv", fold_predictions = "data/processed/output_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_predictions.csv",
fold_metrics = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_metrics.csv", fold_metrics = "data/processed/output_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_metrics.csv",
overall_results = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/overall_results.csv", overall_results = "data/processed/output_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/overall_results.csv",
fold_feature_importances = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_feature_importances.csv" fold_feature_importances = "data/processed/output_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_feature_importances.csv"
log:
"data/processed/output_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/notes.log"
script: script:
"../src/models/modeling.py" "../src/models/modeling.py"

View File

@ -0,0 +1,50 @@
import pandas as pd
import numpy as np
from modeling_utils import getMatchingColNames, dropZeroVarianceCols
def summarisedNumericalFeatures(col_names, features):
numerical_features = features.groupby(["pid"])[col_names].var()
numerical_features.columns = numerical_features.columns.str.replace("daily", "overallvar")
return numerical_features
def summarisedCategoricalFeatures(col_names, features):
categorical_features = features.groupby(["pid"])[col_names].agg(lambda x: int(pd.Series.mode(x)[0]))
categorical_features.columns = categorical_features.columns.str.replace("daily", "overallmode")
return categorical_features
def summariseFeatures(features, numerical_operators, categorical_operators, cols_var_threshold):
numerical_col_names = getMatchingColNames(numerical_operators, features)
categorical_col_names = getMatchingColNames(categorical_operators, features)
numerical_features = summarisedNumericalFeatures(numerical_col_names, features)
categorical_features = summarisedCategoricalFeatures(categorical_col_names, features)
features = pd.concat([numerical_features, categorical_features], axis=1)
if cols_var_threshold == "True": # double check the categorical features
features = dropZeroVarianceCols(features)
elif cols_var_threshold == "Flase":
pass
else:
ValueError("COLS_VAR_THRESHOLD parameter in config.yaml can only be 'True' or 'False'")
return features
summarised = snakemake.params["summarised"]
cols_var_threshold = snakemake.params["cols_var_threshold"]
numerical_operators = snakemake.params["numerical_operators"]
categorical_operators = snakemake.params["categorical_operators"]
features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"])
demographic_features = pd.read_csv(snakemake.input["demographic_features"], index_col=["pid"])
targets = pd.read_csv(snakemake.input["targets"], index_col=["pid"])
# Extract summarised features based on daily features:
# for categorical features: calculate variance across all days
# for numerical features: calculate mode across all days
if summarised == "summarised":
features = summariseFeatures(features, numerical_operators, categorical_operators, cols_var_threshold)
data = pd.concat([features, demographic_features, targets], axis=1, join="inner")
data.to_csv(snakemake.output[0], index=True)

View File

@ -1,34 +1,9 @@
import pandas as pd import pandas as pd
from modeling_utils import dropZeroVarianceCols, getNormAllParticipantsScaler, getMetrics, getFeatureImportances, createPipeline import numpy as np
from modeling_utils import getMatchingColNames, dropZeroVarianceCols, getNormAllParticipantsScaler, getMetrics, getFeatureImportances, createPipeline
from sklearn.model_selection import train_test_split, LeaveOneOut, GridSearchCV, cross_val_score, KFold from sklearn.model_selection import train_test_split, LeaveOneOut, GridSearchCV, cross_val_score, KFold
def getMatchingColNames(operators, features):
col_names = []
for col in features.columns:
if any(operator in col for operator in operators):
col_names.append(col)
return col_names
def summarisedNumericalFeatures(col_names, features):
numerical_features = features.groupby(["pid"])[col_names].var()
numerical_features.columns = numerical_features.columns.str.replace("daily", "overallvar")
return numerical_features
def summarisedCategoricalFeatures(col_names, features):
categorical_features = features.groupby(["pid"])[col_names].agg(lambda x: int(pd.Series.mode(x)[0]))
categorical_features.columns = categorical_features.columns.str.replace("daily", "overallmode")
return categorical_features
def summariseFeatures(features, numerical_operators, categorical_operators, cols_var_threshold):
numerical_col_names = getMatchingColNames(numerical_operators, features)
categorical_col_names = getMatchingColNames(categorical_operators, features)
numerical_features = summarisedNumericalFeatures(numerical_col_names, features)
categorical_features = summarisedCategoricalFeatures(categorical_col_names, features)
features = pd.concat([numerical_features, categorical_features], axis=1)
if cols_var_threshold: # double check the categorical features
features = dropZeroVarianceCols(features)
return features
def preprocessNumericalFeatures(train_numerical_features, test_numerical_features, scaler, flag): def preprocessNumericalFeatures(train_numerical_features, test_numerical_features, scaler, flag):
# fillna with mean # fillna with mean
@ -67,17 +42,15 @@ def preprocesFeatures(train_numerical_features, test_numerical_features, categor
############################################################## ##############################################################
# Summary of the workflow # Summary of the workflow
# Step 1. Read parameters, features and targets # Step 1. Read parameters and data
# Step 2. Extract summarised features based on daily features # Step 2. Nested cross validation
# Step 3. Create pipeline # Step 3. Model evaluation
# Step 4. Nested cross validation # Step 4. Save results, parameters, and metrics to CSV files
# Step 5. Model evaluation
# Step 6. Save results, parameters, and metrics to CSV files
############################################################## ##############################################################
# Step 1. Read parameters, features and targets # Step 1. Read parameters and data
# Read parameters # Read parameters
model = snakemake.params["model"] model = snakemake.params["model"]
source = snakemake.params["source"] source = snakemake.params["source"]
@ -85,39 +58,24 @@ summarised = snakemake.params["summarised"]
day_segment = snakemake.params["day_segment"] day_segment = snakemake.params["day_segment"]
scaler = snakemake.params["scaler"] scaler = snakemake.params["scaler"]
cv_method = snakemake.params["cv_method"] cv_method = snakemake.params["cv_method"]
cols_var_threshold = snakemake.params["cols_var_threshold"]
numerical_operators = snakemake.params["numerical_operators"]
categorical_operators = snakemake.params["categorical_operators"] categorical_operators = snakemake.params["categorical_operators"]
categorical_colnames_demographic_features = snakemake.params["categorical_demographic_features"] categorical_colnames_demographic_features = snakemake.params["categorical_demographic_features"]
model_hyperparams = snakemake.params["model_hyperparams"][model] model_hyperparams = snakemake.params["model_hyperparams"][model]
rowsnan_colsnan_days_colsvar_threshold = snakemake.params["rowsnan_colsnan_days_colsvar_threshold"] # thresholds for data cleaning rowsnan_colsnan_days_colsvar_threshold = snakemake.params["rowsnan_colsnan_days_colsvar_threshold"] # thresholds for data cleaning
# Read features and targets
demographic_features = pd.read_csv(snakemake.input["demographic_features"], index_col=["pid"])
targets = pd.read_csv(snakemake.input["targets"], index_col=["pid"])
features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"])
# Compute the proportion of missing value cells among all features
nan_ratio = features.isnull().sum().sum() / (features.shape[0] * features.shape[1])
# Step 2. Extract summarised features based on daily features: # Read data and split
# for categorical features: calculate variance across all days data = pd.read_csv(snakemake.input["data"], index_col=["pid"])
# for numerical features: calculate mode across all days
if summarised == "summarised":
features = summariseFeatures(features, numerical_operators, categorical_operators, cols_var_threshold)
categorical_feature_colnames = categorical_colnames_demographic_features + getMatchingColNames(categorical_operators, features)
data = pd.concat([features, demographic_features, targets], axis=1, join="inner")
data_x, data_y = data.drop("target", axis=1), data[["target"]] data_x, data_y = data.drop("target", axis=1), data[["target"]]
categorical_feature_colnames = categorical_colnames_demographic_features + getMatchingColNames(categorical_operators, data_x)
# Step 3. Create pipeline
pipeline = createPipeline(model) # Step 2. Nested cross validation
cv_class = globals()[cv_method] cv_class = globals()[cv_method]
inner_cv = cv_class() inner_cv = cv_class()
outer_cv = cv_class() outer_cv = cv_class()
# Step 4. Nested cross validation
fold_id, pid, best_params, true_y, pred_y, pred_y_prob = [], [], [], [], [], [] fold_id, pid, best_params, true_y, pred_y, pred_y_prob = [], [], [], [], [], []
feature_importances_all_folds = pd.DataFrame() feature_importances_all_folds = pd.DataFrame()
fold_count = 1 fold_count = 1
@ -143,14 +101,33 @@ for train_index, test_index in outer_cv.split(data_x):
num_of_participants = train_x.shape[0] + test_x.shape[0] num_of_participants = train_x.shape[0] + test_x.shape[0]
num_of_features = train_x.shape[1] num_of_features = train_x.shape[1]
targets_value_counts = train_y["target"].value_counts()
if len(targets_value_counts) < 2 or max(targets_value_counts) < 5:
notes = open(snakemake.log[0], mode="w")
notes.write(targets_value_counts.to_string())
notes.close()
break
# Inner cross validation # Inner cross validation
clf = GridSearchCV(estimator=pipeline, param_grid=model_hyperparams, cv=inner_cv, scoring="f1_micro") if min(targets_value_counts) >= 6:
# SMOTE requires n_neighbors <= n_samples, the default value of n_neighbors is 6
clf = GridSearchCV(estimator=createPipeline(model, "SMOTE"), param_grid=model_hyperparams, cv=inner_cv, scoring="f1_micro")
else:
# RandomOverSampler: over-sample the minority class(es) by picking samples at random with replacement.
clf = GridSearchCV(estimator=createPipeline(model, "RandomOverSampler"), param_grid=model_hyperparams, cv=inner_cv, scoring="f1_micro")
clf.fit(train_x, train_y.values.ravel()) clf.fit(train_x, train_y.values.ravel())
# Collect results and parameters # Collect results and parameters
best_params = best_params + [clf.best_params_] best_params = best_params + [clf.best_params_]
pred_y = pred_y + clf.predict(test_x).tolist() cur_fold_pred = clf.predict(test_x).tolist()
pred_y_prob = pred_y_prob + clf.predict_proba(test_x)[:, 1].tolist() pred_y = pred_y + cur_fold_pred
proba_of_two_categories = clf.predict_proba(test_x).tolist()
if cur_fold_pred[0]:
pred_y_prob = pred_y_prob + [row[proba_of_two_categories[0].index(max(proba_of_two_categories[0]))] for row in proba_of_two_categories]
else:
pred_y_prob = pred_y_prob + [row[proba_of_two_categories[0].index(min(proba_of_two_categories[0]))] for row in proba_of_two_categories]
true_y = true_y + test_y.values.ravel().tolist() true_y = true_y + test_y.values.ravel().tolist()
pid = pid + test_y.index.tolist() # each test partition (fold) in the outer cv is a participant (LeaveOneOut cv) pid = pid + test_y.index.tolist() # each test partition (fold) in the outer cv is a participant (LeaveOneOut cv)
feature_importances_current_fold = getFeatureImportances(model, clf.best_estimator_.steps[1][1], train_x.columns) feature_importances_current_fold = getFeatureImportances(model, clf.best_estimator_.steps[1][1], train_x.columns)
@ -158,13 +135,16 @@ for train_index, test_index in outer_cv.split(data_x):
fold_id.append(fold_count) fold_id.append(fold_count)
fold_count = fold_count + 1 fold_count = fold_count + 1
# Step 5. Model evaluation # Step 3. Model evaluation
acc, pre1, recall1, f11, auc, kappa = getMetrics(pred_y, pred_y_prob, true_y) if len(pred_y) > 1:
metrics = getMetrics(pred_y, pred_y_prob, true_y)
else:
metrics = {"accuracy": None, "precision0": None, "recall0": None, "f10": None, "precision1": None, "recall1": None, "f11": None, "auc": None, "kappa": None}
# Step 6. Save results, parameters, and metrics to CSV files # Step 4. Save results, parameters, and metrics to CSV files
fold_predictions = pd.DataFrame({"fold_id": fold_id, "pid": pid, "hyperparameters": best_params, "true_y": true_y, "pred_y": pred_y, "pred_y_prob": pred_y_prob}) fold_predictions = pd.DataFrame({"fold_id": fold_id, "pid": pid, "hyperparameters": best_params, "true_y": true_y, "pred_y": pred_y, "pred_y_prob": pred_y_prob})
fold_metrics = pd.DataFrame({"fold_id":[], "accuracy":[], "precision1": [], "recall1": [], "f11": [], "auc": [], "kappa": []}) fold_metrics = pd.DataFrame({"fold_id":[], "accuracy":[], "precision0": [], "recall0": [], "f10": [], "precision1": [], "recall1": [], "f11": [], "auc": [], "kappa": []})
overall_results = pd.DataFrame({"num_of_participants": [num_of_participants], "num_of_features": [num_of_features], "nan_ratio": [nan_ratio], "rowsnan_colsnan_days_colsvar_threshold": [rowsnan_colsnan_days_colsvar_threshold], "model": [model], "cv_method": [cv_method], "source": [source], "scaler": [scaler], "day_segment": [day_segment], "summarised": [summarised], "accuracy": [acc], "precision1": [pre1], "recall1": [recall1], "f11": [f11], "auc": [auc], "kappa": [kappa]}) overall_results = pd.DataFrame({"num_of_participants": [num_of_participants], "num_of_features": [num_of_features], "rowsnan_colsnan_days_colsvar_threshold": [rowsnan_colsnan_days_colsvar_threshold], "model": [model], "cv_method": [cv_method], "source": [source], "scaler": [scaler], "day_segment": [day_segment], "summarised": [summarised], "accuracy": [metrics["accuracy"]], "precision0": [metrics["precision0"]], "recall0": [metrics["recall0"]], "f10": [metrics["f10"]], "precision1": [metrics["precision1"]], "recall1": [metrics["recall1"]], "f11": [metrics["f11"]], "auc": [metrics["auc"]], "kappa": [metrics["kappa"]]})
feature_importances_all_folds.insert(loc=0, column='fold_id', value=fold_id) feature_importances_all_folds.insert(loc=0, column='fold_id', value=fold_id)
feature_importances_all_folds.insert(loc=1, column='pid', value=pid) feature_importances_all_folds.insert(loc=1, column='pid', value=pid)

View File

@ -1,11 +1,18 @@
import pandas as pd import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import cohen_kappa_score, roc_auc_score from sklearn.metrics import cohen_kappa_score, roc_auc_score
from imblearn.pipeline import Pipeline from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE from imblearn.over_sampling import SMOTE, RandomOverSampler
def getMatchingColNames(operators, features):
col_names = []
for col in features.columns:
if any(operator in col for operator in operators):
col_names.append(col)
return col_names
# drop columns with zero variance # drop columns with zero variance
def dropZeroVarianceCols(data): def dropZeroVarianceCols(data):
@ -39,14 +46,21 @@ def getNormAllParticipantsScaler(features, scaler_flag):
# get metrics: accuracy, precision1, recall1, f11, auc, kappa # get metrics: accuracy, precision1, recall1, f11, auc, kappa
def getMetrics(pred_y, pred_y_prob, true_y): def getMetrics(pred_y, pred_y_prob, true_y):
acc = accuracy_score(true_y, pred_y) metrics = {}
pre1 = precision_score(true_y, pred_y, average=None, labels=[0,1])[1] # metrics for all categories
recall1 = recall_score(true_y, pred_y, average=None, labels=[0,1])[1] metrics["accuracy"] = accuracy_score(true_y, pred_y)
f11 = f1_score(true_y, pred_y, average=None, labels=[0,1])[1] metrics["auc"] = roc_auc_score(true_y, pred_y_prob)
auc = roc_auc_score(true_y, pred_y_prob) metrics["kappa"] = cohen_kappa_score(true_y, pred_y)
kappa = cohen_kappa_score(true_y, pred_y) # metrics for label 0
metrics["precision0"] = precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[0]
metrics["recall0"] = recall_score(true_y, pred_y, average=None, labels=[0,1])[0]
metrics["f10"] = f1_score(true_y, pred_y, average=None, labels=[0,1])[0]
# metrics for label 1
metrics["precision1"] = precision_score(true_y, pred_y, average=None, labels=[0,1], zero_division=0)[1]
metrics["recall1"] = recall_score(true_y, pred_y, average=None, labels=[0,1])[1]
metrics["f11"] = f1_score(true_y, pred_y, average=None, labels=[0,1])[1]
return acc, pre1, recall1, f11, auc, kappa return metrics
# get feature importances # get feature importances
def getFeatureImportances(model, clf, cols): def getFeatureImportances(model, clf, cols):
@ -83,54 +97,62 @@ def getFeatureImportances(model, clf, cols):
return feature_importances return feature_importances
def createPipeline(model): def createPipeline(model, oversampler_type):
if oversampler_type == "SMOTE":
oversampler = SMOTE(sampling_strategy="minority", random_state=0)
elif oversampler_type == "RandomOverSampler":
oversampler = RandomOverSampler(sampling_strategy="minority", random_state=0)
else:
raise ValueError("RAPIDS pipeline only support 'SMOTE' and 'RandomOverSampler' oversampling methods.")
if model == "LogReg": if model == "LogReg":
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([ pipeline = Pipeline([
("sampling", SMOTE(sampling_strategy="minority", random_state=0)), ("sampling", oversampler),
("clf", LogisticRegression(random_state=0)) ("clf", LogisticRegression(random_state=0))
]) ])
elif model == "kNN": elif model == "kNN":
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
pipeline = Pipeline([ pipeline = Pipeline([
("sampling", SMOTE(sampling_strategy="minority", random_state=0)), ("sampling", oversampler),
("clf", KNeighborsClassifier()) ("clf", KNeighborsClassifier())
]) ])
elif model == "SVM": elif model == "SVM":
from sklearn.svm import SVC from sklearn.svm import SVC
pipeline = Pipeline([ pipeline = Pipeline([
("sampling", SMOTE(sampling_strategy="minority", random_state=0)), ("sampling", oversampler),
("clf", SVC(random_state=0, probability=True)) ("clf", SVC(random_state=0, probability=True))
]) ])
elif model == "DT": elif model == "DT":
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
pipeline = Pipeline([ pipeline = Pipeline([
("sampling", SMOTE(sampling_strategy="minority", random_state=0)), ("sampling", oversampler),
("clf", DecisionTreeClassifier(random_state=0)) ("clf", DecisionTreeClassifier(random_state=0))
]) ])
elif model == "RF": elif model == "RF":
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([ pipeline = Pipeline([
("sampling", SMOTE(sampling_strategy="minority", random_state=0)), ("sampling", oversampler),
("clf", RandomForestClassifier(random_state=0)) ("clf", RandomForestClassifier(random_state=0))
]) ])
elif model == "GB": elif model == "GB":
from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingClassifier
pipeline = Pipeline([ pipeline = Pipeline([
("sampling", SMOTE(sampling_strategy="minority", random_state=0)), ("sampling", oversampler),
("clf", GradientBoostingClassifier(random_state=0)) ("clf", GradientBoostingClassifier(random_state=0))
]) ])
elif model == "XGBoost": elif model == "XGBoost":
from xgboost import XGBClassifier from xgboost import XGBClassifier
pipeline = Pipeline([ pipeline = Pipeline([
("sampling", SMOTE(sampling_strategy="minority", random_state=0)), ("sampling", oversampler),
("clf", XGBClassifier(random_state=0)) ("clf", XGBClassifier(random_state=0, n_jobs=36))
]) ])
elif model == "LightGBM": elif model == "LightGBM":
from lightgbm import LGBMClassifier from lightgbm import LGBMClassifier
pipeline = Pipeline([ pipeline = Pipeline([
("sampling", SMOTE(sampling_strategy="minority", random_state=0)), ("sampling", oversampler),
("clf", LGBMClassifier(random_state=0)) ("clf", LGBMClassifier(random_state=0, n_jobs=36))
]) ])
else: else:
raise ValueError("RAPIDS pipeline only support LogReg, kNN, SVM, DT, RF, GB, XGBoost, and LightGBM algorithms for classification problems.") raise ValueError("RAPIDS pipeline only support LogReg, kNN, SVM, DT, RF, GB, XGBoost, and LightGBM algorithms for classification problems.")