2020-04-16 18:38:28 +02:00
|
|
|
def input_merge_features_of_single_participant(wildcards):
|
|
|
|
if wildcards.source == "phone_fitbit_features":
|
|
|
|
return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"]["PHONE_FEATURES"] + config["PARAMS_FOR_ANALYSIS"]["FITBIT_FEATURES"], day_segment=wildcards.day_segment)
|
2020-03-09 18:32:14 +01:00
|
|
|
else:
|
2020-04-16 18:38:28 +02:00
|
|
|
return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"][wildcards.source.upper()], day_segment=wildcards.day_segment)
|
2020-03-09 18:32:14 +01:00
|
|
|
|
2020-03-18 02:15:53 +01:00
|
|
|
def optional_input_days_to_include(wildcards):
|
2020-04-16 18:38:28 +02:00
|
|
|
if config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["ENABLED"]:
|
2020-03-18 02:15:53 +01:00
|
|
|
# This input automatically trigers the rule days_to_analyse in mystudy.snakefile
|
|
|
|
return ["data/interim/{pid}/days_to_analyse" + \
|
2020-04-16 18:38:28 +02:00
|
|
|
"_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"]) + \
|
|
|
|
"_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]) + \
|
|
|
|
"_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"]) + ".csv"]
|
2020-03-18 02:15:53 +01:00
|
|
|
else:
|
|
|
|
return []
|
|
|
|
|
|
|
|
def optional_input_valid_sensed_days(wildcards):
|
2020-04-16 18:38:28 +02:00
|
|
|
if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:
|
2020-03-18 02:15:53 +01:00
|
|
|
# This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile
|
|
|
|
return ["data/interim/{pid}/phone_valid_sensed_days.csv"]
|
|
|
|
else:
|
|
|
|
return []
|
|
|
|
|
2020-04-16 18:38:28 +02:00
|
|
|
rule merge_features_for_individual_model:
|
2020-03-09 18:32:14 +01:00
|
|
|
input:
|
2020-04-16 18:38:28 +02:00
|
|
|
feature_files = input_merge_features_of_single_participant,
|
2020-03-18 02:15:53 +01:00
|
|
|
phone_valid_sensed_days = optional_input_valid_sensed_days,
|
|
|
|
days_to_include = optional_input_days_to_include
|
2020-03-12 22:31:46 +01:00
|
|
|
params:
|
|
|
|
source = "{source}"
|
2020-03-09 18:32:14 +01:00
|
|
|
output:
|
2020-04-16 20:20:16 +02:00
|
|
|
"data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv"
|
2020-03-09 18:32:14 +01:00
|
|
|
script:
|
2020-04-16 18:38:28 +02:00
|
|
|
"../src/models/merge_features_for_individual_model.R"
|
2020-03-09 18:32:14 +01:00
|
|
|
|
2020-04-16 20:20:16 +02:00
|
|
|
rule merge_features_for_population_model:
|
|
|
|
input:
|
|
|
|
feature_files = expand("data/processed/{pid}/data_for_individual_model/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])
|
|
|
|
output:
|
|
|
|
"data/processed/data_for_population_model/{source}_{day_segment}_original.csv"
|
|
|
|
script:
|
|
|
|
"../src/models/merge_features_for_population_model.R"
|
|
|
|
|
|
|
|
rule merge_demographicfeatures_for_population_model:
|
|
|
|
input:
|
|
|
|
data_files = expand("data/processed/{pid}/demographic_features.csv", pid=config["PIDS"])
|
|
|
|
output:
|
|
|
|
"data/processed/data_for_population_model/demographic_features.csv"
|
|
|
|
script:
|
|
|
|
"../src/models/merge_data_for_population_model.py"
|
|
|
|
|
2020-04-16 18:38:28 +02:00
|
|
|
rule merge_targets_for_population_model:
|
2020-03-09 18:32:14 +01:00
|
|
|
input:
|
2020-04-16 18:38:28 +02:00
|
|
|
data_files = expand("data/processed/{pid}/targets_{{summarised}}.csv", pid=config["PIDS"])
|
2020-03-09 18:32:14 +01:00
|
|
|
output:
|
2020-04-16 20:20:16 +02:00
|
|
|
"data/processed/data_for_population_model/targets_{summarised}.csv"
|
2020-03-09 18:32:14 +01:00
|
|
|
script:
|
2020-04-16 18:38:28 +02:00
|
|
|
"../src/models/merge_data_for_population_model.py"
|
2020-03-18 02:15:53 +01:00
|
|
|
|
2020-04-16 18:38:28 +02:00
|
|
|
rule clean_features_for_individual_model:
|
2020-03-18 02:15:53 +01:00
|
|
|
input:
|
2020-04-16 18:38:28 +02:00
|
|
|
rules.merge_features_for_individual_model.output
|
2020-03-18 02:15:53 +01:00
|
|
|
params:
|
2020-05-15 23:59:14 +02:00
|
|
|
cols_nan_threshold = "{cols_nan_threshold}",
|
|
|
|
cols_var_threshold = "{cols_var_threshold}",
|
|
|
|
days_before_threshold = "{days_before_threshold}",
|
|
|
|
days_after_threshold = "{days_after_threshold}",
|
|
|
|
rows_nan_threshold = "{rows_nan_threshold}",
|
2020-03-18 02:15:53 +01:00
|
|
|
output:
|
2020-05-15 23:59:14 +02:00
|
|
|
"data/processed/{pid}/data_for_individual_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
|
2020-03-18 02:15:53 +01:00
|
|
|
script:
|
2020-04-16 18:38:28 +02:00
|
|
|
"../src/models/clean_features_for_model.R"
|
2020-03-18 02:15:53 +01:00
|
|
|
|
2020-04-16 18:38:28 +02:00
|
|
|
rule clean_features_for_population_model:
|
2020-03-18 02:15:53 +01:00
|
|
|
input:
|
2020-04-16 18:38:28 +02:00
|
|
|
rules.merge_features_for_population_model.output
|
2020-03-18 02:15:53 +01:00
|
|
|
params:
|
2020-05-15 23:59:14 +02:00
|
|
|
cols_nan_threshold = "{cols_nan_threshold}",
|
|
|
|
cols_var_threshold = "{cols_var_threshold}",
|
|
|
|
days_before_threshold = "{days_before_threshold}",
|
|
|
|
days_after_threshold = "{days_after_threshold}",
|
|
|
|
rows_nan_threshold = "{rows_nan_threshold}",
|
2020-03-18 02:15:53 +01:00
|
|
|
output:
|
2020-05-15 23:59:14 +02:00
|
|
|
"data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
|
2020-03-18 02:15:53 +01:00
|
|
|
script:
|
2020-04-16 18:38:28 +02:00
|
|
|
"../src/models/clean_features_for_model.R"
|
2020-03-18 02:15:53 +01:00
|
|
|
|
2020-04-30 00:53:54 +02:00
|
|
|
rule modeling:
|
|
|
|
input:
|
|
|
|
cleaned_features = "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv",
|
|
|
|
demographic_features = "data/processed/data_for_population_model/demographic_features.csv",
|
|
|
|
targets = "data/processed/data_for_population_model/targets_{summarised}.csv",
|
|
|
|
params:
|
|
|
|
model = "{model}",
|
|
|
|
cv_method = "{cv_method}",
|
|
|
|
source = "{source}",
|
|
|
|
day_segment = "{day_segment}",
|
|
|
|
summarised = "{summarised}",
|
|
|
|
scaler = "{scaler}",
|
|
|
|
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
|
|
|
numerical_operators = config["PARAMS_FOR_ANALYSIS"]["NUMERICAL_OPERATORS"],
|
|
|
|
categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"],
|
|
|
|
categorical_demographic_features = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_DEMOGRAPHIC_FEATURES"],
|
|
|
|
model_hyperparams = config["PARAMS_FOR_ANALYSIS"]["MODEL_HYPERPARAMS"],
|
|
|
|
rowsnan_colsnan_days_colsvar_threshold = "{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}"
|
|
|
|
output:
|
|
|
|
fold_predictions = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_predictions.csv",
|
|
|
|
fold_metrics = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_metrics.csv",
|
|
|
|
overall_results = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/overall_results.csv",
|
|
|
|
fold_feature_importances = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_feature_importances.csv"
|
|
|
|
script:
|
|
|
|
"../src/models/modeling.py"
|