2020-05-16 00:42:03 +02:00
ruleorder: nan_cells_ratio_of_cleaned_features > merge_features_and_targets
2020-07-29 21:25:26 +02:00
rule days_to_analyse:
input:
participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv"
params:
days_before_surgery = "{days_before_surgery}",
days_in_hospital = "{days_in_hospital}",
days_after_discharge= "{days_after_discharge}"
output:
"data/interim/{pid}/days_to_analyse_{days_before_surgery}_{days_in_hospital}_{days_after_discharge}.csv"
script:
"../src/models/select_days_to_analyse.py"
rule targets:
input:
2020-08-03 19:09:16 +02:00
participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["TARGET_TABLE"] + "_raw.csv"
2020-07-29 21:25:26 +02:00
params:
pid = "{pid}",
2020-08-17 16:48:15 +02:00
summarised = "{summarised}"
2020-07-29 21:25:26 +02:00
output:
"data/processed/{pid}/targets_{summarised}.csv"
script:
"../src/models/targets.py"
rule demographic_features:
input:
participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv"
params:
pid = "{pid}",
features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC_FEATURES"]
output:
"data/processed/{pid}/demographic_features.csv"
script:
"../src/features/demographic_features.py"
2020-04-16 18:38:28 +02:00
rule merge_features_for_individual_model:
2020-03-09 18:32:14 +01:00
input:
2020-04-16 18:38:28 +02:00
feature_files = input_merge_features_of_single_participant,
2020-03-18 02:15:53 +01:00
phone_valid_sensed_days = optional_input_valid_sensed_days,
days_to_include = optional_input_days_to_include
2020-03-12 22:31:46 +01:00
params:
source = "{source}"
2020-03-09 18:32:14 +01:00
output:
2020-07-28 00:27:36 +02:00
"data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv"
2020-03-09 18:32:14 +01:00
script:
2020-04-16 18:38:28 +02:00
"../src/models/merge_features_for_individual_model.R"
2020-03-09 18:32:14 +01:00
2020-04-16 20:20:16 +02:00
rule merge_features_for_population_model:
input:
2020-07-28 00:27:36 +02:00
feature_files = expand("data/processed/{pid}/data_for_individual_model/{{min_valid_hours_per_day}}hours_{{min_valid_bins_per_hour}}bins/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])
2020-04-16 20:20:16 +02:00
output:
2020-07-28 00:27:36 +02:00
"data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{source}_{day_segment}_original.csv"
2020-04-16 20:20:16 +02:00
script:
"../src/models/merge_features_for_population_model.R"
rule merge_demographicfeatures_for_population_model:
input:
data_files = expand("data/processed/{pid}/demographic_features.csv", pid=config["PIDS"])
output:
"data/processed/data_for_population_model/demographic_features.csv"
script:
"../src/models/merge_data_for_population_model.py"
2020-04-16 18:38:28 +02:00
rule merge_targets_for_population_model:
2020-03-09 18:32:14 +01:00
input:
2020-04-16 18:38:28 +02:00
data_files = expand("data/processed/{pid}/targets_{{summarised}}.csv", pid=config["PIDS"])
2020-03-09 18:32:14 +01:00
output:
2020-04-16 20:20:16 +02:00
"data/processed/data_for_population_model/targets_{summarised}.csv"
2020-03-09 18:32:14 +01:00
script:
2020-04-16 18:38:28 +02:00
"../src/models/merge_data_for_population_model.py"
2020-03-18 02:15:53 +01:00
2020-04-16 18:38:28 +02:00
rule clean_features_for_individual_model:
2020-03-18 02:15:53 +01:00
input:
2020-04-16 18:38:28 +02:00
rules.merge_features_for_individual_model.output
2020-03-18 02:15:53 +01:00
params:
2020-07-28 00:27:36 +02:00
features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"],
2020-05-15 23:59:14 +02:00
cols_nan_threshold = "{cols_nan_threshold}",
cols_var_threshold = "{cols_var_threshold}",
days_before_threshold = "{days_before_threshold}",
days_after_threshold = "{days_after_threshold}",
rows_nan_threshold = "{rows_nan_threshold}",
2020-03-18 02:15:53 +01:00
output:
2020-07-28 00:27:36 +02:00
"data/processed/{pid}/data_for_individual_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
2020-03-18 02:15:53 +01:00
script:
2020-04-16 18:38:28 +02:00
"../src/models/clean_features_for_model.R"
2020-03-18 02:15:53 +01:00
2020-04-16 18:38:28 +02:00
rule clean_features_for_population_model:
2020-03-18 02:15:53 +01:00
input:
2020-04-16 18:38:28 +02:00
rules.merge_features_for_population_model.output
2020-03-18 02:15:53 +01:00
params:
2020-07-28 00:27:36 +02:00
features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"],
2020-05-15 23:59:14 +02:00
cols_nan_threshold = "{cols_nan_threshold}",
cols_var_threshold = "{cols_var_threshold}",
days_before_threshold = "{days_before_threshold}",
days_after_threshold = "{days_after_threshold}",
rows_nan_threshold = "{rows_nan_threshold}",
2020-03-18 02:15:53 +01:00
output:
2020-07-28 00:27:36 +02:00
"data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
2020-03-18 02:15:53 +01:00
script:
2020-04-16 18:38:28 +02:00
"../src/models/clean_features_for_model.R"
2020-03-18 02:15:53 +01:00
2020-05-16 00:25:07 +02:00
rule nan_cells_ratio_of_cleaned_features:
input:
2020-07-28 00:27:36 +02:00
cleaned_features = "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
2020-05-16 00:25:07 +02:00
output:
2020-07-28 00:27:36 +02:00
"data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv"
2020-05-16 00:25:07 +02:00
script:
"../src/models/nan_cells_ratio_of_cleaned_features.py"
2020-05-16 00:42:03 +02:00
rule merge_features_and_targets:
2020-04-30 00:53:54 +02:00
input:
2020-07-28 04:59:32 +02:00
cleaned_features = "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv",
2020-04-30 00:53:54 +02:00
demographic_features = "data/processed/data_for_population_model/demographic_features.csv",
targets = "data/processed/data_for_population_model/targets_{summarised}.csv",
2020-05-16 00:42:03 +02:00
params:
summarised = "{summarised}",
cols_var_threshold = "{cols_var_threshold}",
numerical_operators = config["PARAMS_FOR_ANALYSIS"]["NUMERICAL_OPERATORS"],
2020-08-03 19:09:16 +02:00
categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"],
features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"],
2020-05-16 00:42:03 +02:00
output:
2020-07-28 04:59:32 +02:00
"data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv"
2020-05-16 00:42:03 +02:00
script:
"../src/models/merge_features_and_targets.py"
2020-05-16 00:45:45 +02:00
rule baseline:
input:
2020-07-28 04:59:32 +02:00
"data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv"
2020-05-16 00:45:45 +02:00
params:
cv_method = "{cv_method}",
rowsnan_colsnan_days_colsvar_threshold = "{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}",
demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC_FEATURES"]
output:
2020-08-05 13:51:35 +02:00
"data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/baseline/{cv_method}/{source}_{day_segment}_{summarised}.csv"
2020-05-16 00:45:45 +02:00
log:
2020-08-05 13:51:35 +02:00
"data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/baseline/{cv_method}/{source}_{day_segment}_{summarised}_notes.log"
2020-05-16 00:45:45 +02:00
script:
"../src/models/baseline.py"
2020-05-16 00:42:03 +02:00
rule modeling:
input:
2020-07-28 04:59:32 +02:00
data = "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv"
2020-04-30 00:53:54 +02:00
params:
model = "{model}",
cv_method = "{cv_method}",
source = "{source}",
day_segment = "{day_segment}",
summarised = "{summarised}",
scaler = "{scaler}",
categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"],
categorical_demographic_features = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_DEMOGRAPHIC_FEATURES"],
model_hyperparams = config["PARAMS_FOR_ANALYSIS"]["MODEL_HYPERPARAMS"],
2020-05-16 00:42:03 +02:00
rowsnan_colsnan_days_colsvar_threshold = "{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}"
2020-04-30 00:53:54 +02:00
output:
2020-07-28 04:59:32 +02:00
fold_predictions = "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_predictions.csv",
fold_metrics = "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_metrics.csv",
overall_results = "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/overall_results.csv",
fold_feature_importances = "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_feature_importances.csv"
2020-05-16 00:42:03 +02:00
log:
2020-07-28 04:59:32 +02:00
"data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/notes.log"
2020-04-30 00:53:54 +02:00
script:
2020-05-16 00:42:03 +02:00
"../src/models/modeling.py"
2020-05-16 00:49:14 +02:00
rule merge_population_model_results:
input:
2020-07-28 04:59:32 +02:00
overall_results = "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/overall_results.csv",
nan_cells_ratio = "data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv",
2020-08-05 13:51:35 +02:00
baseline = "data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/baseline/{cv_method}/{source}_{day_segment}_{summarised}.csv"
2020-05-16 00:49:14 +02:00
output:
2020-07-28 04:59:32 +02:00
"data/processed/output_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/merged_population_model_results.csv"
2020-05-16 00:49:14 +02:00
script:
"../src/models/merge_population_model_results.py"