2020-11-25 22:34:05 +01:00
|
|
|
rule download_demographic_data:
|
2020-07-29 21:25:26 +02:00
|
|
|
input:
|
2020-11-25 22:34:05 +01:00
|
|
|
participant_file = "data/external/participant_files/{pid}.yaml"
|
2020-07-29 21:25:26 +02:00
|
|
|
params:
|
2020-11-25 22:34:05 +01:00
|
|
|
source = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["SOURCE"],
|
|
|
|
table = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["TABLE"],
|
2020-07-29 21:25:26 +02:00
|
|
|
output:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/raw/{pid}/participant_info_raw.csv"
|
2020-07-29 21:25:26 +02:00
|
|
|
script:
|
2020-11-25 22:34:05 +01:00
|
|
|
"../src/data/workflow_example/download_demographic_data.R"
|
2020-07-29 21:25:26 +02:00
|
|
|
|
|
|
|
rule demographic_features:
|
|
|
|
input:
|
2020-11-25 22:34:05 +01:00
|
|
|
participant_info = "data/raw/{pid}/participant_info_raw.csv"
|
2020-07-29 21:25:26 +02:00
|
|
|
params:
|
|
|
|
pid = "{pid}",
|
2020-11-25 22:34:05 +01:00
|
|
|
features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FEATURES"]
|
2020-07-29 21:25:26 +02:00
|
|
|
output:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/processed/features/{pid}/demographic_features.csv"
|
2020-07-29 21:25:26 +02:00
|
|
|
script:
|
2020-11-25 22:34:05 +01:00
|
|
|
"../src/features/workflow_example/demographic_features.py"
|
2020-07-29 21:25:26 +02:00
|
|
|
|
2020-11-25 22:34:05 +01:00
|
|
|
rule download_target_data:
|
2020-03-09 18:32:14 +01:00
|
|
|
input:
|
2020-11-25 22:34:05 +01:00
|
|
|
participant_file = "data/external/participant_files/{pid}.yaml"
|
2020-03-12 22:31:46 +01:00
|
|
|
params:
|
2020-11-25 22:34:05 +01:00
|
|
|
source = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["SOURCE"],
|
|
|
|
table = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["TABLE"],
|
2020-03-09 18:32:14 +01:00
|
|
|
output:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/raw/{pid}/participant_target_raw.csv"
|
2020-03-09 18:32:14 +01:00
|
|
|
script:
|
2020-11-25 22:34:05 +01:00
|
|
|
"../src/data/workflow_example/download_target_data.R"
|
2020-03-09 18:32:14 +01:00
|
|
|
|
2020-11-25 22:34:05 +01:00
|
|
|
rule target_readable_datetime:
|
2020-04-16 20:20:16 +02:00
|
|
|
input:
|
2020-11-25 22:34:05 +01:00
|
|
|
sensor_input = "data/raw/{pid}/participant_target_raw.csv",
|
2020-12-03 00:41:03 +01:00
|
|
|
time_segments = "data/interim/time_segments/{pid}_time_segments.csv"
|
2020-11-25 22:34:05 +01:00
|
|
|
params:
|
|
|
|
fixed_timezone = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["SOURCE"]["TIMEZONE"],
|
2020-12-03 00:41:03 +01:00
|
|
|
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
|
|
|
|
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
|
2020-04-16 20:20:16 +02:00
|
|
|
output:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/raw/{pid}/participant_target_with_datetime.csv"
|
2020-04-16 20:20:16 +02:00
|
|
|
script:
|
2020-11-25 22:34:05 +01:00
|
|
|
"../src/data/readable_datetime.R"
|
2020-04-16 20:20:16 +02:00
|
|
|
|
2020-11-25 22:34:05 +01:00
|
|
|
rule parse_targets:
|
2020-04-16 20:20:16 +02:00
|
|
|
input:
|
2020-11-25 22:34:05 +01:00
|
|
|
targets = "data/raw/{pid}/participant_target_with_datetime.csv",
|
2020-12-03 00:41:03 +01:00
|
|
|
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
2020-04-16 20:20:16 +02:00
|
|
|
output:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/processed/targets/{pid}/parsed_targets.csv"
|
2020-04-16 20:20:16 +02:00
|
|
|
script:
|
2020-11-25 22:34:05 +01:00
|
|
|
"../src/models/workflow_example/parse_targets.py"
|
2020-04-16 20:20:16 +02:00
|
|
|
|
2020-11-25 22:34:05 +01:00
|
|
|
rule clean_sensor_features_for_individual_participants:
|
2020-03-09 18:32:14 +01:00
|
|
|
input:
|
2020-11-25 22:34:05 +01:00
|
|
|
rules.merge_sensor_features_for_individual_participants.output
|
|
|
|
params:
|
|
|
|
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
|
|
|
|
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
|
|
|
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
|
|
|
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
2020-03-09 18:32:14 +01:00
|
|
|
output:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/processed/features/{pid}/all_sensor_features_cleaned.csv"
|
2020-03-09 18:32:14 +01:00
|
|
|
script:
|
2020-11-25 22:34:05 +01:00
|
|
|
"../src/models/workflow_example/clean_sensor_features.R"
|
2020-03-18 02:15:53 +01:00
|
|
|
|
2020-11-25 22:34:05 +01:00
|
|
|
rule clean_sensor_features_for_all_participants:
|
2020-03-18 02:15:53 +01:00
|
|
|
input:
|
2020-11-25 22:34:05 +01:00
|
|
|
rules.merge_sensor_features_for_all_participants.output
|
2020-03-18 02:15:53 +01:00
|
|
|
params:
|
2020-11-25 22:34:05 +01:00
|
|
|
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
|
|
|
|
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
|
|
|
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
|
|
|
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
|
2020-03-18 02:15:53 +01:00
|
|
|
output:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/processed/features/all_participants/all_sensor_features_cleaned.csv"
|
2020-03-18 02:15:53 +01:00
|
|
|
script:
|
2020-11-25 22:34:05 +01:00
|
|
|
"../src/models/workflow_example/clean_sensor_features.R"
|
|
|
|
|
|
|
|
rule merge_features_and_targets_for_individual_model:
|
2020-03-18 02:15:53 +01:00
|
|
|
input:
|
2020-11-25 22:34:05 +01:00
|
|
|
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned.csv",
|
|
|
|
targets = "data/processed/targets/{pid}/parsed_targets.csv",
|
2020-03-18 02:15:53 +01:00
|
|
|
output:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/processed/models/individual_model/{pid}/input.csv"
|
2020-03-18 02:15:53 +01:00
|
|
|
script:
|
2020-11-25 22:34:05 +01:00
|
|
|
"../src/models/workflow_example/merge_features_and_targets_for_individual_model.py"
|
2020-03-18 02:15:53 +01:00
|
|
|
|
2020-11-25 22:34:05 +01:00
|
|
|
rule merge_features_and_targets_for_population_model:
|
2020-05-16 00:25:07 +02:00
|
|
|
input:
|
2020-11-25 22:34:05 +01:00
|
|
|
cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned.csv",
|
|
|
|
demographic_features = expand("data/processed/features/{pid}/demographic_features.csv", pid=config["PIDS"]),
|
|
|
|
targets = expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"]),
|
2020-05-16 00:25:07 +02:00
|
|
|
output:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/processed/models/population_model/input.csv"
|
2020-05-16 00:25:07 +02:00
|
|
|
script:
|
2020-11-25 22:34:05 +01:00
|
|
|
"../src/models/workflow_example/merge_features_and_targets_for_population_model.py"
|
|
|
|
|
|
|
|
rule baselines_for_individual_model:
|
2020-04-30 00:53:54 +02:00
|
|
|
input:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/processed/models/individual_model/{pid}/input.csv"
|
2020-05-16 00:42:03 +02:00
|
|
|
params:
|
2020-11-25 22:34:05 +01:00
|
|
|
cv_method = "{cv_method}",
|
|
|
|
colnames_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FEATURES"],
|
2020-05-16 00:42:03 +02:00
|
|
|
output:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv"
|
|
|
|
log:
|
|
|
|
"data/processed/models/individual_model/{pid}/output_{cv_method}/baselines_notes.log"
|
2020-05-16 00:42:03 +02:00
|
|
|
script:
|
2020-11-25 22:34:05 +01:00
|
|
|
"../src/models/workflow_example/baselines.py"
|
|
|
|
|
|
|
|
rule baselines_for_population_model:
|
2020-05-16 00:45:45 +02:00
|
|
|
input:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/processed/models/population_model/input.csv"
|
2020-05-16 00:45:45 +02:00
|
|
|
params:
|
|
|
|
cv_method = "{cv_method}",
|
2020-11-25 22:34:05 +01:00
|
|
|
colnames_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FEATURES"],
|
2020-05-16 00:45:45 +02:00
|
|
|
output:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/processed/models/population_model/output_{cv_method}/baselines.csv"
|
2020-05-16 00:45:45 +02:00
|
|
|
log:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/processed/models/population_model/output_{cv_method}/baselines_notes.log"
|
2020-05-16 00:45:45 +02:00
|
|
|
script:
|
2020-11-25 22:34:05 +01:00
|
|
|
"../src/models/workflow_example/baselines.py"
|
|
|
|
|
2020-11-26 04:35:38 +01:00
|
|
|
rule modelling_for_individual_participants:
|
2020-05-16 00:42:03 +02:00
|
|
|
input:
|
2020-11-25 22:34:05 +01:00
|
|
|
data = "data/processed/models/individual_model/{pid}/input.csv"
|
2020-04-30 00:53:54 +02:00
|
|
|
params:
|
|
|
|
model = "{model}",
|
|
|
|
cv_method = "{cv_method}",
|
|
|
|
scaler = "{scaler}",
|
|
|
|
categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"],
|
2020-11-25 22:34:05 +01:00
|
|
|
categorical_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["CATEGORICAL_FEATURES"],
|
2020-04-30 00:53:54 +02:00
|
|
|
model_hyperparams = config["PARAMS_FOR_ANALYSIS"]["MODEL_HYPERPARAMS"],
|
|
|
|
output:
|
2020-11-25 22:34:05 +01:00
|
|
|
fold_predictions = "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/fold_predictions.csv",
|
|
|
|
fold_metrics = "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/fold_metrics.csv",
|
|
|
|
overall_results = "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/overall_results.csv",
|
|
|
|
fold_feature_importances = "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/fold_feature_importances.csv"
|
2020-05-16 00:42:03 +02:00
|
|
|
log:
|
2020-11-25 22:34:05 +01:00
|
|
|
"data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/notes.log"
|
2020-04-30 00:53:54 +02:00
|
|
|
script:
|
2020-11-26 04:35:38 +01:00
|
|
|
"../src/models/workflow_example/modelling.py"
|
2020-05-16 00:49:14 +02:00
|
|
|
|
2020-11-26 04:35:38 +01:00
|
|
|
rule modelling_for_all_participants:
|
2020-05-16 00:49:14 +02:00
|
|
|
input:
|
2020-11-25 22:34:05 +01:00
|
|
|
data = "data/processed/models/population_model/input.csv"
|
|
|
|
params:
|
|
|
|
model = "{model}",
|
|
|
|
cv_method = "{cv_method}",
|
|
|
|
scaler = "{scaler}",
|
|
|
|
categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"],
|
|
|
|
categorical_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["CATEGORICAL_FEATURES"],
|
|
|
|
model_hyperparams = config["PARAMS_FOR_ANALYSIS"]["MODEL_HYPERPARAMS"],
|
2020-05-16 00:49:14 +02:00
|
|
|
output:
|
2020-11-25 22:34:05 +01:00
|
|
|
fold_predictions = "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/fold_predictions.csv",
|
|
|
|
fold_metrics = "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/fold_metrics.csv",
|
|
|
|
overall_results = "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/overall_results.csv",
|
|
|
|
fold_feature_importances = "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/fold_feature_importances.csv"
|
|
|
|
log:
|
|
|
|
"data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/notes.log"
|
2020-05-16 00:49:14 +02:00
|
|
|
script:
|
2020-11-26 04:35:38 +01:00
|
|
|
"../src/models/workflow_example/modelling.py"
|