rapids/rules/models.smk

166 lines
7.7 KiB
Plaintext
Raw Normal View History

2020-11-25 22:34:05 +01:00
rule download_demographic_data:
input:
2020-11-25 22:34:05 +01:00
participant_file = "data/external/participant_files/{pid}.yaml"
params:
2020-11-25 22:34:05 +01:00
source = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["SOURCE"],
table = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["TABLE"],
output:
2020-11-25 22:34:05 +01:00
"data/raw/{pid}/participant_info_raw.csv"
script:
2020-11-25 22:34:05 +01:00
"../src/data/workflow_example/download_demographic_data.R"
rule demographic_features:
input:
2020-11-25 22:34:05 +01:00
participant_info = "data/raw/{pid}/participant_info_raw.csv"
params:
pid = "{pid}",
2020-11-25 22:34:05 +01:00
features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FEATURES"]
output:
2020-11-25 22:34:05 +01:00
"data/processed/features/{pid}/demographic_features.csv"
script:
2020-11-25 22:34:05 +01:00
"../src/features/workflow_example/demographic_features.py"
2020-11-25 22:34:05 +01:00
rule download_target_data:
input:
2020-11-25 22:34:05 +01:00
participant_file = "data/external/participant_files/{pid}.yaml"
params:
2020-11-25 22:34:05 +01:00
source = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["SOURCE"],
table = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["TABLE"],
output:
2020-11-25 22:34:05 +01:00
"data/raw/{pid}/participant_target_raw.csv"
script:
2020-11-25 22:34:05 +01:00
"../src/data/workflow_example/download_target_data.R"
2020-11-25 22:34:05 +01:00
rule target_readable_datetime:
input:
2020-11-25 22:34:05 +01:00
sensor_input = "data/raw/{pid}/participant_target_raw.csv",
2020-12-03 00:41:03 +01:00
time_segments = "data/interim/time_segments/{pid}_time_segments.csv"
2020-11-25 22:34:05 +01:00
params:
fixed_timezone = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["SOURCE"]["TIMEZONE"],
2020-12-03 00:41:03 +01:00
time_segments_type = config["TIME_SEGMENTS"]["TYPE"],
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output:
2020-11-25 22:34:05 +01:00
"data/raw/{pid}/participant_target_with_datetime.csv"
script:
2020-11-25 22:34:05 +01:00
"../src/data/readable_datetime.R"
2020-11-25 22:34:05 +01:00
rule parse_targets:
input:
2020-11-25 22:34:05 +01:00
targets = "data/raw/{pid}/participant_target_with_datetime.csv",
2020-12-03 00:41:03 +01:00
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
output:
2020-11-25 22:34:05 +01:00
"data/processed/targets/{pid}/parsed_targets.csv"
script:
2020-11-25 22:34:05 +01:00
"../src/models/workflow_example/parse_targets.py"
2020-11-25 22:34:05 +01:00
rule clean_sensor_features_for_individual_participants:
input:
2020-11-25 22:34:05 +01:00
rules.merge_sensor_features_for_individual_participants.output
params:
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
output:
2020-11-25 22:34:05 +01:00
"data/processed/features/{pid}/all_sensor_features_cleaned.csv"
script:
2020-11-25 22:34:05 +01:00
"../src/models/workflow_example/clean_sensor_features.R"
2020-11-25 22:34:05 +01:00
rule clean_sensor_features_for_all_participants:
input:
2020-11-25 22:34:05 +01:00
rules.merge_sensor_features_for_all_participants.output
params:
2020-11-25 22:34:05 +01:00
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
data_yielded_hours_ratio_threshold = config["PARAMS_FOR_ANALYSIS"]["DATA_YIELDED_HOURS_RATIO_THRESHOLD"],
output:
2020-11-25 22:34:05 +01:00
"data/processed/features/all_participants/all_sensor_features_cleaned.csv"
script:
2020-11-25 22:34:05 +01:00
"../src/models/workflow_example/clean_sensor_features.R"
rule merge_features_and_targets_for_individual_model:
input:
2020-11-25 22:34:05 +01:00
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned.csv",
targets = "data/processed/targets/{pid}/parsed_targets.csv",
output:
2020-11-25 22:34:05 +01:00
"data/processed/models/individual_model/{pid}/input.csv"
script:
2020-11-25 22:34:05 +01:00
"../src/models/workflow_example/merge_features_and_targets_for_individual_model.py"
2020-11-25 22:34:05 +01:00
rule merge_features_and_targets_for_population_model:
input:
2020-11-25 22:34:05 +01:00
cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned.csv",
demographic_features = expand("data/processed/features/{pid}/demographic_features.csv", pid=config["PIDS"]),
targets = expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"]),
output:
2020-11-25 22:34:05 +01:00
"data/processed/models/population_model/input.csv"
script:
2020-11-25 22:34:05 +01:00
"../src/models/workflow_example/merge_features_and_targets_for_population_model.py"
rule baselines_for_individual_model:
2020-04-30 00:53:54 +02:00
input:
2020-11-25 22:34:05 +01:00
"data/processed/models/individual_model/{pid}/input.csv"
params:
2020-11-25 22:34:05 +01:00
cv_method = "{cv_method}",
colnames_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FEATURES"],
output:
2020-11-25 22:34:05 +01:00
"data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv"
log:
"data/processed/models/individual_model/{pid}/output_{cv_method}/baselines_notes.log"
script:
2020-11-25 22:34:05 +01:00
"../src/models/workflow_example/baselines.py"
rule baselines_for_population_model:
2020-05-16 00:45:45 +02:00
input:
2020-11-25 22:34:05 +01:00
"data/processed/models/population_model/input.csv"
2020-05-16 00:45:45 +02:00
params:
cv_method = "{cv_method}",
2020-11-25 22:34:05 +01:00
colnames_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FEATURES"],
2020-05-16 00:45:45 +02:00
output:
2020-11-25 22:34:05 +01:00
"data/processed/models/population_model/output_{cv_method}/baselines.csv"
2020-05-16 00:45:45 +02:00
log:
2020-11-25 22:34:05 +01:00
"data/processed/models/population_model/output_{cv_method}/baselines_notes.log"
2020-05-16 00:45:45 +02:00
script:
2020-11-25 22:34:05 +01:00
"../src/models/workflow_example/baselines.py"
rule modelling_for_individual_participants:
input:
2020-11-25 22:34:05 +01:00
data = "data/processed/models/individual_model/{pid}/input.csv"
2020-04-30 00:53:54 +02:00
params:
model = "{model}",
cv_method = "{cv_method}",
scaler = "{scaler}",
categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"],
2020-11-25 22:34:05 +01:00
categorical_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["CATEGORICAL_FEATURES"],
2020-04-30 00:53:54 +02:00
model_hyperparams = config["PARAMS_FOR_ANALYSIS"]["MODEL_HYPERPARAMS"],
output:
2020-11-25 22:34:05 +01:00
fold_predictions = "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/fold_predictions.csv",
fold_metrics = "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/fold_metrics.csv",
overall_results = "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/overall_results.csv",
fold_feature_importances = "data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/fold_feature_importances.csv"
log:
2020-11-25 22:34:05 +01:00
"data/processed/models/individual_model/{pid}/output_{cv_method}/{model}/{scaler}/notes.log"
2020-04-30 00:53:54 +02:00
script:
"../src/models/workflow_example/modelling.py"
rule modelling_for_all_participants:
input:
2020-11-25 22:34:05 +01:00
data = "data/processed/models/population_model/input.csv"
params:
model = "{model}",
cv_method = "{cv_method}",
scaler = "{scaler}",
categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"],
categorical_demographic_features = config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["CATEGORICAL_FEATURES"],
model_hyperparams = config["PARAMS_FOR_ANALYSIS"]["MODEL_HYPERPARAMS"],
output:
2020-11-25 22:34:05 +01:00
fold_predictions = "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/fold_predictions.csv",
fold_metrics = "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/fold_metrics.csv",
overall_results = "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/overall_results.csv",
fold_feature_importances = "data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/fold_feature_importances.csv"
log:
"data/processed/models/population_model/output_{cv_method}/{model}/{scaler}/notes.log"
script:
"../src/models/workflow_example/modelling.py"