rapids/rules/models.snakefile

def input_merge_features_of_single_participant(wildcards):
    if wildcards.source == "phone_fitbit_features":
        return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"]["PHONE_FEATURES"] + config["PARAMS_FOR_ANALYSIS"]["FITBIT_FEATURES"], day_segment=wildcards.day_segment)
    else:
        return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"][wildcards.source.upper()], day_segment=wildcards.day_segment)

def optional_input_days_to_include(wildcards):
    if config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["ENABLED"]:
        # This input automatically trigers the rule days_to_analyse in mystudy.snakefile
        return ["data/interim/{pid}/days_to_analyse" + \
                    "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"]) + \
                    "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]) + \
                    "_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"]) + ".csv"]
    else:
        return []

def optional_input_valid_sensed_days(wildcards):
    if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:
        # This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile
        return ["data/interim/{pid}/phone_valid_sensed_days.csv"]
    else:
        return []

rule merge_features_for_individual_model:
    input:
        feature_files = input_merge_features_of_single_participant,
        phone_valid_sensed_days = optional_input_valid_sensed_days,
        days_to_include = optional_input_days_to_include
    params:
        source = "{source}"
    output:
        "data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv"
    script:
        "../src/models/merge_features_for_individual_model.R"

rule merge_features_for_population_model:
    input:
        feature_files = expand("data/processed/{pid}/data_for_individual_model/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])
    output:
        "data/processed/data_for_population_model/{source}_{day_segment}_original.csv"
    script:
        "../src/models/merge_features_for_population_model.R"

rule merge_demographicfeatures_for_population_model:
    input:
        data_files = expand("data/processed/{pid}/demographic_features.csv", pid=config["PIDS"])
    output:
        "data/processed/data_for_population_model/demographic_features.csv"
    script:
        "../src/models/merge_data_for_population_model.py"

rule merge_targets_for_population_model:
    input:
        data_files = expand("data/processed/{pid}/targets_{{summarised}}.csv", pid=config["PIDS"])
    output:
        "data/processed/data_for_population_model/targets_{summarised}.csv"
    script:
        "../src/models/merge_data_for_population_model.py"

rule clean_features_for_individual_model:
    input:
        rules.merge_features_for_individual_model.output
    params:
        cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
        cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
        rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
        days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
        days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"]
    output:
        "data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_clean.csv"
    script:
        "../src/models/clean_features_for_model.R"

rule clean_features_for_population_model:
    input:
        rules.merge_features_for_population_model.output
    params:
        cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
        cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
        rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
        days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
        days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"]
    output:
        "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv"
    script:
        "../src/models/clean_features_for_model.R"

rule modeling:
    input:
        cleaned_features = "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv",
        demographic_features = "data/processed/data_for_population_model/demographic_features.csv",
        targets = "data/processed/data_for_population_model/targets_{summarised}.csv",
    params:
        model = "{model}",
        cv_method = "{cv_method}",
        source = "{source}",
        day_segment = "{day_segment}",
        summarised = "{summarised}",
        scaler = "{scaler}",
        cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
        numerical_operators = config["PARAMS_FOR_ANALYSIS"]["NUMERICAL_OPERATORS"],
        categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"],
        categorical_demographic_features = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_DEMOGRAPHIC_FEATURES"],
        model_hyperparams = config["PARAMS_FOR_ANALYSIS"]["MODEL_HYPERPARAMS"],
        rowsnan_colsnan_days_colsvar_threshold = "{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}"
    output:
        fold_predictions = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_predictions.csv",
        fold_metrics = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_metrics.csv",
        overall_results = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/overall_results.csv",
        fold_feature_importances = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_feature_importances.csv"
    script:
        "../src/models/modeling.py"
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`def input_merge_features_of_single_participant(wildcards):`
			`if wildcards.source == "phone_fitbit_features":`
			`return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"]["PHONE_FEATURES"] + config["PARAMS_FOR_ANALYSIS"]["FITBIT_FEATURES"], day_segment=wildcards.day_segment)`
Add merge metrics module for analysis rules 2020-03-09 18:32:14 +01:00			`else:`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`return expand("data/processed/{pid}/{features}_{day_segment}.csv", pid=wildcards.pid, features=config["PARAMS_FOR_ANALYSIS"][wildcards.source.upper()], day_segment=wildcards.day_segment)`
Add merge metrics module for analysis rules 2020-03-09 18:32:14 +01:00
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00			`def optional_input_days_to_include(wildcards):`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`if config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["ENABLED"]:`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00			`# This input automatically trigers the rule days_to_analyse in mystudy.snakefile`
			`return ["data/interim/{pid}/days_to_analyse" + \`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`"_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_BEFORE_SURGERY"]) + \`
			`"_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_IN_HOSPITAL"]) + \`
			`"_" + str(config["PARAMS_FOR_ANALYSIS"]["DAYS_TO_ANALYSE"]["DAYS_AFTER_DISCHARGE"]) + ".csv"]`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00			`else:`
			`return []`

			`def optional_input_valid_sensed_days(wildcards):`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`if config["PARAMS_FOR_ANALYSIS"]["DROP_VALID_SENSED_DAYS"]["ENABLED"]:`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00			`# This input automatically trigers the rule phone_valid_sensed_days in preprocessing.snakefile`
			`return ["data/interim/{pid}/phone_valid_sensed_days.csv"]`
			`else:`
			`return []`

Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`rule merge_features_for_individual_model:`
Add merge metrics module for analysis rules 2020-03-09 18:32:14 +01:00			`input:`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`feature_files = input_merge_features_of_single_participant,`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00			`phone_valid_sensed_days = optional_input_valid_sensed_days,`
			`days_to_include = optional_input_days_to_include`
Rename merge metrics for models and add filter valid sensed days Co-authored-by: Meng Li <AnnieLM1996@gmail.com> 2020-03-12 22:31:46 +01:00			`params:`
			`source = "{source}"`
Add merge metrics module for analysis rules 2020-03-09 18:32:14 +01:00			`output:`
Add merge module for demographic features and target 2020-04-16 20:20:16 +02:00			`"data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv"`
Add merge metrics module for analysis rules 2020-03-09 18:32:14 +01:00			`script:`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`"../src/models/merge_features_for_individual_model.R"`
Add merge metrics module for analysis rules 2020-03-09 18:32:14 +01:00
Add merge module for demographic features and target 2020-04-16 20:20:16 +02:00			`rule merge_features_for_population_model:`
			`input:`
			`feature_files = expand("data/processed/{pid}/data_for_individual_model/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])`
			`output:`
			`"data/processed/data_for_population_model/{source}_{day_segment}_original.csv"`
			`script:`
			`"../src/models/merge_features_for_population_model.R"`

			`rule merge_demographicfeatures_for_population_model:`
			`input:`
			`data_files = expand("data/processed/{pid}/demographic_features.csv", pid=config["PIDS"])`
			`output:`
			`"data/processed/data_for_population_model/demographic_features.csv"`
			`script:`
			`"../src/models/merge_data_for_population_model.py"`

Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`rule merge_targets_for_population_model:`
Add merge metrics module for analysis rules 2020-03-09 18:32:14 +01:00			`input:`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`data_files = expand("data/processed/{pid}/targets_{{summarised}}.csv", pid=config["PIDS"])`
Add merge metrics module for analysis rules 2020-03-09 18:32:14 +01:00			`output:`
Add merge module for demographic features and target 2020-04-16 20:20:16 +02:00			`"data/processed/data_for_population_model/targets_{summarised}.csv"`
Add merge metrics module for analysis rules 2020-03-09 18:32:14 +01:00			`script:`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`"../src/models/merge_data_for_population_model.py"`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`rule clean_features_for_individual_model:`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00			`input:`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`rules.merge_features_for_individual_model.output`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00			`params:`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],`
			`cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],`
			`rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],`
Split days threshold of data cleaning into days_before_surgery and days_after_discharge Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-29 20:37:40 +02:00			`days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],`
			`days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"]`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00			`output:`
Add merge module for demographic features and target 2020-04-16 20:20:16 +02:00			`"data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_clean.csv"`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00			`script:`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`"../src/models/clean_features_for_model.R"`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`rule clean_features_for_population_model:`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00			`input:`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`rules.merge_features_for_population_model.output`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00			`params:`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],`
			`cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],`
			`rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],`
Split days threshold of data cleaning into days_before_surgery and days_after_discharge Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-29 20:37:40 +02:00			`days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],`
			`days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"]`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00			`output:`
Add merge module for demographic features and target 2020-04-16 20:20:16 +02:00			`"data/processed/data_for_population_model/{source}_{day_segment}_clean.csv"`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00			`script:`
Add demographic_features and targets module; refactor analysis code Co-authored-by: JulioV <juliovhz@gmail.com> 2020-04-16 18:38:28 +02:00			`"../src/models/clean_features_for_model.R"`
Refactor select_days_to_analyse, fix merge bugs, add clean metrics for model 2020-03-18 02:15:53 +01:00
Add modeling module 2020-04-30 00:53:54 +02:00			`rule modeling:`
			`input:`
			`cleaned_features = "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv",`
			`demographic_features = "data/processed/data_for_population_model/demographic_features.csv",`
			`targets = "data/processed/data_for_population_model/targets_{summarised}.csv",`
			`params:`
			`model = "{model}",`
			`cv_method = "{cv_method}",`
			`source = "{source}",`
			`day_segment = "{day_segment}",`
			`summarised = "{summarised}",`
			`scaler = "{scaler}",`
			`cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],`
			`numerical_operators = config["PARAMS_FOR_ANALYSIS"]["NUMERICAL_OPERATORS"],`
			`categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"],`
			`categorical_demographic_features = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_DEMOGRAPHIC_FEATURES"],`
			`model_hyperparams = config["PARAMS_FOR_ANALYSIS"]["MODEL_HYPERPARAMS"],`
			`rowsnan_colsnan_days_colsvar_threshold = "{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}\|{days_after_threshold}_{cols_var_threshold}"`
			`output:`
			`fold_predictions = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}\|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_predictions.csv",`
			`fold_metrics = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}\|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_metrics.csv",`
			`overall_results = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}\|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/overall_results.csv",`
			`fold_feature_importances = "data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}\|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/fold_feature_importances.csv"`
			`script:`
			`"../src/models/modeling.py"`