diff --git a/Snakefile b/Snakefile index 9cea55e8..87ccc6d3 100644 --- a/Snakefile +++ b/Snakefile @@ -72,20 +72,23 @@ rule all: pid=config["PIDS"], segment = config["WIFI"]["DAY_SEGMENTS"]), # Models - expand("data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_original.csv", + expand("data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv", pid = config["PIDS"], source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), - expand("data/processed/features_for_population_model/{source}_{day_segment}_original.csv", + expand("data/processed/data_for_population_model/{source}_{day_segment}_original.csv", source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), - expand("data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_clean.csv", + expand("data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_clean.csv", pid = config["PIDS"], source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), - expand("data/processed/features_for_population_model/{source}_{day_segment}_clean.csv", + expand("data/processed/data_for_population_model/{source}_{day_segment}_clean.csv", source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), + expand("data/processed/data_for_population_model/demographic_features.csv"), + expand("data/processed/data_for_population_model/targets_{summarised}.csv", + summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]), # Vizualisations expand("reports/figures/{pid}/{sensor}_heatmap_rows.html", pid=config["PIDS"], sensor=config["SENSORS"]), expand("reports/figures/{pid}/compliance_heatmap.html", pid=config["PIDS"]), diff --git a/config.yaml b/config.yaml index 8be8aaf8..cede7342 100644 --- a/config.yaml +++ b/config.yaml @@ -156,6 +156,8 @@ PARAMS_FOR_ANALYSIS: ROWS_NAN_THRESHOLD: 0.5 PARTICIPANTS_DAY_THRESHOLD: 7 + SUMMARISED: ["summarised"] # "summarised" or "notsummarised" + # Target Settings: # 1 => TARGETS_RATIO_THRESHOLD (ceiling) or more of available CESD scores were TARGETS_VALUE_THRESHOLD or higher; 0 => otherwise TARGETS_RATIO_THRESHOLD: 0.5 diff --git a/rules/models.snakefile b/rules/models.snakefile index 7c0b209b..3108e7ab 100644 --- a/rules/models.snakefile +++ b/rules/models.snakefile @@ -29,15 +29,31 @@ rule merge_features_for_individual_model: params: source = "{source}" output: - "data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_original.csv" + "data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv" script: "../src/models/merge_features_for_individual_model.R" +rule merge_features_for_population_model: + input: + feature_files = expand("data/processed/{pid}/data_for_individual_model/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"]) + output: + "data/processed/data_for_population_model/{source}_{day_segment}_original.csv" + script: + "../src/models/merge_features_for_population_model.R" + +rule merge_demographicfeatures_for_population_model: + input: + data_files = expand("data/processed/{pid}/demographic_features.csv", pid=config["PIDS"]) + output: + "data/processed/data_for_population_model/demographic_features.csv" + script: + "../src/models/merge_data_for_population_model.py" + rule merge_targets_for_population_model: input: data_files = expand("data/processed/{pid}/targets_{{summarised}}.csv", pid=config["PIDS"]) output: - "data/processed/features_for_population_model/targets_{summarised}.csv" + "data/processed/data_for_population_model/targets_{summarised}.csv" script: "../src/models/merge_data_for_population_model.py" @@ -50,7 +66,7 @@ rule clean_features_for_individual_model: rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], participants_day_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"] output: - "data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_clean.csv" + "data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_clean.csv" script: "../src/models/clean_features_for_model.R" @@ -63,7 +79,7 @@ rule clean_features_for_population_model: rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], participants_day_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"] output: - "data/processed/features_for_population_model/{source}_{day_segment}_clean.csv" + "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv" script: "../src/models/clean_features_for_model.R" diff --git a/src/models/merge_data_for_population_model.py b/src/models/merge_data_for_population_model.py new file mode 100644 index 00000000..376b43a8 --- /dev/null +++ b/src/models/merge_data_for_population_model.py @@ -0,0 +1,8 @@ +import pandas as pd + +data_all_participants = pd.DataFrame() +for data_file in snakemake.input["data_files"]: + data_single_participant = pd.read_csv(data_file) + data_all_participants = pd.concat([data_all_participants, data_single_participant], axis=0) + +data_all_participants.to_csv(snakemake.output[0], index=False)