Add merge module for demographic features and target

pull/95/head
Meng Li 2020-04-16 14:20:16 -04:00
parent eac721de84
commit 5696b4f6d4
4 changed files with 37 additions and 8 deletions

View File

@ -72,20 +72,23 @@ rule all:
pid=config["PIDS"],
segment = config["WIFI"]["DAY_SEGMENTS"]),
# Models
expand("data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_original.csv",
expand("data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv",
pid = config["PIDS"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
expand("data/processed/features_for_population_model/{source}_{day_segment}_original.csv",
expand("data/processed/data_for_population_model/{source}_{day_segment}_original.csv",
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
expand("data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_clean.csv",
expand("data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_clean.csv",
pid = config["PIDS"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
expand("data/processed/features_for_population_model/{source}_{day_segment}_clean.csv",
expand("data/processed/data_for_population_model/{source}_{day_segment}_clean.csv",
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
expand("data/processed/data_for_population_model/demographic_features.csv"),
expand("data/processed/data_for_population_model/targets_{summarised}.csv",
summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
# Vizualisations
expand("reports/figures/{pid}/{sensor}_heatmap_rows.html", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("reports/figures/{pid}/compliance_heatmap.html", pid=config["PIDS"]),

View File

@ -156,6 +156,8 @@ PARAMS_FOR_ANALYSIS:
ROWS_NAN_THRESHOLD: 0.5
PARTICIPANTS_DAY_THRESHOLD: 7
SUMMARISED: ["summarised"] # "summarised" or "notsummarised"
# Target Settings:
# 1 => TARGETS_RATIO_THRESHOLD (ceiling) or more of available CESD scores were TARGETS_VALUE_THRESHOLD or higher; 0 => otherwise
TARGETS_RATIO_THRESHOLD: 0.5

View File

@ -29,15 +29,31 @@ rule merge_features_for_individual_model:
params:
source = "{source}"
output:
"data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_original.csv"
"data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_original.csv"
script:
"../src/models/merge_features_for_individual_model.R"
rule merge_features_for_population_model:
input:
feature_files = expand("data/processed/{pid}/data_for_individual_model/{{source}}_{{day_segment}}_original.csv", pid=config["PIDS"])
output:
"data/processed/data_for_population_model/{source}_{day_segment}_original.csv"
script:
"../src/models/merge_features_for_population_model.R"
rule merge_demographicfeatures_for_population_model:
input:
data_files = expand("data/processed/{pid}/demographic_features.csv", pid=config["PIDS"])
output:
"data/processed/data_for_population_model/demographic_features.csv"
script:
"../src/models/merge_data_for_population_model.py"
rule merge_targets_for_population_model:
input:
data_files = expand("data/processed/{pid}/targets_{{summarised}}.csv", pid=config["PIDS"])
output:
"data/processed/features_for_population_model/targets_{summarised}.csv"
"data/processed/data_for_population_model/targets_{summarised}.csv"
script:
"../src/models/merge_data_for_population_model.py"
@ -50,7 +66,7 @@ rule clean_features_for_individual_model:
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
participants_day_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"]
output:
"data/processed/{pid}/features_for_individual_model/{source}_{day_segment}_clean.csv"
"data/processed/{pid}/data_for_individual_model/{source}_{day_segment}_clean.csv"
script:
"../src/models/clean_features_for_model.R"
@ -63,7 +79,7 @@ rule clean_features_for_population_model:
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
participants_day_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANTS_DAY_THRESHOLD"]
output:
"data/processed/features_for_population_model/{source}_{day_segment}_clean.csv"
"data/processed/data_for_population_model/{source}_{day_segment}_clean.csv"
script:
"../src/models/clean_features_for_model.R"

View File

@ -0,0 +1,8 @@
import pandas as pd
data_all_participants = pd.DataFrame()
for data_file in snakemake.input["data_files"]:
data_single_participant = pd.read_csv(data_file)
data_all_participants = pd.concat([data_all_participants, data_single_participant], axis=0)
data_all_participants.to_csv(snakemake.output[0], index=False)