From f5688f615456eaa2fbd494bfdde22b7ff49e184b Mon Sep 17 00:00:00 2001 From: junos Date: Fri, 8 Apr 2022 15:42:04 +0200 Subject: [PATCH] Add a rule to merge sensor and baseline features. And select target as before. --- config.yaml | 2 +- rules/models.smk | 11 ++++++++++ ...atures_and_targets_for_population_model.py | 20 +++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 src/models/merge_features_and_targets_for_population_model.py diff --git a/config.yaml b/config.yaml index f5a62762..dcea2843 100644 --- a/config.yaml +++ b/config.yaml @@ -645,7 +645,7 @@ PARAMS_FOR_ANALYSIS: results-survey413767_final.csv # Belgium 2 ] QUESTION_LIST: survey637813+question_text.csv - FEATURES: [age, gender, startlanguage, demand, control, demand_control_ratio] + FEATURES: [age, gender, startlanguage, limesurvey_demand, limesurvey_control, limesurvey_demand_control_ratio, limesurvey_demand_control_ratio_quartile] CATEGORICAL_FEATURES: [gender] TARGET: diff --git a/rules/models.smk b/rules/models.smk index 7910e1d6..b2eb03e4 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -37,3 +37,14 @@ rule select_target: "data/processed/models/individual_model/{pid}/input.csv" script: "../src/models/select_targets.py" + +rule merge_features_and_targets_for_population_model: + input: + cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned_rapids.csv", + demographic_features = expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]), + params: + target_variable=config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"] + output: + "data/processed/models/population_model/input.csv" + script: + "../src/models/merge_features_and_targets_for_population_model.py" diff --git a/src/models/merge_features_and_targets_for_population_model.py b/src/models/merge_features_and_targets_for_population_model.py new file mode 100644 index 00000000..f9e9acd2 --- /dev/null +++ b/src/models/merge_features_and_targets_for_population_model.py @@ -0,0 +1,20 @@ +import pandas as pd + +from helper import retain_target_column + +sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"]) + +all_baseline_features = pd.DataFrame() +for baseline_features_path in snakemake.input["demographic_features"]: + pid = baseline_features_path.split("/")[3] + baseline_features = pd.read_csv(baseline_features_path) + baseline_features = baseline_features.assign(pid=pid) + all_baseline_features = pd.concat([all_baseline_features, baseline_features], axis=0) + +# merge sensor features and baseline features +features = sensor_features.merge(all_baseline_features, on="pid", how="left") + +target_variable_name = snakemake.params["target_variable"] +model_input = retain_target_column(features, target_variable_name) + +model_input.to_csv(snakemake.output[0], index=False)