From 50c0defca7d494e42c0bf1ae08d1ed95c838a230 Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 6 Apr 2022 18:16:49 +0200 Subject: [PATCH] Select target columns (no parsing necessary). --- Snakefile | 2 +- config.yaml | 2 +- rules/models.smk | 11 ++++++----- src/models/parse_targets.py | 6 ------ src/models/select_targets.py | 20 ++++++++++++++++++++ 5 files changed, 28 insertions(+), 13 deletions(-) delete mode 100644 src/models/parse_targets.py create mode 100644 src/models/select_targets.py diff --git a/Snakefile b/Snakefile index a769e831..c702bcdb 100644 --- a/Snakefile +++ b/Snakefile @@ -422,7 +422,7 @@ if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]: # Targets (labels) if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]: - files_to_compute.extend(expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"])) rule all: input: diff --git a/config.yaml b/config.yaml index 14fc5d27..f5a62762 100644 --- a/config.yaml +++ b/config.yaml @@ -650,5 +650,5 @@ PARAMS_FOR_ANALYSIS: TARGET: COMPUTE: True - SCALE: [positive_affect, negative_affect] + LABEL: PANAS_negative_affect_mean diff --git a/rules/models.smk b/rules/models.smk index b7ee6bc6..7910e1d6 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -28,11 +28,12 @@ rule baseline_features: script: "../src/data/baseline_features.py" -rule parse_targets: +rule select_target: input: - targets = "data/processed/features/{pid}/phone_esm.csv", - time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned_rapids.csv" + params: + target_variable = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"] output: - "data/processed/targets/{pid}/parsed_targets.csv" + "data/processed/models/individual_model/{pid}/input.csv" script: - "../src/models/parse_targets.py" + "../src/models/select_targets.py" diff --git a/src/models/parse_targets.py b/src/models/parse_targets.py deleted file mode 100644 index 222981ea..00000000 --- a/src/models/parse_targets.py +++ /dev/null @@ -1,6 +0,0 @@ -import pandas as pd - - -targets = pd.read_csv(snakemake.input["targets"]) - -targets.to_csv(snakemake.output[0], index=False) diff --git a/src/models/select_targets.py b/src/models/select_targets.py new file mode 100644 index 00000000..69e70570 --- /dev/null +++ b/src/models/select_targets.py @@ -0,0 +1,20 @@ +import pandas as pd + + +cleaned_sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"]) + +column_names = cleaned_sensor_features.columns +esm_names_index = column_names.str.startswith("phone_esm_straw") +# Find all columns coming from phone_esm, since these are not features for our purposes and we will drop them. +esm_names = column_names[esm_names_index] + +target_variable_name = esm_names.str.contains(snakemake.params["target_variable"]) +if all(~target_variable_name): + raise ValueError("The requested target (", snakemake.params["target_variable"], ")cannot be found in the dataset.", + "Please check the names of phone_esm_ columns in all_sensor_features_cleaned_rapids.csv") +esm_names = esm_names[~target_variable_name] +# We will only keep one column related to phone_esm and that will be our target variable. + +model_input = cleaned_sensor_features.drop(esm_names, axis=1) + +model_input.to_csv(snakemake.output[0], index=False)