From 50c0defca7d494e42c0bf1ae08d1ed95c838a230 Mon Sep 17 00:00:00 2001
From: junos <junos.lukan@ijs.si>
Date: Wed, 6 Apr 2022 18:16:49 +0200
Subject: [PATCH] Select target columns (no parsing necessary).

---
 Snakefile                    |  2 +-
 config.yaml                  |  2 +-
 rules/models.smk             | 11 ++++++-----
 src/models/parse_targets.py  |  6 ------
 src/models/select_targets.py | 20 ++++++++++++++++++++
 5 files changed, 28 insertions(+), 13 deletions(-)
 delete mode 100644 src/models/parse_targets.py
 create mode 100644 src/models/select_targets.py

diff --git a/Snakefile b/Snakefile
index a769e831..c702bcdb 100644
--- a/Snakefile
+++ b/Snakefile
@@ -422,7 +422,7 @@ if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
 
 # Targets (labels)
 if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]:
-    files_to_compute.extend(expand("data/processed/targets/{pid}/parsed_targets.csv", pid=config["PIDS"]))
+    files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
 
 rule all:
     input:
diff --git a/config.yaml b/config.yaml
index 14fc5d27..f5a62762 100644
--- a/config.yaml
+++ b/config.yaml
@@ -650,5 +650,5 @@ PARAMS_FOR_ANALYSIS:
 
   TARGET:
     COMPUTE: True
-    SCALE: [positive_affect, negative_affect]
+    LABEL: PANAS_negative_affect_mean
 
diff --git a/rules/models.smk b/rules/models.smk
index b7ee6bc6..7910e1d6 100644
--- a/rules/models.smk
+++ b/rules/models.smk
@@ -28,11 +28,12 @@ rule baseline_features:
     script:
         "../src/data/baseline_features.py"
 
-rule parse_targets:
+rule select_target:
     input:
-        targets = "data/processed/features/{pid}/phone_esm.csv",
-        time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
+        cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned_rapids.csv"
+    params:
+        target_variable = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
     output:
-        "data/processed/targets/{pid}/parsed_targets.csv"
+        "data/processed/models/individual_model/{pid}/input.csv"
     script:
-        "../src/models/parse_targets.py"
+        "../src/models/select_targets.py"
diff --git a/src/models/parse_targets.py b/src/models/parse_targets.py
deleted file mode 100644
index 222981ea..00000000
--- a/src/models/parse_targets.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import pandas as pd
-
-
-targets = pd.read_csv(snakemake.input["targets"])
-
-targets.to_csv(snakemake.output[0], index=False)
diff --git a/src/models/select_targets.py b/src/models/select_targets.py
new file mode 100644
index 00000000..69e70570
--- /dev/null
+++ b/src/models/select_targets.py
@@ -0,0 +1,20 @@
+import pandas as pd
+
+
+cleaned_sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"])
+
+column_names = cleaned_sensor_features.columns
+esm_names_index = column_names.str.startswith("phone_esm_straw")
+# Find all columns coming from phone_esm, since these are not features for our purposes and we will drop them.
+esm_names = column_names[esm_names_index]
+
+target_variable_name = esm_names.str.contains(snakemake.params["target_variable"])
+if all(~target_variable_name):
+    raise ValueError("The requested target (", snakemake.params["target_variable"],  ")cannot be found in the dataset.",
+                     "Please check the names of phone_esm_ columns in all_sensor_features_cleaned_rapids.csv")
+esm_names = esm_names[~target_variable_name]
+# We will only keep one column related to phone_esm and that will be our target variable.
+
+model_input = cleaned_sensor_features.drop(esm_names, axis=1)
+
+model_input.to_csv(snakemake.output[0], index=False)