From 2fed9626444aad9220415126fda32d6d1ca0f59b Mon Sep 17 00:00:00 2001
From: junos <junos.lukan@ijs.si>
Date: Mon, 28 Feb 2022 18:30:41 +0100
Subject: [PATCH] Calculate JCQ demand score.

Hardcode question IDs to be reversed.
---
 src/data/baseline_features.py | 57 +++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py
index 4978e122..b9a601c1 100644
--- a/src/data/baseline_features.py
+++ b/src/data/baseline_features.py
@@ -5,13 +5,26 @@ requested_features = snakemake.params["features"]
 baseline_features = pd.DataFrame(columns=requested_features)
 question_filename = snakemake.params["question_filename"]
 
+JCQ_DEMAND = "JobEisen"
+JCQ_CONTROL = "JobControle"
+
 dict_JCQ_demand_control_reverse = {
-    "demand_0": " [Od mene se ne zahteva,",
-    "demand_1": " [Imam dovolj časa, da končam",
-    "demand_2": " [Pri svojem delu se ne srečujem s konfliktnimi"
+    JCQ_DEMAND: {
+        3: " [Od mene se ne zahteva,",
+        4: " [Imam dovolj časa, da končam",
+        5: " [Pri svojem delu se ne srečujem s konfliktnimi",
+    },
+    JCQ_CONTROL: {
+        2: " |Moje delo vključuje veliko ponavljajočega",
+        6: " [Pri svojem delu imam zelo malo svobode",
+    },
 }
 
+LIMESURVEY_JCQ_MIN = 1
+LIMESURVEY_JCQ_MAX = 4
+
 participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])
+
 if not participant_info.empty:
     if "age" in requested_features:
         now = pd.Timestamp("now")
@@ -25,13 +38,37 @@ if not participant_info.empty:
             0, "startlanguage"
         ]
     if "demand" in requested_features:
-        limesurvey_questions = pd.read_csv(question_filename, header=None).T
-        limesurvey_questions[["code", "text"]] = limesurvey_questions[0].str.split(r"\.\s", expand=True, n=1)
-        demand_reverse_lime_rows = limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_0"]) | \
-                                   limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_1"]) | \
-                                   limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_2"])
-        demand_reverse_lime = limesurvey_questions[demand_reverse_lime_rows]
-        demand_reverse_lime.loc[:, "qid"] = demand_reverse_lime["code"].str.extract(r"\[(\d+)\]")
+        participant_info_t = participant_info.T
+        rows_baseline = participant_info_t.index
+        # Find questions about demand, but disregard time (duration of filling in questionnaire)
+        rows_demand = rows_baseline.str.startswith(
+            JCQ_DEMAND
+        ) & ~rows_baseline.str.endswith("Time")
+        limesurvey_control = (
+            participant_info_t[rows_demand]
+            .reset_index()
+            .rename(columns={"index": "question", 0: "score_original"})
+        )
+        # Extract question IDs from names such as JobEisen[3]
+        limesurvey_control.loc[:, "qid"] = (
+            limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
+        )
+        limesurvey_control["score"] = limesurvey_control["score_original"]
+        # Identify rows that include questions to be reversed.
+        rows_demand_reverse = limesurvey_control["qid"].isin(
+            dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
+        )
+        # Reverse the score, so that the maximum value becomes the minimum etc.
+        limesurvey_control.loc[rows_demand_reverse, "score"] = (
+            LIMESURVEY_JCQ_MAX
+            + LIMESURVEY_JCQ_MIN
+            - limesurvey_control.loc[rows_demand_reverse, "score_original"]
+        )
+        # TODO Write to data/interim
+        baseline_features.loc[0, "limesurvey_demand"] = limesurvey_control[
+            "score"
+        ].sum()
+
 
 baseline_features.to_csv(
     snakemake.output[0], index=False, encoding="utf-8",