From 9a74e74d084e3bc49316414ee63c4f2bb77d2542 Mon Sep 17 00:00:00 2001
From: junos <junos.lukan@ijs.si>
Date: Wed, 23 Feb 2022 18:15:26 +0100
Subject: [PATCH 1/5] Add the baseline features rule to snakefile.

Correct age calculation for a single value instead of dataframe.
---
 Snakefile                     | 3 ++-
 src/data/baseline_features.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 01af47dd..9e0efdcc 100644
--- a/Snakefile
+++ b/Snakefile
@@ -403,9 +403,10 @@ for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
     if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
         files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))
 
-# Demographic features
+# Baseline features
 files_to_compute.extend(expand("data/raw/baseline_merged.csv"))
 files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"]))
+files_to_compute.extend(expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]))
 
 rule all:
     input:
diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py
index 60671911..599dab4c 100644
--- a/src/data/baseline_features.py
+++ b/src/data/baseline_features.py
@@ -10,7 +10,7 @@ if not participant_info.empty:
         now = pd.Timestamp("now")
         baseline_features.loc[0, "age"] = (
             now - participant_info.loc[0, "date_of_birth"]
-        ).dt.days / 365.25245
+        ).days / 365.25245
     if "gender" in requested_features:
         baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"]
     if "startlanguage" in requested_features:

From 30ac8b1cd5da922336774fb682a7ed4ffe4c12a7 Mon Sep 17 00:00:00 2001
From: junos <junos.lukan@ijs.si>
Date: Wed, 23 Feb 2022 19:08:10 +0100
Subject: [PATCH 2/5] Start calculating demand control features.

---
 config.yaml                   |  1 +
 rules/models.smk              |  3 ++-
 src/data/baseline_features.py | 15 +++++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/config.yaml b/config.yaml
index f14e3bfd..6a267081 100644
--- a/config.yaml
+++ b/config.yaml
@@ -634,5 +634,6 @@ PARAMS_FOR_ANALYSIS:
                 results-survey358134_final.csv,  # Belgium 1
                 results-survey413767_final.csv  # Belgium 2
     ]
+    QUESTION_LIST: survey637813+question_text.csv
     FEATURES: [age, gender, startlanguage]
     CATEGORICAL_FEATURES: [gender]
diff --git a/rules/models.smk b/rules/models.smk
index 94df273a..92b4a935 100644
--- a/rules/models.smk
+++ b/rules/models.smk
@@ -20,7 +20,8 @@ rule baseline_features:
         "data/raw/{pid}/participant_baseline_raw.csv"
     params:
         pid="{pid}",
-        features=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FEATURES"]
+        features=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FEATURES"],
+        question_filename=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FOLDER"] + "/" + config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["QUESTION_LIST"]
     output:
         "data/processed/features/{pid}/baseline_features.csv"
     script:
diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py
index 599dab4c..4978e122 100644
--- a/src/data/baseline_features.py
+++ b/src/data/baseline_features.py
@@ -3,6 +3,13 @@ import pandas as pd
 pid = snakemake.params["pid"]
 requested_features = snakemake.params["features"]
 baseline_features = pd.DataFrame(columns=requested_features)
+question_filename = snakemake.params["question_filename"]
+
+dict_JCQ_demand_control_reverse = {
+    "demand_0": " [Od mene se ne zahteva,",
+    "demand_1": " [Imam dovolj časa, da končam",
+    "demand_2": " [Pri svojem delu se ne srečujem s konfliktnimi"
+}
 
 participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])
 if not participant_info.empty:
@@ -17,6 +24,14 @@ if not participant_info.empty:
         baseline_features.loc[0, "startlanguage"] = participant_info.loc[
             0, "startlanguage"
         ]
+    if "demand" in requested_features:
+        limesurvey_questions = pd.read_csv(question_filename, header=None).T
+        limesurvey_questions[["code", "text"]] = limesurvey_questions[0].str.split(r"\.\s", expand=True, n=1)
+        demand_reverse_lime_rows = limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_0"]) | \
+                                   limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_1"]) | \
+                                   limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_2"])
+        demand_reverse_lime = limesurvey_questions[demand_reverse_lime_rows]
+        demand_reverse_lime.loc[:, "qid"] = demand_reverse_lime["code"].str.extract(r"\[(\d+)\]")
 
 baseline_features.to_csv(
     snakemake.output[0], index=False, encoding="utf-8",

From 2fed9626444aad9220415126fda32d6d1ca0f59b Mon Sep 17 00:00:00 2001
From: junos <junos.lukan@ijs.si>
Date: Mon, 28 Feb 2022 18:30:41 +0100
Subject: [PATCH 3/5] Calculate JCQ demand score.

Hardcode question IDs to be reversed.
---
 src/data/baseline_features.py | 57 +++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py
index 4978e122..b9a601c1 100644
--- a/src/data/baseline_features.py
+++ b/src/data/baseline_features.py
@@ -5,13 +5,26 @@ requested_features = snakemake.params["features"]
 baseline_features = pd.DataFrame(columns=requested_features)
 question_filename = snakemake.params["question_filename"]
 
+JCQ_DEMAND = "JobEisen"
+JCQ_CONTROL = "JobControle"
+
 dict_JCQ_demand_control_reverse = {
-    "demand_0": " [Od mene se ne zahteva,",
-    "demand_1": " [Imam dovolj časa, da končam",
-    "demand_2": " [Pri svojem delu se ne srečujem s konfliktnimi"
+    JCQ_DEMAND: {
+        3: " [Od mene se ne zahteva,",
+        4: " [Imam dovolj časa, da končam",
+        5: " [Pri svojem delu se ne srečujem s konfliktnimi",
+    },
+    JCQ_CONTROL: {
+        2: " |Moje delo vključuje veliko ponavljajočega",
+        6: " [Pri svojem delu imam zelo malo svobode",
+    },
 }
 
+LIMESURVEY_JCQ_MIN = 1
+LIMESURVEY_JCQ_MAX = 4
+
 participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])
+
 if not participant_info.empty:
     if "age" in requested_features:
         now = pd.Timestamp("now")
@@ -25,13 +38,37 @@ if not participant_info.empty:
             0, "startlanguage"
         ]
     if "demand" in requested_features:
-        limesurvey_questions = pd.read_csv(question_filename, header=None).T
-        limesurvey_questions[["code", "text"]] = limesurvey_questions[0].str.split(r"\.\s", expand=True, n=1)
-        demand_reverse_lime_rows = limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_0"]) | \
-                                   limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_1"]) | \
-                                   limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_2"])
-        demand_reverse_lime = limesurvey_questions[demand_reverse_lime_rows]
-        demand_reverse_lime.loc[:, "qid"] = demand_reverse_lime["code"].str.extract(r"\[(\d+)\]")
+        participant_info_t = participant_info.T
+        rows_baseline = participant_info_t.index
+        # Find questions about demand, but disregard time (duration of filling in questionnaire)
+        rows_demand = rows_baseline.str.startswith(
+            JCQ_DEMAND
+        ) & ~rows_baseline.str.endswith("Time")
+        limesurvey_control = (
+            participant_info_t[rows_demand]
+            .reset_index()
+            .rename(columns={"index": "question", 0: "score_original"})
+        )
+        # Extract question IDs from names such as JobEisen[3]
+        limesurvey_control.loc[:, "qid"] = (
+            limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
+        )
+        limesurvey_control["score"] = limesurvey_control["score_original"]
+        # Identify rows that include questions to be reversed.
+        rows_demand_reverse = limesurvey_control["qid"].isin(
+            dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
+        )
+        # Reverse the score, so that the maximum value becomes the minimum etc.
+        limesurvey_control.loc[rows_demand_reverse, "score"] = (
+            LIMESURVEY_JCQ_MAX
+            + LIMESURVEY_JCQ_MIN
+            - limesurvey_control.loc[rows_demand_reverse, "score_original"]
+        )
+        # TODO Write to data/interim
+        baseline_features.loc[0, "limesurvey_demand"] = limesurvey_control[
+            "score"
+        ].sum()
+
 
 baseline_features.to_csv(
     snakemake.output[0], index=False, encoding="utf-8",

From b5a6317f4b0d09fdf62151212bdfc294222233b2 Mon Sep 17 00:00:00 2001
From: junos <junos.lukan@ijs.si>
Date: Mon, 28 Feb 2022 18:51:47 +0100
Subject: [PATCH 4/5] Calculate JCQ control and demand control ratio.

Include norms and corresponding quartile.
---
 src/data/baseline_features.py | 158 +++++++++++++++++++++++++++-------
 1 file changed, 129 insertions(+), 29 deletions(-)

diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py
index b9a601c1..116607c8 100644
--- a/src/data/baseline_features.py
+++ b/src/data/baseline_features.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 
 pid = snakemake.params["pid"]
@@ -23,6 +24,26 @@ dict_JCQ_demand_control_reverse = {
 LIMESURVEY_JCQ_MIN = 1
 LIMESURVEY_JCQ_MAX = 4
 
+DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
+DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
+
+JCQ_NORMS = {
+    "F": {
+        0: DEMAND_CONTROL_RATIO_MIN,
+        1: 0.45,
+        2: 0.52,
+        3: 0.62,
+        4: DEMAND_CONTROL_RATIO_MAX,
+    },
+    "M": {
+        0: DEMAND_CONTROL_RATIO_MIN,
+        1: 0.41,
+        2: 0.48,
+        3: 0.56,
+        4: DEMAND_CONTROL_RATIO_MAX,
+    },
+}
+
 participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])
 
 if not participant_info.empty:
@@ -37,38 +58,117 @@ if not participant_info.empty:
         baseline_features.loc[0, "startlanguage"] = participant_info.loc[
             0, "startlanguage"
         ]
-    if "demand" in requested_features:
+    if (
+        ("demand" in requested_features)
+        or ("control" in requested_features)
+        or ("demand_control_ratio" in requested_features)
+    ):
         participant_info_t = participant_info.T
         rows_baseline = participant_info_t.index
-        # Find questions about demand, but disregard time (duration of filling in questionnaire)
-        rows_demand = rows_baseline.str.startswith(
-            JCQ_DEMAND
-        ) & ~rows_baseline.str.endswith("Time")
-        limesurvey_control = (
-            participant_info_t[rows_demand]
-            .reset_index()
-            .rename(columns={"index": "question", 0: "score_original"})
-        )
-        # Extract question IDs from names such as JobEisen[3]
-        limesurvey_control.loc[:, "qid"] = (
-            limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
-        )
-        limesurvey_control["score"] = limesurvey_control["score_original"]
-        # Identify rows that include questions to be reversed.
-        rows_demand_reverse = limesurvey_control["qid"].isin(
-            dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
-        )
-        # Reverse the score, so that the maximum value becomes the minimum etc.
-        limesurvey_control.loc[rows_demand_reverse, "score"] = (
-            LIMESURVEY_JCQ_MAX
-            + LIMESURVEY_JCQ_MIN
-            - limesurvey_control.loc[rows_demand_reverse, "score_original"]
-        )
-        # TODO Write to data/interim
-        baseline_features.loc[0, "limesurvey_demand"] = limesurvey_control[
-            "score"
-        ].sum()
 
+        if ("demand" in requested_features) or (
+            "demand_control_ratio" in requested_features
+        ):
+            # Find questions about demand, but disregard time (duration of filling in questionnaire)
+            rows_demand = rows_baseline.str.startswith(
+                JCQ_DEMAND
+            ) & ~rows_baseline.str.endswith("Time")
+            limesurvey_demand = (
+                participant_info_t[rows_demand]
+                .reset_index()
+                .rename(columns={"index": "question", 0: "score_original"})
+            )
+            # Extract question IDs from names such as JobEisen[3]
+            limesurvey_demand.loc[:, "qid"] = (
+                limesurvey_demand["question"].str.extract(r"\[(\d+)\]").astype(int)
+            )
+            limesurvey_demand["score"] = limesurvey_demand["score_original"]
+            # Identify rows that include questions to be reversed.
+            rows_demand_reverse = limesurvey_demand["qid"].isin(
+                dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
+            )
+            # Reverse the score, so that the maximum value becomes the minimum etc.
+            limesurvey_demand.loc[rows_demand_reverse, "score"] = (
+                LIMESURVEY_JCQ_MAX
+                + LIMESURVEY_JCQ_MIN
+                - limesurvey_demand.loc[rows_demand_reverse, "score_original"]
+            )
+            # TODO Write to data/interim
+            if "demand" in requested_features:
+                baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[
+                    "score"
+                ].sum()
+
+        if ("control" in requested_features) or (
+            "demand_control_ratio" in requested_features
+        ):
+            # Find questions about control, but disregard time (duration of filling in questionnaire)
+            rows_control = rows_baseline.str.startswith(
+                JCQ_CONTROL
+            ) & ~rows_baseline.str.endswith("Time")
+            limesurvey_control = (
+                participant_info_t[rows_control]
+                .reset_index()
+                .rename(columns={"index": "question", 0: "score_original"})
+            )
+            # Extract question IDs from names such as JobControle[3]
+            limesurvey_control.loc[:, "qid"] = (
+                limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
+            )
+            limesurvey_control["score"] = limesurvey_control["score_original"]
+            # Identify rows that include questions to be reversed.
+            rows_control_reverse = limesurvey_control["qid"].isin(
+                dict_JCQ_demand_control_reverse[JCQ_CONTROL].keys()
+            )
+            # Reverse the score, so that the maximum value becomes the minimum etc.
+            limesurvey_control.loc[rows_control_reverse, "score"] = (
+                LIMESURVEY_JCQ_MAX
+                + LIMESURVEY_JCQ_MIN
+                - limesurvey_control.loc[rows_control_reverse, "score_original"]
+            )
+            # TODO Write to data/interim
+            if "control" in requested_features:
+                baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[
+                    "score"
+                ].sum()
+
+        if "demand_control_ratio" in requested_features:
+            limesurvey_demand_control_ratio = (
+                limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()
+            )
+            if (
+                JCQ_NORMS[participant_info.loc[0, "gender"]][0]
+                <= limesurvey_demand_control_ratio
+                < JCQ_NORMS[participant_info.loc[0, "gender"]][1]
+            ):
+                limesurvey_quartile = 1
+            elif (
+                JCQ_NORMS[participant_info.loc[0, "gender"]][1]
+                <= limesurvey_demand_control_ratio
+                < JCQ_NORMS[participant_info.loc[0, "gender"]][2]
+            ):
+                limesurvey_quartile = 2
+            elif (
+                JCQ_NORMS[participant_info.loc[0, "gender"]][2]
+                <= limesurvey_demand_control_ratio
+                < JCQ_NORMS[participant_info.loc[0, "gender"]][3]
+            ):
+                limesurvey_quartile = 3
+            elif (
+                JCQ_NORMS[participant_info.loc[0, "gender"]][3]
+                <= limesurvey_demand_control_ratio
+                < JCQ_NORMS[participant_info.loc[0, "gender"]][4]
+            ):
+                limesurvey_quartile = 4
+            else:
+                limesurvey_quartile = np.nan
+
+            baseline_features.loc[
+                0, "limesurvey_demand_control_ratio"
+            ] = limesurvey_demand_control_ratio
+            baseline_features.loc[
+                0, "limesurvey_demand_control_ratio_quartile"
+            ] = limesurvey_quartile
 
 baseline_features.to_csv(
     snakemake.output[0], index=False, encoding="utf-8",

From f13a91044d884a690d162ce5828fa8dfcfdfeb2c Mon Sep 17 00:00:00 2001
From: junos <junos.lukan@ijs.si>
Date: Tue, 1 Mar 2022 11:39:58 +0100
Subject: [PATCH 5/5] Write questionnaire data to data/interim.

---
 rules/models.smk              |  3 ++-
 src/data/baseline_features.py | 12 +++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/rules/models.smk b/rules/models.smk
index 92b4a935..6d4b0bb8 100644
--- a/rules/models.smk
+++ b/rules/models.smk
@@ -23,6 +23,7 @@ rule baseline_features:
         features=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FEATURES"],
         question_filename=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FOLDER"] + "/" + config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["QUESTION_LIST"]
     output:
-        "data/processed/features/{pid}/baseline_features.csv"
+        interim="data/interim/{pid}/baseline_questionnaires.csv",
+        features="data/processed/features/{pid}/baseline_features.csv"
     script:
         "../src/data/baseline_features.py"
diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py
index 116607c8..61ea6eb7 100644
--- a/src/data/baseline_features.py
+++ b/src/data/baseline_features.py
@@ -3,6 +3,7 @@ import pandas as pd
 
 pid = snakemake.params["pid"]
 requested_features = snakemake.params["features"]
+baseline_interim = pd.DataFrame(columns=["qid", "question", "score_original", "score"])
 baseline_features = pd.DataFrame(columns=requested_features)
 question_filename = snakemake.params["question_filename"]
 
@@ -93,7 +94,7 @@ if not participant_info.empty:
                 + LIMESURVEY_JCQ_MIN
                 - limesurvey_demand.loc[rows_demand_reverse, "score_original"]
             )
-            # TODO Write to data/interim
+            pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True)
             if "demand" in requested_features:
                 baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[
                     "score"
@@ -126,7 +127,7 @@ if not participant_info.empty:
                 + LIMESURVEY_JCQ_MIN
                 - limesurvey_control.loc[rows_control_reverse, "score_original"]
             )
-            # TODO Write to data/interim
+            pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True)
             if "control" in requested_features:
                 baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[
                     "score"
@@ -170,6 +171,7 @@ if not participant_info.empty:
                 0, "limesurvey_demand_control_ratio_quartile"
             ] = limesurvey_quartile
 
-baseline_features.to_csv(
-    snakemake.output[0], index=False, encoding="utf-8",
-)
+if not baseline_interim.empty:
+    baseline_interim.to_csv(snakemake.output["interim"], index=False, encoding="utf-8")
+
+baseline_features.to_csv(snakemake.output["features"], index=False, encoding="utf-8")