Calculate JCQ control and demand control ratio.

Include norms and corresponding quartile.
2022-02-28 18:51:47 +01:00 · 2022-02-28 18:51:47 +01:00 · b5a6317f4b
parent 2fed962644
commit b5a6317f4b
1 changed files with 129 additions and 29 deletions
--- a/src/data/baseline_features.py
+++ b/src/data/baseline_features.py
@ -1,3 +1,4 @@
 import numpy as np
 import pandas as pd
 pid = snakemake.params["pid"]
@ -23,6 +24,26 @@ dict_JCQ_demand_control_reverse = {
 LIMESURVEY_JCQ_MIN = 1
 LIMESURVEY_JCQ_MAX = 4
 DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
 DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
 JCQ_NORMS = {
    "F": {
        0: DEMAND_CONTROL_RATIO_MIN,
        1: 0.45,
        2: 0.52,
        3: 0.62,
        4: DEMAND_CONTROL_RATIO_MAX,
    },
    "M": {
        0: DEMAND_CONTROL_RATIO_MIN,
        1: 0.41,
        2: 0.48,
        3: 0.56,
        4: DEMAND_CONTROL_RATIO_MAX,
    },
 }
 participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])
 if not participant_info.empty:
@ -37,38 +58,117 @@ if not participant_info.empty:
        baseline_features.loc[0, "startlanguage"] = participant_info.loc[
            0, "startlanguage"
        ]
-    if "demand" in requested_features:
+    if (
        ("demand" in requested_features)
        or ("control" in requested_features)
        or ("demand_control_ratio" in requested_features)
    ):
        participant_info_t = participant_info.T
        rows_baseline = participant_info_t.index
        # Find questions about demand, but disregard time (duration of filling in questionnaire)
        rows_demand = rows_baseline.str.startswith(
            JCQ_DEMAND
        ) & ~rows_baseline.str.endswith("Time")
        limesurvey_control = (
            participant_info_t[rows_demand]
            .reset_index()
            .rename(columns={"index": "question", 0: "score_original"})
        )
        # Extract question IDs from names such as JobEisen[3]
        limesurvey_control.loc[:, "qid"] = (
            limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
        )
        limesurvey_control["score"] = limesurvey_control["score_original"]
        # Identify rows that include questions to be reversed.
        rows_demand_reverse = limesurvey_control["qid"].isin(
            dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
        )
        # Reverse the score, so that the maximum value becomes the minimum etc.
        limesurvey_control.loc[rows_demand_reverse, "score"] = (
            LIMESURVEY_JCQ_MAX
            + LIMESURVEY_JCQ_MIN
            - limesurvey_control.loc[rows_demand_reverse, "score_original"]
        )
        # TODO Write to data/interim
        baseline_features.loc[0, "limesurvey_demand"] = limesurvey_control[
            "score"
        ].sum()
        if ("demand" in requested_features) or (
            "demand_control_ratio" in requested_features
        ):
            # Find questions about demand, but disregard time (duration of filling in questionnaire)
            rows_demand = rows_baseline.str.startswith(
                JCQ_DEMAND
            ) & ~rows_baseline.str.endswith("Time")
            limesurvey_demand = (
                participant_info_t[rows_demand]
                .reset_index()
                .rename(columns={"index": "question", 0: "score_original"})
            )
            # Extract question IDs from names such as JobEisen[3]
            limesurvey_demand.loc[:, "qid"] = (
                limesurvey_demand["question"].str.extract(r"\[(\d+)\]").astype(int)
            )
            limesurvey_demand["score"] = limesurvey_demand["score_original"]
            # Identify rows that include questions to be reversed.
            rows_demand_reverse = limesurvey_demand["qid"].isin(
                dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
            )
            # Reverse the score, so that the maximum value becomes the minimum etc.
            limesurvey_demand.loc[rows_demand_reverse, "score"] = (
                LIMESURVEY_JCQ_MAX
                + LIMESURVEY_JCQ_MIN
                - limesurvey_demand.loc[rows_demand_reverse, "score_original"]
            )
            # TODO Write to data/interim
            if "demand" in requested_features:
                baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[
                    "score"
                ].sum()
        if ("control" in requested_features) or (
            "demand_control_ratio" in requested_features
        ):
            # Find questions about control, but disregard time (duration of filling in questionnaire)
            rows_control = rows_baseline.str.startswith(
                JCQ_CONTROL
            ) & ~rows_baseline.str.endswith("Time")
            limesurvey_control = (
                participant_info_t[rows_control]
                .reset_index()
                .rename(columns={"index": "question", 0: "score_original"})
            )
            # Extract question IDs from names such as JobControle[3]
            limesurvey_control.loc[:, "qid"] = (
                limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
            )
            limesurvey_control["score"] = limesurvey_control["score_original"]
            # Identify rows that include questions to be reversed.
            rows_control_reverse = limesurvey_control["qid"].isin(
                dict_JCQ_demand_control_reverse[JCQ_CONTROL].keys()
            )
            # Reverse the score, so that the maximum value becomes the minimum etc.
            limesurvey_control.loc[rows_control_reverse, "score"] = (
                LIMESURVEY_JCQ_MAX
                + LIMESURVEY_JCQ_MIN
                - limesurvey_control.loc[rows_control_reverse, "score_original"]
            )
            # TODO Write to data/interim
            if "control" in requested_features:
                baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[
                    "score"
                ].sum()
        if "demand_control_ratio" in requested_features:
            limesurvey_demand_control_ratio = (
                limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()
            )
            if (
                JCQ_NORMS[participant_info.loc[0, "gender"]][0]
                <= limesurvey_demand_control_ratio
                < JCQ_NORMS[participant_info.loc[0, "gender"]][1]
            ):
                limesurvey_quartile = 1
            elif (
                JCQ_NORMS[participant_info.loc[0, "gender"]][1]
                <= limesurvey_demand_control_ratio
                < JCQ_NORMS[participant_info.loc[0, "gender"]][2]
            ):
                limesurvey_quartile = 2
            elif (
                JCQ_NORMS[participant_info.loc[0, "gender"]][2]
                <= limesurvey_demand_control_ratio
                < JCQ_NORMS[participant_info.loc[0, "gender"]][3]
            ):
                limesurvey_quartile = 3
            elif (
                JCQ_NORMS[participant_info.loc[0, "gender"]][3]
                <= limesurvey_demand_control_ratio
                < JCQ_NORMS[participant_info.loc[0, "gender"]][4]
            ):
                limesurvey_quartile = 4
            else:
                limesurvey_quartile = np.nan
            baseline_features.loc[
                0, "limesurvey_demand_control_ratio"
            ] = limesurvey_demand_control_ratio
            baseline_features.loc[
                0, "limesurvey_demand_control_ratio_quartile"
            ] = limesurvey_quartile
 baseline_features.to_csv(
    snakemake.output[0], index=False, encoding="utf-8",