rapids/src/data/baseline_features.py

import numpy as np
import pandas as pd

pid = snakemake.params["pid"]
requested_features = snakemake.params["features"]
baseline_interim = pd.DataFrame(columns=["qid", "question", "score_original", "score"])
baseline_features = pd.DataFrame(columns=requested_features)
question_filename = snakemake.params["question_filename"]

JCQ_DEMAND = "JobEisen"
JCQ_CONTROL = "JobControle"

dict_JCQ_demand_control_reverse = {
    JCQ_DEMAND: {
        3: " [Od mene se ne zahteva,",
        4: " [Imam dovolj časa, da končam",
        5: " [Pri svojem delu se ne srečujem s konfliktnimi",
    },
    JCQ_CONTROL: {
        2: " |Moje delo vključuje veliko ponavljajočega",
        6: " [Pri svojem delu imam zelo malo svobode",
    },
}

LIMESURVEY_JCQ_MIN = 1
LIMESURVEY_JCQ_MAX = 4

DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9

JCQ_NORMS = {
    "F": {
        0: DEMAND_CONTROL_RATIO_MIN,
        1: 0.45,
        2: 0.52,
        3: 0.62,
        4: DEMAND_CONTROL_RATIO_MAX,
    },
    "M": {
        0: DEMAND_CONTROL_RATIO_MIN,
        1: 0.41,
        2: 0.48,
        3: 0.56,
        4: DEMAND_CONTROL_RATIO_MAX,
    },
}

participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])

if not participant_info.empty:
    if "age" in requested_features:
        now = pd.Timestamp("now")
        baseline_features.loc[0, "age"] = (
            now - participant_info.loc[0, "date_of_birth"]
        ).days / 365.25245
    if "gender" in requested_features:
        baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"]
    if "startlanguage" in requested_features:
        baseline_features.loc[0, "startlanguage"] = participant_info.loc[
            0, "startlanguage"
        ]
    if (
        ("demand" in requested_features)
        or ("control" in requested_features)
        or ("demand_control_ratio" in requested_features)
    ):
        participant_info_t = participant_info.T
        rows_baseline = participant_info_t.index

        if ("demand" in requested_features) or (
            "demand_control_ratio" in requested_features
        ):
            # Find questions about demand, but disregard time (duration of filling in questionnaire)
            rows_demand = rows_baseline.str.startswith(
                JCQ_DEMAND
            ) & ~rows_baseline.str.endswith("Time")
            limesurvey_demand = (
                participant_info_t[rows_demand]
                .reset_index()
                .rename(columns={"index": "question", 0: "score_original"})
            )
            # Extract question IDs from names such as JobEisen[3]
            limesurvey_demand.loc[:, "qid"] = (
                limesurvey_demand["question"].str.extract(r"\[(\d+)\]").astype(int)
            )
            limesurvey_demand["score"] = limesurvey_demand["score_original"]
            # Identify rows that include questions to be reversed.
            rows_demand_reverse = limesurvey_demand["qid"].isin(
                dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
            )
            # Reverse the score, so that the maximum value becomes the minimum etc.
            limesurvey_demand.loc[rows_demand_reverse, "score"] = (
                LIMESURVEY_JCQ_MAX
                + LIMESURVEY_JCQ_MIN
                - limesurvey_demand.loc[rows_demand_reverse, "score_original"]
            )
            pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True)
            if "demand" in requested_features:
                baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[
                    "score"
                ].sum()

        if ("control" in requested_features) or (
            "demand_control_ratio" in requested_features
        ):
            # Find questions about control, but disregard time (duration of filling in questionnaire)
            rows_control = rows_baseline.str.startswith(
                JCQ_CONTROL
            ) & ~rows_baseline.str.endswith("Time")
            limesurvey_control = (
                participant_info_t[rows_control]
                .reset_index()
                .rename(columns={"index": "question", 0: "score_original"})
            )
            # Extract question IDs from names such as JobControle[3]
            limesurvey_control.loc[:, "qid"] = (
                limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
            )
            limesurvey_control["score"] = limesurvey_control["score_original"]
            # Identify rows that include questions to be reversed.
            rows_control_reverse = limesurvey_control["qid"].isin(
                dict_JCQ_demand_control_reverse[JCQ_CONTROL].keys()
            )
            # Reverse the score, so that the maximum value becomes the minimum etc.
            limesurvey_control.loc[rows_control_reverse, "score"] = (
                LIMESURVEY_JCQ_MAX
                + LIMESURVEY_JCQ_MIN
                - limesurvey_control.loc[rows_control_reverse, "score_original"]
            )
            pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True)
            if "control" in requested_features:
                baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[
                    "score"
                ].sum()

        if "demand_control_ratio" in requested_features:
            limesurvey_demand_control_ratio = (
                limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()
            )
            if (
                JCQ_NORMS[participant_info.loc[0, "gender"]][0]
                <= limesurvey_demand_control_ratio
                < JCQ_NORMS[participant_info.loc[0, "gender"]][1]
            ):
                limesurvey_quartile = 1
            elif (
                JCQ_NORMS[participant_info.loc[0, "gender"]][1]
                <= limesurvey_demand_control_ratio
                < JCQ_NORMS[participant_info.loc[0, "gender"]][2]
            ):
                limesurvey_quartile = 2
            elif (
                JCQ_NORMS[participant_info.loc[0, "gender"]][2]
                <= limesurvey_demand_control_ratio
                < JCQ_NORMS[participant_info.loc[0, "gender"]][3]
            ):
                limesurvey_quartile = 3
            elif (
                JCQ_NORMS[participant_info.loc[0, "gender"]][3]
                <= limesurvey_demand_control_ratio
                < JCQ_NORMS[participant_info.loc[0, "gender"]][4]
            ):
                limesurvey_quartile = 4
            else:
                limesurvey_quartile = np.nan

            baseline_features.loc[
                0, "limesurvey_demand_control_ratio"
            ] = limesurvey_demand_control_ratio
            baseline_features.loc[
                0, "limesurvey_demand_control_ratio_quartile"
            ] = limesurvey_quartile

if not baseline_interim.empty:
    baseline_interim.to_csv(snakemake.output["interim"], index=False, encoding="utf-8")

baseline_features.to_csv(snakemake.output["features"], index=False, encoding="utf-8")
Calculate JCQ control and demand control ratio. Include norms and corresponding quartile. 2022-02-28 18:51:47 +01:00			`import numpy as np`
Prepare baseline feature rule. 2022-02-23 11:09:33 +01:00			`import pandas as pd`

			`pid = snakemake.params["pid"]`
			`requested_features = snakemake.params["features"]`
Write questionnaire data to data/interim. 2022-03-01 11:39:58 +01:00			`baseline_interim = pd.DataFrame(columns=["qid", "question", "score_original", "score"])`
Prepare baseline feature rule. 2022-02-23 11:09:33 +01:00			`baseline_features = pd.DataFrame(columns=requested_features)`
Start calculating demand control features. 2022-02-23 19:08:10 +01:00			`question_filename = snakemake.params["question_filename"]`

Calculate JCQ demand score. Hardcode question IDs to be reversed. 2022-02-28 18:30:41 +01:00			`JCQ_DEMAND = "JobEisen"`
			`JCQ_CONTROL = "JobControle"`

Start calculating demand control features. 2022-02-23 19:08:10 +01:00			`dict_JCQ_demand_control_reverse = {`
Calculate JCQ demand score. Hardcode question IDs to be reversed. 2022-02-28 18:30:41 +01:00			`JCQ_DEMAND: {`
			`3: " [Od mene se ne zahteva,",`
			`4: " [Imam dovolj časa, da končam",`
			`5: " [Pri svojem delu se ne srečujem s konfliktnimi",`
			`},`
			`JCQ_CONTROL: {`
			`2: " \|Moje delo vključuje veliko ponavljajočega",`
			`6: " [Pri svojem delu imam zelo malo svobode",`
			`},`
Start calculating demand control features. 2022-02-23 19:08:10 +01:00			`}`
Prepare baseline feature rule. 2022-02-23 11:09:33 +01:00
Calculate JCQ demand score. Hardcode question IDs to be reversed. 2022-02-28 18:30:41 +01:00			`LIMESURVEY_JCQ_MIN = 1`
			`LIMESURVEY_JCQ_MAX = 4`

Calculate JCQ control and demand control ratio. Include norms and corresponding quartile. 2022-02-28 18:51:47 +01:00			`DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)`
			`DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9`

			`JCQ_NORMS = {`
			`"F": {`
			`0: DEMAND_CONTROL_RATIO_MIN,`
			`1: 0.45,`
			`2: 0.52,`
			`3: 0.62,`
			`4: DEMAND_CONTROL_RATIO_MAX,`
			`},`
			`"M": {`
			`0: DEMAND_CONTROL_RATIO_MIN,`
			`1: 0.41,`
			`2: 0.48,`
			`3: 0.56,`
			`4: DEMAND_CONTROL_RATIO_MAX,`
			`},`
			`}`

Add age, gender, and language as features. Move calculation of age from merge_baseline_data.py to baseline_features.py. 2022-02-23 18:05:23 +01:00			`participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])`
Calculate JCQ demand score. Hardcode question IDs to be reversed. 2022-02-28 18:30:41 +01:00
Add age, gender, and language as features. Move calculation of age from merge_baseline_data.py to baseline_features.py. 2022-02-23 18:05:23 +01:00			`if not participant_info.empty:`
			`if "age" in requested_features:`
			`now = pd.Timestamp("now")`
			`baseline_features.loc[0, "age"] = (`
			`now - participant_info.loc[0, "date_of_birth"]`
Add the baseline features rule to snakefile. Correct age calculation for a single value instead of dataframe. 2022-02-23 18:15:26 +01:00			`).days / 365.25245`
Add age, gender, and language as features. Move calculation of age from merge_baseline_data.py to baseline_features.py. 2022-02-23 18:05:23 +01:00			`if "gender" in requested_features:`
			`baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"]`
			`if "startlanguage" in requested_features:`
			`baseline_features.loc[0, "startlanguage"] = participant_info.loc[`
			`0, "startlanguage"`
			`]`
Calculate JCQ control and demand control ratio. Include norms and corresponding quartile. 2022-02-28 18:51:47 +01:00			`if (`
			`("demand" in requested_features)`
			`or ("control" in requested_features)`
			`or ("demand_control_ratio" in requested_features)`
			`):`
Calculate JCQ demand score. Hardcode question IDs to be reversed. 2022-02-28 18:30:41 +01:00			`participant_info_t = participant_info.T`
			`rows_baseline = participant_info_t.index`

Calculate JCQ control and demand control ratio. Include norms and corresponding quartile. 2022-02-28 18:51:47 +01:00			`if ("demand" in requested_features) or (`
			`"demand_control_ratio" in requested_features`
			`):`
			`# Find questions about demand, but disregard time (duration of filling in questionnaire)`
			`rows_demand = rows_baseline.str.startswith(`
			`JCQ_DEMAND`
			`) & ~rows_baseline.str.endswith("Time")`
			`limesurvey_demand = (`
			`participant_info_t[rows_demand]`
			`.reset_index()`
			`.rename(columns={"index": "question", 0: "score_original"})`
			`)`
			`# Extract question IDs from names such as JobEisen[3]`
			`limesurvey_demand.loc[:, "qid"] = (`
			`limesurvey_demand["question"].str.extract(r"\[(\d+)\]").astype(int)`
			`)`
			`limesurvey_demand["score"] = limesurvey_demand["score_original"]`
			`# Identify rows that include questions to be reversed.`
			`rows_demand_reverse = limesurvey_demand["qid"].isin(`
			`dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()`
			`)`
			`# Reverse the score, so that the maximum value becomes the minimum etc.`
			`limesurvey_demand.loc[rows_demand_reverse, "score"] = (`
			`LIMESURVEY_JCQ_MAX`
			`+ LIMESURVEY_JCQ_MIN`
			`- limesurvey_demand.loc[rows_demand_reverse, "score_original"]`
			`)`
Write questionnaire data to data/interim. 2022-03-01 11:39:58 +01:00			`pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True)`
Calculate JCQ control and demand control ratio. Include norms and corresponding quartile. 2022-02-28 18:51:47 +01:00			`if "demand" in requested_features:`
			`baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[`
			`"score"`
			`].sum()`

			`if ("control" in requested_features) or (`
			`"demand_control_ratio" in requested_features`
			`):`
			`# Find questions about control, but disregard time (duration of filling in questionnaire)`
			`rows_control = rows_baseline.str.startswith(`
			`JCQ_CONTROL`
			`) & ~rows_baseline.str.endswith("Time")`
			`limesurvey_control = (`
			`participant_info_t[rows_control]`
			`.reset_index()`
			`.rename(columns={"index": "question", 0: "score_original"})`
			`)`
			`# Extract question IDs from names such as JobControle[3]`
			`limesurvey_control.loc[:, "qid"] = (`
			`limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)`
			`)`
			`limesurvey_control["score"] = limesurvey_control["score_original"]`
			`# Identify rows that include questions to be reversed.`
			`rows_control_reverse = limesurvey_control["qid"].isin(`
			`dict_JCQ_demand_control_reverse[JCQ_CONTROL].keys()`
			`)`
			`# Reverse the score, so that the maximum value becomes the minimum etc.`
			`limesurvey_control.loc[rows_control_reverse, "score"] = (`
			`LIMESURVEY_JCQ_MAX`
			`+ LIMESURVEY_JCQ_MIN`
			`- limesurvey_control.loc[rows_control_reverse, "score_original"]`
			`)`
Write questionnaire data to data/interim. 2022-03-01 11:39:58 +01:00			`pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True)`
Calculate JCQ control and demand control ratio. Include norms and corresponding quartile. 2022-02-28 18:51:47 +01:00			`if "control" in requested_features:`
			`baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[`
			`"score"`
			`].sum()`

			`if "demand_control_ratio" in requested_features:`
			`limesurvey_demand_control_ratio = (`
			`limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()`
			`)`
			`if (`
			`JCQ_NORMS[participant_info.loc[0, "gender"]][0]`
			`<= limesurvey_demand_control_ratio`
			`< JCQ_NORMS[participant_info.loc[0, "gender"]][1]`
			`):`
			`limesurvey_quartile = 1`
			`elif (`
			`JCQ_NORMS[participant_info.loc[0, "gender"]][1]`
			`<= limesurvey_demand_control_ratio`
			`< JCQ_NORMS[participant_info.loc[0, "gender"]][2]`
			`):`
			`limesurvey_quartile = 2`
			`elif (`
			`JCQ_NORMS[participant_info.loc[0, "gender"]][2]`
			`<= limesurvey_demand_control_ratio`
			`< JCQ_NORMS[participant_info.loc[0, "gender"]][3]`
			`):`
			`limesurvey_quartile = 3`
			`elif (`
			`JCQ_NORMS[participant_info.loc[0, "gender"]][3]`
			`<= limesurvey_demand_control_ratio`
			`< JCQ_NORMS[participant_info.loc[0, "gender"]][4]`
			`):`
			`limesurvey_quartile = 4`
			`else:`
			`limesurvey_quartile = np.nan`

			`baseline_features.loc[`
			`0, "limesurvey_demand_control_ratio"`
			`] = limesurvey_demand_control_ratio`
			`baseline_features.loc[`
			`0, "limesurvey_demand_control_ratio_quartile"`
			`] = limesurvey_quartile`
Add age, gender, and language as features. Move calculation of age from merge_baseline_data.py to baseline_features.py. 2022-02-23 18:05:23 +01:00
Write questionnaire data to data/interim. 2022-03-01 11:39:58 +01:00			`if not baseline_interim.empty:`
			`baseline_interim.to_csv(snakemake.output["interim"], index=False, encoding="utf-8")`

			`baseline_features.to_csv(snakemake.output["features"], index=False, encoding="utf-8")`