rapids/src/data/baseline_features.py

import pandas as pd

pid = snakemake.params["pid"]
requested_features = snakemake.params["features"]
baseline_features = pd.DataFrame(columns=requested_features)
question_filename = snakemake.params["question_filename"]

JCQ_DEMAND = "JobEisen"
JCQ_CONTROL = "JobControle"

dict_JCQ_demand_control_reverse = {
    JCQ_DEMAND: {
        3: " [Od mene se ne zahteva,",
        4: " [Imam dovolj časa, da končam",
        5: " [Pri svojem delu se ne srečujem s konfliktnimi",
    },
    JCQ_CONTROL: {
        2: " |Moje delo vključuje veliko ponavljajočega",
        6: " [Pri svojem delu imam zelo malo svobode",
    },
}

LIMESURVEY_JCQ_MIN = 1
LIMESURVEY_JCQ_MAX = 4

participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])

if not participant_info.empty:
    if "age" in requested_features:
        now = pd.Timestamp("now")
        baseline_features.loc[0, "age"] = (
            now - participant_info.loc[0, "date_of_birth"]
        ).days / 365.25245
    if "gender" in requested_features:
        baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"]
    if "startlanguage" in requested_features:
        baseline_features.loc[0, "startlanguage"] = participant_info.loc[
            0, "startlanguage"
        ]
    if "demand" in requested_features:
        participant_info_t = participant_info.T
        rows_baseline = participant_info_t.index
        # Find questions about demand, but disregard time (duration of filling in questionnaire)
        rows_demand = rows_baseline.str.startswith(
            JCQ_DEMAND
        ) & ~rows_baseline.str.endswith("Time")
        limesurvey_control = (
            participant_info_t[rows_demand]
            .reset_index()
            .rename(columns={"index": "question", 0: "score_original"})
        )
        # Extract question IDs from names such as JobEisen[3]
        limesurvey_control.loc[:, "qid"] = (
            limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
        )
        limesurvey_control["score"] = limesurvey_control["score_original"]
        # Identify rows that include questions to be reversed.
        rows_demand_reverse = limesurvey_control["qid"].isin(
            dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
        )
        # Reverse the score, so that the maximum value becomes the minimum etc.
        limesurvey_control.loc[rows_demand_reverse, "score"] = (
            LIMESURVEY_JCQ_MAX
            + LIMESURVEY_JCQ_MIN
            - limesurvey_control.loc[rows_demand_reverse, "score_original"]
        )
        # TODO Write to data/interim
        baseline_features.loc[0, "limesurvey_demand"] = limesurvey_control[
            "score"
        ].sum()


baseline_features.to_csv(
    snakemake.output[0], index=False, encoding="utf-8",
)