rapids/src/data/baseline_features.py

183 lines
7.2 KiB
Python

import numpy as np
import pandas as pd
pid = snakemake.params["pid"]
requested_features = snakemake.params["features"]
baseline_interim = pd.DataFrame(columns=["qid", "question", "score_original", "score"])
baseline_features = pd.DataFrame(columns=requested_features)
question_filename = snakemake.params["question_filename"]
JCQ_DEMAND = "JobEisen"
JCQ_CONTROL = "JobControle"
dict_JCQ_demand_control_reverse = {
JCQ_DEMAND: {
3: " [Od mene se ne zahteva,",
4: " [Imam dovolj časa, da končam",
5: " [Pri svojem delu se ne srečujem s konfliktnimi",
},
JCQ_CONTROL: {
2: " |Moje delo vključuje veliko ponavljajočega",
6: " [Pri svojem delu imam zelo malo svobode",
},
}
LIMESURVEY_JCQ_MIN = 1
LIMESURVEY_JCQ_MAX = 4
DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
JCQ_NORMS = {
"F": {
0: DEMAND_CONTROL_RATIO_MIN,
1: 0.45,
2: 0.52,
3: 0.62,
4: DEMAND_CONTROL_RATIO_MAX,
},
"M": {
0: DEMAND_CONTROL_RATIO_MIN,
1: 0.41,
2: 0.48,
3: 0.56,
4: DEMAND_CONTROL_RATIO_MAX,
},
}
participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])
if not participant_info.empty:
if "age" in requested_features:
now = pd.Timestamp("now")
baseline_features.loc[0, "age"] = (
now - participant_info.loc[0, "date_of_birth"]
).days / 365.25245
if "gender" in requested_features:
baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"]
if "startlanguage" in requested_features:
baseline_features.loc[0, "startlanguage"] = participant_info.loc[
0, "startlanguage"
]
if (
("limesurvey_demand" in requested_features)
or ("limesurvey_control" in requested_features)
or ("limesurvey_demand_control_ratio" in requested_features)
):
participant_info_t = participant_info.T
rows_baseline = participant_info_t.index
if ("limesurvey_demand" in requested_features) or (
"limesurvey_demand_control_ratio" in requested_features
):
# Find questions about demand, but disregard time (duration of filling in questionnaire)
rows_demand = rows_baseline.str.startswith(
JCQ_DEMAND
) & ~rows_baseline.str.endswith("Time")
limesurvey_demand = (
participant_info_t[rows_demand]
.reset_index()
.rename(columns={"index": "question", 0: "score_original"})
)
# Extract question IDs from names such as JobEisen[3]
limesurvey_demand["qid"] = (
limesurvey_demand["question"].str.extract(r"\[(\d+)\]").astype(int)
)
limesurvey_demand["score"] = limesurvey_demand["score_original"]
# Identify rows that include questions to be reversed.
rows_demand_reverse = limesurvey_demand["qid"].isin(
dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
)
# Reverse the score, so that the maximum value becomes the minimum etc.
limesurvey_demand.loc[rows_demand_reverse, "score"] = (
LIMESURVEY_JCQ_MAX
+ LIMESURVEY_JCQ_MIN
- limesurvey_demand.loc[rows_demand_reverse, "score_original"]
)
baseline_interim = pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True)
if "limesurvey_demand" in requested_features:
baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[
"score"
].sum()
if ("limesurvey_control" in requested_features) or (
"limesurvey_demand_control_ratio" in requested_features
):
# Find questions about control, but disregard time (duration of filling in questionnaire)
rows_control = rows_baseline.str.startswith(
JCQ_CONTROL
) & ~rows_baseline.str.endswith("Time")
limesurvey_control = (
participant_info_t[rows_control]
.reset_index()
.rename(columns={"index": "question", 0: "score_original"})
)
# Extract question IDs from names such as JobControle[3]
limesurvey_control["qid"] = (
limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
)
limesurvey_control["score"] = limesurvey_control["score_original"]
# Identify rows that include questions to be reversed.
rows_control_reverse = limesurvey_control["qid"].isin(
dict_JCQ_demand_control_reverse[JCQ_CONTROL].keys()
)
# Reverse the score, so that the maximum value becomes the minimum etc.
limesurvey_control.loc[rows_control_reverse, "score"] = (
LIMESURVEY_JCQ_MAX
+ LIMESURVEY_JCQ_MIN
- limesurvey_control.loc[rows_control_reverse, "score_original"]
)
baseline_interim = pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True)
if "limesurvey_control" in requested_features:
baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[
"score"
].sum()
if "limesurvey_demand_control_ratio" in requested_features:
if limesurvey_control["score"].sum():
limesurvey_demand_control_ratio = (
limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()
)
else:
limesurvey_demand_control_ratio = 0
if (
JCQ_NORMS[participant_info.loc[0, "gender"]][0]
<= limesurvey_demand_control_ratio
< JCQ_NORMS[participant_info.loc[0, "gender"]][1]
):
limesurvey_quartile = 1
elif (
JCQ_NORMS[participant_info.loc[0, "gender"]][1]
<= limesurvey_demand_control_ratio
< JCQ_NORMS[participant_info.loc[0, "gender"]][2]
):
limesurvey_quartile = 2
elif (
JCQ_NORMS[participant_info.loc[0, "gender"]][2]
<= limesurvey_demand_control_ratio
< JCQ_NORMS[participant_info.loc[0, "gender"]][3]
):
limesurvey_quartile = 3
elif (
JCQ_NORMS[participant_info.loc[0, "gender"]][3]
<= limesurvey_demand_control_ratio
< JCQ_NORMS[participant_info.loc[0, "gender"]][4]
):
limesurvey_quartile = 4
else:
limesurvey_quartile = np.nan
baseline_features.loc[
0, "limesurvey_demand_control_ratio"
] = limesurvey_demand_control_ratio
baseline_features.loc[
0, "limesurvey_demand_control_ratio_quartile"
] = limesurvey_quartile
if not baseline_interim.empty:
baseline_interim.to_csv(snakemake.output["interim"], index=False, encoding="utf-8")
baseline_features.to_csv(snakemake.output["features"], index=False, encoding="utf-8")