Merge branch 'master' into run_test_participant
commit
70e077f6ab
|
@ -403,9 +403,10 @@ for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
|
||||||
if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
|
if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))
|
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))
|
||||||
|
|
||||||
# Demographic features
|
# Baseline features
|
||||||
files_to_compute.extend(expand("data/raw/baseline_merged.csv"))
|
files_to_compute.extend(expand("data/raw/baseline_merged.csv"))
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"]))
|
||||||
|
files_to_compute.extend(expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]))
|
||||||
|
|
||||||
rule all:
|
rule all:
|
||||||
input:
|
input:
|
||||||
|
|
|
@ -634,5 +634,6 @@ PARAMS_FOR_ANALYSIS:
|
||||||
results-survey358134_final.csv, # Belgium 1
|
results-survey358134_final.csv, # Belgium 1
|
||||||
results-survey413767_final.csv # Belgium 2
|
results-survey413767_final.csv # Belgium 2
|
||||||
]
|
]
|
||||||
|
QUESTION_LIST: survey637813+question_text.csv
|
||||||
FEATURES: [age, gender, startlanguage]
|
FEATURES: [age, gender, startlanguage]
|
||||||
CATEGORICAL_FEATURES: [gender]
|
CATEGORICAL_FEATURES: [gender]
|
||||||
|
|
|
@ -20,8 +20,10 @@ rule baseline_features:
|
||||||
"data/raw/{pid}/participant_baseline_raw.csv"
|
"data/raw/{pid}/participant_baseline_raw.csv"
|
||||||
params:
|
params:
|
||||||
pid="{pid}",
|
pid="{pid}",
|
||||||
features=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FEATURES"]
|
features=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FEATURES"],
|
||||||
|
question_filename=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FOLDER"] + "/" + config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["QUESTION_LIST"]
|
||||||
output:
|
output:
|
||||||
"data/processed/features/{pid}/baseline_features.csv"
|
interim="data/interim/{pid}/baseline_questionnaires.csv",
|
||||||
|
features="data/processed/features/{pid}/baseline_features.csv"
|
||||||
script:
|
script:
|
||||||
"../src/data/baseline_features.py"
|
"../src/data/baseline_features.py"
|
||||||
|
|
|
@ -1,23 +1,177 @@
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
pid = snakemake.params["pid"]
|
pid = snakemake.params["pid"]
|
||||||
requested_features = snakemake.params["features"]
|
requested_features = snakemake.params["features"]
|
||||||
|
baseline_interim = pd.DataFrame(columns=["qid", "question", "score_original", "score"])
|
||||||
baseline_features = pd.DataFrame(columns=requested_features)
|
baseline_features = pd.DataFrame(columns=requested_features)
|
||||||
|
question_filename = snakemake.params["question_filename"]
|
||||||
|
|
||||||
|
JCQ_DEMAND = "JobEisen"
|
||||||
|
JCQ_CONTROL = "JobControle"
|
||||||
|
|
||||||
|
dict_JCQ_demand_control_reverse = {
|
||||||
|
JCQ_DEMAND: {
|
||||||
|
3: " [Od mene se ne zahteva,",
|
||||||
|
4: " [Imam dovolj časa, da končam",
|
||||||
|
5: " [Pri svojem delu se ne srečujem s konfliktnimi",
|
||||||
|
},
|
||||||
|
JCQ_CONTROL: {
|
||||||
|
2: " |Moje delo vključuje veliko ponavljajočega",
|
||||||
|
6: " [Pri svojem delu imam zelo malo svobode",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
LIMESURVEY_JCQ_MIN = 1
|
||||||
|
LIMESURVEY_JCQ_MAX = 4
|
||||||
|
|
||||||
|
DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
|
||||||
|
DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
|
||||||
|
|
||||||
|
JCQ_NORMS = {
|
||||||
|
"F": {
|
||||||
|
0: DEMAND_CONTROL_RATIO_MIN,
|
||||||
|
1: 0.45,
|
||||||
|
2: 0.52,
|
||||||
|
3: 0.62,
|
||||||
|
4: DEMAND_CONTROL_RATIO_MAX,
|
||||||
|
},
|
||||||
|
"M": {
|
||||||
|
0: DEMAND_CONTROL_RATIO_MIN,
|
||||||
|
1: 0.41,
|
||||||
|
2: 0.48,
|
||||||
|
3: 0.56,
|
||||||
|
4: DEMAND_CONTROL_RATIO_MAX,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])
|
participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])
|
||||||
|
|
||||||
if not participant_info.empty:
|
if not participant_info.empty:
|
||||||
if "age" in requested_features:
|
if "age" in requested_features:
|
||||||
now = pd.Timestamp("now")
|
now = pd.Timestamp("now")
|
||||||
baseline_features.loc[0, "age"] = (
|
baseline_features.loc[0, "age"] = (
|
||||||
now - participant_info.loc[0, "date_of_birth"]
|
now - participant_info.loc[0, "date_of_birth"]
|
||||||
).dt.days / 365.25245
|
).days / 365.25245
|
||||||
if "gender" in requested_features:
|
if "gender" in requested_features:
|
||||||
baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"]
|
baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"]
|
||||||
if "startlanguage" in requested_features:
|
if "startlanguage" in requested_features:
|
||||||
baseline_features.loc[0, "startlanguage"] = participant_info.loc[
|
baseline_features.loc[0, "startlanguage"] = participant_info.loc[
|
||||||
0, "startlanguage"
|
0, "startlanguage"
|
||||||
]
|
]
|
||||||
|
if (
|
||||||
|
("demand" in requested_features)
|
||||||
|
or ("control" in requested_features)
|
||||||
|
or ("demand_control_ratio" in requested_features)
|
||||||
|
):
|
||||||
|
participant_info_t = participant_info.T
|
||||||
|
rows_baseline = participant_info_t.index
|
||||||
|
|
||||||
baseline_features.to_csv(
|
if ("demand" in requested_features) or (
|
||||||
snakemake.output[0], index=False, encoding="utf-8",
|
"demand_control_ratio" in requested_features
|
||||||
)
|
):
|
||||||
|
# Find questions about demand, but disregard time (duration of filling in questionnaire)
|
||||||
|
rows_demand = rows_baseline.str.startswith(
|
||||||
|
JCQ_DEMAND
|
||||||
|
) & ~rows_baseline.str.endswith("Time")
|
||||||
|
limesurvey_demand = (
|
||||||
|
participant_info_t[rows_demand]
|
||||||
|
.reset_index()
|
||||||
|
.rename(columns={"index": "question", 0: "score_original"})
|
||||||
|
)
|
||||||
|
# Extract question IDs from names such as JobEisen[3]
|
||||||
|
limesurvey_demand.loc[:, "qid"] = (
|
||||||
|
limesurvey_demand["question"].str.extract(r"\[(\d+)\]").astype(int)
|
||||||
|
)
|
||||||
|
limesurvey_demand["score"] = limesurvey_demand["score_original"]
|
||||||
|
# Identify rows that include questions to be reversed.
|
||||||
|
rows_demand_reverse = limesurvey_demand["qid"].isin(
|
||||||
|
dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
|
||||||
|
)
|
||||||
|
# Reverse the score, so that the maximum value becomes the minimum etc.
|
||||||
|
limesurvey_demand.loc[rows_demand_reverse, "score"] = (
|
||||||
|
LIMESURVEY_JCQ_MAX
|
||||||
|
+ LIMESURVEY_JCQ_MIN
|
||||||
|
- limesurvey_demand.loc[rows_demand_reverse, "score_original"]
|
||||||
|
)
|
||||||
|
pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True)
|
||||||
|
if "demand" in requested_features:
|
||||||
|
baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[
|
||||||
|
"score"
|
||||||
|
].sum()
|
||||||
|
|
||||||
|
if ("control" in requested_features) or (
|
||||||
|
"demand_control_ratio" in requested_features
|
||||||
|
):
|
||||||
|
# Find questions about control, but disregard time (duration of filling in questionnaire)
|
||||||
|
rows_control = rows_baseline.str.startswith(
|
||||||
|
JCQ_CONTROL
|
||||||
|
) & ~rows_baseline.str.endswith("Time")
|
||||||
|
limesurvey_control = (
|
||||||
|
participant_info_t[rows_control]
|
||||||
|
.reset_index()
|
||||||
|
.rename(columns={"index": "question", 0: "score_original"})
|
||||||
|
)
|
||||||
|
# Extract question IDs from names such as JobControle[3]
|
||||||
|
limesurvey_control.loc[:, "qid"] = (
|
||||||
|
limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
|
||||||
|
)
|
||||||
|
limesurvey_control["score"] = limesurvey_control["score_original"]
|
||||||
|
# Identify rows that include questions to be reversed.
|
||||||
|
rows_control_reverse = limesurvey_control["qid"].isin(
|
||||||
|
dict_JCQ_demand_control_reverse[JCQ_CONTROL].keys()
|
||||||
|
)
|
||||||
|
# Reverse the score, so that the maximum value becomes the minimum etc.
|
||||||
|
limesurvey_control.loc[rows_control_reverse, "score"] = (
|
||||||
|
LIMESURVEY_JCQ_MAX
|
||||||
|
+ LIMESURVEY_JCQ_MIN
|
||||||
|
- limesurvey_control.loc[rows_control_reverse, "score_original"]
|
||||||
|
)
|
||||||
|
pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True)
|
||||||
|
if "control" in requested_features:
|
||||||
|
baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[
|
||||||
|
"score"
|
||||||
|
].sum()
|
||||||
|
|
||||||
|
if "demand_control_ratio" in requested_features:
|
||||||
|
limesurvey_demand_control_ratio = (
|
||||||
|
limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
JCQ_NORMS[participant_info.loc[0, "gender"]][0]
|
||||||
|
<= limesurvey_demand_control_ratio
|
||||||
|
< JCQ_NORMS[participant_info.loc[0, "gender"]][1]
|
||||||
|
):
|
||||||
|
limesurvey_quartile = 1
|
||||||
|
elif (
|
||||||
|
JCQ_NORMS[participant_info.loc[0, "gender"]][1]
|
||||||
|
<= limesurvey_demand_control_ratio
|
||||||
|
< JCQ_NORMS[participant_info.loc[0, "gender"]][2]
|
||||||
|
):
|
||||||
|
limesurvey_quartile = 2
|
||||||
|
elif (
|
||||||
|
JCQ_NORMS[participant_info.loc[0, "gender"]][2]
|
||||||
|
<= limesurvey_demand_control_ratio
|
||||||
|
< JCQ_NORMS[participant_info.loc[0, "gender"]][3]
|
||||||
|
):
|
||||||
|
limesurvey_quartile = 3
|
||||||
|
elif (
|
||||||
|
JCQ_NORMS[participant_info.loc[0, "gender"]][3]
|
||||||
|
<= limesurvey_demand_control_ratio
|
||||||
|
< JCQ_NORMS[participant_info.loc[0, "gender"]][4]
|
||||||
|
):
|
||||||
|
limesurvey_quartile = 4
|
||||||
|
else:
|
||||||
|
limesurvey_quartile = np.nan
|
||||||
|
|
||||||
|
baseline_features.loc[
|
||||||
|
0, "limesurvey_demand_control_ratio"
|
||||||
|
] = limesurvey_demand_control_ratio
|
||||||
|
baseline_features.loc[
|
||||||
|
0, "limesurvey_demand_control_ratio_quartile"
|
||||||
|
] = limesurvey_quartile
|
||||||
|
|
||||||
|
if not baseline_interim.empty:
|
||||||
|
baseline_interim.to_csv(snakemake.output["interim"], index=False, encoding="utf-8")
|
||||||
|
|
||||||
|
baseline_features.to_csv(snakemake.output["features"], index=False, encoding="utf-8")
|
||||||
|
|
Loading…
Reference in New Issue