From 9a74e74d084e3bc49316414ee63c4f2bb77d2542 Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 23 Feb 2022 18:15:26 +0100 Subject: [PATCH 1/5] Add the baseline features rule to snakefile. Correct age calculation for a single value instead of dataframe. --- Snakefile | 3 ++- src/data/baseline_features.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 01af47dd..9e0efdcc 100644 --- a/Snakefile +++ b/Snakefile @@ -403,9 +403,10 @@ for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys(): if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv")) -# Demographic features +# Baseline features files_to_compute.extend(expand("data/raw/baseline_merged.csv")) files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"])) +files_to_compute.extend(expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"])) rule all: input: diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py index 60671911..599dab4c 100644 --- a/src/data/baseline_features.py +++ b/src/data/baseline_features.py @@ -10,7 +10,7 @@ if not participant_info.empty: now = pd.Timestamp("now") baseline_features.loc[0, "age"] = ( now - participant_info.loc[0, "date_of_birth"] - ).dt.days / 365.25245 + ).days / 365.25245 if "gender" in requested_features: baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"] if "startlanguage" in requested_features: From 30ac8b1cd5da922336774fb682a7ed4ffe4c12a7 Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 23 Feb 2022 19:08:10 +0100 Subject: [PATCH 2/5] Start calculating demand control features. --- config.yaml | 1 + rules/models.smk | 3 ++- src/data/baseline_features.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/config.yaml b/config.yaml index f14e3bfd..6a267081 100644 --- a/config.yaml +++ b/config.yaml @@ -634,5 +634,6 @@ PARAMS_FOR_ANALYSIS: results-survey358134_final.csv, # Belgium 1 results-survey413767_final.csv # Belgium 2 ] + QUESTION_LIST: survey637813+question_text.csv FEATURES: [age, gender, startlanguage] CATEGORICAL_FEATURES: [gender] diff --git a/rules/models.smk b/rules/models.smk index 94df273a..92b4a935 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -20,7 +20,8 @@ rule baseline_features: "data/raw/{pid}/participant_baseline_raw.csv" params: pid="{pid}", - features=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FEATURES"] + features=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FEATURES"], + question_filename=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FOLDER"] + "/" + config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["QUESTION_LIST"] output: "data/processed/features/{pid}/baseline_features.csv" script: diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py index 599dab4c..4978e122 100644 --- a/src/data/baseline_features.py +++ b/src/data/baseline_features.py @@ -3,6 +3,13 @@ import pandas as pd pid = snakemake.params["pid"] requested_features = snakemake.params["features"] baseline_features = pd.DataFrame(columns=requested_features) +question_filename = snakemake.params["question_filename"] + +dict_JCQ_demand_control_reverse = { + "demand_0": " [Od mene se ne zahteva,", + "demand_1": " [Imam dovolj časa, da končam", + "demand_2": " [Pri svojem delu se ne srečujem s konfliktnimi" +} participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"]) if not participant_info.empty: @@ -17,6 +24,14 @@ if not participant_info.empty: baseline_features.loc[0, "startlanguage"] = participant_info.loc[ 0, "startlanguage" ] + if "demand" in requested_features: + limesurvey_questions = pd.read_csv(question_filename, header=None).T + limesurvey_questions[["code", "text"]] = limesurvey_questions[0].str.split(r"\.\s", expand=True, n=1) + demand_reverse_lime_rows = limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_0"]) | \ + limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_1"]) | \ + limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_2"]) + demand_reverse_lime = limesurvey_questions[demand_reverse_lime_rows] + demand_reverse_lime.loc[:, "qid"] = demand_reverse_lime["code"].str.extract(r"\[(\d+)\]") baseline_features.to_csv( snakemake.output[0], index=False, encoding="utf-8", From 2fed9626444aad9220415126fda32d6d1ca0f59b Mon Sep 17 00:00:00 2001 From: junos Date: Mon, 28 Feb 2022 18:30:41 +0100 Subject: [PATCH 3/5] Calculate JCQ demand score. Hardcode question IDs to be reversed. --- src/data/baseline_features.py | 57 +++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py index 4978e122..b9a601c1 100644 --- a/src/data/baseline_features.py +++ b/src/data/baseline_features.py @@ -5,13 +5,26 @@ requested_features = snakemake.params["features"] baseline_features = pd.DataFrame(columns=requested_features) question_filename = snakemake.params["question_filename"] +JCQ_DEMAND = "JobEisen" +JCQ_CONTROL = "JobControle" + dict_JCQ_demand_control_reverse = { - "demand_0": " [Od mene se ne zahteva,", - "demand_1": " [Imam dovolj časa, da končam", - "demand_2": " [Pri svojem delu se ne srečujem s konfliktnimi" + JCQ_DEMAND: { + 3: " [Od mene se ne zahteva,", + 4: " [Imam dovolj časa, da končam", + 5: " [Pri svojem delu se ne srečujem s konfliktnimi", + }, + JCQ_CONTROL: { + 2: " |Moje delo vključuje veliko ponavljajočega", + 6: " [Pri svojem delu imam zelo malo svobode", + }, } +LIMESURVEY_JCQ_MIN = 1 +LIMESURVEY_JCQ_MAX = 4 + participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"]) + if not participant_info.empty: if "age" in requested_features: now = pd.Timestamp("now") @@ -25,13 +38,37 @@ if not participant_info.empty: 0, "startlanguage" ] if "demand" in requested_features: - limesurvey_questions = pd.read_csv(question_filename, header=None).T - limesurvey_questions[["code", "text"]] = limesurvey_questions[0].str.split(r"\.\s", expand=True, n=1) - demand_reverse_lime_rows = limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_0"]) | \ - limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_1"]) | \ - limesurvey_questions["text"].str.startswith(dict_JCQ_demand_control_reverse["demand_2"]) - demand_reverse_lime = limesurvey_questions[demand_reverse_lime_rows] - demand_reverse_lime.loc[:, "qid"] = demand_reverse_lime["code"].str.extract(r"\[(\d+)\]") + participant_info_t = participant_info.T + rows_baseline = participant_info_t.index + # Find questions about demand, but disregard time (duration of filling in questionnaire) + rows_demand = rows_baseline.str.startswith( + JCQ_DEMAND + ) & ~rows_baseline.str.endswith("Time") + limesurvey_control = ( + participant_info_t[rows_demand] + .reset_index() + .rename(columns={"index": "question", 0: "score_original"}) + ) + # Extract question IDs from names such as JobEisen[3] + limesurvey_control.loc[:, "qid"] = ( + limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int) + ) + limesurvey_control["score"] = limesurvey_control["score_original"] + # Identify rows that include questions to be reversed. + rows_demand_reverse = limesurvey_control["qid"].isin( + dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys() + ) + # Reverse the score, so that the maximum value becomes the minimum etc. + limesurvey_control.loc[rows_demand_reverse, "score"] = ( + LIMESURVEY_JCQ_MAX + + LIMESURVEY_JCQ_MIN + - limesurvey_control.loc[rows_demand_reverse, "score_original"] + ) + # TODO Write to data/interim + baseline_features.loc[0, "limesurvey_demand"] = limesurvey_control[ + "score" + ].sum() + baseline_features.to_csv( snakemake.output[0], index=False, encoding="utf-8", From b5a6317f4b0d09fdf62151212bdfc294222233b2 Mon Sep 17 00:00:00 2001 From: junos Date: Mon, 28 Feb 2022 18:51:47 +0100 Subject: [PATCH 4/5] Calculate JCQ control and demand control ratio. Include norms and corresponding quartile. --- src/data/baseline_features.py | 158 +++++++++++++++++++++++++++------- 1 file changed, 129 insertions(+), 29 deletions(-) diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py index b9a601c1..116607c8 100644 --- a/src/data/baseline_features.py +++ b/src/data/baseline_features.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd pid = snakemake.params["pid"] @@ -23,6 +24,26 @@ dict_JCQ_demand_control_reverse = { LIMESURVEY_JCQ_MIN = 1 LIMESURVEY_JCQ_MAX = 4 +DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4) +DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9 + +JCQ_NORMS = { + "F": { + 0: DEMAND_CONTROL_RATIO_MIN, + 1: 0.45, + 2: 0.52, + 3: 0.62, + 4: DEMAND_CONTROL_RATIO_MAX, + }, + "M": { + 0: DEMAND_CONTROL_RATIO_MIN, + 1: 0.41, + 2: 0.48, + 3: 0.56, + 4: DEMAND_CONTROL_RATIO_MAX, + }, +} + participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"]) if not participant_info.empty: @@ -37,38 +58,117 @@ if not participant_info.empty: baseline_features.loc[0, "startlanguage"] = participant_info.loc[ 0, "startlanguage" ] - if "demand" in requested_features: + if ( + ("demand" in requested_features) + or ("control" in requested_features) + or ("demand_control_ratio" in requested_features) + ): participant_info_t = participant_info.T rows_baseline = participant_info_t.index - # Find questions about demand, but disregard time (duration of filling in questionnaire) - rows_demand = rows_baseline.str.startswith( - JCQ_DEMAND - ) & ~rows_baseline.str.endswith("Time") - limesurvey_control = ( - participant_info_t[rows_demand] - .reset_index() - .rename(columns={"index": "question", 0: "score_original"}) - ) - # Extract question IDs from names such as JobEisen[3] - limesurvey_control.loc[:, "qid"] = ( - limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int) - ) - limesurvey_control["score"] = limesurvey_control["score_original"] - # Identify rows that include questions to be reversed. - rows_demand_reverse = limesurvey_control["qid"].isin( - dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys() - ) - # Reverse the score, so that the maximum value becomes the minimum etc. - limesurvey_control.loc[rows_demand_reverse, "score"] = ( - LIMESURVEY_JCQ_MAX - + LIMESURVEY_JCQ_MIN - - limesurvey_control.loc[rows_demand_reverse, "score_original"] - ) - # TODO Write to data/interim - baseline_features.loc[0, "limesurvey_demand"] = limesurvey_control[ - "score" - ].sum() + if ("demand" in requested_features) or ( + "demand_control_ratio" in requested_features + ): + # Find questions about demand, but disregard time (duration of filling in questionnaire) + rows_demand = rows_baseline.str.startswith( + JCQ_DEMAND + ) & ~rows_baseline.str.endswith("Time") + limesurvey_demand = ( + participant_info_t[rows_demand] + .reset_index() + .rename(columns={"index": "question", 0: "score_original"}) + ) + # Extract question IDs from names such as JobEisen[3] + limesurvey_demand.loc[:, "qid"] = ( + limesurvey_demand["question"].str.extract(r"\[(\d+)\]").astype(int) + ) + limesurvey_demand["score"] = limesurvey_demand["score_original"] + # Identify rows that include questions to be reversed. + rows_demand_reverse = limesurvey_demand["qid"].isin( + dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys() + ) + # Reverse the score, so that the maximum value becomes the minimum etc. + limesurvey_demand.loc[rows_demand_reverse, "score"] = ( + LIMESURVEY_JCQ_MAX + + LIMESURVEY_JCQ_MIN + - limesurvey_demand.loc[rows_demand_reverse, "score_original"] + ) + # TODO Write to data/interim + if "demand" in requested_features: + baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[ + "score" + ].sum() + + if ("control" in requested_features) or ( + "demand_control_ratio" in requested_features + ): + # Find questions about control, but disregard time (duration of filling in questionnaire) + rows_control = rows_baseline.str.startswith( + JCQ_CONTROL + ) & ~rows_baseline.str.endswith("Time") + limesurvey_control = ( + participant_info_t[rows_control] + .reset_index() + .rename(columns={"index": "question", 0: "score_original"}) + ) + # Extract question IDs from names such as JobControle[3] + limesurvey_control.loc[:, "qid"] = ( + limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int) + ) + limesurvey_control["score"] = limesurvey_control["score_original"] + # Identify rows that include questions to be reversed. + rows_control_reverse = limesurvey_control["qid"].isin( + dict_JCQ_demand_control_reverse[JCQ_CONTROL].keys() + ) + # Reverse the score, so that the maximum value becomes the minimum etc. + limesurvey_control.loc[rows_control_reverse, "score"] = ( + LIMESURVEY_JCQ_MAX + + LIMESURVEY_JCQ_MIN + - limesurvey_control.loc[rows_control_reverse, "score_original"] + ) + # TODO Write to data/interim + if "control" in requested_features: + baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[ + "score" + ].sum() + + if "demand_control_ratio" in requested_features: + limesurvey_demand_control_ratio = ( + limesurvey_demand["score"].sum() / limesurvey_control["score"].sum() + ) + if ( + JCQ_NORMS[participant_info.loc[0, "gender"]][0] + <= limesurvey_demand_control_ratio + < JCQ_NORMS[participant_info.loc[0, "gender"]][1] + ): + limesurvey_quartile = 1 + elif ( + JCQ_NORMS[participant_info.loc[0, "gender"]][1] + <= limesurvey_demand_control_ratio + < JCQ_NORMS[participant_info.loc[0, "gender"]][2] + ): + limesurvey_quartile = 2 + elif ( + JCQ_NORMS[participant_info.loc[0, "gender"]][2] + <= limesurvey_demand_control_ratio + < JCQ_NORMS[participant_info.loc[0, "gender"]][3] + ): + limesurvey_quartile = 3 + elif ( + JCQ_NORMS[participant_info.loc[0, "gender"]][3] + <= limesurvey_demand_control_ratio + < JCQ_NORMS[participant_info.loc[0, "gender"]][4] + ): + limesurvey_quartile = 4 + else: + limesurvey_quartile = np.nan + + baseline_features.loc[ + 0, "limesurvey_demand_control_ratio" + ] = limesurvey_demand_control_ratio + baseline_features.loc[ + 0, "limesurvey_demand_control_ratio_quartile" + ] = limesurvey_quartile baseline_features.to_csv( snakemake.output[0], index=False, encoding="utf-8", From f13a91044d884a690d162ce5828fa8dfcfdfeb2c Mon Sep 17 00:00:00 2001 From: junos Date: Tue, 1 Mar 2022 11:39:58 +0100 Subject: [PATCH 5/5] Write questionnaire data to data/interim. --- rules/models.smk | 3 ++- src/data/baseline_features.py | 12 +++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/rules/models.smk b/rules/models.smk index 92b4a935..6d4b0bb8 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -23,6 +23,7 @@ rule baseline_features: features=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FEATURES"], question_filename=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FOLDER"] + "/" + config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["QUESTION_LIST"] output: - "data/processed/features/{pid}/baseline_features.csv" + interim="data/interim/{pid}/baseline_questionnaires.csv", + features="data/processed/features/{pid}/baseline_features.csv" script: "../src/data/baseline_features.py" diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py index 116607c8..61ea6eb7 100644 --- a/src/data/baseline_features.py +++ b/src/data/baseline_features.py @@ -3,6 +3,7 @@ import pandas as pd pid = snakemake.params["pid"] requested_features = snakemake.params["features"] +baseline_interim = pd.DataFrame(columns=["qid", "question", "score_original", "score"]) baseline_features = pd.DataFrame(columns=requested_features) question_filename = snakemake.params["question_filename"] @@ -93,7 +94,7 @@ if not participant_info.empty: + LIMESURVEY_JCQ_MIN - limesurvey_demand.loc[rows_demand_reverse, "score_original"] ) - # TODO Write to data/interim + pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True) if "demand" in requested_features: baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[ "score" @@ -126,7 +127,7 @@ if not participant_info.empty: + LIMESURVEY_JCQ_MIN - limesurvey_control.loc[rows_control_reverse, "score_original"] ) - # TODO Write to data/interim + pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True) if "control" in requested_features: baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[ "score" @@ -170,6 +171,7 @@ if not participant_info.empty: 0, "limesurvey_demand_control_ratio_quartile" ] = limesurvey_quartile -baseline_features.to_csv( - snakemake.output[0], index=False, encoding="utf-8", -) +if not baseline_interim.empty: + baseline_interim.to_csv(snakemake.output["interim"], index=False, encoding="utf-8") + +baseline_features.to_csv(snakemake.output["features"], index=False, encoding="utf-8")