import numpy as np import pandas as pd pid = snakemake.params["pid"] requested_features = snakemake.params["features"] baseline_interim = pd.DataFrame(columns=["qid", "question", "score_original", "score"]) baseline_features = pd.DataFrame(columns=requested_features) question_filename = snakemake.params["question_filename"] JCQ_DEMAND = "JobEisen" JCQ_CONTROL = "JobControle" dict_JCQ_demand_control_reverse = { JCQ_DEMAND: { 3: " [Od mene se ne zahteva,", 4: " [Imam dovolj časa, da končam", 5: " [Pri svojem delu se ne srečujem s konfliktnimi", }, JCQ_CONTROL: { 2: " |Moje delo vključuje veliko ponavljajočega", 6: " [Pri svojem delu imam zelo malo svobode", }, } LIMESURVEY_JCQ_MIN = 1 LIMESURVEY_JCQ_MAX = 4 DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4) DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9 JCQ_NORMS = { "F": { 0: DEMAND_CONTROL_RATIO_MIN, 1: 0.45, 2: 0.52, 3: 0.62, 4: DEMAND_CONTROL_RATIO_MAX, }, "M": { 0: DEMAND_CONTROL_RATIO_MIN, 1: 0.41, 2: 0.48, 3: 0.56, 4: DEMAND_CONTROL_RATIO_MAX, }, } participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"]) if not participant_info.empty: if "age" in requested_features: now = pd.Timestamp("now") baseline_features.loc[0, "age"] = ( now - participant_info.loc[0, "date_of_birth"] ).days / 365.25245 if "gender" in requested_features: baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"] if "startlanguage" in requested_features: baseline_features.loc[0, "startlanguage"] = participant_info.loc[ 0, "startlanguage" ] if ( ("limesurvey_demand" in requested_features) or ("limesurvey_control" in requested_features) or ("limesurvey_demand_control_ratio" in requested_features) ): participant_info_t = participant_info.T rows_baseline = participant_info_t.index if ("limesurvey_demand" in requested_features) or ( "limesurvey_demand_control_ratio" in requested_features ): # Find questions about demand, but disregard time (duration of filling in questionnaire) rows_demand = rows_baseline.str.startswith( JCQ_DEMAND ) & ~rows_baseline.str.endswith("Time") limesurvey_demand = ( participant_info_t[rows_demand] .reset_index() .rename(columns={"index": "question", 0: "score_original"}) ) # Extract question IDs from names such as JobEisen[3] limesurvey_demand["qid"] = ( limesurvey_demand["question"].str.extract(r"\[(\d+)\]").astype(int) ) limesurvey_demand["score"] = limesurvey_demand["score_original"] # Identify rows that include questions to be reversed. rows_demand_reverse = limesurvey_demand["qid"].isin( dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys() ) # Reverse the score, so that the maximum value becomes the minimum etc. limesurvey_demand.loc[rows_demand_reverse, "score"] = ( LIMESURVEY_JCQ_MAX + LIMESURVEY_JCQ_MIN - limesurvey_demand.loc[rows_demand_reverse, "score_original"] ) baseline_interim = pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True) if "demand" in requested_features: baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[ "score" ].sum() if ("limesurvey_control" in requested_features) or ( "limesurvey_demand_control_ratio" in requested_features ): # Find questions about control, but disregard time (duration of filling in questionnaire) rows_control = rows_baseline.str.startswith( JCQ_CONTROL ) & ~rows_baseline.str.endswith("Time") limesurvey_control = ( participant_info_t[rows_control] .reset_index() .rename(columns={"index": "question", 0: "score_original"}) ) # Extract question IDs from names such as JobControle[3] limesurvey_control["qid"] = ( limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int) ) limesurvey_control["score"] = limesurvey_control["score_original"] # Identify rows that include questions to be reversed. rows_control_reverse = limesurvey_control["qid"].isin( dict_JCQ_demand_control_reverse[JCQ_CONTROL].keys() ) # Reverse the score, so that the maximum value becomes the minimum etc. limesurvey_control.loc[rows_control_reverse, "score"] = ( LIMESURVEY_JCQ_MAX + LIMESURVEY_JCQ_MIN - limesurvey_control.loc[rows_control_reverse, "score_original"] ) baseline_interim = pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True) if "limesurvey_control" in requested_features: baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[ "score" ].sum() if "limesurvey_demand_control_ratio" in requested_features: limesurvey_demand_control_ratio = ( limesurvey_demand["score"].sum() / limesurvey_control["score"].sum() ) if ( JCQ_NORMS[participant_info.loc[0, "gender"]][0] <= limesurvey_demand_control_ratio < JCQ_NORMS[participant_info.loc[0, "gender"]][1] ): limesurvey_quartile = 1 elif ( JCQ_NORMS[participant_info.loc[0, "gender"]][1] <= limesurvey_demand_control_ratio < JCQ_NORMS[participant_info.loc[0, "gender"]][2] ): limesurvey_quartile = 2 elif ( JCQ_NORMS[participant_info.loc[0, "gender"]][2] <= limesurvey_demand_control_ratio < JCQ_NORMS[participant_info.loc[0, "gender"]][3] ): limesurvey_quartile = 3 elif ( JCQ_NORMS[participant_info.loc[0, "gender"]][3] <= limesurvey_demand_control_ratio < JCQ_NORMS[participant_info.loc[0, "gender"]][4] ): limesurvey_quartile = 4 else: limesurvey_quartile = np.nan baseline_features.loc[ 0, "limesurvey_demand_control_ratio" ] = limesurvey_demand_control_ratio baseline_features.loc[ 0, "limesurvey_demand_control_ratio_quartile" ] = limesurvey_quartile if not baseline_interim.empty: baseline_interim.to_csv(snakemake.output["interim"], index=False, encoding="utf-8") baseline_features.to_csv(snakemake.output["features"], index=False, encoding="utf-8")