diff --git a/Snakefile b/Snakefile index 9e0efdcc..4ea33827 100644 --- a/Snakefile +++ b/Snakefile @@ -406,6 +406,7 @@ for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys(): # Baseline features files_to_compute.extend(expand("data/raw/baseline_merged.csv")) files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"])) +files_to_compute.extend(expand("data/interim/{pid}/baseline_questionnaires.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"])) rule all: diff --git a/config.yaml b/config.yaml index 6a267081..4d7a699c 100644 --- a/config.yaml +++ b/config.yaml @@ -635,5 +635,5 @@ PARAMS_FOR_ANALYSIS: results-survey413767_final.csv # Belgium 2 ] QUESTION_LIST: survey637813+question_text.csv - FEATURES: [age, gender, startlanguage] + FEATURES: [age, gender, startlanguage, demand, control, demand_control_ratio] CATEGORICAL_FEATURES: [gender] diff --git a/rules/models.smk b/rules/models.smk index 92b4a935..6d4b0bb8 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -23,6 +23,7 @@ rule baseline_features: features=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FEATURES"], question_filename=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FOLDER"] + "/" + config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["QUESTION_LIST"] output: - "data/processed/features/{pid}/baseline_features.csv" + interim="data/interim/{pid}/baseline_questionnaires.csv", + features="data/processed/features/{pid}/baseline_features.csv" script: "../src/data/baseline_features.py" diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py index 116607c8..61ea6eb7 100644 --- a/src/data/baseline_features.py +++ b/src/data/baseline_features.py @@ -3,6 +3,7 @@ import pandas as pd pid = snakemake.params["pid"] requested_features = snakemake.params["features"] +baseline_interim = pd.DataFrame(columns=["qid", "question", "score_original", "score"]) baseline_features = pd.DataFrame(columns=requested_features) question_filename = snakemake.params["question_filename"] @@ -93,7 +94,7 @@ if not participant_info.empty: + LIMESURVEY_JCQ_MIN - limesurvey_demand.loc[rows_demand_reverse, "score_original"] ) - # TODO Write to data/interim + pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True) if "demand" in requested_features: baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[ "score" @@ -126,7 +127,7 @@ if not participant_info.empty: + LIMESURVEY_JCQ_MIN - limesurvey_control.loc[rows_control_reverse, "score_original"] ) - # TODO Write to data/interim + pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True) if "control" in requested_features: baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[ "score" @@ -170,6 +171,7 @@ if not participant_info.empty: 0, "limesurvey_demand_control_ratio_quartile" ] = limesurvey_quartile -baseline_features.to_csv( - snakemake.output[0], index=False, encoding="utf-8", -) +if not baseline_interim.empty: + baseline_interim.to_csv(snakemake.output["interim"], index=False, encoding="utf-8") + +baseline_features.to_csv(snakemake.output["features"], index=False, encoding="utf-8")