diff --git a/Snakefile b/Snakefile index ab4f1aa8..01af47dd 100644 --- a/Snakefile +++ b/Snakefile @@ -405,7 +405,7 @@ for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys(): # Demographic features files_to_compute.extend(expand("data/raw/baseline_merged.csv")) -#files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"])) +files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"])) rule all: input: diff --git a/rules/models.smk b/rules/models.smk index 18c6b777..de7381aa 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -6,3 +6,11 @@ rule merge_baseline_data: script: "../src/data/merge_baseline_data.py" +rule download_baseline_data: + input: + participant_file = "data/external/participant_files/{pid}.yaml", + data = "data/raw/baseline_merged.csv" + output: + "data/raw/{pid}/participant_baseline_raw.csv" + script: + "../src/data/download_baseline_data.py" diff --git a/src/data/download_baseline_data.py b/src/data/download_baseline_data.py new file mode 100644 index 00000000..3663a1fd --- /dev/null +++ b/src/data/download_baseline_data.py @@ -0,0 +1,14 @@ +import pandas as pd +import yaml + +filename = snakemake.input["data"] +baseline = pd.read_csv(filename) + +with open(snakemake.input["participant_file"], "r") as file: + participant = yaml.safe_load(file) + +username = participant["PHONE"]["LABEL"] + +baseline[baseline["username"] == username].to_csv(snakemake.output[0], + index=False, + encoding="utf-8",) diff --git a/src/data/merge_baseline_data.py b/src/data/merge_baseline_data.py index 8e5f2576..6eb91cd5 100644 --- a/src/data/merge_baseline_data.py +++ b/src/data/merge_baseline_data.py @@ -11,7 +11,11 @@ filenames = snakemake.input["data"] baseline_dfs = [] for fn in filenames: - baseline_dfs.append(pd.read_csv(fn)) + baseline_dfs.append(pd.read_csv(fn, + parse_dates=["Geboortedatum"], + infer_datetime_format=True, + cache_dates=True, + )) baseline = ( pd.concat(baseline_dfs, join="inner") @@ -22,8 +26,9 @@ baseline = ( baseline.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True) now = pd.Timestamp("now") baseline = baseline.assign( - date_of_birth=lambda x: pd.to_datetime(x.date_of_birth), age=lambda x: (now - x.date_of_birth).dt.days / 365.25245, ) -baseline.to_csv(snakemake.output[0]) +baseline.to_csv(snakemake.output[0], + index=False, + encoding="utf-8",)