Split baseline data to participants.

And some csv I/O settings.
labels
junos 2022-02-04 18:37:57 +01:00
parent 16e608db74
commit bf9c764c97
4 changed files with 31 additions and 4 deletions

View File

@ -405,7 +405,7 @@ for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
# Demographic features # Demographic features
files_to_compute.extend(expand("data/raw/baseline_merged.csv")) files_to_compute.extend(expand("data/raw/baseline_merged.csv"))
#files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"]))
rule all: rule all:
input: input:

View File

@ -6,3 +6,11 @@ rule merge_baseline_data:
script: script:
"../src/data/merge_baseline_data.py" "../src/data/merge_baseline_data.py"
rule download_baseline_data:
input:
participant_file = "data/external/participant_files/{pid}.yaml",
data = "data/raw/baseline_merged.csv"
output:
"data/raw/{pid}/participant_baseline_raw.csv"
script:
"../src/data/download_baseline_data.py"

View File

@ -0,0 +1,14 @@
import pandas as pd
import yaml
filename = snakemake.input["data"]
baseline = pd.read_csv(filename)
with open(snakemake.input["participant_file"], "r") as file:
participant = yaml.safe_load(file)
username = participant["PHONE"]["LABEL"]
baseline[baseline["username"] == username].to_csv(snakemake.output[0],
index=False,
encoding="utf-8",)

View File

@ -11,7 +11,11 @@ filenames = snakemake.input["data"]
baseline_dfs = [] baseline_dfs = []
for fn in filenames: for fn in filenames:
baseline_dfs.append(pd.read_csv(fn)) baseline_dfs.append(pd.read_csv(fn,
parse_dates=["Geboortedatum"],
infer_datetime_format=True,
cache_dates=True,
))
baseline = ( baseline = (
pd.concat(baseline_dfs, join="inner") pd.concat(baseline_dfs, join="inner")
@ -22,8 +26,9 @@ baseline = (
baseline.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True) baseline.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
now = pd.Timestamp("now") now = pd.Timestamp("now")
baseline = baseline.assign( baseline = baseline.assign(
date_of_birth=lambda x: pd.to_datetime(x.date_of_birth),
age=lambda x: (now - x.date_of_birth).dt.days / 365.25245, age=lambda x: (now - x.date_of_birth).dt.days / 365.25245,
) )
baseline.to_csv(snakemake.output[0]) baseline.to_csv(snakemake.output[0],
index=False,
encoding="utf-8",)