rapids/src/data/merge_baseline_data.py

35 lines
890 B
Python

import pandas as pd
VARIABLES_TO_TRANSLATE = {
"Gebruikersnaam": "username",
"Geslacht": "gender",
"Geboortedatum": "date_of_birth",
}
filenames = snakemake.input["data"]
baseline_dfs = []
for fn in filenames:
baseline_dfs.append(pd.read_csv(fn,
parse_dates=["Geboortedatum"],
infer_datetime_format=True,
cache_dates=True,
))
baseline = (
pd.concat(baseline_dfs, join="inner")
.reset_index()
.drop(columns="index")
)
baseline.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
now = pd.Timestamp("now")
baseline = baseline.assign(
age=lambda x: (now - x.date_of_birth).dt.days / 365.25245,
)
baseline.to_csv(snakemake.output[0],
index=False,
encoding="utf-8",)