From 07da6be3984c93df9b8e4b295c22e21730b1847a Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 23 Feb 2022 18:05:23 +0100 Subject: [PATCH] Add age, gender, and language as features. Move calculation of age from merge_baseline_data.py to baseline_features.py. --- config.yaml | 2 +- src/data/baseline_features.py | 17 +++++++++++++++++ src/data/merge_baseline_data.py | 4 ---- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/config.yaml b/config.yaml index 5b8992b4..f14e3bfd 100644 --- a/config.yaml +++ b/config.yaml @@ -634,5 +634,5 @@ PARAMS_FOR_ANALYSIS: results-survey358134_final.csv, # Belgium 1 results-survey413767_final.csv # Belgium 2 ] - FEATURES: [age, gender] + FEATURES: [age, gender, startlanguage] CATEGORICAL_FEATURES: [gender] diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py index 2541e641..60671911 100644 --- a/src/data/baseline_features.py +++ b/src/data/baseline_features.py @@ -4,3 +4,20 @@ pid = snakemake.params["pid"] requested_features = snakemake.params["features"] baseline_features = pd.DataFrame(columns=requested_features) +participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"]) +if not participant_info.empty: + if "age" in requested_features: + now = pd.Timestamp("now") + baseline_features.loc[0, "age"] = ( + now - participant_info.loc[0, "date_of_birth"] + ).dt.days / 365.25245 + if "gender" in requested_features: + baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"] + if "startlanguage" in requested_features: + baseline_features.loc[0, "startlanguage"] = participant_info.loc[ + 0, "startlanguage" + ] + +baseline_features.to_csv( + snakemake.output[0], index=False, encoding="utf-8", +) diff --git a/src/data/merge_baseline_data.py b/src/data/merge_baseline_data.py index 6eb91cd5..c0abcb5e 100644 --- a/src/data/merge_baseline_data.py +++ b/src/data/merge_baseline_data.py @@ -24,10 +24,6 @@ baseline = ( ) baseline.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True) -now = pd.Timestamp("now") -baseline = baseline.assign( - age=lambda x: (now - x.date_of_birth).dt.days / 365.25245, -) baseline.to_csv(snakemake.output[0], index=False,