diff --git a/config.yaml b/config.yaml index 5b8992b4..f14e3bfd 100644 --- a/config.yaml +++ b/config.yaml @@ -634,5 +634,5 @@ PARAMS_FOR_ANALYSIS: results-survey358134_final.csv, # Belgium 1 results-survey413767_final.csv # Belgium 2 ] - FEATURES: [age, gender] + FEATURES: [age, gender, startlanguage] CATEGORICAL_FEATURES: [gender] diff --git a/src/data/baseline_features.py b/src/data/baseline_features.py index 2541e641..60671911 100644 --- a/src/data/baseline_features.py +++ b/src/data/baseline_features.py @@ -4,3 +4,20 @@ pid = snakemake.params["pid"] requested_features = snakemake.params["features"] baseline_features = pd.DataFrame(columns=requested_features) +participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"]) +if not participant_info.empty: + if "age" in requested_features: + now = pd.Timestamp("now") + baseline_features.loc[0, "age"] = ( + now - participant_info.loc[0, "date_of_birth"] + ).dt.days / 365.25245 + if "gender" in requested_features: + baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"] + if "startlanguage" in requested_features: + baseline_features.loc[0, "startlanguage"] = participant_info.loc[ + 0, "startlanguage" + ] + +baseline_features.to_csv( + snakemake.output[0], index=False, encoding="utf-8", +) diff --git a/src/data/merge_baseline_data.py b/src/data/merge_baseline_data.py index 6eb91cd5..c0abcb5e 100644 --- a/src/data/merge_baseline_data.py +++ b/src/data/merge_baseline_data.py @@ -24,10 +24,6 @@ baseline = ( ) baseline.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True) -now = pd.Timestamp("now") -baseline = baseline.assign( - age=lambda x: (now - x.date_of_birth).dt.days / 365.25245, -) baseline.to_csv(snakemake.output[0], index=False,