Add age, gender, and language as features.

Move calculation of age from merge_baseline_data.py to baseline_features.py.
labels
junos 2022-02-23 18:05:23 +01:00
parent 176367631b
commit 07da6be398
3 changed files with 18 additions and 5 deletions

View File

@ -634,5 +634,5 @@ PARAMS_FOR_ANALYSIS:
results-survey358134_final.csv, # Belgium 1
results-survey413767_final.csv # Belgium 2
]
FEATURES: [age, gender]
FEATURES: [age, gender, startlanguage]
CATEGORICAL_FEATURES: [gender]

View File

@ -4,3 +4,20 @@ pid = snakemake.params["pid"]
requested_features = snakemake.params["features"]
baseline_features = pd.DataFrame(columns=requested_features)
participant_info = pd.read_csv(snakemake.input[0], parse_dates=["date_of_birth"])
if not participant_info.empty:
if "age" in requested_features:
now = pd.Timestamp("now")
baseline_features.loc[0, "age"] = (
now - participant_info.loc[0, "date_of_birth"]
).dt.days / 365.25245
if "gender" in requested_features:
baseline_features.loc[0, "gender"] = participant_info.loc[0, "gender"]
if "startlanguage" in requested_features:
baseline_features.loc[0, "startlanguage"] = participant_info.loc[
0, "startlanguage"
]
baseline_features.to_csv(
snakemake.output[0], index=False, encoding="utf-8",
)

View File

@ -24,10 +24,6 @@ baseline = (
)
baseline.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
now = pd.Timestamp("now")
baseline = baseline.assign(
age=lambda x: (now - x.date_of_birth).dt.days / 365.25245,
)
baseline.to_csv(snakemake.output[0],
index=False,