From 16e608db746f50d4cfb8fb9d44bca6b412d617aa Mon Sep 17 00:00:00 2001 From: junos Date: Fri, 4 Feb 2022 18:21:42 +0100 Subject: [PATCH] First merge baseline datasets. --- Snakefile | 3 ++- config.yaml | 2 +- rules/models.smk | 9 ++++----- src/data/download_demographic_data.py | 14 ------------- src/data/merge_baseline_data.py | 29 +++++++++++++++++++++++++++ 5 files changed, 36 insertions(+), 21 deletions(-) delete mode 100644 src/data/download_demographic_data.py create mode 100644 src/data/merge_baseline_data.py diff --git a/Snakefile b/Snakefile index 70a44b22..ab4f1aa8 100644 --- a/Snakefile +++ b/Snakefile @@ -404,7 +404,8 @@ for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv")) # Demographic features -files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"])) +files_to_compute.extend(expand("data/raw/baseline_merged.csv")) +#files_to_compute.extend(expand("data/raw/{pid}/participant_baseline_raw.csv", pid=config["PIDS"])) rule all: input: diff --git a/config.yaml b/config.yaml index 559396f9..5b8992b4 100644 --- a/config.yaml +++ b/config.yaml @@ -628,7 +628,7 @@ ALL_CLEANING_OVERALL: ######################################################################################################################## PARAMS_FOR_ANALYSIS: - DEMOGRAPHIC: + BASELINE: FOLDER: data/external/baseline CONTAINER: [results-survey637813_final.csv, # Slovenia results-survey358134_final.csv, # Belgium 1 diff --git a/rules/models.smk b/rules/models.smk index 3dc2ce61..18c6b777 100644 --- a/rules/models.smk +++ b/rules/models.smk @@ -1,9 +1,8 @@ -rule download_demographic_data: +rule merge_baseline_data: input: - participant_file = "data/external/participant_files/{pid}.yaml", - data = expand(config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["FOLDER"] + "/{container}", container=config["PARAMS_FOR_ANALYSIS"]["DEMOGRAPHIC"]["CONTAINER"]) + data = expand(config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["FOLDER"] + "/{container}", container=config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["CONTAINER"]) output: - "data/raw/{pid}/participant_baseline_raw.csv" + "data/raw/baseline_merged.csv" script: - "../src/data/download_demographic_data.py" + "../src/data/merge_baseline_data.py" diff --git a/src/data/download_demographic_data.py b/src/data/download_demographic_data.py deleted file mode 100644 index af7eb0b5..00000000 --- a/src/data/download_demographic_data.py +++ /dev/null @@ -1,14 +0,0 @@ -import pandas as pd - -filenames = snakemake.input["data"] - -baseline_dfs = [] - -for fn in filenames: - baseline_dfs.append(pd.read_csv(fn)) - -baseline = ( - pd.concat(baseline_dfs, join="inner") - .reset_index() - .drop(columns="index") -) diff --git a/src/data/merge_baseline_data.py b/src/data/merge_baseline_data.py new file mode 100644 index 00000000..8e5f2576 --- /dev/null +++ b/src/data/merge_baseline_data.py @@ -0,0 +1,29 @@ +import pandas as pd + +VARIABLES_TO_TRANSLATE = { + "Gebruikersnaam": "username", + "Geslacht": "gender", + "Geboortedatum": "date_of_birth", +} + +filenames = snakemake.input["data"] + +baseline_dfs = [] + +for fn in filenames: + baseline_dfs.append(pd.read_csv(fn)) + +baseline = ( + pd.concat(baseline_dfs, join="inner") + .reset_index() + .drop(columns="index") +) + +baseline.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True) +now = pd.Timestamp("now") +baseline = baseline.assign( + date_of_birth=lambda x: pd.to_datetime(x.date_of_birth), + age=lambda x: (now - x.date_of_birth).dt.days / 365.25245, +) + +baseline.to_csv(snakemake.output[0])