From 5fab99d8df54e543f6a11bea7302bcd1783e97c5 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Fri, 15 May 2020 18:25:07 -0400 Subject: [PATCH] Add one rule to calculate the ratio of cells with missing values for cleaned features --- Snakefile | 8 ++++++++ rules/models.snakefile | 8 ++++++++ src/models/nan_cells_ratio_of_cleaned_features.py | 8 ++++++++ 3 files changed, 24 insertions(+) create mode 100644 src/models/nan_cells_ratio_of_cleaned_features.py diff --git a/Snakefile b/Snakefile index 8cdde971..5cfca3f5 100644 --- a/Snakefile +++ b/Snakefile @@ -103,6 +103,14 @@ rule all: expand("data/processed/data_for_population_model/demographic_features.csv"), expand("data/processed/data_for_population_model/targets_{summarised}.csv", summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]), + expand("data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv", + rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], + cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], + days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"], + days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"], + cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"], + source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"], + day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]), expand("data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/{result_component}.csv", rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], diff --git a/rules/models.snakefile b/rules/models.snakefile index 500903ea..9ac182ba 100644 --- a/rules/models.snakefile +++ b/rules/models.snakefile @@ -85,6 +85,14 @@ rule clean_features_for_population_model: script: "../src/models/clean_features_for_model.R" +rule nan_cells_ratio_of_cleaned_features: + input: + cleaned_features = "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv" + output: + "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv" + script: + "../src/models/nan_cells_ratio_of_cleaned_features.py" + rule modeling: input: cleaned_features = "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv", diff --git a/src/models/nan_cells_ratio_of_cleaned_features.py b/src/models/nan_cells_ratio_of_cleaned_features.py new file mode 100644 index 00000000..de06a0c2 --- /dev/null +++ b/src/models/nan_cells_ratio_of_cleaned_features.py @@ -0,0 +1,8 @@ +import pandas as pd + +features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"]) + +# Compute the proportion of missing value cells among all features +nan_cells_ratio = features.isnull().sum().sum() / (features.shape[0] * features.shape[1]) + +pd.DataFrame({"nan_cells_ratio": [nan_cells_ratio]}).to_csv(snakemake.output[0], index=False) \ No newline at end of file