From 5fab99d8df54e543f6a11bea7302bcd1783e97c5 Mon Sep 17 00:00:00 2001
From: Meng Li <34143965+Meng6@users.noreply.github.com>
Date: Fri, 15 May 2020 18:25:07 -0400
Subject: [PATCH] Add one rule to calculate the ratio of cells with missing
 values for cleaned features

---
 Snakefile                                         | 8 ++++++++
 rules/models.snakefile                            | 8 ++++++++
 src/models/nan_cells_ratio_of_cleaned_features.py | 8 ++++++++
 3 files changed, 24 insertions(+)
 create mode 100644 src/models/nan_cells_ratio_of_cleaned_features.py

diff --git a/Snakefile b/Snakefile
index 8cdde971..5cfca3f5 100644
--- a/Snakefile
+++ b/Snakefile
@@ -103,6 +103,14 @@ rule all:
         expand("data/processed/data_for_population_model/demographic_features.csv"),
         expand("data/processed/data_for_population_model/targets_{summarised}.csv",
                             summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
+        expand("data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv",
+                            rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
+                            cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
+                            days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
+                            days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
+                            cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
+                            source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                            day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
         expand("data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/{result_component}.csv",
                             rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
                             cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
diff --git a/rules/models.snakefile b/rules/models.snakefile
index 500903ea..9ac182ba 100644
--- a/rules/models.snakefile
+++ b/rules/models.snakefile
@@ -85,6 +85,14 @@ rule clean_features_for_population_model:
     script:
         "../src/models/clean_features_for_model.R"
 
+rule nan_cells_ratio_of_cleaned_features:
+    input:
+        cleaned_features = "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
+    output:
+        "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv"
+    script:
+        "../src/models/nan_cells_ratio_of_cleaned_features.py"
+ 
 rule modeling:
     input:
         cleaned_features = "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv",
diff --git a/src/models/nan_cells_ratio_of_cleaned_features.py b/src/models/nan_cells_ratio_of_cleaned_features.py
new file mode 100644
index 00000000..de06a0c2
--- /dev/null
+++ b/src/models/nan_cells_ratio_of_cleaned_features.py
@@ -0,0 +1,8 @@
+import pandas as pd
+
+features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"])
+
+# Compute the proportion of missing value cells among all features
+nan_cells_ratio = features.isnull().sum().sum() / (features.shape[0] * features.shape[1])
+
+pd.DataFrame({"nan_cells_ratio": [nan_cells_ratio]}).to_csv(snakemake.output[0], index=False)
\ No newline at end of file