Add one rule to calculate the ratio of cells with missing values for cleaned features

2020-05-15 18:25:07 -04:00 · 2020-05-15 18:25:07 -04:00 · 5fab99d8df
parent 12302a9486
commit 5fab99d8df
3 changed files with 24 additions and 0 deletions
--- a/8
+++ b/8
@ -103,6 +103,14 @@ rule all:
        expand("data/processed/data_for_population_model/demographic_features.csv"),
        expand("data/processed/data_for_population_model/targets_{summarised}.csv",
                            summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
+        expand("data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv",
+                            rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
+                            cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
+                            days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
+                            days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
+                            cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
+                            source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
+                            day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
        expand("data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/{result_component}.csv",
                            rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
                            cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
--- a/rules/models.snakefile
+++ b/rules/models.snakefile
@ -85,6 +85,14 @@ rule clean_features_for_population_model:
    script:
        "../src/models/clean_features_for_model.R"

+rule nan_cells_ratio_of_cleaned_features:
+    input:
+        cleaned_features = "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
+    output:
+        "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv"
+    script:
+        "../src/models/nan_cells_ratio_of_cleaned_features.py"
+ 
 rule modeling:
    input:
        cleaned_features = "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv",
--- a/src/models/nan_cells_ratio_of_cleaned_features.py
+++ b/src/models/nan_cells_ratio_of_cleaned_features.py
@ -0,0 +1,8 @@
+import pandas as pd
+
+features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"])
+
+# Compute the proportion of missing value cells among all features
+nan_cells_ratio = features.isnull().sum().sum() / (features.shape[0] * features.shape[1])
+
+pd.DataFrame({"nan_cells_ratio": [nan_cells_ratio]}).to_csv(snakemake.output[0], index=False)