Add one rule to calculate the ratio of cells with missing values for cleaned features

pull/95/head
Meng Li 2020-05-15 18:25:07 -04:00
parent 12302a9486
commit 5fab99d8df
3 changed files with 24 additions and 0 deletions

View File

@ -103,6 +103,14 @@ rule all:
expand("data/processed/data_for_population_model/demographic_features.csv"), expand("data/processed/data_for_population_model/demographic_features.csv"),
expand("data/processed/data_for_population_model/targets_{summarised}.csv", expand("data/processed/data_for_population_model/targets_{summarised}.csv",
summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]), summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
expand("data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv",
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
expand("data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/{result_component}.csv", expand("data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/{result_component}.csv",
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"], rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"], cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],

View File

@ -85,6 +85,14 @@ rule clean_features_for_population_model:
script: script:
"../src/models/clean_features_for_model.R" "../src/models/clean_features_for_model.R"
rule nan_cells_ratio_of_cleaned_features:
input:
cleaned_features = "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
output:
"data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv"
script:
"../src/models/nan_cells_ratio_of_cleaned_features.py"
rule modeling: rule modeling:
input: input:
cleaned_features = "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv", cleaned_features = "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv",

View File

@ -0,0 +1,8 @@
import pandas as pd
features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"])
# Compute the proportion of missing value cells among all features
nan_cells_ratio = features.isnull().sum().sum() / (features.shape[0] * features.shape[1])
pd.DataFrame({"nan_cells_ratio": [nan_cells_ratio]}).to_csv(snakemake.output[0], index=False)