Add one rule to calculate the ratio of cells with missing values for cleaned features
parent
12302a9486
commit
5fab99d8df
|
@ -103,6 +103,14 @@ rule all:
|
|||
expand("data/processed/data_for_population_model/demographic_features.csv"),
|
||||
expand("data/processed/data_for_population_model/targets_{summarised}.csv",
|
||||
summarised = config["PARAMS_FOR_ANALYSIS"]["SUMMARISED"]),
|
||||
expand("data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv",
|
||||
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
||||
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
|
||||
days_before_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_BEFORE_THRESHOLD"],
|
||||
days_after_threshold = config["PARAMS_FOR_ANALYSIS"]["PARTICIPANT_DAYS_AFTER_THRESHOLD"],
|
||||
cols_var_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_VAR_THRESHOLD"],
|
||||
source = config["PARAMS_FOR_ANALYSIS"]["SOURCES"],
|
||||
day_segment = config["PARAMS_FOR_ANALYSIS"]["DAY_SEGMENTS"]),
|
||||
expand("data/processed/output_population_model/{rows_nan_threshold}_{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{model}/{cv_method}/{source}_{day_segment}_{summarised}_{scaler}/{result_component}.csv",
|
||||
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"],
|
||||
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"],
|
||||
|
|
|
@ -85,6 +85,14 @@ rule clean_features_for_population_model:
|
|||
script:
|
||||
"../src/models/clean_features_for_model.R"
|
||||
|
||||
rule nan_cells_ratio_of_cleaned_features:
|
||||
input:
|
||||
cleaned_features = "data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_clean.csv"
|
||||
output:
|
||||
"data/processed/data_for_population_model/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_nancellsratio.csv"
|
||||
script:
|
||||
"../src/models/nan_cells_ratio_of_cleaned_features.py"
|
||||
|
||||
rule modeling:
|
||||
input:
|
||||
cleaned_features = "data/processed/data_for_population_model/{source}_{day_segment}_clean.csv",
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
import pandas as pd
|
||||
|
||||
features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"])
|
||||
|
||||
# Compute the proportion of missing value cells among all features
|
||||
nan_cells_ratio = features.isnull().sum().sum() / (features.shape[0] * features.shape[1])
|
||||
|
||||
pd.DataFrame({"nan_cells_ratio": [nan_cells_ratio]}).to_csv(snakemake.output[0], index=False)
|
Loading…
Reference in New Issue