Change the method of computing missing value cells

pull/95/head
Meng Li 2020-04-30 15:40:55 -04:00
parent 0463558aee
commit 6ed52e7d1a
1 changed files with 4 additions and 2 deletions

View File

@ -95,6 +95,9 @@ rowsnan_colsnan_days_colsvar_threshold = snakemake.params["rowsnan_colsnan_days_
demographic_features = pd.read_csv(snakemake.input["demographic_features"], index_col=["pid"])
targets = pd.read_csv(snakemake.input["targets"], index_col=["pid"])
features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"])
# Compute the proportion of missing value cells among all features
nan_ratio = features.isnull().sum().sum() / (features.shape[0] * features.shape[1])
# Step 2. Extract summarised features based on daily features:
# for categorical features: calculate variance across all days
@ -134,12 +137,11 @@ for train_index, test_index in outer_cv.split(data_x):
test_x = preprocesFeatures(train_numerical_features, test_numerical_features, test_categorical_features, mode_categorical_features, scaler, "test")
train_x, test_x = train_x.align(test_x, join='outer', axis=1, fill_value=0) # in case we get rid off categorical columns
# Compute number of participants, features and proportion of missing value cells among all features,
# Compute number of participants and features
# values do not change between folds
if fold_count == 1:
num_of_participants = train_x.shape[0] + test_x.shape[0]
num_of_features = train_x.shape[1]
nan_ratio = (train_x.isnull().sum().sum() + test_x.isnull().sum().sum()) / ((train_x.shape[0] + test_x.shape[0]) * train_x.shape[1])
# Inner cross validation
clf = GridSearchCV(estimator=pipeline, param_grid=model_hyperparams, cv=inner_cv, scoring="f1_micro")