kNN imputation relocation and execution only on specific columns.
parent
e3b78c8a85
commit
0ce6da5444
|
@ -101,14 +101,6 @@ def straw_cleaning(sensor_data_files, provider):
|
|||
sns.heatmap(features.isna(), cbar=False)
|
||||
plt.savefig(f'features_nans_bf_knn.png', bbox_inches='tight')
|
||||
|
||||
# KNN IMPUTATION
|
||||
features = impute(features, method="knn")
|
||||
|
||||
sns.set(rc={"figure.figsize":(16, 8)})
|
||||
sns.heatmap(features.isna(), cbar=False)
|
||||
plt.savefig(f'features_nans_af_knn.png', bbox_inches='tight')
|
||||
|
||||
|
||||
## STANDARDIZATION - should it happen before or after kNN imputation?
|
||||
# TODO: check if there are additional columns that need to be excluded from the standardization
|
||||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||
|
@ -116,6 +108,15 @@ def straw_cleaning(sensor_data_files, provider):
|
|||
|
||||
features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)])
|
||||
|
||||
# KNN IMPUTATION
|
||||
impute_cols = [col for col in features.columns if col not in ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']]
|
||||
features[impute_cols] = impute(features[impute_cols], method="knn")
|
||||
|
||||
|
||||
sns.set(rc={"figure.figsize":(16, 8)})
|
||||
sns.heatmap(features.isna(), cbar=False)
|
||||
plt.savefig(f'features_nans_af_knn.png', bbox_inches='tight')
|
||||
|
||||
# VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
|
||||
if features.isna.any().any():
|
||||
raise ValueError
|
||||
|
|
Loading…
Reference in New Issue