Small changes in cleaning overall
parent
001d400729
commit
2dc89c083c
|
@ -25,7 +25,6 @@ def straw_cleaning(sensor_data_files, provider, target):
|
|||
|
||||
# (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
|
||||
if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
|
||||
# target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
|
||||
features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
|
||||
|
||||
graph_bf_af(features, "2target_rows_after")
|
||||
|
@ -96,21 +95,7 @@ def straw_cleaning(sensor_data_files, provider, target):
|
|||
features[impute_zero] = impute(features[impute_zero], method="zero")
|
||||
|
||||
graph_bf_af(features, "5zero_imp")
|
||||
|
||||
# Impute phone locations with median - should this rather be imputed at kNN step??
|
||||
# impute_locations = [col for col in features.columns if "phone_locations_" in col]
|
||||
|
||||
# # features[impute_locations] = features[impute_locations].mask(np.random.random(features[impute_locations].shape) < .1)
|
||||
|
||||
# features.at[0,'pid'] = "p01"
|
||||
# features.at[1,'pid'] = "p01"
|
||||
# features.at[2,'pid'] = "p02"
|
||||
# features.at[3,'pid'] = "p02"
|
||||
|
||||
# graph_bf_af(features[impute_locations], "phoneloc_before")
|
||||
|
||||
# features[impute_locations] = features[impute_locations + ["pid"]].groupby("pid").transform(lambda x: x.fillna(x.median()))[impute_locations]
|
||||
|
||||
|
||||
# (4) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
|
||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
||||
|
||||
|
|
Loading…
Reference in New Issue