Small changes in cleaning overall

notes
Primoz 2022-10-07 08:52:12 +00:00
parent 001d400729
commit 2dc89c083c
1 changed files with 1 additions and 16 deletions

View File

@ -25,7 +25,6 @@ def straw_cleaning(sensor_data_files, provider, target):
# (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE # (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']: if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
# target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True) features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
graph_bf_af(features, "2target_rows_after") graph_bf_af(features, "2target_rows_after")
@ -97,20 +96,6 @@ def straw_cleaning(sensor_data_files, provider, target):
graph_bf_af(features, "5zero_imp") graph_bf_af(features, "5zero_imp")
# Impute phone locations with median - should this rather be imputed at kNN step??
# impute_locations = [col for col in features.columns if "phone_locations_" in col]
# # features[impute_locations] = features[impute_locations].mask(np.random.random(features[impute_locations].shape) < .1)
# features.at[0,'pid'] = "p01"
# features.at[1,'pid'] = "p01"
# features.at[2,'pid'] = "p02"
# features.at[3,'pid'] = "p02"
# graph_bf_af(features[impute_locations], "phoneloc_before")
# features[impute_locations] = features[impute_locations + ["pid"]].groupby("pid").transform(lambda x: x.fillna(x.median()))[impute_locations]
# (4) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows) # (4) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns