From 2dc89c083c99d43da5e1eb9cbc1c07d39901f27d Mon Sep 17 00:00:00 2001 From: Primoz Date: Fri, 7 Oct 2022 08:52:12 +0000 Subject: [PATCH] Small changes in cleaning overall --- src/features/all_cleaning_overall/straw/main.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/src/features/all_cleaning_overall/straw/main.py b/src/features/all_cleaning_overall/straw/main.py index 71151608..0583705c 100644 --- a/src/features/all_cleaning_overall/straw/main.py +++ b/src/features/all_cleaning_overall/straw/main.py @@ -25,7 +25,6 @@ def straw_cleaning(sensor_data_files, provider, target): # (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']: - # target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True) graph_bf_af(features, "2target_rows_after") @@ -96,21 +95,7 @@ def straw_cleaning(sensor_data_files, provider, target): features[impute_zero] = impute(features[impute_zero], method="zero") graph_bf_af(features, "5zero_imp") - - # Impute phone locations with median - should this rather be imputed at kNN step?? - # impute_locations = [col for col in features.columns if "phone_locations_" in col] - - # # features[impute_locations] = features[impute_locations].mask(np.random.random(features[impute_locations].shape) < .1) - - # features.at[0,'pid'] = "p01" - # features.at[1,'pid'] = "p01" - # features.at[2,'pid'] = "p02" - # features.at[3,'pid'] = "p02" - - # graph_bf_af(features[impute_locations], "phoneloc_before") - - # features[impute_locations] = features[impute_locations + ["pid"]].groupby("pid").transform(lambda x: x.fillna(x.median()))[impute_locations] - + # (4) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows) esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns