Small changes in cleaning overall

2022-10-07 08:52:12 +00:00 · 2022-10-07 08:52:12 +00:00 · 2dc89c083c
parent 001d400729
commit 2dc89c083c
1 changed files with 1 additions and 16 deletions
--- a/src/features/all_cleaning_overall/straw/main.py
+++ b/src/features/all_cleaning_overall/straw/main.py
@ -25,7 +25,6 @@ def straw_cleaning(sensor_data_files, provider, target):

    # (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
    if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
-        # target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
        features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)

    graph_bf_af(features, "2target_rows_after")
@ -96,21 +95,7 @@ def straw_cleaning(sensor_data_files, provider, target):
    features[impute_zero] = impute(features[impute_zero], method="zero")

    graph_bf_af(features, "5zero_imp")
-
-    # Impute phone locations with median - should this rather be imputed at kNN step??
-    # impute_locations = [col for col in features.columns if "phone_locations_" in col]
-
-    # # features[impute_locations] = features[impute_locations].mask(np.random.random(features[impute_locations].shape) < .1)
-
-    # features.at[0,'pid'] = "p01"
-    # features.at[1,'pid'] = "p01"
-    # features.at[2,'pid'] = "p02"
-    # features.at[3,'pid'] = "p02"
-
-    # graph_bf_af(features[impute_locations], "phoneloc_before")
-
-    # features[impute_locations] = features[impute_locations + ["pid"]].groupby("pid").transform(lambda x: x.fillna(x.median()))[impute_locations]
-    
+ 
    # (4) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
    esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns