From 2dc89c083c99d43da5e1eb9cbc1c07d39901f27d Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Fri, 7 Oct 2022 08:52:12 +0000
Subject: [PATCH] Small changes in cleaning overall

---
 src/features/all_cleaning_overall/straw/main.py | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/src/features/all_cleaning_overall/straw/main.py b/src/features/all_cleaning_overall/straw/main.py
index 71151608..0583705c 100644
--- a/src/features/all_cleaning_overall/straw/main.py
+++ b/src/features/all_cleaning_overall/straw/main.py
@@ -25,7 +25,6 @@ def straw_cleaning(sensor_data_files, provider, target):
 
     # (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
     if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
-        # target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
         features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
 
     graph_bf_af(features, "2target_rows_after")
@@ -96,21 +95,7 @@ def straw_cleaning(sensor_data_files, provider, target):
     features[impute_zero] = impute(features[impute_zero], method="zero")
 
     graph_bf_af(features, "5zero_imp")
-
-    # Impute phone locations with median - should this rather be imputed at kNN step??
-    # impute_locations = [col for col in features.columns if "phone_locations_" in col]
-
-    # # features[impute_locations] = features[impute_locations].mask(np.random.random(features[impute_locations].shape) < .1)
-
-    # features.at[0,'pid'] = "p01"
-    # features.at[1,'pid'] = "p01"
-    # features.at[2,'pid'] = "p02"
-    # features.at[3,'pid'] = "p02"
-
-    # graph_bf_af(features[impute_locations], "phoneloc_before")
-
-    # features[impute_locations] = features[impute_locations + ["pid"]].groupby("pid").transform(lambda x: x.fillna(x.median()))[impute_locations]
-    
+ 
     # (4) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
     esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns