From 1ad25bb5727d30affd2ee063386f2f5ca52e0d63 Mon Sep 17 00:00:00 2001 From: Primoz Date: Tue, 11 Oct 2022 08:26:17 +0000 Subject: [PATCH] Few modifications of some imputation values in cleaning script and feature extraction. --- src/features/all_cleaning_individual/straw/main.py | 9 +++++++-- src/features/all_cleaning_overall/straw/main.py | 9 +++++---- .../phone_applications_foreground/rapids/main.py | 4 ++-- src/features/phone_calls/rapids/main.R | 2 +- src/features/phone_messages/rapids/main.R | 2 +- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/features/all_cleaning_individual/straw/main.py b/src/features/all_cleaning_individual/straw/main.py index 6ac97cf7..8a222491 100644 --- a/src/features/all_cleaning_individual/straw/main.py +++ b/src/features/all_cleaning_individual/straw/main.py @@ -75,7 +75,8 @@ def straw_cleaning(sensor_data_files, provider): "firstuseafter" in col or "timefirstmessages" in col or "timelastmessages" in col] - features[impute_w_hn] = impute(features[impute_w_hn], method="high_number") + features[impute_w_hn] = features[impute_w_hn].fillna(1500) + # Impute special case (mostcommonactivity) and (homelabel) impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col] @@ -84,6 +85,10 @@ def straw_cleaning(sensor_data_files, provider): impute_w_sn2 = [col for col in features.columns if "homelabel" in col] features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value + impute_w_sn3 = [col for col features.columns if "loglocationvariance" in col] + features[impute_w_sn2] = features[impute_w_sn2].fillna(-1000000) # Special case of imputation - nominal/ordinal value + + # Impute selected phone features with 0 impute_zero = [col for col in features if \ col.startswith('phone_applications_foreground_rapids_') or @@ -151,7 +156,7 @@ def impute(df, method='zero'): return { 'zero': df.fillna(0), - 'high_number': df.fillna(1000000), + 'high_number': df.fillna(1500), 'mean': df.fillna(df.mean()), 'median': df.fillna(df.median()), 'knn': k_nearest(df) diff --git a/src/features/all_cleaning_overall/straw/main.py b/src/features/all_cleaning_overall/straw/main.py index 0583705c..b0baa760 100644 --- a/src/features/all_cleaning_overall/straw/main.py +++ b/src/features/all_cleaning_overall/straw/main.py @@ -71,9 +71,7 @@ def straw_cleaning(sensor_data_files, provider, target): "firstuseafter" in col or "timefirstmessages" in col or "timelastmessages" in col] - features[impute_w_hn] = impute(features[impute_w_hn], method="high_number") - - graph_bf_af(features, "4high_number_imp") + features[impute_w_hn] = features[impute_w_hn].fillna(1500) # Impute special case (mostcommonactivity) and (homelabel) impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col] @@ -82,6 +80,9 @@ def straw_cleaning(sensor_data_files, provider, target): impute_w_sn2 = [col for col in features.columns if "homelabel" in col] features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value + impute_w_sn3 = [col for col features.columns if "loglocationvariance" in col] + features[impute_w_sn2] = features[impute_w_sn2].fillna(-1000000) # Special case of imputation - loglocation + # Impute selected phone features with 0 impute_zero = [col for col in features if \ col.startswith('phone_applications_foreground_rapids_') or @@ -189,7 +190,7 @@ def impute(df, method='zero'): return { 'zero': df.fillna(0), - 'high_number': df.fillna(1000000), + 'high_number': df.fillna(1500), 'mean': df.fillna(df.mean()), 'median': df.fillna(df.median()), 'knn': k_nearest(df) diff --git a/src/features/phone_applications_foreground/rapids/main.py b/src/features/phone_applications_foreground/rapids/main.py index 4d7814a4..d9204547 100644 --- a/src/features/phone_applications_foreground/rapids/main.py +++ b/src/features/phone_applications_foreground/rapids/main.py @@ -9,13 +9,13 @@ def compute_features(filtered_data, apps_type, requested_features, apps_features if "timeoffirstuse" in requested_features: time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment") if time_first_event.empty: - apps_features["timeoffirstuse" + apps_type] = 1000000 # np.nan + apps_features["timeoffirstuse" + apps_type] = 1500 # np.nan else: apps_features["timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"] if "timeoflastuse" in requested_features: time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment") if time_last_event.empty: - apps_features["timeoflastuse" + apps_type] = 1000000 # np.nan + apps_features["timeoflastuse" + apps_type] = 1500 # np.nan else: apps_features["timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"] if "frequencyentropy" in requested_features: diff --git a/src/features/phone_calls/rapids/main.R b/src/features/phone_calls/rapids/main.R index 5b75e2ea..d6c8ab88 100644 --- a/src/features/phone_calls/rapids/main.R +++ b/src/features/phone_calls/rapids/main.R @@ -94,7 +94,7 @@ rapids_features <- function(sensor_data_files, time_segment, provider){ colnames(.) call_features <- call_features %>% - mutate_at(., time_cols, ~replace(., is.na(.), 1000000)) + mutate_at(., time_cols, ~replace(., is.na(.), 1500)) # Fill NA values with 0 call_features <- call_features %>% mutate_all(~replace(., is.na(.), 0)) diff --git a/src/features/phone_messages/rapids/main.R b/src/features/phone_messages/rapids/main.R index 817b5e49..d5dddc73 100644 --- a/src/features/phone_messages/rapids/main.R +++ b/src/features/phone_messages/rapids/main.R @@ -70,7 +70,7 @@ rapids_features <- function(sensor_data_files, time_segment, provider){ colnames(.) messages_features <- messages_features %>% - mutate_at(., time_cols, ~replace(., is.na(.), 1000000)) + mutate_at(., time_cols, ~replace(., is.na(.), 1500)) # Fill NA values with 0 messages_features <- messages_features %>% mutate_all(~replace(., is.na(.), 0))