Phone wifi visible inspection (WIP)

2022-09-16 13:24:21 +00:00 · 2022-09-16 13:24:21 +00:00 · 62982866cd
parent 0ce6da5444
commit 62982866cd
2 changed files with 15 additions and 11 deletions
--- a/src/features/all_cleaning_individual/straw/main.py
+++ b/src/features/all_cleaning_individual/straw/main.py
@ -22,8 +22,6 @@ def straw_cleaning(sensor_data_files, provider):
        target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
        features = features[features['phone_esm_straw_' + target].notna()].reset_index()

-    test_cols = [col for col in features.columns if 'phone_calls' in col or 'phone_messages' in col]
-
    # TODO: reorder the cleaning steps so it makes sense for the analysis
    # TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this
    # the snakemake rules will also have to come with additional parameter (in rules/features.smk)
@ -69,7 +67,7 @@ def straw_cleaning(sensor_data_files, provider):
    # (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved)
    features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]

-    # (4) REMOVE COLS WHERE VARIANCE IS 0
+    # (4) REMOVE COLS WHERE VARIANCE IS 0 TODO: preveri za local_segment stolpce
    if provider["COLS_VAR_THRESHOLD"]:
        features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
    
@ -104,12 +102,10 @@ def straw_cleaning(sensor_data_files, provider):
    ## STANDARDIZATION - should it happen before or after kNN imputation?
    # TODO: check if there are additional columns that need to be excluded from the standardization
    excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
-    excluded_columns += [col for col in features.columns if "level_1" in col]
-    
    features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)])

    # KNN IMPUTATION
-    impute_cols = [col for col in features.columns if col not in ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']]
+    impute_cols = [col for col in features.columns if col not in excluded_columns]
    features[impute_cols] = impute(features[impute_cols], method="knn")


--- a/src/features/phone_wifi_visible/rapids/main.R
+++ b/src/features/phone_wifi_visible/rapids/main.R
@ -9,9 +9,11 @@ compute_wifi_feature <- function(data, feature, time_segment){
              "countscans" = data %>% summarise(!!feature := n()),
              "uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid)))
    return(data)
+
   } else if(feature == "countscansmostuniquedevice"){
     # Get the most scanned device
-    mostuniquedevice <- data %>% 
+    mostuniquedevice <- data %>%
+      filter(bssid != "") %>% 
      group_by(bssid) %>% 
      mutate(N=n()) %>% 
      ungroup() %>%
@ -21,12 +23,19 @@ compute_wifi_feature <- function(data, feature, time_segment){

    data <- data %>% filter_data_by_segment(time_segment)

+    print(data %>% 
+             filter(bssid == mostuniquedevice) %>%
+             group_by(local_segment) %>% 
+             summarise(!!feature := n()))
+
+    raise
+
    return(data %>% 
             filter(bssid == mostuniquedevice) %>%
             group_by(local_segment) %>% 
-             summarise(!!feature := n()) %>%
-             mutate_all(~replace(., is.na(.), 0))
+             summarise(!!feature := n())
    )
+
  }
 }

@ -46,7 +55,6 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
    feature <- compute_wifi_feature(wifi_data, feature_name, time_segment)
    features <- merge(features, feature, by="local_segment", all = TRUE)
  }
-
-  features <- features %>% mutate_all(~replace(., is.na(.), 0))
+  # features <- features %>% mutate_all(~replace(., is.na(.), 0))
  return(features)
 }