diff --git a/src/features/all_cleaning_individual/straw/main.py b/src/features/all_cleaning_individual/straw/main.py index 5a2bca7d..380301e5 100644 --- a/src/features/all_cleaning_individual/straw/main.py +++ b/src/features/all_cleaning_individual/straw/main.py @@ -22,8 +22,6 @@ def straw_cleaning(sensor_data_files, provider): target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config features = features[features['phone_esm_straw_' + target].notna()].reset_index() - test_cols = [col for col in features.columns if 'phone_calls' in col or 'phone_messages' in col] - # TODO: reorder the cleaning steps so it makes sense for the analysis # TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this # the snakemake rules will also have to come with additional parameter (in rules/features.smk) @@ -69,7 +67,7 @@ def straw_cleaning(sensor_data_files, provider): # (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved) features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] - # (4) REMOVE COLS WHERE VARIANCE IS 0 + # (4) REMOVE COLS WHERE VARIANCE IS 0 TODO: preveri za local_segment stolpce if provider["COLS_VAR_THRESHOLD"]: features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True) @@ -104,12 +102,10 @@ def straw_cleaning(sensor_data_files, provider): ## STANDARDIZATION - should it happen before or after kNN imputation? # TODO: check if there are additional columns that need to be excluded from the standardization excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime'] - excluded_columns += [col for col in features.columns if "level_1" in col] - features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)]) # KNN IMPUTATION - impute_cols = [col for col in features.columns if col not in ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']] + impute_cols = [col for col in features.columns if col not in excluded_columns] features[impute_cols] = impute(features[impute_cols], method="knn") diff --git a/src/features/phone_wifi_visible/rapids/main.R b/src/features/phone_wifi_visible/rapids/main.R index fc914397..03221d3a 100644 --- a/src/features/phone_wifi_visible/rapids/main.R +++ b/src/features/phone_wifi_visible/rapids/main.R @@ -9,9 +9,11 @@ compute_wifi_feature <- function(data, feature, time_segment){ "countscans" = data %>% summarise(!!feature := n()), "uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid))) return(data) + } else if(feature == "countscansmostuniquedevice"){ # Get the most scanned device - mostuniquedevice <- data %>% + mostuniquedevice <- data %>% + filter(bssid != "") %>% group_by(bssid) %>% mutate(N=n()) %>% ungroup() %>% @@ -21,12 +23,19 @@ compute_wifi_feature <- function(data, feature, time_segment){ data <- data %>% filter_data_by_segment(time_segment) + print(data %>% + filter(bssid == mostuniquedevice) %>% + group_by(local_segment) %>% + summarise(!!feature := n())) + + raise + return(data %>% filter(bssid == mostuniquedevice) %>% group_by(local_segment) %>% - summarise(!!feature := n()) %>% - mutate_all(~replace(., is.na(.), 0)) + summarise(!!feature := n()) ) + } } @@ -46,7 +55,6 @@ rapids_features <- function(sensor_data_files, time_segment, provider){ feature <- compute_wifi_feature(wifi_data, feature_name, time_segment) features <- merge(features, feature, by="local_segment", all = TRUE) } - - features <- features %>% mutate_all(~replace(., is.na(.), 0)) + # features <- features %>% mutate_all(~replace(., is.na(.), 0)) return(features) }