Phone wifi visible inspection (WIP)
parent
0ce6da5444
commit
62982866cd
|
@ -22,8 +22,6 @@ def straw_cleaning(sensor_data_files, provider):
|
|||
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
|
||||
features = features[features['phone_esm_straw_' + target].notna()].reset_index()
|
||||
|
||||
test_cols = [col for col in features.columns if 'phone_calls' in col or 'phone_messages' in col]
|
||||
|
||||
# TODO: reorder the cleaning steps so it makes sense for the analysis
|
||||
# TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this
|
||||
# the snakemake rules will also have to come with additional parameter (in rules/features.smk)
|
||||
|
@ -69,7 +67,7 @@ def straw_cleaning(sensor_data_files, provider):
|
|||
# (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved)
|
||||
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
||||
|
||||
# (4) REMOVE COLS WHERE VARIANCE IS 0
|
||||
# (4) REMOVE COLS WHERE VARIANCE IS 0 TODO: preveri za local_segment stolpce
|
||||
if provider["COLS_VAR_THRESHOLD"]:
|
||||
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
|
||||
|
||||
|
@ -104,12 +102,10 @@ def straw_cleaning(sensor_data_files, provider):
|
|||
## STANDARDIZATION - should it happen before or after kNN imputation?
|
||||
# TODO: check if there are additional columns that need to be excluded from the standardization
|
||||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||
excluded_columns += [col for col in features.columns if "level_1" in col]
|
||||
|
||||
features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)])
|
||||
|
||||
# KNN IMPUTATION
|
||||
impute_cols = [col for col in features.columns if col not in ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']]
|
||||
impute_cols = [col for col in features.columns if col not in excluded_columns]
|
||||
features[impute_cols] = impute(features[impute_cols], method="knn")
|
||||
|
||||
|
||||
|
|
|
@ -9,9 +9,11 @@ compute_wifi_feature <- function(data, feature, time_segment){
|
|||
"countscans" = data %>% summarise(!!feature := n()),
|
||||
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid)))
|
||||
return(data)
|
||||
|
||||
} else if(feature == "countscansmostuniquedevice"){
|
||||
# Get the most scanned device
|
||||
mostuniquedevice <- data %>%
|
||||
mostuniquedevice <- data %>%
|
||||
filter(bssid != "") %>%
|
||||
group_by(bssid) %>%
|
||||
mutate(N=n()) %>%
|
||||
ungroup() %>%
|
||||
|
@ -21,12 +23,19 @@ compute_wifi_feature <- function(data, feature, time_segment){
|
|||
|
||||
data <- data %>% filter_data_by_segment(time_segment)
|
||||
|
||||
print(data %>%
|
||||
filter(bssid == mostuniquedevice) %>%
|
||||
group_by(local_segment) %>%
|
||||
summarise(!!feature := n()))
|
||||
|
||||
raise
|
||||
|
||||
return(data %>%
|
||||
filter(bssid == mostuniquedevice) %>%
|
||||
group_by(local_segment) %>%
|
||||
summarise(!!feature := n()) %>%
|
||||
mutate_all(~replace(., is.na(.), 0))
|
||||
summarise(!!feature := n())
|
||||
)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -46,7 +55,6 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
|||
feature <- compute_wifi_feature(wifi_data, feature_name, time_segment)
|
||||
features <- merge(features, feature, by="local_segment", all = TRUE)
|
||||
}
|
||||
|
||||
features <- features %>% mutate_all(~replace(., is.na(.), 0))
|
||||
# features <- features %>% mutate_all(~replace(., is.na(.), 0))
|
||||
return(features)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue