Phone wifi visible inspection (WIP)

notes
Primoz 2022-09-16 13:24:21 +00:00
parent 0ce6da5444
commit 62982866cd
2 changed files with 15 additions and 11 deletions

View File

@ -22,8 +22,6 @@ def straw_cleaning(sensor_data_files, provider):
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
features = features[features['phone_esm_straw_' + target].notna()].reset_index()
test_cols = [col for col in features.columns if 'phone_calls' in col or 'phone_messages' in col]
# TODO: reorder the cleaning steps so it makes sense for the analysis
# TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this
# the snakemake rules will also have to come with additional parameter (in rules/features.smk)
@ -69,7 +67,7 @@ def straw_cleaning(sensor_data_files, provider):
# (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved)
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
# (4) REMOVE COLS WHERE VARIANCE IS 0
# (4) REMOVE COLS WHERE VARIANCE IS 0 TODO: preveri za local_segment stolpce
if provider["COLS_VAR_THRESHOLD"]:
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
@ -104,12 +102,10 @@ def straw_cleaning(sensor_data_files, provider):
## STANDARDIZATION - should it happen before or after kNN imputation?
# TODO: check if there are additional columns that need to be excluded from the standardization
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
excluded_columns += [col for col in features.columns if "level_1" in col]
features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)])
# KNN IMPUTATION
impute_cols = [col for col in features.columns if col not in ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']]
impute_cols = [col for col in features.columns if col not in excluded_columns]
features[impute_cols] = impute(features[impute_cols], method="knn")

View File

@ -9,9 +9,11 @@ compute_wifi_feature <- function(data, feature, time_segment){
"countscans" = data %>% summarise(!!feature := n()),
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid)))
return(data)
} else if(feature == "countscansmostuniquedevice"){
# Get the most scanned device
mostuniquedevice <- data %>%
filter(bssid != "") %>%
group_by(bssid) %>%
mutate(N=n()) %>%
ungroup() %>%
@ -21,12 +23,19 @@ compute_wifi_feature <- function(data, feature, time_segment){
data <- data %>% filter_data_by_segment(time_segment)
print(data %>%
filter(bssid == mostuniquedevice) %>%
group_by(local_segment) %>%
summarise(!!feature := n()))
raise
return(data %>%
filter(bssid == mostuniquedevice) %>%
group_by(local_segment) %>%
summarise(!!feature := n()) %>%
mutate_all(~replace(., is.na(.), 0))
summarise(!!feature := n())
)
}
}
@ -46,7 +55,6 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
feature <- compute_wifi_feature(wifi_data, feature_name, time_segment)
features <- merge(features, feature, by="local_segment", all = TRUE)
}
features <- features %>% mutate_all(~replace(., is.na(.), 0))
# features <- features %>% mutate_all(~replace(., is.na(.), 0))
return(features)
}