Phone wifi visible inspection (WIP)

notes
Primoz 2022-09-16 13:24:21 +00:00
parent 0ce6da5444
commit 62982866cd
2 changed files with 15 additions and 11 deletions

View File

@ -22,8 +22,6 @@ def straw_cleaning(sensor_data_files, provider):
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
features = features[features['phone_esm_straw_' + target].notna()].reset_index() features = features[features['phone_esm_straw_' + target].notna()].reset_index()
test_cols = [col for col in features.columns if 'phone_calls' in col or 'phone_messages' in col]
# TODO: reorder the cleaning steps so it makes sense for the analysis # TODO: reorder the cleaning steps so it makes sense for the analysis
# TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this # TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this
# the snakemake rules will also have to come with additional parameter (in rules/features.smk) # the snakemake rules will also have to come with additional parameter (in rules/features.smk)
@ -69,7 +67,7 @@ def straw_cleaning(sensor_data_files, provider):
# (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved) # (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved)
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
# (4) REMOVE COLS WHERE VARIANCE IS 0 # (4) REMOVE COLS WHERE VARIANCE IS 0 TODO: preveri za local_segment stolpce
if provider["COLS_VAR_THRESHOLD"]: if provider["COLS_VAR_THRESHOLD"]:
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True) features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
@ -104,12 +102,10 @@ def straw_cleaning(sensor_data_files, provider):
## STANDARDIZATION - should it happen before or after kNN imputation? ## STANDARDIZATION - should it happen before or after kNN imputation?
# TODO: check if there are additional columns that need to be excluded from the standardization # TODO: check if there are additional columns that need to be excluded from the standardization
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime'] excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
excluded_columns += [col for col in features.columns if "level_1" in col]
features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)]) features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)])
# KNN IMPUTATION # KNN IMPUTATION
impute_cols = [col for col in features.columns if col not in ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']] impute_cols = [col for col in features.columns if col not in excluded_columns]
features[impute_cols] = impute(features[impute_cols], method="knn") features[impute_cols] = impute(features[impute_cols], method="knn")

View File

@ -9,9 +9,11 @@ compute_wifi_feature <- function(data, feature, time_segment){
"countscans" = data %>% summarise(!!feature := n()), "countscans" = data %>% summarise(!!feature := n()),
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid))) "uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid)))
return(data) return(data)
} else if(feature == "countscansmostuniquedevice"){ } else if(feature == "countscansmostuniquedevice"){
# Get the most scanned device # Get the most scanned device
mostuniquedevice <- data %>% mostuniquedevice <- data %>%
filter(bssid != "") %>%
group_by(bssid) %>% group_by(bssid) %>%
mutate(N=n()) %>% mutate(N=n()) %>%
ungroup() %>% ungroup() %>%
@ -21,12 +23,19 @@ compute_wifi_feature <- function(data, feature, time_segment){
data <- data %>% filter_data_by_segment(time_segment) data <- data %>% filter_data_by_segment(time_segment)
print(data %>%
filter(bssid == mostuniquedevice) %>%
group_by(local_segment) %>%
summarise(!!feature := n()))
raise
return(data %>% return(data %>%
filter(bssid == mostuniquedevice) %>% filter(bssid == mostuniquedevice) %>%
group_by(local_segment) %>% group_by(local_segment) %>%
summarise(!!feature := n()) %>% summarise(!!feature := n())
mutate_all(~replace(., is.na(.), 0))
) )
} }
} }
@ -46,7 +55,6 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
feature <- compute_wifi_feature(wifi_data, feature_name, time_segment) feature <- compute_wifi_feature(wifi_data, feature_name, time_segment)
features <- merge(features, feature, by="local_segment", all = TRUE) features <- merge(features, feature, by="local_segment", all = TRUE)
} }
# features <- features %>% mutate_all(~replace(., is.na(.), 0))
features <- features %>% mutate_all(~replace(., is.na(.), 0))
return(features) return(features)
} }