Phone wifi visible inspection (WIP)
parent
0ce6da5444
commit
62982866cd
|
@ -22,8 +22,6 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
|
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
|
||||||
features = features[features['phone_esm_straw_' + target].notna()].reset_index()
|
features = features[features['phone_esm_straw_' + target].notna()].reset_index()
|
||||||
|
|
||||||
test_cols = [col for col in features.columns if 'phone_calls' in col or 'phone_messages' in col]
|
|
||||||
|
|
||||||
# TODO: reorder the cleaning steps so it makes sense for the analysis
|
# TODO: reorder the cleaning steps so it makes sense for the analysis
|
||||||
# TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this
|
# TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this
|
||||||
# the snakemake rules will also have to come with additional parameter (in rules/features.smk)
|
# the snakemake rules will also have to come with additional parameter (in rules/features.smk)
|
||||||
|
@ -69,7 +67,7 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
# (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved)
|
# (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved)
|
||||||
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
||||||
|
|
||||||
# (4) REMOVE COLS WHERE VARIANCE IS 0
|
# (4) REMOVE COLS WHERE VARIANCE IS 0 TODO: preveri za local_segment stolpce
|
||||||
if provider["COLS_VAR_THRESHOLD"]:
|
if provider["COLS_VAR_THRESHOLD"]:
|
||||||
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
|
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
|
||||||
|
|
||||||
|
@ -104,12 +102,10 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
## STANDARDIZATION - should it happen before or after kNN imputation?
|
## STANDARDIZATION - should it happen before or after kNN imputation?
|
||||||
# TODO: check if there are additional columns that need to be excluded from the standardization
|
# TODO: check if there are additional columns that need to be excluded from the standardization
|
||||||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||||
excluded_columns += [col for col in features.columns if "level_1" in col]
|
|
||||||
|
|
||||||
features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)])
|
features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)])
|
||||||
|
|
||||||
# KNN IMPUTATION
|
# KNN IMPUTATION
|
||||||
impute_cols = [col for col in features.columns if col not in ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']]
|
impute_cols = [col for col in features.columns if col not in excluded_columns]
|
||||||
features[impute_cols] = impute(features[impute_cols], method="knn")
|
features[impute_cols] = impute(features[impute_cols], method="knn")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,9 +9,11 @@ compute_wifi_feature <- function(data, feature, time_segment){
|
||||||
"countscans" = data %>% summarise(!!feature := n()),
|
"countscans" = data %>% summarise(!!feature := n()),
|
||||||
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid)))
|
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid)))
|
||||||
return(data)
|
return(data)
|
||||||
|
|
||||||
} else if(feature == "countscansmostuniquedevice"){
|
} else if(feature == "countscansmostuniquedevice"){
|
||||||
# Get the most scanned device
|
# Get the most scanned device
|
||||||
mostuniquedevice <- data %>%
|
mostuniquedevice <- data %>%
|
||||||
|
filter(bssid != "") %>%
|
||||||
group_by(bssid) %>%
|
group_by(bssid) %>%
|
||||||
mutate(N=n()) %>%
|
mutate(N=n()) %>%
|
||||||
ungroup() %>%
|
ungroup() %>%
|
||||||
|
@ -21,12 +23,19 @@ compute_wifi_feature <- function(data, feature, time_segment){
|
||||||
|
|
||||||
data <- data %>% filter_data_by_segment(time_segment)
|
data <- data %>% filter_data_by_segment(time_segment)
|
||||||
|
|
||||||
|
print(data %>%
|
||||||
|
filter(bssid == mostuniquedevice) %>%
|
||||||
|
group_by(local_segment) %>%
|
||||||
|
summarise(!!feature := n()))
|
||||||
|
|
||||||
|
raise
|
||||||
|
|
||||||
return(data %>%
|
return(data %>%
|
||||||
filter(bssid == mostuniquedevice) %>%
|
filter(bssid == mostuniquedevice) %>%
|
||||||
group_by(local_segment) %>%
|
group_by(local_segment) %>%
|
||||||
summarise(!!feature := n()) %>%
|
summarise(!!feature := n())
|
||||||
mutate_all(~replace(., is.na(.), 0))
|
|
||||||
)
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -46,7 +55,6 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
||||||
feature <- compute_wifi_feature(wifi_data, feature_name, time_segment)
|
feature <- compute_wifi_feature(wifi_data, feature_name, time_segment)
|
||||||
features <- merge(features, feature, by="local_segment", all = TRUE)
|
features <- merge(features, feature, by="local_segment", all = TRUE)
|
||||||
}
|
}
|
||||||
|
# features <- features %>% mutate_all(~replace(., is.na(.), 0))
|
||||||
features <- features %>% mutate_all(~replace(., is.na(.), 0))
|
|
||||||
return(features)
|
return(features)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue