From eda58d9d2c62b30536b03f43c0437a2cebcecd9a Mon Sep 17 00:00:00 2001 From: JulioV Date: Mon, 31 Aug 2020 18:51:06 -0400 Subject: [PATCH] Migrate wifi to new file structure --- Snakefile | 24 ++++++++++------- config.yaml | 10 ++++--- rules/features.smk | 27 ++++++++++++++----- rules/preprocessing.smk | 8 ++++++ src/data/join_visible_and_connected_wifi.R | 18 +++++++++++++ src/features/utils/utils.R | 2 +- .../wifi/{wifi_base.R => rapids/main.R} | 27 ++++++++----------- src/features/wifi/wifi_entry.R | 13 +++++++++ src/features/wifi/wifi_entry.py | 18 +++++++++++++ 9 files changed, 109 insertions(+), 38 deletions(-) create mode 100644 src/data/join_visible_and_connected_wifi.R rename src/features/wifi/{wifi_base.R => rapids/main.R} (56%) create mode 100644 src/features/wifi/wifi_entry.R create mode 100644 src/features/wifi/wifi_entry.py diff --git a/Snakefile b/Snakefile index 4b4542f9..b7364f84 100644 --- a/Snakefile +++ b/Snakefile @@ -99,16 +99,20 @@ if config["APPLICATIONS_FOREGROUND"]["COMPUTE"]: files_to_compute.extend(expand("data/interim/{pid}/{sensor}_with_datetime_with_genre.csv", pid=config["PIDS"], sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])) files_to_compute.extend(expand("data/processed/{pid}/applications_foreground_{day_segment}.csv", pid = config["PIDS"], day_segment = config["APPLICATIONS_FOREGROUND"]["DAY_SEGMENTS"])) -if config["WIFI"]["COMPUTE"]: - if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])) - files_to_compute.extend(expand("data/processed/{pid}/wifi_{day_segment}.csv", pid = config["PIDS"], day_segment = config["WIFI"]["DAY_SEGMENTS"])) - - if len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])) - files_to_compute.extend(expand("data/processed/{pid}/wifi_{day_segment}.csv", pid = config["PIDS"], day_segment = config["WIFI"]["DAY_SEGMENTS"])) +for provider in config["WIFI"]["PROVIDERS"].keys(): + if config["WIFI"]["PROVIDERS"][provider]["COMPUTE"]: + if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0: + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])) + files_to_compute.extend(expand("data/raw/{pid}/{sensor_key}_with_datetime_visibleandconnected.csv", pid=config["PIDS"], sensor_key="WIFI".lower())) + files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["WIFI"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="WIFI".lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="WIFI".lower())) + if len(config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"]) > 0: + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["WIFI"]["DB_TABLE"]["CONNECTED_ACCESS_POINTS"])) + files_to_compute.extend(expand("data/raw/{pid}/{sensor_key}_with_datetime_visibleandconnected.csv", pid=config["PIDS"], sensor_key="WIFI".lower())) + files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["WIFI"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="WIFI".lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="WIFI".lower())) if config["HEARTRATE"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["HEARTRATE"]["DB_TABLE"])) diff --git a/config.yaml b/config.yaml index 2483aa77..5011e834 100644 --- a/config.yaml +++ b/config.yaml @@ -106,7 +106,6 @@ BLUETOOTH: PROVIDERS: RAPIDS: COMPUTE: False - DAY_SEGMENTS: *day_segments FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] SRC_FOLDER: "rapids" # inside src/features/bluetooth SRC_LANGUAGE: "r" @@ -197,12 +196,15 @@ SLEEP: SUMMARY_FEATURES: ["sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgefficiency", "countepisode"] WIFI: - COMPUTE: False DB_TABLE: VISIBLE_ACCESS_POINTS: "wifi" # if you only have a CONNECTED_ACCESS_POINTS table, set this value to "" CONNECTED_ACCESS_POINTS: "sensor_wifi" # if you only have a VISIBLE_ACCESS_POINTS table, set this value to "" - DAY_SEGMENTS: *day_segments - FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] + PROVIDERS: + RAPIDS: + COMPUTE: False + FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] + SRC_FOLDER: "rapids" # inside src/features/bluetooth + SRC_LANGUAGE: "r" CONVERSATION: COMPUTE: False diff --git a/rules/features.smk b/rules/features.smk index 8e7625af..22bfd035 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -227,16 +227,29 @@ rule applications_foreground_features: script: "../src/features/applications_foreground_features.py" -rule wifi_features: - input: - expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["WIFI"]["DB_TABLE"]), - day_segments = expand("data/interim/{sensor}_day_segments.csv", sensor=config["WIFI"]["DB_TABLE"]) +rule wifi_r_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower()), + day_segments_labels = "data/interim/day_segments_labels.csv" params: - features = config["WIFI"]["FEATURES"] + provider = lambda wildcards: config["WIFI"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}" output: - "data/processed/{pid}/wifi_features.csv" + "data/interim/{pid}/wifi_features/wifi_r_{provider_key}.csv" script: - "../src/features/wifi_features.R" + "../src/features/wifi/wifi_entry.R" + +rule wifi_python_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower()), + day_segments_labels = "data/interim/day_segments_labels.csv" + params: + provider = lambda wildcards: config["WIFI"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}" + output: + "data/interim/{pid}/wifi_features/wifi_python_{provider_key}.csv" + script: + "../src/features/wifi/wifi_entry.py" rule fitbit_heartrate_features: input: diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index c8f98132..e85cbdb4 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -172,3 +172,11 @@ rule fitbit_sleep_with_datetime: intraday_data = "data/raw/{pid}/fitbit_sleep_intraday_with_datetime.csv" script: "../src/data/fitbit_readable_datetime.py" + +rule join_wifi_tables: + input: + unpack(optional_wifi_input) + output: + "data/raw/{pid}/wifi_with_datetime_visibleandconnected.csv" + script: + "../src/data/join_visible_and_connected_wifi.R" \ No newline at end of file diff --git a/src/data/join_visible_and_connected_wifi.R b/src/data/join_visible_and_connected_wifi.R new file mode 100644 index 00000000..87c7a511 --- /dev/null +++ b/src/data/join_visible_and_connected_wifi.R @@ -0,0 +1,18 @@ +source("renv/activate.R") +library("dplyr") + +if(!is.null(snakemake@input[["visible_access_points"]]) && is.null(snakemake@input[["connected_access_points"]])){ + wifi_data <- read.csv(snakemake@input[["visible_access_points"]], stringsAsFactors = FALSE) + wifi_data <- wifi_data %>% mutate(connected = 0) +} else if(is.null(snakemake@input[["visible_access_points"]]) && !is.null(snakemake@input[["connected_access_points"]])){ + wifi_data <- read.csv(snakemake@input[["connected_access_points"]], stringsAsFactors = FALSE) + wifi_data <- wifi_data %>% mutate(connected = 1) +} else if(!is.null(snakemake@input[["visible_access_points"]]) && !is.null(snakemake@input[["connected_access_points"]])){ + visible_access_points <- read.csv(snakemake@input[["visible_access_points"]], stringsAsFactors = FALSE) + visible_access_points <- visible_access_points %>% mutate(connected = 0) + connected_access_points <- read.csv(snakemake@input[["connected_access_points"]], stringsAsFactors = FALSE) + connected_access_points <- connected_access_points %>% mutate(connected = 1) + wifi_data <- bind_rows(visible_access_points, connected_access_points) %>% arrange(timestamp) +} + +write.csv(wifi_data, snakemake@output[[1]], row.names = FALSE) \ No newline at end of file diff --git a/src/features/utils/utils.R b/src/features/utils/utils.R index 645ef78c..922b79e7 100644 --- a/src/features/utils/utils.R +++ b/src/features/utils/utils.R @@ -20,7 +20,7 @@ fetch_provider_features <- function(provider, provider_key, config_key, sensor_d day_segments_labels <- read.csv(day_segments_file, stringsAsFactors = FALSE) if(!"FEATURES" %in% names(provider)) - stop(paste0("Provider config[CALLS][PROVIDERS][", provider_key,"] is missing a FEATURES attribute in config.yaml")) + stop(paste0("Provider config[", config_key,"][PROVIDERS][", provider_key,"] is missing a FEATURES attribute in config.yaml")) if(provider[["COMPUTE"]] == TRUE){ code_path <- paste0("src/features/", config_key,"/", provider[["SRC_FOLDER"]], "/main.R") diff --git a/src/features/wifi/wifi_base.R b/src/features/wifi/rapids/main.R similarity index 56% rename from src/features/wifi/wifi_base.R rename to src/features/wifi/rapids/main.R index 67d5cf08..ab61e5d5 100644 --- a/src/features/wifi/wifi_base.R +++ b/src/features/wifi/rapids/main.R @@ -1,18 +1,12 @@ library(dplyr) -filter_by_day_segment <- function(data, day_segment) { - if(day_segment != "daily") - data <- data %>% filter(local_day_segment == day_segment) - - return(data %>% group_by(local_date)) -} - compute_wifi_feature <- function(data, feature, day_segment){ - data <- data %>% filter_by_day_segment(day_segment) + data <- data %>% filter_data_by_segment(day_segment) if(feature %in% c("countscans", "uniquedevices")){ + data <- data %>% group_by(local_segment) data <- switch(feature, - "countscans" = data %>% summarise(!!paste("wifi", day_segment, feature, sep = "_") := n()), - "uniquedevices" = data %>% summarise(!!paste("wifi", day_segment, feature, sep = "_") := n_distinct(bssid))) + "countscans" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n()), + "uniquedevices" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n_distinct(bssid))) return(data) } else if(feature == "countscansmostuniquedevice"){ # Get the most scanned device @@ -25,15 +19,16 @@ compute_wifi_feature <- function(data, feature, day_segment){ pull(bssid) return(data %>% filter(bssid == mostuniquedevice) %>% - group_by(local_date) %>% - summarise(!!paste("wifi", day_segment, feature, sep = "_") := n()) %>% + group_by(local_segment) %>% + summarise(!!paste("wifi_rapids", feature, sep = "_") := n()) %>% replace(is.na(.), 0)) } } -base_wifi_features <- function(wifi_data, day_segment, requested_features){ +rapids_features <- function(wifi_data, day_segment, provider){ + requested_features <- provider[["FEATURES"]] # Output dataframe - features = data.frame(local_date = character(), stringsAsFactors = FALSE) + features = data.frame(local_segment = character(), stringsAsFactors = FALSE) # The name of the features this function can compute base_features_names <- c("countscans", "uniquedevices", "countscansmostuniquedevice") @@ -42,8 +37,8 @@ base_wifi_features <- function(wifi_data, day_segment, requested_features){ features_to_compute <- intersect(base_features_names, requested_features) for(feature_name in features_to_compute){ - feature <- compute_wifi_feature(wifi_data, feature_name, day_segment) - features <- merge(features, feature, by="local_date", all = TRUE) + feature <- compute_wifi_feature(wifi_data, feature_name, day_segment) + features <- merge(features, feature, by="local_segment", all = TRUE) } return(features) diff --git a/src/features/wifi/wifi_entry.R b/src/features/wifi/wifi_entry.R new file mode 100644 index 00000000..1a825360 --- /dev/null +++ b/src/features/wifi/wifi_entry.R @@ -0,0 +1,13 @@ +source("renv/activate.R") +source("src/features/utils/utils.R") +library("dplyr") +library("tidyr") + +sensor_data_file <- snakemake@input[["sensor_data"]] +day_segments_file <- snakemake@input[["day_segments_labels"]] +provider <- snakemake@params["provider"][["provider"]] +provider_key <- snakemake@params["provider_key"] + +sensor_features <- fetch_provider_features(provider, provider_key, "wifi", sensor_data_file, day_segments_file) + +write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/wifi/wifi_entry.py b/src/features/wifi/wifi_entry.py new file mode 100644 index 00000000..ffe8bb2f --- /dev/null +++ b/src/features/wifi/wifi_entry.py @@ -0,0 +1,18 @@ +import pandas as pd +from importlib import import_module, util +from pathlib import Path + +# import fetch_provider_features from src/features/utils/utils.py +spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) +mod = util.module_from_spec(spec) +spec.loader.exec_module(mod) +fetch_provider_features = getattr(mod, "fetch_provider_features") + +sensor_data_file = snakemake.input["sensor_data"][0] +day_segments_file = snakemake.input["day_segments_labels"] +provider = snakemake.params["provider"] +provider_key = snakemake.params["provider_key"] + +sensor_features = fetch_provider_features(provider, provider_key, "wifi", sensor_data_file, day_segments_file) + +sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file