Update R feature scripts to add sensor and provider names automatically

pull/103/head
JulioV 2020-11-25 14:49:42 -05:00
parent f02ca2624d
commit ced3305ddb
12 changed files with 69 additions and 82 deletions

View File

@ -66,10 +66,10 @@ for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys():
for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys():
if config["PHONE_SCREEN"]["PROVIDERS"][provider]["COMPUTE"]:
if "PHONE_SCREEN" in config["PHONE_DATA_YIELD"]["SENSORS"]:
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
else:
raise ValueError("Error: Add PHONE_SCREEN (and as many phone sensor as you have in your database) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data)")
# if "PHONE_SCREEN" in config["PHONE_DATA_YIELD"]["SENSORS"]:# not used for now because we took episodepersensedminutes out of the list of supported features
# files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"]))
# else:
# raise ValueError("Error: Add PHONE_SCREEN (and as many PHONE_SENSORS as you have in your database) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data)")
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime_unified.csv", pid=config["PIDS"]))
@ -127,9 +127,9 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
if config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]:
if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] == "FUSED_RESAMPLED":
if "PHONE_LOCATIONS" in config["PHONE_DATA_YIELD"]["SENSORS"]:
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"]))
else:
raise ValueError("Error: Add PHONE_LOCATIONS (and as many SENSORS as you have) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
raise ValueError("Error: Add PHONE_LOCATIONS (and as many PHONE_SENSORS as you have) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"]))

View File

@ -55,10 +55,10 @@ DEVICE_DATA:
################################################################################
PHONE_DATA_YIELD:
SENSORS: [PHONE_MESSAGES, PHONE_CALLS, PHONE_ACCELEROMETER]
SENSORS: []
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1 representing the number of minutes with at least
SRC_LANGUAGE: "r"

View File

@ -139,7 +139,7 @@ The code to extract your behavioral features should be implemented in your provi
- One row per day segment instance (e.g. 14 our `p01`'s `my_days` example)
- The `local_segment` column added by `filter_data_by_segment()`
- One column per feature. Your feature columns should be named `SENSOR_PROVIDER_FEATURE`, for example `accelerometr_vega_feature1`
- One column per feature. By convention the name of your features should only contain letters or numbers (`feature1`). RAPIDS will automatically add the right sensor and provider prefix (`accelerometr_vega_`)
??? example "`PHONE_ACCELEROMETER` Provider Example"
For your reference, this a short example of our own provider (`RAPIDS`) for `PHONE_ACCELEROMETER` that computes five acceleration features

View File

@ -80,6 +80,8 @@ rule phone_readable_datetime:
rule phone_yielded_timestamps:
input:
all_sensors = expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor = map(str.lower, config["PHONE_DATA_YIELD"]["SENSORS"]))
params:
sensors = config["PHONE_DATA_YIELD"]["SENSORS"] # not used but needed so the rule is triggered if this array changes
output:
"data/interim/{pid}/phone_yielded_timestamps.csv"
script:
@ -99,18 +101,6 @@ rule phone_yielded_timestamps_with_datetime:
script:
"../src/data/readable_datetime.R"
rule phone_valid_sensed_days:
input:
phone_sensed_bins = "data/interim/{pid}/phone_sensed_bins.csv"
params:
min_valid_hours_per_day = "{min_valid_hours_per_day}",
min_valid_bins_per_hour = "{min_valid_bins_per_hour}"
output:
"data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv"
script:
"../src/data/phone_valid_sensed_days.R"
rule unify_ios_android:
input:
sensor_data = "data/raw/{pid}/{sensor}_with_datetime.csv",
@ -125,7 +115,7 @@ rule unify_ios_android:
rule process_phone_locations_types:
input:
locations = "data/raw/{pid}/phone_locations_raw.csv",
phone_sensed_timestamps = "data/interim/{pid}/phone_sensed_timestamps.csv",
phone_sensed_timestamps = "data/interim/{pid}/phone_yielded_timestamps.csv",
params:
consecutive_threshold = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"],
time_since_valid_location = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"],

View File

@ -6,8 +6,8 @@ compute_bluetooth_feature <- function(data, feature, day_segment){
if(feature %in% c("countscans", "uniquedevices")){
data <- data %>% group_by(local_segment)
data <- switch(feature,
"countscans" = data %>% summarise(!!paste("bluetooth_rapids", feature, sep = "_") := n()),
"uniquedevices" = data %>% summarise(!!paste("bluetooth_rapids", feature, sep = "_") := n_distinct(bt_address)))
"countscans" = data %>% summarise(!!feature := n()),
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bt_address)))
return(data)
} else if(feature == "countscansmostuniquedevice"){
# Get the most scanned device
@ -22,7 +22,7 @@ compute_bluetooth_feature <- function(data, feature, day_segment){
return(data %>%
filter(bt_address == mostuniquedevice) %>%
group_by(local_segment) %>%
summarise(!!paste("bluetooth_rapids", feature, sep = "_") := n()) %>%
summarise(!!feature := n()) %>%
replace(is.na(.), 0))
}
}
@ -46,7 +46,7 @@ rapids_features <- function(sensor_data_files, day_segment, provider){
features <- merge(features, feature, by="local_segment", all = TRUE)
}
features <- features %>% mutate_at(vars(contains("countscansmostuniquedevice")), list( ~ replace_na(., 0)))
features <- features %>% mutate_at(vars(contains("countscansmostuniquedevice")), list( ~ replace_na(., 0))) %>% select(-local_segment)
return(features)
}

View File

@ -20,7 +20,7 @@ call_features_of_type <- function(calls, call_type, day_segment, requested_featu
if(length(features_to_compute) == 0)
return(features)
if(nrow(calls) < 1)
return(cbind(features, read.csv(text = paste(paste("calls_rapids", call_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
return(cbind(features, read.csv(text = paste(paste(call_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
for(feature_name in features_to_compute){
if(feature_name == "countmostfrequentcontact"){
@ -34,24 +34,24 @@ call_features_of_type <- function(calls, call_type, day_segment, requested_featu
pull(trace)
feature <- calls %>%
group_by(local_segment) %>%
summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := sum(trace == mostfrequentcontact))
summarise(!!paste(call_type, feature_name, sep = "_") := sum(trace == mostfrequentcontact))
features <- merge(features, feature, by="local_segment", all = TRUE)
} else {
feature <- calls %>%
group_by(local_segment)
feature <- switch(feature_name,
"count" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := n()),
"distinctcontacts" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := n_distinct(trace)),
"meanduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := mean(call_duration)),
"sumduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := sum(call_duration)),
"minduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := min(call_duration)),
"maxduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := max(call_duration)),
"stdduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := sd(call_duration)),
"modeduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := Mode(call_duration)),
"entropyduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := entropy.MillerMadow(call_duration)),
"timefirstcall" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)),
"timelastcall" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute)))
"count" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := n()),
"distinctcontacts" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := n_distinct(trace)),
"meanduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := mean(call_duration)),
"sumduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := sum(call_duration)),
"minduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := min(call_duration)),
"maxduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := max(call_duration)),
"stdduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := sd(call_duration)),
"modeduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := Mode(call_duration)),
"entropyduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := entropy.MillerMadow(call_duration)),
"timefirstcall" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)),
"timelastcall" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute)))
features <- merge(features, feature, by="local_segment", all = TRUE)
}

View File

@ -19,8 +19,8 @@ compute_data_yield_features <- function(data, feature_name, day_segment, provide
valid_yielded_hours = sum(valid_hour == TRUE) / 1.0,
duration_minutes = first(duration_minutes),
duration_hours = duration_minutes / 60.0,
phone_data_yield_rapids_ratiovalidyieldedminutes = valid_yielded_minutes / duration_minutes,
phone_data_yield_rapids_ratiovalidyieldedhours = if_else(duration_hours > 1, valid_yielded_hours / duration_hours, valid_yielded_hours))
ratiovalidyieldedminutes = valid_yielded_minutes / duration_minutes,
ratiovalidyieldedhours = if_else(duration_hours > 1, valid_yielded_hours / duration_hours, valid_yielded_hours))
return(features)
}
@ -41,7 +41,7 @@ rapids_features <- function(sensor_data_files, day_segment, provider){
features_to_compute <- intersect(base_features_names, requested_features)
features <- compute_data_yield_features(yield_data, feature_name, day_segment, provider) %>%
select(c("local_segment", paste0("phone_data_yield_rapids_", features_to_compute)))
select(c("local_segment", features_to_compute))
return(features)
}

View File

@ -8,22 +8,22 @@ sapply(file.sources,source,.GlobalEnv)
create_empty_file <- function(requested_features){
return(data.frame(local_segment= character(),
locations_barnett_hometime= numeric(),
locations_barnett_disttravelled= numeric(),
locations_barnett_rog= numeric(),
locations_barnett_maxdiam= numeric(),
locations_barnett_maxhomedist= numeric(),
locations_barnett_siglocsvisited= numeric(),
locations_barnett_avgflightlen= numeric(),
locations_barnett_stdflightlen= numeric(),
locations_barnett_avgflightdur= numeric(),
locations_barnett_stdflightdur= numeric(),
locations_barnett_probpause= numeric(),
locations_barnett_siglocentropy= numeric(),
locations_barnett_minsmissing= numeric(),
locations_barnett_circdnrtn= numeric(),
locations_barnett_wkenddayrtn= numeric(),
locations_barnett_minutes_data_used= numeric()
hometime= numeric(),
disttravelled= numeric(),
rog= numeric(),
maxdiam= numeric(),
maxhomedist= numeric(),
siglocsvisited= numeric(),
avgflightlen= numeric(),
stdflightlen= numeric(),
avgflightdur= numeric(),
stdflightdur= numeric(),
probpause= numeric(),
siglocentropy= numeric(),
minsmissing= numeric(),
circdnrtn= numeric(),
wkenddayrtn= numeric(),
minutes_data_used= numeric()
) %>% select(all_of(requested_features)))
}
@ -41,9 +41,9 @@ barnett_features <- function(sensor_data_files, day_segment, params){
available_features <- c("hometime","disttravelled","rog","maxdiam", "maxhomedist","siglocsvisited","avgflightlen", "stdflightlen",
"avgflightdur","stdflightdur", "probpause","siglocentropy","minsmissing", "circdnrtn","wkenddayrtn")
requested_features <- intersect(unlist(params["FEATURES"], use.names = F), available_features)
requested_features <- c("local_segment", paste("locations_barnett", requested_features, sep = "_"))
requested_features <- c("local_segment", requested_features)
if(minutes_data_used)
requested_features <- c(requested_features, "locations_barnett_minutes_data_used")
requested_features <- c(requested_features, "minutes_data_used")
# Excludes datasets with less than 24 hours of data
if(max(location$timestamp) - min(location$timestamp) < 86400000)
@ -67,8 +67,8 @@ barnett_features <- function(sensor_data_files, day_segment, params){
group_by(local_date, local_hour) %>%
summarise(n_minutes = n_distinct(local_minute)) %>%
group_by(local_date) %>%
summarise(locations_barnett_minutes_data_used = sum(n_minutes)) %>%
select(local_date, locations_barnett_minutes_data_used)
summarise(minutes_data_used = sum(n_minutes)) %>%
select(local_date, minutes_data_used)
# Save day segment to attach it later
location_dates_segments <- location %>% select(local_date, local_segment) %>% distinct(local_date, .keep_all = TRUE)
@ -89,7 +89,7 @@ barnett_features <- function(sensor_data_files, day_segment, params){
features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg)
features <- as.data.frame(features)
features[-1] <- lapply(lapply(features[-1], as.character), as.numeric)
colnames(features)=c("local_date",tolower(paste("locations_barnett", colnames(outputMobility$featavg), sep = "_")))
colnames(features)=c("local_date",tolower(colnames(outputMobility$featavg)))
# Add the minute count column
features <- left_join(features, location_minutes_used, by = "local_date")
# Add the day segment column for consistency

View File

@ -15,7 +15,7 @@ message_features_of_type <- function(messages, messages_type, day_segment, reque
if(length(features_to_compute) == 0)
return(features)
if(nrow(messages) < 1)
return(cbind(features, read.csv(text = paste(paste("messages_rapids", messages_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
return(cbind(features, read.csv(text = paste(paste(messages_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
for(feature_name in features_to_compute){
if(feature_name == "countmostfrequentcontact"){
@ -29,17 +29,17 @@ message_features_of_type <- function(messages, messages_type, day_segment, reque
pull(trace)
feature <- messages %>%
group_by(local_segment) %>%
summarise(!!paste("messages_rapids", messages_type, feature_name, sep = "_") := sum(trace == mostfrequentcontact))
summarise(!!paste(messages_type, feature_name, sep = "_") := sum(trace == mostfrequentcontact))
features <- merge(features, feature, by="local_segment", all = TRUE)
} else {
feature <- messages %>%
group_by(local_segment)
feature <- switch(feature_name,
"count" = feature %>% summarise(!!paste("messages_rapids", messages_type, feature_name, sep = "_") := n()),
"distinctcontacts" = feature %>% summarise(!!paste("messages_rapids", messages_type, feature_name, sep = "_") := n_distinct(trace)),
"timefirstmessage" = feature %>% summarise(!!paste("messages_rapids", messages_type, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)),
"timelastmessage" = feature %>% summarise(!!paste("messages_rapids", messages_type, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute)))
"count" = feature %>% summarise(!!paste(messages_type, feature_name, sep = "_") := n()),
"distinctcontacts" = feature %>% summarise(!!paste(messages_type, feature_name, sep = "_") := n_distinct(trace)),
"timefirstmessage" = feature %>% summarise(!!paste(messages_type, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)),
"timelastmessage" = feature %>% summarise(!!paste(messages_type, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute)))
features <- merge(features, feature, by="local_segment", all = TRUE)
}
@ -57,7 +57,7 @@ rapids_features <- function(sensor_data_files, day_segment, provider){
# Filter rows that belong to the message type and day segment of interest
message_type_label = ifelse(message_type == "received", "1", ifelse(message_type == "sent", "2", NA))
if(is.na(message_type_label))
stop(paste("Message type can online be received or sent but instead you typed: ", message_type, " in config[MESSAGES][MESSAGES_TYPES]"))
stop(paste("Message type can online be received or sent but instead you typed: ", message_type, " in config[PHONE_MESSAGES][MESSAGES_TYPES]"))
requested_features <- provider[["FEATURES"]][[message_type]]
messages_of_type <- messages_data %>% filter(message_type == message_type_label)

View File

@ -5,8 +5,8 @@ compute_wifi_feature <- function(data, feature, day_segment){
if(feature %in% c("countscans", "uniquedevices")){
data <- data %>% group_by(local_segment)
data <- switch(feature,
"countscans" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n()),
"uniquedevices" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n_distinct(bssid)))
"countscans" = data %>% summarise(!!feature := n()),
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid)))
return(data)
} else if(feature == "countscansmostuniquedevice"){
# Get the most scanned device
@ -20,7 +20,7 @@ compute_wifi_feature <- function(data, feature, day_segment){
return(data %>%
filter(bssid == mostuniquedevice) %>%
group_by(local_segment) %>%
summarise(!!paste("wifi_rapids", feature, sep = "_") := n()) %>%
summarise(!!feature := n()) %>%
replace(is.na(.), 0))
}
}

View File

@ -5,8 +5,8 @@ compute_wifi_feature <- function(data, feature, day_segment){
if(feature %in% c("countscans", "uniquedevices")){
data <- data %>% group_by(local_segment)
data <- switch(feature,
"countscans" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n()),
"uniquedevices" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n_distinct(bssid)))
"countscans" = data %>% summarise(!!feature := n()),
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid)))
return(data)
} else if(feature == "countscansmostuniquedevice"){
# Get the most scanned device
@ -20,7 +20,7 @@ compute_wifi_feature <- function(data, feature, day_segment){
return(data %>%
filter(bssid == mostuniquedevice) %>%
group_by(local_segment) %>%
summarise(!!paste("wifi_rapids", feature, sep = "_") := n()) %>%
summarise(!!feature := n()) %>%
replace(is.na(.), 0))
}
}

View File

@ -61,12 +61,9 @@ fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_d
print(paste(rapids_log_tag,"Processing", sensor_key, provider_key, day_segment))
features <- features_function(sensor_data_files, day_segment, provider)
# Check all features names contain the provider key so they are unique
features_names <- colnames(features %>% select(-local_segment))
if(any(!grepl(paste0(".*(",str_to_lower(provider_key),").*"), features_names)))
stop(paste("The name of all calls features of", provider_key," must contain its name in lower case but the following don't [", paste(features_names[!grepl(paste0(".*(",str_to_lower(provider_key),").*"), features_names)], collapse = ", "), "]"))
if(!"local_segment" %in% colnames(features))
stop(paste0("The dataframe returned by the ",sensor_key," provider '", provider_key,"' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (", code_path,")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different day segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)"))
features <- features %>% rename_at(vars(!matches("local_segment")), ~ paste(sensor_key, provider_key, ., sep = "_"))
sensor_features <- merge(sensor_features, features, all = TRUE)
}
} else { # This is redundant, if COMPUTE is FALSE this script will be never executed