Update R feature scripts to add sensor and provider names automatically
parent
f02ca2624d
commit
ced3305ddb
12
Snakefile
12
Snakefile
|
@ -66,10 +66,10 @@ for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys():
|
||||||
|
|
||||||
for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys():
|
for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys():
|
||||||
if config["PHONE_SCREEN"]["PROVIDERS"][provider]["COMPUTE"]:
|
if config["PHONE_SCREEN"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
if "PHONE_SCREEN" in config["PHONE_DATA_YIELD"]["SENSORS"]:
|
# if "PHONE_SCREEN" in config["PHONE_DATA_YIELD"]["SENSORS"]:# not used for now because we took episodepersensedminutes out of the list of supported features
|
||||||
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
|
# files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"]))
|
||||||
else:
|
# else:
|
||||||
raise ValueError("Error: Add PHONE_SCREEN (and as many phone sensor as you have in your database) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data)")
|
# raise ValueError("Error: Add PHONE_SCREEN (and as many PHONE_SENSORS as you have in your database) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data)")
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_raw.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_raw.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime_unified.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/raw/{pid}/phone_screen_with_datetime_unified.csv", pid=config["PIDS"]))
|
||||||
|
@ -127,9 +127,9 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
|
||||||
if config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]:
|
if config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] == "FUSED_RESAMPLED":
|
if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] == "FUSED_RESAMPLED":
|
||||||
if "PHONE_LOCATIONS" in config["PHONE_DATA_YIELD"]["SENSORS"]:
|
if "PHONE_LOCATIONS" in config["PHONE_DATA_YIELD"]["SENSORS"]:
|
||||||
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/interim/{pid}/phone_yielded_timestamps.csv", pid=config["PIDS"]))
|
||||||
else:
|
else:
|
||||||
raise ValueError("Error: Add PHONE_LOCATIONS (and as many SENSORS as you have) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
|
raise ValueError("Error: Add PHONE_LOCATIONS (and as many PHONE_SENSORS as you have) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
|
||||||
|
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"]))
|
||||||
|
|
|
@ -55,10 +55,10 @@ DEVICE_DATA:
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
PHONE_DATA_YIELD:
|
PHONE_DATA_YIELD:
|
||||||
SENSORS: [PHONE_MESSAGES, PHONE_CALLS, PHONE_ACCELEROMETER]
|
SENSORS: []
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
|
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
|
||||||
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1 representing the number of minutes with at least
|
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1 representing the number of minutes with at least
|
||||||
SRC_LANGUAGE: "r"
|
SRC_LANGUAGE: "r"
|
||||||
|
|
|
@ -139,7 +139,7 @@ The code to extract your behavioral features should be implemented in your provi
|
||||||
|
|
||||||
- One row per day segment instance (e.g. 14 our `p01`'s `my_days` example)
|
- One row per day segment instance (e.g. 14 our `p01`'s `my_days` example)
|
||||||
- The `local_segment` column added by `filter_data_by_segment()`
|
- The `local_segment` column added by `filter_data_by_segment()`
|
||||||
- One column per feature. Your feature columns should be named `SENSOR_PROVIDER_FEATURE`, for example `accelerometr_vega_feature1`
|
- One column per feature. By convention the name of your features should only contain letters or numbers (`feature1`). RAPIDS will automatically add the right sensor and provider prefix (`accelerometr_vega_`)
|
||||||
|
|
||||||
??? example "`PHONE_ACCELEROMETER` Provider Example"
|
??? example "`PHONE_ACCELEROMETER` Provider Example"
|
||||||
For your reference, this a short example of our own provider (`RAPIDS`) for `PHONE_ACCELEROMETER` that computes five acceleration features
|
For your reference, this a short example of our own provider (`RAPIDS`) for `PHONE_ACCELEROMETER` that computes five acceleration features
|
||||||
|
|
|
@ -80,6 +80,8 @@ rule phone_readable_datetime:
|
||||||
rule phone_yielded_timestamps:
|
rule phone_yielded_timestamps:
|
||||||
input:
|
input:
|
||||||
all_sensors = expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor = map(str.lower, config["PHONE_DATA_YIELD"]["SENSORS"]))
|
all_sensors = expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor = map(str.lower, config["PHONE_DATA_YIELD"]["SENSORS"]))
|
||||||
|
params:
|
||||||
|
sensors = config["PHONE_DATA_YIELD"]["SENSORS"] # not used but needed so the rule is triggered if this array changes
|
||||||
output:
|
output:
|
||||||
"data/interim/{pid}/phone_yielded_timestamps.csv"
|
"data/interim/{pid}/phone_yielded_timestamps.csv"
|
||||||
script:
|
script:
|
||||||
|
@ -99,18 +101,6 @@ rule phone_yielded_timestamps_with_datetime:
|
||||||
script:
|
script:
|
||||||
"../src/data/readable_datetime.R"
|
"../src/data/readable_datetime.R"
|
||||||
|
|
||||||
rule phone_valid_sensed_days:
|
|
||||||
input:
|
|
||||||
phone_sensed_bins = "data/interim/{pid}/phone_sensed_bins.csv"
|
|
||||||
params:
|
|
||||||
min_valid_hours_per_day = "{min_valid_hours_per_day}",
|
|
||||||
min_valid_bins_per_hour = "{min_valid_bins_per_hour}"
|
|
||||||
output:
|
|
||||||
"data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv"
|
|
||||||
script:
|
|
||||||
"../src/data/phone_valid_sensed_days.R"
|
|
||||||
|
|
||||||
|
|
||||||
rule unify_ios_android:
|
rule unify_ios_android:
|
||||||
input:
|
input:
|
||||||
sensor_data = "data/raw/{pid}/{sensor}_with_datetime.csv",
|
sensor_data = "data/raw/{pid}/{sensor}_with_datetime.csv",
|
||||||
|
@ -125,7 +115,7 @@ rule unify_ios_android:
|
||||||
rule process_phone_locations_types:
|
rule process_phone_locations_types:
|
||||||
input:
|
input:
|
||||||
locations = "data/raw/{pid}/phone_locations_raw.csv",
|
locations = "data/raw/{pid}/phone_locations_raw.csv",
|
||||||
phone_sensed_timestamps = "data/interim/{pid}/phone_sensed_timestamps.csv",
|
phone_sensed_timestamps = "data/interim/{pid}/phone_yielded_timestamps.csv",
|
||||||
params:
|
params:
|
||||||
consecutive_threshold = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"],
|
consecutive_threshold = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"],
|
||||||
time_since_valid_location = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"],
|
time_since_valid_location = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"],
|
||||||
|
|
|
@ -6,8 +6,8 @@ compute_bluetooth_feature <- function(data, feature, day_segment){
|
||||||
if(feature %in% c("countscans", "uniquedevices")){
|
if(feature %in% c("countscans", "uniquedevices")){
|
||||||
data <- data %>% group_by(local_segment)
|
data <- data %>% group_by(local_segment)
|
||||||
data <- switch(feature,
|
data <- switch(feature,
|
||||||
"countscans" = data %>% summarise(!!paste("bluetooth_rapids", feature, sep = "_") := n()),
|
"countscans" = data %>% summarise(!!feature := n()),
|
||||||
"uniquedevices" = data %>% summarise(!!paste("bluetooth_rapids", feature, sep = "_") := n_distinct(bt_address)))
|
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bt_address)))
|
||||||
return(data)
|
return(data)
|
||||||
} else if(feature == "countscansmostuniquedevice"){
|
} else if(feature == "countscansmostuniquedevice"){
|
||||||
# Get the most scanned device
|
# Get the most scanned device
|
||||||
|
@ -22,7 +22,7 @@ compute_bluetooth_feature <- function(data, feature, day_segment){
|
||||||
return(data %>%
|
return(data %>%
|
||||||
filter(bt_address == mostuniquedevice) %>%
|
filter(bt_address == mostuniquedevice) %>%
|
||||||
group_by(local_segment) %>%
|
group_by(local_segment) %>%
|
||||||
summarise(!!paste("bluetooth_rapids", feature, sep = "_") := n()) %>%
|
summarise(!!feature := n()) %>%
|
||||||
replace(is.na(.), 0))
|
replace(is.na(.), 0))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -46,7 +46,7 @@ rapids_features <- function(sensor_data_files, day_segment, provider){
|
||||||
features <- merge(features, feature, by="local_segment", all = TRUE)
|
features <- merge(features, feature, by="local_segment", all = TRUE)
|
||||||
}
|
}
|
||||||
|
|
||||||
features <- features %>% mutate_at(vars(contains("countscansmostuniquedevice")), list( ~ replace_na(., 0)))
|
features <- features %>% mutate_at(vars(contains("countscansmostuniquedevice")), list( ~ replace_na(., 0))) %>% select(-local_segment)
|
||||||
|
|
||||||
return(features)
|
return(features)
|
||||||
}
|
}
|
|
@ -20,7 +20,7 @@ call_features_of_type <- function(calls, call_type, day_segment, requested_featu
|
||||||
if(length(features_to_compute) == 0)
|
if(length(features_to_compute) == 0)
|
||||||
return(features)
|
return(features)
|
||||||
if(nrow(calls) < 1)
|
if(nrow(calls) < 1)
|
||||||
return(cbind(features, read.csv(text = paste(paste("calls_rapids", call_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
|
return(cbind(features, read.csv(text = paste(paste(call_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
|
||||||
|
|
||||||
for(feature_name in features_to_compute){
|
for(feature_name in features_to_compute){
|
||||||
if(feature_name == "countmostfrequentcontact"){
|
if(feature_name == "countmostfrequentcontact"){
|
||||||
|
@ -34,24 +34,24 @@ call_features_of_type <- function(calls, call_type, day_segment, requested_featu
|
||||||
pull(trace)
|
pull(trace)
|
||||||
feature <- calls %>%
|
feature <- calls %>%
|
||||||
group_by(local_segment) %>%
|
group_by(local_segment) %>%
|
||||||
summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := sum(trace == mostfrequentcontact))
|
summarise(!!paste(call_type, feature_name, sep = "_") := sum(trace == mostfrequentcontact))
|
||||||
features <- merge(features, feature, by="local_segment", all = TRUE)
|
features <- merge(features, feature, by="local_segment", all = TRUE)
|
||||||
} else {
|
} else {
|
||||||
feature <- calls %>%
|
feature <- calls %>%
|
||||||
group_by(local_segment)
|
group_by(local_segment)
|
||||||
|
|
||||||
feature <- switch(feature_name,
|
feature <- switch(feature_name,
|
||||||
"count" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := n()),
|
"count" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := n()),
|
||||||
"distinctcontacts" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := n_distinct(trace)),
|
"distinctcontacts" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := n_distinct(trace)),
|
||||||
"meanduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := mean(call_duration)),
|
"meanduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := mean(call_duration)),
|
||||||
"sumduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := sum(call_duration)),
|
"sumduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := sum(call_duration)),
|
||||||
"minduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := min(call_duration)),
|
"minduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := min(call_duration)),
|
||||||
"maxduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := max(call_duration)),
|
"maxduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := max(call_duration)),
|
||||||
"stdduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := sd(call_duration)),
|
"stdduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := sd(call_duration)),
|
||||||
"modeduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := Mode(call_duration)),
|
"modeduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := Mode(call_duration)),
|
||||||
"entropyduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := entropy.MillerMadow(call_duration)),
|
"entropyduration" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := entropy.MillerMadow(call_duration)),
|
||||||
"timefirstcall" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)),
|
"timefirstcall" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)),
|
||||||
"timelastcall" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute)))
|
"timelastcall" = feature %>% summarise(!!paste(call_type, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute)))
|
||||||
|
|
||||||
features <- merge(features, feature, by="local_segment", all = TRUE)
|
features <- merge(features, feature, by="local_segment", all = TRUE)
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,8 +19,8 @@ compute_data_yield_features <- function(data, feature_name, day_segment, provide
|
||||||
valid_yielded_hours = sum(valid_hour == TRUE) / 1.0,
|
valid_yielded_hours = sum(valid_hour == TRUE) / 1.0,
|
||||||
duration_minutes = first(duration_minutes),
|
duration_minutes = first(duration_minutes),
|
||||||
duration_hours = duration_minutes / 60.0,
|
duration_hours = duration_minutes / 60.0,
|
||||||
phone_data_yield_rapids_ratiovalidyieldedminutes = valid_yielded_minutes / duration_minutes,
|
ratiovalidyieldedminutes = valid_yielded_minutes / duration_minutes,
|
||||||
phone_data_yield_rapids_ratiovalidyieldedhours = if_else(duration_hours > 1, valid_yielded_hours / duration_hours, valid_yielded_hours))
|
ratiovalidyieldedhours = if_else(duration_hours > 1, valid_yielded_hours / duration_hours, valid_yielded_hours))
|
||||||
return(features)
|
return(features)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ rapids_features <- function(sensor_data_files, day_segment, provider){
|
||||||
features_to_compute <- intersect(base_features_names, requested_features)
|
features_to_compute <- intersect(base_features_names, requested_features)
|
||||||
|
|
||||||
features <- compute_data_yield_features(yield_data, feature_name, day_segment, provider) %>%
|
features <- compute_data_yield_features(yield_data, feature_name, day_segment, provider) %>%
|
||||||
select(c("local_segment", paste0("phone_data_yield_rapids_", features_to_compute)))
|
select(c("local_segment", features_to_compute))
|
||||||
|
|
||||||
return(features)
|
return(features)
|
||||||
}
|
}
|
|
@ -8,22 +8,22 @@ sapply(file.sources,source,.GlobalEnv)
|
||||||
|
|
||||||
create_empty_file <- function(requested_features){
|
create_empty_file <- function(requested_features){
|
||||||
return(data.frame(local_segment= character(),
|
return(data.frame(local_segment= character(),
|
||||||
locations_barnett_hometime= numeric(),
|
hometime= numeric(),
|
||||||
locations_barnett_disttravelled= numeric(),
|
disttravelled= numeric(),
|
||||||
locations_barnett_rog= numeric(),
|
rog= numeric(),
|
||||||
locations_barnett_maxdiam= numeric(),
|
maxdiam= numeric(),
|
||||||
locations_barnett_maxhomedist= numeric(),
|
maxhomedist= numeric(),
|
||||||
locations_barnett_siglocsvisited= numeric(),
|
siglocsvisited= numeric(),
|
||||||
locations_barnett_avgflightlen= numeric(),
|
avgflightlen= numeric(),
|
||||||
locations_barnett_stdflightlen= numeric(),
|
stdflightlen= numeric(),
|
||||||
locations_barnett_avgflightdur= numeric(),
|
avgflightdur= numeric(),
|
||||||
locations_barnett_stdflightdur= numeric(),
|
stdflightdur= numeric(),
|
||||||
locations_barnett_probpause= numeric(),
|
probpause= numeric(),
|
||||||
locations_barnett_siglocentropy= numeric(),
|
siglocentropy= numeric(),
|
||||||
locations_barnett_minsmissing= numeric(),
|
minsmissing= numeric(),
|
||||||
locations_barnett_circdnrtn= numeric(),
|
circdnrtn= numeric(),
|
||||||
locations_barnett_wkenddayrtn= numeric(),
|
wkenddayrtn= numeric(),
|
||||||
locations_barnett_minutes_data_used= numeric()
|
minutes_data_used= numeric()
|
||||||
) %>% select(all_of(requested_features)))
|
) %>% select(all_of(requested_features)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -41,9 +41,9 @@ barnett_features <- function(sensor_data_files, day_segment, params){
|
||||||
available_features <- c("hometime","disttravelled","rog","maxdiam", "maxhomedist","siglocsvisited","avgflightlen", "stdflightlen",
|
available_features <- c("hometime","disttravelled","rog","maxdiam", "maxhomedist","siglocsvisited","avgflightlen", "stdflightlen",
|
||||||
"avgflightdur","stdflightdur", "probpause","siglocentropy","minsmissing", "circdnrtn","wkenddayrtn")
|
"avgflightdur","stdflightdur", "probpause","siglocentropy","minsmissing", "circdnrtn","wkenddayrtn")
|
||||||
requested_features <- intersect(unlist(params["FEATURES"], use.names = F), available_features)
|
requested_features <- intersect(unlist(params["FEATURES"], use.names = F), available_features)
|
||||||
requested_features <- c("local_segment", paste("locations_barnett", requested_features, sep = "_"))
|
requested_features <- c("local_segment", requested_features)
|
||||||
if(minutes_data_used)
|
if(minutes_data_used)
|
||||||
requested_features <- c(requested_features, "locations_barnett_minutes_data_used")
|
requested_features <- c(requested_features, "minutes_data_used")
|
||||||
|
|
||||||
# Excludes datasets with less than 24 hours of data
|
# Excludes datasets with less than 24 hours of data
|
||||||
if(max(location$timestamp) - min(location$timestamp) < 86400000)
|
if(max(location$timestamp) - min(location$timestamp) < 86400000)
|
||||||
|
@ -67,8 +67,8 @@ barnett_features <- function(sensor_data_files, day_segment, params){
|
||||||
group_by(local_date, local_hour) %>%
|
group_by(local_date, local_hour) %>%
|
||||||
summarise(n_minutes = n_distinct(local_minute)) %>%
|
summarise(n_minutes = n_distinct(local_minute)) %>%
|
||||||
group_by(local_date) %>%
|
group_by(local_date) %>%
|
||||||
summarise(locations_barnett_minutes_data_used = sum(n_minutes)) %>%
|
summarise(minutes_data_used = sum(n_minutes)) %>%
|
||||||
select(local_date, locations_barnett_minutes_data_used)
|
select(local_date, minutes_data_used)
|
||||||
|
|
||||||
# Save day segment to attach it later
|
# Save day segment to attach it later
|
||||||
location_dates_segments <- location %>% select(local_date, local_segment) %>% distinct(local_date, .keep_all = TRUE)
|
location_dates_segments <- location %>% select(local_date, local_segment) %>% distinct(local_date, .keep_all = TRUE)
|
||||||
|
@ -89,7 +89,7 @@ barnett_features <- function(sensor_data_files, day_segment, params){
|
||||||
features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg)
|
features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg)
|
||||||
features <- as.data.frame(features)
|
features <- as.data.frame(features)
|
||||||
features[-1] <- lapply(lapply(features[-1], as.character), as.numeric)
|
features[-1] <- lapply(lapply(features[-1], as.character), as.numeric)
|
||||||
colnames(features)=c("local_date",tolower(paste("locations_barnett", colnames(outputMobility$featavg), sep = "_")))
|
colnames(features)=c("local_date",tolower(colnames(outputMobility$featavg)))
|
||||||
# Add the minute count column
|
# Add the minute count column
|
||||||
features <- left_join(features, location_minutes_used, by = "local_date")
|
features <- left_join(features, location_minutes_used, by = "local_date")
|
||||||
# Add the day segment column for consistency
|
# Add the day segment column for consistency
|
||||||
|
|
|
@ -15,7 +15,7 @@ message_features_of_type <- function(messages, messages_type, day_segment, reque
|
||||||
if(length(features_to_compute) == 0)
|
if(length(features_to_compute) == 0)
|
||||||
return(features)
|
return(features)
|
||||||
if(nrow(messages) < 1)
|
if(nrow(messages) < 1)
|
||||||
return(cbind(features, read.csv(text = paste(paste("messages_rapids", messages_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
|
return(cbind(features, read.csv(text = paste(paste(messages_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
|
||||||
|
|
||||||
for(feature_name in features_to_compute){
|
for(feature_name in features_to_compute){
|
||||||
if(feature_name == "countmostfrequentcontact"){
|
if(feature_name == "countmostfrequentcontact"){
|
||||||
|
@ -29,17 +29,17 @@ message_features_of_type <- function(messages, messages_type, day_segment, reque
|
||||||
pull(trace)
|
pull(trace)
|
||||||
feature <- messages %>%
|
feature <- messages %>%
|
||||||
group_by(local_segment) %>%
|
group_by(local_segment) %>%
|
||||||
summarise(!!paste("messages_rapids", messages_type, feature_name, sep = "_") := sum(trace == mostfrequentcontact))
|
summarise(!!paste(messages_type, feature_name, sep = "_") := sum(trace == mostfrequentcontact))
|
||||||
features <- merge(features, feature, by="local_segment", all = TRUE)
|
features <- merge(features, feature, by="local_segment", all = TRUE)
|
||||||
} else {
|
} else {
|
||||||
feature <- messages %>%
|
feature <- messages %>%
|
||||||
group_by(local_segment)
|
group_by(local_segment)
|
||||||
|
|
||||||
feature <- switch(feature_name,
|
feature <- switch(feature_name,
|
||||||
"count" = feature %>% summarise(!!paste("messages_rapids", messages_type, feature_name, sep = "_") := n()),
|
"count" = feature %>% summarise(!!paste(messages_type, feature_name, sep = "_") := n()),
|
||||||
"distinctcontacts" = feature %>% summarise(!!paste("messages_rapids", messages_type, feature_name, sep = "_") := n_distinct(trace)),
|
"distinctcontacts" = feature %>% summarise(!!paste(messages_type, feature_name, sep = "_") := n_distinct(trace)),
|
||||||
"timefirstmessage" = feature %>% summarise(!!paste("messages_rapids", messages_type, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)),
|
"timefirstmessage" = feature %>% summarise(!!paste(messages_type, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)),
|
||||||
"timelastmessage" = feature %>% summarise(!!paste("messages_rapids", messages_type, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute)))
|
"timelastmessage" = feature %>% summarise(!!paste(messages_type, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute)))
|
||||||
|
|
||||||
features <- merge(features, feature, by="local_segment", all = TRUE)
|
features <- merge(features, feature, by="local_segment", all = TRUE)
|
||||||
}
|
}
|
||||||
|
@ -57,7 +57,7 @@ rapids_features <- function(sensor_data_files, day_segment, provider){
|
||||||
# Filter rows that belong to the message type and day segment of interest
|
# Filter rows that belong to the message type and day segment of interest
|
||||||
message_type_label = ifelse(message_type == "received", "1", ifelse(message_type == "sent", "2", NA))
|
message_type_label = ifelse(message_type == "received", "1", ifelse(message_type == "sent", "2", NA))
|
||||||
if(is.na(message_type_label))
|
if(is.na(message_type_label))
|
||||||
stop(paste("Message type can online be received or sent but instead you typed: ", message_type, " in config[MESSAGES][MESSAGES_TYPES]"))
|
stop(paste("Message type can online be received or sent but instead you typed: ", message_type, " in config[PHONE_MESSAGES][MESSAGES_TYPES]"))
|
||||||
|
|
||||||
requested_features <- provider[["FEATURES"]][[message_type]]
|
requested_features <- provider[["FEATURES"]][[message_type]]
|
||||||
messages_of_type <- messages_data %>% filter(message_type == message_type_label)
|
messages_of_type <- messages_data %>% filter(message_type == message_type_label)
|
||||||
|
|
|
@ -5,8 +5,8 @@ compute_wifi_feature <- function(data, feature, day_segment){
|
||||||
if(feature %in% c("countscans", "uniquedevices")){
|
if(feature %in% c("countscans", "uniquedevices")){
|
||||||
data <- data %>% group_by(local_segment)
|
data <- data %>% group_by(local_segment)
|
||||||
data <- switch(feature,
|
data <- switch(feature,
|
||||||
"countscans" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n()),
|
"countscans" = data %>% summarise(!!feature := n()),
|
||||||
"uniquedevices" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n_distinct(bssid)))
|
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid)))
|
||||||
return(data)
|
return(data)
|
||||||
} else if(feature == "countscansmostuniquedevice"){
|
} else if(feature == "countscansmostuniquedevice"){
|
||||||
# Get the most scanned device
|
# Get the most scanned device
|
||||||
|
@ -20,7 +20,7 @@ compute_wifi_feature <- function(data, feature, day_segment){
|
||||||
return(data %>%
|
return(data %>%
|
||||||
filter(bssid == mostuniquedevice) %>%
|
filter(bssid == mostuniquedevice) %>%
|
||||||
group_by(local_segment) %>%
|
group_by(local_segment) %>%
|
||||||
summarise(!!paste("wifi_rapids", feature, sep = "_") := n()) %>%
|
summarise(!!feature := n()) %>%
|
||||||
replace(is.na(.), 0))
|
replace(is.na(.), 0))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,8 +5,8 @@ compute_wifi_feature <- function(data, feature, day_segment){
|
||||||
if(feature %in% c("countscans", "uniquedevices")){
|
if(feature %in% c("countscans", "uniquedevices")){
|
||||||
data <- data %>% group_by(local_segment)
|
data <- data %>% group_by(local_segment)
|
||||||
data <- switch(feature,
|
data <- switch(feature,
|
||||||
"countscans" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n()),
|
"countscans" = data %>% summarise(!!feature := n()),
|
||||||
"uniquedevices" = data %>% summarise(!!paste("wifi_rapids", feature, sep = "_") := n_distinct(bssid)))
|
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid)))
|
||||||
return(data)
|
return(data)
|
||||||
} else if(feature == "countscansmostuniquedevice"){
|
} else if(feature == "countscansmostuniquedevice"){
|
||||||
# Get the most scanned device
|
# Get the most scanned device
|
||||||
|
@ -20,7 +20,7 @@ compute_wifi_feature <- function(data, feature, day_segment){
|
||||||
return(data %>%
|
return(data %>%
|
||||||
filter(bssid == mostuniquedevice) %>%
|
filter(bssid == mostuniquedevice) %>%
|
||||||
group_by(local_segment) %>%
|
group_by(local_segment) %>%
|
||||||
summarise(!!paste("wifi_rapids", feature, sep = "_") := n()) %>%
|
summarise(!!feature := n()) %>%
|
||||||
replace(is.na(.), 0))
|
replace(is.na(.), 0))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -61,12 +61,9 @@ fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_d
|
||||||
print(paste(rapids_log_tag,"Processing", sensor_key, provider_key, day_segment))
|
print(paste(rapids_log_tag,"Processing", sensor_key, provider_key, day_segment))
|
||||||
|
|
||||||
features <- features_function(sensor_data_files, day_segment, provider)
|
features <- features_function(sensor_data_files, day_segment, provider)
|
||||||
|
if(!"local_segment" %in% colnames(features))
|
||||||
# Check all features names contain the provider key so they are unique
|
stop(paste0("The dataframe returned by the ",sensor_key," provider '", provider_key,"' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (", code_path,")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different day segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)"))
|
||||||
features_names <- colnames(features %>% select(-local_segment))
|
features <- features %>% rename_at(vars(!matches("local_segment")), ~ paste(sensor_key, provider_key, ., sep = "_"))
|
||||||
if(any(!grepl(paste0(".*(",str_to_lower(provider_key),").*"), features_names)))
|
|
||||||
stop(paste("The name of all calls features of", provider_key," must contain its name in lower case but the following don't [", paste(features_names[!grepl(paste0(".*(",str_to_lower(provider_key),").*"), features_names)], collapse = ", "), "]"))
|
|
||||||
|
|
||||||
sensor_features <- merge(sensor_features, features, all = TRUE)
|
sensor_features <- merge(sensor_features, features, all = TRUE)
|
||||||
}
|
}
|
||||||
} else { # This is redundant, if COMPUTE is FALSE this script will be never executed
|
} else { # This is redundant, if COMPUTE is FALSE this script will be never executed
|
||||||
|
|
Loading…
Reference in New Issue