Add resampling for fused location
parent
6a79fbe1e8
commit
0ba88203f4
|
@ -46,6 +46,11 @@ PHONE_VALID_SENSED_DAYS:
|
||||||
MIN_VALID_HOURS: 20 # (out of 24)
|
MIN_VALID_HOURS: 20 # (out of 24)
|
||||||
MIN_BINS_PER_HOUR: 8 # (out of 60min/BIN_SIZE bins)
|
MIN_BINS_PER_HOUR: 8 # (out of 60min/BIN_SIZE bins)
|
||||||
|
|
||||||
|
RESAMPLE_FUSED_LOCATION:
|
||||||
|
CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
|
||||||
|
TIME_SINCE_VALID_LOCATION: 12 # hours, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
|
||||||
|
TIMEZONE: *timezone
|
||||||
|
|
||||||
BARNETT_LOCATION:
|
BARNETT_LOCATION:
|
||||||
ACCURACY_LIMIT: 51 # filters location coordinates with an accuracy higher than this
|
ACCURACY_LIMIT: 51 # filters location coordinates with an accuracy higher than this
|
||||||
TIMEZONE: *timezone
|
TIMEZONE: *timezone
|
||||||
|
|
|
@ -51,4 +51,18 @@ rule unify_ios_android:
|
||||||
output:
|
output:
|
||||||
"data/raw/{pid}/{sensor}_with_datetime_unified.csv"
|
"data/raw/{pid}/{sensor}_with_datetime_unified.csv"
|
||||||
script:
|
script:
|
||||||
"../src/data/unify_ios_android.R"
|
"../src/data/unify_ios_android.R"
|
||||||
|
|
||||||
|
rule resample_fused_location:
|
||||||
|
input:
|
||||||
|
locations = "data/raw/{pid}/locations_raw.csv",
|
||||||
|
phone_sensed_bins = rules.phone_sensed_bins.output
|
||||||
|
params:
|
||||||
|
bin_size = config["PHONE_VALID_SENSED_DAYS"]["BIN_SIZE"],
|
||||||
|
timezone = config["RESAMPLE_FUSED_LOCATION"]["TIMEZONE"],
|
||||||
|
consecutive_threshold = config["RESAMPLE_FUSED_LOCATION"]["CONSECUTIVE_THRESHOLD"],
|
||||||
|
time_since_valid_location = config["RESAMPLE_FUSED_LOCATION"]["TIME_SINCE_VALID_LOCATION"]
|
||||||
|
output:
|
||||||
|
"data/raw/{pid}/locations_resampled.csv"
|
||||||
|
script:
|
||||||
|
"../src/data/resample_fused_location.R"
|
|
@ -0,0 +1,43 @@
|
||||||
|
source("packrat/init.R")
|
||||||
|
|
||||||
|
|
||||||
|
library(dplyr)
|
||||||
|
library(readr)
|
||||||
|
library(tidyr)
|
||||||
|
|
||||||
|
bin_size <- snakemake@params[["bin_size"]]
|
||||||
|
timezone <- snakemake@params[["timezone"]]
|
||||||
|
consecutive_threshold <- snakemake@params[["consecutive_threshold"]]
|
||||||
|
time_since_valid_location <- snakemake@params[["time_since_valid_location"]]
|
||||||
|
|
||||||
|
locations <- read_csv(snakemake@input[["locations"]], col_types = cols())
|
||||||
|
phone_sensed_bins <- read_csv(snakemake@input[["phone_sensed_bins"]], col_types = cols(local_date = col_character()))
|
||||||
|
|
||||||
|
if(nrow(locations) > 0){
|
||||||
|
sensed_minute_bins <- phone_sensed_bins %>%
|
||||||
|
pivot_longer(-local_date, names_to = c("hour", "bin"), names_ptypes = list(hour = integer(), bin = integer()), names_sep = "_", values_to = "sensor_count") %>%
|
||||||
|
complete(nesting(local_date, hour), bin = seq(0, 59,1)) %>%
|
||||||
|
fill(sensor_count) %>%
|
||||||
|
mutate(timestamp = as.numeric(as.POSIXct(paste0(local_date, " ", hour,":", bin,":00"), format = "%Y-%m-%d %H:%M:%S", tz = timezone)) * 1000 ) %>%
|
||||||
|
filter(sensor_count > 0) %>%
|
||||||
|
select(timestamp)
|
||||||
|
|
||||||
|
resampled_locations <- locations %>%
|
||||||
|
filter(provider == "fused") %>%
|
||||||
|
bind_rows(sensed_minute_bins) %>%
|
||||||
|
arrange(timestamp) %>%
|
||||||
|
# We group and therefore, fill in, missing rows that appear after a valid fused location record and exist
|
||||||
|
# within consecutive_threshold minutes from each other
|
||||||
|
mutate(consecutive_time_diff = c(1, diff(timestamp)),
|
||||||
|
resample_group = cumsum(!is.na(double_longitude) | consecutive_time_diff > (1000 * 60 * consecutive_threshold))) %>%
|
||||||
|
group_by(resample_group) %>%
|
||||||
|
# drop rows that are logged after time_since_valid_location hours from the last valid fused location
|
||||||
|
filter((timestamp - first(timestamp) < (1000 * 60 * 60 * time_since_valid_location))) %>%
|
||||||
|
fill(-timestamp, -resample_group) %>%
|
||||||
|
select(-consecutive_time_diff) %>%
|
||||||
|
drop_na(double_longitude, double_latitude, accuracy)
|
||||||
|
|
||||||
|
write.csv(resampled_locations,snakemake@output[[1]], row.names = F)
|
||||||
|
} else {
|
||||||
|
write.csv(locations,snakemake@output[[1]], row.names = F)
|
||||||
|
}
|
Loading…
Reference in New Issue