diff --git a/config.yaml b/config.yaml index a16699ce..a56c3ef4 100644 --- a/config.yaml +++ b/config.yaml @@ -46,6 +46,11 @@ PHONE_VALID_SENSED_DAYS: MIN_VALID_HOURS: 20 # (out of 24) MIN_BINS_PER_HOUR: 8 # (out of 60min/BIN_SIZE bins) +RESAMPLE_FUSED_LOCATION: + CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold + TIME_SINCE_VALID_LOCATION: 12 # hours, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row + TIMEZONE: *timezone + BARNETT_LOCATION: ACCURACY_LIMIT: 51 # filters location coordinates with an accuracy higher than this TIMEZONE: *timezone diff --git a/rules/preprocessing.snakefile b/rules/preprocessing.snakefile index 564b2a57..29bc3b06 100644 --- a/rules/preprocessing.snakefile +++ b/rules/preprocessing.snakefile @@ -51,4 +51,18 @@ rule unify_ios_android: output: "data/raw/{pid}/{sensor}_with_datetime_unified.csv" script: - "../src/data/unify_ios_android.R" \ No newline at end of file + "../src/data/unify_ios_android.R" + +rule resample_fused_location: + input: + locations = "data/raw/{pid}/locations_raw.csv", + phone_sensed_bins = rules.phone_sensed_bins.output + params: + bin_size = config["PHONE_VALID_SENSED_DAYS"]["BIN_SIZE"], + timezone = config["RESAMPLE_FUSED_LOCATION"]["TIMEZONE"], + consecutive_threshold = config["RESAMPLE_FUSED_LOCATION"]["CONSECUTIVE_THRESHOLD"], + time_since_valid_location = config["RESAMPLE_FUSED_LOCATION"]["TIME_SINCE_VALID_LOCATION"] + output: + "data/raw/{pid}/locations_resampled.csv" + script: + "../src/data/resample_fused_location.R" \ No newline at end of file diff --git a/src/data/resample_fused_location.r b/src/data/resample_fused_location.r new file mode 100644 index 00000000..ddec05a7 --- /dev/null +++ b/src/data/resample_fused_location.r @@ -0,0 +1,43 @@ +source("packrat/init.R") + + +library(dplyr) +library(readr) +library(tidyr) + +bin_size <- snakemake@params[["bin_size"]] +timezone <- snakemake@params[["timezone"]] +consecutive_threshold <- snakemake@params[["consecutive_threshold"]] +time_since_valid_location <- snakemake@params[["time_since_valid_location"]] + +locations <- read_csv(snakemake@input[["locations"]], col_types = cols()) +phone_sensed_bins <- read_csv(snakemake@input[["phone_sensed_bins"]], col_types = cols(local_date = col_character())) + +if(nrow(locations) > 0){ + sensed_minute_bins <- phone_sensed_bins %>% + pivot_longer(-local_date, names_to = c("hour", "bin"), names_ptypes = list(hour = integer(), bin = integer()), names_sep = "_", values_to = "sensor_count") %>% + complete(nesting(local_date, hour), bin = seq(0, 59,1)) %>% + fill(sensor_count) %>% + mutate(timestamp = as.numeric(as.POSIXct(paste0(local_date, " ", hour,":", bin,":00"), format = "%Y-%m-%d %H:%M:%S", tz = timezone)) * 1000 ) %>% + filter(sensor_count > 0) %>% + select(timestamp) + + resampled_locations <- locations %>% + filter(provider == "fused") %>% + bind_rows(sensed_minute_bins) %>% + arrange(timestamp) %>% + # We group and therefore, fill in, missing rows that appear after a valid fused location record and exist + # within consecutive_threshold minutes from each other + mutate(consecutive_time_diff = c(1, diff(timestamp)), + resample_group = cumsum(!is.na(double_longitude) | consecutive_time_diff > (1000 * 60 * consecutive_threshold))) %>% + group_by(resample_group) %>% + # drop rows that are logged after time_since_valid_location hours from the last valid fused location + filter((timestamp - first(timestamp) < (1000 * 60 * 60 * time_since_valid_location))) %>% + fill(-timestamp, -resample_group) %>% + select(-consecutive_time_diff) %>% + drop_na(double_longitude, double_latitude, accuracy) + + write.csv(resampled_locations,snakemake@output[[1]], row.names = F) +} else { + write.csv(locations,snakemake@output[[1]], row.names = F) +}