Refactor resample fused location to filter fused rows earlier

2019-12-09 19:15:57 -05:00 · 2019-12-09 19:15:57 -05:00 · 3947f1ec29
parent cb2ee1ec82
commit 3947f1ec29
1 changed files with 19 additions and 20 deletions
--- a/src/data/resample_fused_location.r
+++ b/src/data/resample_fused_location.r
@ -10,32 +10,31 @@ timezone <- snakemake@params[["timezone"]]
 consecutive_threshold <- snakemake@params[["consecutive_threshold"]]
 time_since_valid_location <- snakemake@params[["time_since_valid_location"]]
-locations <- read_csv(snakemake@input[["locations"]], col_types = cols())
+locations <- read_csv(snakemake@input[["locations"]], col_types = cols()) %>% filter(provider == "fused")
 phone_sensed_bins  <- read_csv(snakemake@input[["phone_sensed_bins"]], col_types = cols(local_date = col_character()))
 if(nrow(locations) > 0){
    sensed_minute_bins <- phone_sensed_bins %>% 
-    pivot_longer(-local_date, names_to = c("hour", "bin"), names_ptypes = list(hour = integer(), bin = integer()), names_sep = "_", values_to = "sensor_count") %>% 
+        pivot_longer(-local_date, names_to = c("hour", "bin"), names_ptypes = list(hour = integer(), bin = integer()), names_sep = "_", values_to = "sensor_count") %>% 
-    complete(nesting(local_date, hour), bin = seq(0, 59,1)) %>% 
+        complete(nesting(local_date, hour), bin = seq(0, 59,1)) %>% 
-    fill(sensor_count) %>% 
+        fill(sensor_count) %>% 
-    mutate(timestamp = as.numeric(as.POSIXct(paste0(local_date, " ", hour,":", bin,":00"), format = "%Y-%m-%d %H:%M:%S", tz = timezone)) * 1000 ) %>%
+        mutate(timestamp = as.numeric(as.POSIXct(paste0(local_date, " ", hour,":", bin,":00"), format = "%Y-%m-%d %H:%M:%S", tz = timezone)) * 1000 ) %>%
-    filter(sensor_count > 0) %>% 
+        filter(sensor_count > 0) %>% 
-    select(timestamp)
+        select(timestamp)
    resampled_locations <- locations %>%
-    filter(provider == "fused") %>% 
+        bind_rows(sensed_minute_bins) %>% 
-    bind_rows(sensed_minute_bins) %>% 
+        arrange(timestamp) %>% 
-    arrange(timestamp) %>% 
+        # We group and therefore, fill in, missing rows that appear after a valid fused location record and exist
-    # We group and therefore, fill in, missing rows that appear after a valid fused location record and exist
+        # within consecutive_threshold minutes from each other
-    # within consecutive_threshold minutes from each other
+        mutate(consecutive_time_diff = c(1, diff(timestamp)),
-    mutate(consecutive_time_diff = c(1, diff(timestamp)),
+            resample_group = cumsum(!is.na(double_longitude) | consecutive_time_diff > (1000 * 60 * consecutive_threshold))) %>% 
-         resample_group = cumsum(!is.na(double_longitude) | consecutive_time_diff > (1000 * 60 * consecutive_threshold))) %>% 
+        group_by(resample_group) %>% 
-    group_by(resample_group) %>% 
+        # drop rows that are logged after time_since_valid_location hours from the last valid fused location
-    # drop rows that are logged after time_since_valid_location hours from the last valid fused location
+        filter((timestamp - first(timestamp) < (1000 * 60 * 60 * time_since_valid_location))) %>% 
-    filter((timestamp - first(timestamp) < (1000 * 60 * 60 * time_since_valid_location))) %>% 
+        fill(-timestamp, -resample_group) %>% 
-    fill(-timestamp, -resample_group) %>% 
+        select(-consecutive_time_diff) %>% 
-    select(-consecutive_time_diff) %>% 
+        drop_na(double_longitude, double_latitude, accuracy)
    drop_na(double_longitude, double_latitude, accuracy)
    write.csv(resampled_locations,snakemake@output[[1]], row.names = F)
 } else {