Fix location bug and improve fused resampling

2020-06-11 12:25:49 -04:00 · 2020-06-11 12:25:49 -04:00 · 8fbafcbe0c
parent b426e2ce47
commit 8fbafcbe0c
7 changed files with 89 additions and 41 deletions
--- a/config.yaml
+++ b/config.yaml
@ -65,7 +65,7 @@ PHONE_VALID_SENSED_DAYS:

 RESAMPLE_FUSED_LOCATION:
  CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
-  TIME_SINCE_VALID_LOCATION: 12 # hours, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
+  TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
  TIMEZONE: *timezone

 BARNETT_LOCATION:
@ -74,6 +74,7 @@ BARNETT_LOCATION:
  LOCATIONS_TO_USE: ALL # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED
  ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
  TIMEZONE: *timezone
+  MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features

 BLUETOOTH:
  DAY_SEGMENTS: *day_segments
--- a/docs/features/extracted.rst
+++ b/docs/features/extracted.rst
@ -576,9 +576,10 @@ See `Location (Barnett’s) Config Code`_
 =================    ===================
 Name	             Description
 =================    ===================
-location_to_use      The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``ALL_EXCEPT_FUSED`` OR ``RESAMPLE_FUSED``
+location_to_use      *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``ALL_EXCEPT_FUSED`` OR ``RESAMPLE_FUSED``
 accuracy_limit       This is in meters. The sensor drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius specified.
-timezone             The timezone used to calculate location. 
+timezone             The timezone used to calculate location.
+minutes_data_used    This is NOT a feature. This is just a quality control check, and if set to TRUE, a new column is added to the output file with the number of minutes containing location data that were used to compute all features. The more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
 features             Features to be computed, see table below
 =================    ===================

@ -609,7 +610,11 @@ wkenddayrtn                      Same as circdnrtn but computed separately for w

 **Assumptions/Observations:** 

-Types of location data to use. Aware Android and iOS clients can collect location coordinates through the phone's GPS or Google's fused location API. If your Aware client was ONLY configured to use GPS set ``location_to_use`` to ``ALL``, if your client was configured to use BOTH GPS and fused location set ``location_to_use`` to  ``ALL_EXCEPT_FUSED`` to ignore fused coordinates, if your client was configured to use fused location  set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days <phone-valid-sensed-days>`), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different from the previous one.
+*Types of location data to use*
+
+Aware Android and iOS clients can collect location coordinates through the phone's GPS or Google's fused location API. If your Aware client was ONLY configured to use GPS set ``location_to_use`` to ``ALL``, if your client was configured to use BOTH GPS and fused location you can use ``ALL`` or set ``location_to_use`` to  ``ALL_EXCEPT_FUSED`` to ignore fused coordinates, if your client was configured to use fused location only,  set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days <phone-valid-sensed-days>`), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different from the previous one. 
+
+There are two parameters associated with resampling fused location in the ``RESAMPLE_FUSED_LOCATION`` section of the ``config.yaml`` file. ``CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know.

 *Significant Locations Identified*

--- a/rules/features.snakefile
+++ b/rules/features.snakefile
@ -9,6 +9,12 @@ def optional_ar_input(wildcards):
        return ["data/raw/{pid}/plugin_ios_activity_recognition_with_datetime_unified.csv",
                "data/processed/{pid}/plugin_ios_activity_recognition_deltas.csv"]

+def optional_location_input(wildcards):
+    if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
+        return rules.resample_fused_location.output
+    else:
+        return "data/raw/{pid}/locations_with_datetime.csv",
+
 rule sms_features:
    input: 
        "data/raw/{pid}/messages_with_datetime.csv"
@ -68,13 +74,13 @@ rule ios_activity_recognition_deltas:

 rule location_barnett_features:
    input:
-        raw = "data/raw/{pid}/locations_raw.csv",
-        fused = rules.resample_fused_location.output
+        locations = optional_location_input
    params:
        features = config["BARNETT_LOCATION"]["FEATURES"],
        locations_to_use = config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"],
        accuracy_limit = config["BARNETT_LOCATION"]["ACCURACY_LIMIT"],
        timezone = config["BARNETT_LOCATION"]["TIMEZONE"],
+        minutes_data_used = config["BARNETT_LOCATION"]["MINUTES_DATA_USED"],
        day_segment = "{day_segment}"
    output:
        "data/processed/{pid}/location_barnett_{day_segment}.csv"
--- a/src/data/resample_fused_location.R
+++ b/src/data/resample_fused_location.R
@ -22,18 +22,30 @@ if(nrow(locations) > 0){
        select(timestamp)

    resampled_locations <- locations %>%
-        bind_rows(sensed_minute_bins) %>% 
+        bind_rows(sensed_minute_bins) %>%
+        mutate(provider = replace_na(provider, "resampled"))  %>% 
        arrange(timestamp) %>% 
        # We group and therefore, fill in, missing rows that appear after a valid fused location record and exist
        # within consecutive_threshold minutes from each other
        mutate(consecutive_time_diff = c(1, diff(timestamp)),
            resample_group = cumsum(!is.na(double_longitude) | consecutive_time_diff > (1000 * 60 * consecutive_threshold))) %>% 
        group_by(resample_group) %>% 
-        # drop rows that are logged after time_since_valid_location hours from the last valid fused location
-        filter((timestamp - first(timestamp) < (1000 * 60 * 60 * time_since_valid_location))) %>% 
+        # drop rows that are logged after time_since_valid_location minutes from the last valid fused location
+        filter((timestamp - first(timestamp) < (1000 * 60 * time_since_valid_location))) %>% 
        fill(-timestamp, -resample_group) %>% 
        select(-consecutive_time_diff) %>% 
-        drop_na(double_longitude, double_latitude, accuracy)
+        drop_na(double_longitude, double_latitude, accuracy) %>% 
+        # Add local date_time
+        mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"),
+            local_date_time = format(utc_date_time, tz = timezone, usetz = F)) %>% 
+        separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>% 
+        separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
+        mutate(local_hour = as.numeric(local_hour), local_minute = as.numeric(local_minute)) %>%
+        # Delete resampled rows that exist in the same minute as other original (fused) rows
+        group_by(local_date, local_hour, local_minute) %>%
+        mutate(n = n()) %>%
+        filter(n == 1 | (n > 1 & provider == "fused")) %>% 
+        select(-n)

    write.csv(resampled_locations,snakemake@output[[1]], row.names = F)
 } else {
--- a/src/features/location_barnett/GPS2MobMat.R
+++ b/src/features/location_barnett/GPS2MobMat.R
@ -25,7 +25,7 @@ function(locations_df,itrvl=10,accuracylim=51,r=NULL,w=NULL,tint_m=NULL,tint_k=N
  numitrvl=1
  cat("Collapse data within",itrvl,"second intervals...\n")
  for(i in 2:nrow(mat)){
-    #ProgressBar(nrow(mat)-1,i-1)
+    ProgressBar(nrow(mat)-1,i-1)
    if(mat[i,1]/1000<tstart+itrvl){
      nextline[3]=nextline[3]+mat[i,2]
      nextline[4]=nextline[4]+mat[i,3]
@ -62,7 +62,7 @@ function(locations_df,itrvl=10,accuracylim=51,r=NULL,w=NULL,tint_m=NULL,tint_k=N
  curind=1
  cat("Convert from X/Y to flights/pauses...\n")
  for(i in 1:nrow(avgmat)){
-    #ProgressBar(nrow(avgmat),i)
+    ProgressBar(nrow(avgmat),i)
    if(avgmat[i,1]==4){
      outmat=rbind(outmat,ExtractFlights(avgmat[curind:(i-1),c(5,6,2)],r,w),
                   c(avgmat[i,1],NA,NA,avgmat[i,2],NA,NA,avgmat[i,3]))
--- a/src/features/location_barnett/GuessPause.R
+++ b/src/features/location_barnett/GuessPause.R
@ -56,7 +56,7 @@ function(mat,mindur=300,r=75){
      outmat=mat[1:(flatmat[1,1]-1),]
    }
    for(i in 1:nrow(flatmat)){
-      #ProgressBar(nrow(flatmat),i)
+      ProgressBar(nrow(flatmat),i)
      outmat=rbind(outmat,Collapse2Pause(mat[flatmat[i,1]:flatmat[i,2],]))
      if(i<nrow(flatmat) && flatmat[i,2]<flatmat[i+1,1]-1){
        outmat=rbind(outmat,mat[(flatmat[i,2]+1):(flatmat[i+1,1]-1),])
--- a/src/features/location_barnett_features.R
+++ b/src/features/location_barnett_features.R
@ -1,8 +1,11 @@
 source("renv/activate.R")
+# Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility
+file.sources = list.files(c("src/features/location_barnett"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE)
+sapply(file.sources,source,.GlobalEnv)

 library(dplyr)

-write_empty_file <- function(file_path, requested_feature){
+write_empty_file <- function(file_path, requested_features){
  write.csv(data.frame(local_date= character(), 
                        location_barnett_hometime= numeric(), 
                        location_barnett_disttravelled= numeric(), 
@ -18,48 +21,69 @@ write_empty_file <- function(file_path, requested_feature){
                        location_barnett_siglocentropy= numeric(), 
                        location_barnett_minsmissing= numeric(), 
                        location_barnett_circdnrtn= numeric(), 
-                        location_barnett_wkenddayrtn= numeric()
-                      ) %>% select(requested_feature), file_path, row.names = F)
+                        location_barnett_wkenddayrtn= numeric(),
+                        minutes_data_used= numeric()
+                      ) %>% select(requested_features), file_path, row.names = F)
 }

-# Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility
-file.sources = list.files(c("src/features/location_barnett"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE)
-sapply(file.sources,source,.GlobalEnv)
-
+location <- read.csv(snakemake@input[["locations"]], stringsAsFactors = F) 
+# The choice between RESAMPLE_FUSED and the original location data happens at the rule level in the function
+# optional_location_input in features.snakefile
 locations_to_use <- snakemake@params[["locations_to_use"]]
 accuracy_limit <- snakemake@params[["accuracy_limit"]]
 timezone <- snakemake@params[["timezone"]]
-requested_feature <- intersect(unlist(snakemake@params["features"], use.names = F), 
+minutes_data_used <- snakemake@params[["minutes_data_used"]]
+requested_features <- intersect(unlist(snakemake@params["features"], use.names = F), 
                                c("hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","minsmissing","circdnrtn","wkenddayrtn"))
-requested_feature <- c("local_date", paste("location_barnett", requested_feature, sep = "_"))
+requested_features <- c("local_date", paste("location_barnett", requested_features, sep = "_"))
+if(minutes_data_used)
+  requested_features <- c(requested_features, "minutes_data_used")

-# By deafult we use all raw locations: fused without resampling and not fused (gps, network)
-location <- read.csv(snakemake@input[["raw"]], stringsAsFactors = F) %>%
-  select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy)
-
-if(locations_to_use == "ALL_EXCEPT_FUSED"){
-  location <- location %>% filter(provider != "fused")
-} else if (locations_to_use == "RESAMPLE_FUSED"){
-  location <- read.csv(snakemake@input[["fused"]], stringsAsFactors = F) %>%
-    select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy)
-} else if (locations_to_use != "ALL"){
+if(!locations_to_use %in% c("ALL_EXCEPT_FUSED", "RESAMPLE_FUSED", "ALL")){
  print("Unkown filter, provide one of the following three: ALL, ALL_EXCEPT_FUSED, or RESAMPLE_FUSED")
  quit(save = "no", status = 1, runLast = FALSE)
 }

+ # excludes fused and resample
+if(locations_to_use == "ALL_EXCEPT_FUSED")
+  location <- location %>% filter(provider == "gps")
+
+# Remove 0,0 location coordinates
+location <- location %>% filter(double_latitude != 0 & double_longitude != 0)
+
+# Excludes datasets with less than 24 hours of data
+if(max(location$timestamp) - min(location$timestamp) < 86400000)
+  location <- head(location, 0)
+
 if (nrow(location) > 1){
-    features <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone)
-    if(is.null(features)){
-      write_empty_file(snakemake@output[[1]], requested_feature)
+
+    # Count how many minutes of data we use to get location features
+    # Some minutes have multiple fused  rows
+    location_minutes_used <- location %>% 
+      group_by(local_date, local_hour) %>% 
+      summarise(n_minutes = n_distinct(local_minute)) %>% 
+      group_by(local_date) %>% 
+      summarise(minutes_data_used = sum(n_minutes)) %>% 
+      select(local_date, minutes_data_used)
+
+    location <- location %>%
+      select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy)
+
+    outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone)
+
+    if(is.null(outputMobility)){
+      write_empty_file(snakemake@output[[1]], requested_features)
    } else{
      # Copy index (dates) as a column 
-      outmatrix <- cbind(rownames(features$featavg), features$featavg)
-      outmatrix <- as.data.frame(outmatrix)
-      outmatrix[-1] <- lapply(lapply(outmatrix[-1], as.character), as.numeric)
-      colnames(outmatrix)=c("local_date",tolower(paste("location_barnett", colnames(features$featavg), sep = "_")))
-      write.csv(outmatrix %>% select(requested_feature), snakemake@output[[1]], row.names = F)
+      features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg)
+      features <- as.data.frame(features)
+      features[-1] <- lapply(lapply(features[-1], as.character), as.numeric)
+      colnames(features)=c("local_date",tolower(paste("location_barnett", colnames(outputMobility$featavg), sep = "_")))
+      # Add the minute count column
+      features <- left_join(features, location_minutes_used, by = "local_date")
+      write.csv(features %>% select(requested_features), snakemake@output[[1]], row.names = F)
    }
    
 } else {
-    write_empty_file(snakemake@output[[1]], requested_feature)    
+    write_empty_file(snakemake@output[[1]], requested_features)    
 }