From 32472461ec8556f4d07585ed370da513816c90dd Mon Sep 17 00:00:00 2001
From: JulioV <JulioV@users.noreply.github.com>
Date: Wed, 26 May 2021 14:04:29 -0400
Subject: [PATCH] - Fix bug when no phone data yield is needed to process
 location data

- Remove location rows with the same timestamp based on their accuracy
---
 docs/change-log.md                | 2 ++
 rules/common.smk                  | 5 +++++
 rules/preprocessing.smk           | 2 +-
 src/data/process_location_types.R | 9 +++++++--
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/docs/change-log.md b/docs/change-log.md
index cc444738..7e68e20f 100644
--- a/docs/change-log.md
+++ b/docs/change-log.md
@@ -4,6 +4,8 @@
 - Fix bug that did not correctly parse participants with more than 2 phones or more than 1 wearable
 - New keyboard features
 - Add the `EXCLUDE_SLEEP` module for steps intraday features
+- Fix bug when no phone data yield is needed to process location data
+- Remove location rows with the same timestamp based on their accuracy
 ## v1.2.0
 - Sleep summary and intraday features are more consistent.
 - Add wake and bedtime features for sleep summary data.
diff --git a/rules/common.smk b/rules/common.smk
index 550b2507..da31700a 100644
--- a/rules/common.smk
+++ b/rules/common.smk
@@ -11,6 +11,11 @@ def get_script_language(script_path):
 
 
 # Features.smk #########################################################################################################
+def optional_phone_yield_input_for_locations(wildcards):
+    if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] in ["ALL_RESAMPLED","FUSED_RESAMPLED"]:
+        return "data/interim/{pid}/phone_yielded_timestamps.csv"
+    return []
+
 def get_barnett_daily(wildcards):
     if wildcards.provider_key.upper() == "BARNETT":
         return "data/interim/{pid}/phone_locations_barnett_daily.csv"
diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk
index 281ecb86..0d393463 100644
--- a/rules/preprocessing.smk
+++ b/rules/preprocessing.smk
@@ -94,7 +94,7 @@ rule unify_ios_android:
 rule process_phone_locations_types:
     input:
         locations = "data/raw/{pid}/phone_locations_raw.csv",
-        phone_sensed_timestamps = "data/interim/{pid}/phone_yielded_timestamps.csv",
+        phone_sensed_timestamps = optional_phone_yield_input_for_locations,
     params:
         consecutive_threshold = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"],
         time_since_valid_location = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"],
diff --git a/src/data/process_location_types.R b/src/data/process_location_types.R
index 50e8705e..8a2d58d5 100644
--- a/src/data/process_location_types.R
+++ b/src/data/process_location_types.R
@@ -7,10 +7,13 @@ consecutive_threshold <- snakemake@params[["consecutive_threshold"]]
 time_since_valid_location <- snakemake@params[["time_since_valid_location"]]
 locations_to_use <- snakemake@params[["locations_to_use"]]
 
-phone_sensed_timestamps  <- read_csv(snakemake@input[["phone_sensed_timestamps"]], col_types = cols_only(timestamp = col_double()))
 locations <- read.csv(snakemake@input[["locations"]]) %>% 
             filter(double_latitude != 0 & double_longitude != 0) %>% 
-            drop_na(double_longitude, double_latitude)
+            drop_na(double_longitude, double_latitude) %>% 
+            group_by(timestamp) %>% # keep only the row with the best accuracy if two or more have the same timestamp
+            filter(accuracy == min(accuracy, na.rm=TRUE)) %>%  
+            filter(row_number()==1) %>% 
+            ungroup()
 
 if(!locations_to_use %in% c("ALL", "FUSED_RESAMPLED", "GPS", "ALL_RESAMPLED")){
     print("Unkown location filter, provide one of the following three: ALL, GPS, ALL_RESAMPLED, or FUSED_RESAMPLED")
@@ -39,6 +42,8 @@ if(locations_to_use == "ALL"){
     }
 
     if(nrow(locations) > 0){
+        phone_sensed_timestamps  <- read_csv(snakemake@input[["phone_sensed_timestamps"]], col_types = cols_only(timestamp = col_double()))
+
         processed_locations <- locations %>%
             distinct(timestamp, .keep_all = TRUE) %>% 
             bind_rows(phone_sensed_timestamps) %>%