- Fix bug when no phone data yield is needed to process location data

- Remove location rows with the same timestamp based on their accuracy
2021-05-26 14:04:29 -04:00 · 2021-05-26 14:04:29 -04:00 · 32472461ec
parent 9b21196f35
commit 32472461ec
4 changed files with 15 additions and 3 deletions
--- a/docs/change-log.md
+++ b/docs/change-log.md
@ -4,6 +4,8 @@
 - Fix bug that did not correctly parse participants with more than 2 phones or more than 1 wearable
 - New keyboard features
 - Add the `EXCLUDE_SLEEP` module for steps intraday features
+- Fix bug when no phone data yield is needed to process location data
+- Remove location rows with the same timestamp based on their accuracy
 ## v1.2.0
 - Sleep summary and intraday features are more consistent.
 - Add wake and bedtime features for sleep summary data.
--- a/rules/common.smk
+++ b/rules/common.smk
@ -11,6 +11,11 @@ def get_script_language(script_path):


 # Features.smk #########################################################################################################
+def optional_phone_yield_input_for_locations(wildcards):
+    if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] in ["ALL_RESAMPLED","FUSED_RESAMPLED"]:
+        return "data/interim/{pid}/phone_yielded_timestamps.csv"
+    return []
+
 def get_barnett_daily(wildcards):
    if wildcards.provider_key.upper() == "BARNETT":
        return "data/interim/{pid}/phone_locations_barnett_daily.csv"
--- a/rules/preprocessing.smk
+++ b/rules/preprocessing.smk
@ -94,7 +94,7 @@ rule unify_ios_android:
 rule process_phone_locations_types:
    input:
        locations = "data/raw/{pid}/phone_locations_raw.csv",
-        phone_sensed_timestamps = "data/interim/{pid}/phone_yielded_timestamps.csv",
+        phone_sensed_timestamps = optional_phone_yield_input_for_locations,
    params:
        consecutive_threshold = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"],
        time_since_valid_location = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"],
--- a/src/data/process_location_types.R
+++ b/src/data/process_location_types.R
@ -7,10 +7,13 @@ consecutive_threshold <- snakemake@params[["consecutive_threshold"]]
 time_since_valid_location <- snakemake@params[["time_since_valid_location"]]
 locations_to_use <- snakemake@params[["locations_to_use"]]

-phone_sensed_timestamps  <- read_csv(snakemake@input[["phone_sensed_timestamps"]], col_types = cols_only(timestamp = col_double()))
 locations <- read.csv(snakemake@input[["locations"]]) %>% 
            filter(double_latitude != 0 & double_longitude != 0) %>% 
-            drop_na(double_longitude, double_latitude)
+            drop_na(double_longitude, double_latitude) %>% 
+            group_by(timestamp) %>% # keep only the row with the best accuracy if two or more have the same timestamp
+            filter(accuracy == min(accuracy, na.rm=TRUE)) %>%  
+            filter(row_number()==1) %>% 
+            ungroup()

 if(!locations_to_use %in% c("ALL", "FUSED_RESAMPLED", "GPS", "ALL_RESAMPLED")){
    print("Unkown location filter, provide one of the following three: ALL, GPS, ALL_RESAMPLED, or FUSED_RESAMPLED")
@ -39,6 +42,8 @@ if(locations_to_use == "ALL"){
    }

    if(nrow(locations) > 0){
+        phone_sensed_timestamps  <- read_csv(snakemake@input[["phone_sensed_timestamps"]], col_types = cols_only(timestamp = col_double()))
+
        processed_locations <- locations %>%
            distinct(timestamp, .keep_all = TRUE) %>% 
            bind_rows(phone_sensed_timestamps) %>%