From 32472461ec8556f4d07585ed370da513816c90dd Mon Sep 17 00:00:00 2001 From: JulioV Date: Wed, 26 May 2021 14:04:29 -0400 Subject: [PATCH] - Fix bug when no phone data yield is needed to process location data - Remove location rows with the same timestamp based on their accuracy --- docs/change-log.md | 2 ++ rules/common.smk | 5 +++++ rules/preprocessing.smk | 2 +- src/data/process_location_types.R | 9 +++++++-- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/docs/change-log.md b/docs/change-log.md index cc444738..7e68e20f 100644 --- a/docs/change-log.md +++ b/docs/change-log.md @@ -4,6 +4,8 @@ - Fix bug that did not correctly parse participants with more than 2 phones or more than 1 wearable - New keyboard features - Add the `EXCLUDE_SLEEP` module for steps intraday features +- Fix bug when no phone data yield is needed to process location data +- Remove location rows with the same timestamp based on their accuracy ## v1.2.0 - Sleep summary and intraday features are more consistent. - Add wake and bedtime features for sleep summary data. diff --git a/rules/common.smk b/rules/common.smk index 550b2507..da31700a 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -11,6 +11,11 @@ def get_script_language(script_path): # Features.smk ######################################################################################################### +def optional_phone_yield_input_for_locations(wildcards): + if config["PHONE_LOCATIONS"]["LOCATIONS_TO_USE"] in ["ALL_RESAMPLED","FUSED_RESAMPLED"]: + return "data/interim/{pid}/phone_yielded_timestamps.csv" + return [] + def get_barnett_daily(wildcards): if wildcards.provider_key.upper() == "BARNETT": return "data/interim/{pid}/phone_locations_barnett_daily.csv" diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index 281ecb86..0d393463 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -94,7 +94,7 @@ rule unify_ios_android: rule process_phone_locations_types: input: locations = "data/raw/{pid}/phone_locations_raw.csv", - phone_sensed_timestamps = "data/interim/{pid}/phone_yielded_timestamps.csv", + phone_sensed_timestamps = optional_phone_yield_input_for_locations, params: consecutive_threshold = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD"], time_since_valid_location = config["PHONE_LOCATIONS"]["FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION"], diff --git a/src/data/process_location_types.R b/src/data/process_location_types.R index 50e8705e..8a2d58d5 100644 --- a/src/data/process_location_types.R +++ b/src/data/process_location_types.R @@ -7,10 +7,13 @@ consecutive_threshold <- snakemake@params[["consecutive_threshold"]] time_since_valid_location <- snakemake@params[["time_since_valid_location"]] locations_to_use <- snakemake@params[["locations_to_use"]] -phone_sensed_timestamps <- read_csv(snakemake@input[["phone_sensed_timestamps"]], col_types = cols_only(timestamp = col_double())) locations <- read.csv(snakemake@input[["locations"]]) %>% filter(double_latitude != 0 & double_longitude != 0) %>% - drop_na(double_longitude, double_latitude) + drop_na(double_longitude, double_latitude) %>% + group_by(timestamp) %>% # keep only the row with the best accuracy if two or more have the same timestamp + filter(accuracy == min(accuracy, na.rm=TRUE)) %>% + filter(row_number()==1) %>% + ungroup() if(!locations_to_use %in% c("ALL", "FUSED_RESAMPLED", "GPS", "ALL_RESAMPLED")){ print("Unkown location filter, provide one of the following three: ALL, GPS, ALL_RESAMPLED, or FUSED_RESAMPLED") @@ -39,6 +42,8 @@ if(locations_to_use == "ALL"){ } if(nrow(locations) > 0){ + phone_sensed_timestamps <- read_csv(snakemake@input[["phone_sensed_timestamps"]], col_types = cols_only(timestamp = col_double())) + processed_locations <- locations %>% distinct(timestamp, .keep_all = TRUE) %>% bind_rows(phone_sensed_timestamps) %>%