From 8fbafcbe0ca7aa68a1e3503201cb8718838541cb Mon Sep 17 00:00:00 2001 From: JulioV Date: Thu, 11 Jun 2020 12:25:49 -0400 Subject: [PATCH] Fix location bug and improve fused resampling --- config.yaml | 3 +- docs/features/extracted.rst | 11 ++- rules/features.snakefile | 10 ++- src/data/resample_fused_location.R | 20 ++++-- src/features/location_barnett/GPS2MobMat.R | 4 +- src/features/location_barnett/GuessPause.R | 2 +- src/features/location_barnett_features.R | 80 ++++++++++++++-------- 7 files changed, 89 insertions(+), 41 deletions(-) diff --git a/config.yaml b/config.yaml index 264c1fae..456c1cb1 100644 --- a/config.yaml +++ b/config.yaml @@ -65,7 +65,7 @@ PHONE_VALID_SENSED_DAYS: RESAMPLE_FUSED_LOCATION: CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold - TIME_SINCE_VALID_LOCATION: 12 # hours, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row + TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row TIMEZONE: *timezone BARNETT_LOCATION: @@ -74,6 +74,7 @@ BARNETT_LOCATION: LOCATIONS_TO_USE: ALL # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius TIMEZONE: *timezone + MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features BLUETOOTH: DAY_SEGMENTS: *day_segments diff --git a/docs/features/extracted.rst b/docs/features/extracted.rst index 1d29625a..dde687fa 100644 --- a/docs/features/extracted.rst +++ b/docs/features/extracted.rst @@ -576,9 +576,10 @@ See `Location (Barnett’s) Config Code`_ ================= =================== Name Description ================= =================== -location_to_use The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``ALL_EXCEPT_FUSED`` OR ``RESAMPLE_FUSED`` +location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``ALL_EXCEPT_FUSED`` OR ``RESAMPLE_FUSED`` accuracy_limit This is in meters. The sensor drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius specified. -timezone The timezone used to calculate location. +timezone The timezone used to calculate location. +minutes_data_used This is NOT a feature. This is just a quality control check, and if set to TRUE, a new column is added to the output file with the number of minutes containing location data that were used to compute all features. The more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. features Features to be computed, see table below ================= =================== @@ -609,7 +610,11 @@ wkenddayrtn Same as circdnrtn but computed separately for w **Assumptions/Observations:** -Types of location data to use. Aware Android and iOS clients can collect location coordinates through the phone's GPS or Google's fused location API. If your Aware client was ONLY configured to use GPS set ``location_to_use`` to ``ALL``, if your client was configured to use BOTH GPS and fused location set ``location_to_use`` to ``ALL_EXCEPT_FUSED`` to ignore fused coordinates, if your client was configured to use fused location set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days `), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different from the previous one. +*Types of location data to use* + +Aware Android and iOS clients can collect location coordinates through the phone's GPS or Google's fused location API. If your Aware client was ONLY configured to use GPS set ``location_to_use`` to ``ALL``, if your client was configured to use BOTH GPS and fused location you can use ``ALL`` or set ``location_to_use`` to ``ALL_EXCEPT_FUSED`` to ignore fused coordinates, if your client was configured to use fused location only, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days `), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different from the previous one. + +There are two parameters associated with resampling fused location in the ``RESAMPLE_FUSED_LOCATION`` section of the ``config.yaml`` file. ``CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know. *Significant Locations Identified* diff --git a/rules/features.snakefile b/rules/features.snakefile index 3fc73dad..5b38e15a 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -9,6 +9,12 @@ def optional_ar_input(wildcards): return ["data/raw/{pid}/plugin_ios_activity_recognition_with_datetime_unified.csv", "data/processed/{pid}/plugin_ios_activity_recognition_deltas.csv"] +def optional_location_input(wildcards): + if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": + return rules.resample_fused_location.output + else: + return "data/raw/{pid}/locations_with_datetime.csv", + rule sms_features: input: "data/raw/{pid}/messages_with_datetime.csv" @@ -68,13 +74,13 @@ rule ios_activity_recognition_deltas: rule location_barnett_features: input: - raw = "data/raw/{pid}/locations_raw.csv", - fused = rules.resample_fused_location.output + locations = optional_location_input params: features = config["BARNETT_LOCATION"]["FEATURES"], locations_to_use = config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"], accuracy_limit = config["BARNETT_LOCATION"]["ACCURACY_LIMIT"], timezone = config["BARNETT_LOCATION"]["TIMEZONE"], + minutes_data_used = config["BARNETT_LOCATION"]["MINUTES_DATA_USED"], day_segment = "{day_segment}" output: "data/processed/{pid}/location_barnett_{day_segment}.csv" diff --git a/src/data/resample_fused_location.R b/src/data/resample_fused_location.R index 8fd93906..b31048c5 100644 --- a/src/data/resample_fused_location.R +++ b/src/data/resample_fused_location.R @@ -22,18 +22,30 @@ if(nrow(locations) > 0){ select(timestamp) resampled_locations <- locations %>% - bind_rows(sensed_minute_bins) %>% + bind_rows(sensed_minute_bins) %>% + mutate(provider = replace_na(provider, "resampled")) %>% arrange(timestamp) %>% # We group and therefore, fill in, missing rows that appear after a valid fused location record and exist # within consecutive_threshold minutes from each other mutate(consecutive_time_diff = c(1, diff(timestamp)), resample_group = cumsum(!is.na(double_longitude) | consecutive_time_diff > (1000 * 60 * consecutive_threshold))) %>% group_by(resample_group) %>% - # drop rows that are logged after time_since_valid_location hours from the last valid fused location - filter((timestamp - first(timestamp) < (1000 * 60 * 60 * time_since_valid_location))) %>% + # drop rows that are logged after time_since_valid_location minutes from the last valid fused location + filter((timestamp - first(timestamp) < (1000 * 60 * time_since_valid_location))) %>% fill(-timestamp, -resample_group) %>% select(-consecutive_time_diff) %>% - drop_na(double_longitude, double_latitude, accuracy) + drop_na(double_longitude, double_latitude, accuracy) %>% + # Add local date_time + mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"), + local_date_time = format(utc_date_time, tz = timezone, usetz = F)) %>% + separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>% + separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>% + mutate(local_hour = as.numeric(local_hour), local_minute = as.numeric(local_minute)) %>% + # Delete resampled rows that exist in the same minute as other original (fused) rows + group_by(local_date, local_hour, local_minute) %>% + mutate(n = n()) %>% + filter(n == 1 | (n > 1 & provider == "fused")) %>% + select(-n) write.csv(resampled_locations,snakemake@output[[1]], row.names = F) } else { diff --git a/src/features/location_barnett/GPS2MobMat.R b/src/features/location_barnett/GPS2MobMat.R index 89eeab7a..a734104f 100644 --- a/src/features/location_barnett/GPS2MobMat.R +++ b/src/features/location_barnett/GPS2MobMat.R @@ -25,7 +25,7 @@ function(locations_df,itrvl=10,accuracylim=51,r=NULL,w=NULL,tint_m=NULL,tint_k=N numitrvl=1 cat("Collapse data within",itrvl,"second intervals...\n") for(i in 2:nrow(mat)){ - #ProgressBar(nrow(mat)-1,i-1) + ProgressBar(nrow(mat)-1,i-1) if(mat[i,1]/1000% select(requested_feature), file_path, row.names = F) + location_barnett_wkenddayrtn= numeric(), + minutes_data_used= numeric() + ) %>% select(requested_features), file_path, row.names = F) } -# Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility -file.sources = list.files(c("src/features/location_barnett"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE) -sapply(file.sources,source,.GlobalEnv) - +location <- read.csv(snakemake@input[["locations"]], stringsAsFactors = F) +# The choice between RESAMPLE_FUSED and the original location data happens at the rule level in the function +# optional_location_input in features.snakefile locations_to_use <- snakemake@params[["locations_to_use"]] accuracy_limit <- snakemake@params[["accuracy_limit"]] timezone <- snakemake@params[["timezone"]] -requested_feature <- intersect(unlist(snakemake@params["features"], use.names = F), +minutes_data_used <- snakemake@params[["minutes_data_used"]] +requested_features <- intersect(unlist(snakemake@params["features"], use.names = F), c("hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","minsmissing","circdnrtn","wkenddayrtn")) -requested_feature <- c("local_date", paste("location_barnett", requested_feature, sep = "_")) +requested_features <- c("local_date", paste("location_barnett", requested_features, sep = "_")) +if(minutes_data_used) + requested_features <- c(requested_features, "minutes_data_used") -# By deafult we use all raw locations: fused without resampling and not fused (gps, network) -location <- read.csv(snakemake@input[["raw"]], stringsAsFactors = F) %>% - select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy) - -if(locations_to_use == "ALL_EXCEPT_FUSED"){ - location <- location %>% filter(provider != "fused") -} else if (locations_to_use == "RESAMPLE_FUSED"){ - location <- read.csv(snakemake@input[["fused"]], stringsAsFactors = F) %>% - select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy) -} else if (locations_to_use != "ALL"){ +if(!locations_to_use %in% c("ALL_EXCEPT_FUSED", "RESAMPLE_FUSED", "ALL")){ print("Unkown filter, provide one of the following three: ALL, ALL_EXCEPT_FUSED, or RESAMPLE_FUSED") quit(save = "no", status = 1, runLast = FALSE) } + # excludes fused and resample +if(locations_to_use == "ALL_EXCEPT_FUSED") + location <- location %>% filter(provider == "gps") + +# Remove 0,0 location coordinates +location <- location %>% filter(double_latitude != 0 & double_longitude != 0) + +# Excludes datasets with less than 24 hours of data +if(max(location$timestamp) - min(location$timestamp) < 86400000) + location <- head(location, 0) + if (nrow(location) > 1){ - features <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone) - if(is.null(features)){ - write_empty_file(snakemake@output[[1]], requested_feature) + + # Count how many minutes of data we use to get location features + # Some minutes have multiple fused rows + location_minutes_used <- location %>% + group_by(local_date, local_hour) %>% + summarise(n_minutes = n_distinct(local_minute)) %>% + group_by(local_date) %>% + summarise(minutes_data_used = sum(n_minutes)) %>% + select(local_date, minutes_data_used) + + location <- location %>% + select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy) + + outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone) + + if(is.null(outputMobility)){ + write_empty_file(snakemake@output[[1]], requested_features) } else{ # Copy index (dates) as a column - outmatrix <- cbind(rownames(features$featavg), features$featavg) - outmatrix <- as.data.frame(outmatrix) - outmatrix[-1] <- lapply(lapply(outmatrix[-1], as.character), as.numeric) - colnames(outmatrix)=c("local_date",tolower(paste("location_barnett", colnames(features$featavg), sep = "_"))) - write.csv(outmatrix %>% select(requested_feature), snakemake@output[[1]], row.names = F) + features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg) + features <- as.data.frame(features) + features[-1] <- lapply(lapply(features[-1], as.character), as.numeric) + colnames(features)=c("local_date",tolower(paste("location_barnett", colnames(outputMobility$featavg), sep = "_"))) + # Add the minute count column + features <- left_join(features, location_minutes_used, by = "local_date") + write.csv(features %>% select(requested_features), snakemake@output[[1]], row.names = F) } } else { - write_empty_file(snakemake@output[[1]], requested_feature) + write_empty_file(snakemake@output[[1]], requested_features) }