Add ALL_RESAMPLED flag and accuracy limit
parent
4c1e311135
commit
d0fe4d4c28
|
@ -232,20 +232,21 @@ PHONE_LIGHT:
|
||||||
# See https://www.rapids.science/latest/features/phone-locations/
|
# See https://www.rapids.science/latest/features/phone-locations/
|
||||||
PHONE_LOCATIONS:
|
PHONE_LOCATIONS:
|
||||||
TABLE: locations
|
TABLE: locations
|
||||||
LOCATIONS_TO_USE: FUSED_RESAMPLED # ALL, GPS OR FUSED_RESAMPLED
|
LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED
|
||||||
FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
|
FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
|
||||||
FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
|
FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
DORYAB:
|
DORYAB:
|
||||||
COMPUTE: False
|
COMPUTE: False
|
||||||
FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
|
FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
|
||||||
DBSCAN_EPS: 100 # meters
|
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
|
||||||
|
DBSCAN_EPS: 10 # meters
|
||||||
DBSCAN_MINSAMPLES: 5
|
DBSCAN_MINSAMPLES: 5
|
||||||
THRESHOLD_STATIC : 1 # km/h
|
THRESHOLD_STATIC : 1 # km/h
|
||||||
MAXIMUM_GAP_ALLOWED: 300
|
MAXIMUM_GAP_ALLOWED: 300
|
||||||
MINUTES_DATA_USED: False
|
MINUTES_DATA_USED: False
|
||||||
SAMPLING_FREQUENCY: 0
|
SAMPLING_FREQUENCY: 0
|
||||||
CLUSTER_ON: TIME_SEGMENT # PARTICIPANT_DATASET,TIME_SEGMENT
|
CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT
|
||||||
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
|
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
|
||||||
SRC_FOLDER: "doryab" # inside src/features/phone_locations
|
SRC_FOLDER: "doryab" # inside src/features/phone_locations
|
||||||
SRC_LANGUAGE: "python"
|
SRC_LANGUAGE: "python"
|
||||||
|
|
|
@ -5,13 +5,13 @@ Sensor parameters description for `[PHONE_LOCATIONS]`:
|
||||||
|Key | Description |
|
|Key | Description |
|
||||||
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|
||||||
|`[TABLE]`| Database table where the location data is stored
|
|`[TABLE]`| Database table where the location data is stored
|
||||||
|`[LOCATIONS_TO_USE]`| Type of location data to use, one of `ALL`, `GPS` or `FUSED_RESAMPLED`. This filter is based on the `provider` column of the AWARE locations table, `ALL` includes every row, `GPS` only includes rows where provider is gps, and `FUSED_RESAMPLED` only includes rows where provider is fused after being resampled.
|
|`[LOCATIONS_TO_USE]`| Type of location data to use, one of `ALL`, `GPS`, `ALL_RESAMPLED` or `FUSED_RESAMPLED`. This filter is based on the `provider` column of the AWARE locations table, `ALL` includes every row, `GPS` only includes rows where provider is gps, `ALL_RESAMPLED` includes all rows after being resampled, and `FUSED_RESAMPLED` only includes rows where provider is fused after being resampled.
|
||||||
|`[FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD]`| if `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row will be resampled to the next valid timestamp (see the Assumptions/Observations below) only if the time difference between them is less or equal than this threshold (in minutes).
|
|`[FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD]`| if `ALL_RESAMPLED` or `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row will be resampled to the next valid timestamp (see the Assumptions/Observations below) only if the time difference between them is less or equal than this threshold (in minutes).
|
||||||
|`[FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION]`| if `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row will be resampled at most for this long (in minutes)
|
|`[FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION]`| if `ALL_RESAMPLED` or `FUSED_RESAMPLED` is used, the original fused data has to be resampled, a location row will be resampled at most for this long (in minutes)
|
||||||
|
|
||||||
!!! note "Assumptions/Observations"
|
!!! note "Assumptions/Observations"
|
||||||
**Types of location data to use**
|
**Types of location data to use**
|
||||||
AWARE Android and iOS clients can collect location coordinates through the phone\'s GPS, the network cellular towers around the phone or Google\'s fused location API. If you want to use only the GPS provider set `[LOCATIONS_TO_USE]` to `GPS`, if you want to use all providers (not recommended due to the difference in accuracy) set `[LOCATIONS_TO_USE]` to `ALL`, if your AWARE client was configured to use fused location only or want to focus only on this provider, set `[LOCATIONS_TO_USE]` to `RESAMPLE_FUSED`. `RESAMPLE_FUSED` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by the joined timestamps of [`[PHONE_DATA_YIELD][SENSORS]`](../phone-data-yield/), this is done because Google\'s API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one.
|
AWARE Android and iOS clients can collect location coordinates through the phone\'s GPS, the network cellular towers around the phone, or Google\'s fused location API. If you want to use only the GPS provider set `[LOCATIONS_TO_USE]` to `GPS`, if you want to use all providers set `[LOCATIONS_TO_USE]` to `ALL`, if you collected location data from different providers including the fused API use `ALL_RESAMPLED`, if your AWARE client was configured to use fused location only or want to focus only on this provider, set `[LOCATIONS_TO_USE]` to `RESAMPLE_FUSED`. `ALL_RESAMPLED` and `RESAMPLE_FUSED` take the original location coordinates and replicate each pair forward in time as long as the phone was sensing data as indicated by the joined timestamps of [`[PHONE_DATA_YIELD][SENSORS]`](../phone-data-yield/), this is done because Google\'s API only logs a new location coordinate pair when it is sufficiently different in time or space from the previous one and because GPS and network providers can log data at variable rates.
|
||||||
|
|
||||||
There are two parameters associated with resampling fused location. `FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A\'s phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). `FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION` (in minutes, default 720 or 12 hours) stops the last known fused location from being replicated longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this resampling, let us know.
|
There are two parameters associated with resampling fused location. `FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A\'s phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). `FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION` (in minutes, default 720 or 12 hours) stops the last known fused location from being replicated longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this resampling, let us know.
|
||||||
|
|
||||||
|
@ -100,6 +100,7 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
|
||||||
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|
|----------------|-----------------------------------------------------------------------------------------------------------------------------------
|
||||||
|`[COMPUTE]`| Set to `True` to extract `PHONE_LOCATIONS` features from the `BARNETT` provider|
|
|`[COMPUTE]`| Set to `True` to extract `PHONE_LOCATIONS` features from the `BARNETT` provider|
|
||||||
|`[FEATURES]` | Features to be computed, see table below
|
|`[FEATURES]` | Features to be computed, see table below
|
||||||
|
|`[ACCURACY_LIMIT]` | An integer in meters, any location rows with an accuracy higher than this will be dropped. This number means there's a 68% probability the true location is within this radius
|
||||||
| `[DBSCAN_EPS]` | The maximum distance in meters between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
|
| `[DBSCAN_EPS]` | The maximum distance in meters between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
|
||||||
| `[DBSCAN_MINSAMPLES]` | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point of a cluster. This includes the point itself.
|
| `[DBSCAN_MINSAMPLES]` | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point of a cluster. This includes the point itself.
|
||||||
| `[THRESHOLD_STATIC]` | It is the threshold value in km/hr which labels a row as Static or Moving.
|
| `[THRESHOLD_STATIC]` | It is the threshold value in km/hr which labels a row as Static or Moving.
|
||||||
|
|
|
@ -12,21 +12,34 @@ locations <- read.csv(snakemake@input[["locations"]]) %>%
|
||||||
filter(double_latitude != 0 & double_longitude != 0) %>%
|
filter(double_latitude != 0 & double_longitude != 0) %>%
|
||||||
drop_na(double_longitude, double_latitude)
|
drop_na(double_longitude, double_latitude)
|
||||||
|
|
||||||
if(!locations_to_use %in% c("ALL", "FUSED_RESAMPLED", "GPS")){
|
if(!locations_to_use %in% c("ALL", "FUSED_RESAMPLED", "GPS", "ALL_RESAMPLED")){
|
||||||
print("Unkown location filter, provide one of the following three: ALL, GPS, or FUSED_RESAMPLED")
|
print("Unkown location filter, provide one of the following three: ALL, GPS, ALL_RESAMPLED, or FUSED_RESAMPLED")
|
||||||
quit(save = "no", status = 1, runLast = FALSE)
|
quit(save = "no", status = 1, runLast = FALSE)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# keep the location row that has the best (lowest) accuracy if more than 1 row was logged within any 1 second
|
||||||
|
if(locations_to_use %in% c("FUSED_RESAMPLED", "ALL_RESAMPLED"))
|
||||||
|
locations <- locations %>% drop_na(double_longitude, double_latitude) %>%
|
||||||
|
mutate(minute_bin = timestamp %/% 1001) %>%
|
||||||
|
group_by(minute_bin) %>%
|
||||||
|
slice(which.min(accuracy)) %>%
|
||||||
|
ungroup() %>%
|
||||||
|
select(-minute_bin)
|
||||||
|
|
||||||
if(locations_to_use == "ALL"){
|
if(locations_to_use == "ALL"){
|
||||||
processed_locations <- locations
|
processed_locations <- locations
|
||||||
} else if(locations_to_use == "GPS"){
|
} else if(locations_to_use == "GPS"){
|
||||||
processed_locations <- locations %>% filter(provider == "gps")
|
processed_locations <- locations %>% filter(provider == "gps")
|
||||||
} else if(locations_to_use == "FUSED_RESAMPLED"){
|
} else if(locations_to_use %in% c("FUSED_RESAMPLED", "ALL_RESAMPLED")){
|
||||||
locations <- locations %>% filter(provider == "fused")
|
if (locations_to_use == "FUSED_RESAMPLED"){
|
||||||
|
locations <- locations %>% filter(provider == "fused")
|
||||||
|
providers_to_keep = c("fused")
|
||||||
|
} else if(locations_to_use == "ALL_RESAMPLED"){
|
||||||
|
providers_to_keep = c("fused", "gps", "network")
|
||||||
|
}
|
||||||
|
|
||||||
if(nrow(locations) > 0){
|
if(nrow(locations) > 0){
|
||||||
processed_locations <- locations %>%
|
processed_locations <- locations %>%
|
||||||
# TODO filter repeated location rows based on the accurcy
|
|
||||||
distinct(timestamp, .keep_all = TRUE) %>%
|
distinct(timestamp, .keep_all = TRUE) %>%
|
||||||
bind_rows(phone_sensed_timestamps) %>%
|
bind_rows(phone_sensed_timestamps) %>%
|
||||||
arrange(timestamp) %>%
|
arrange(timestamp) %>%
|
||||||
|
@ -37,7 +50,7 @@ if(locations_to_use == "ALL"){
|
||||||
group_by(resample_group) %>%
|
group_by(resample_group) %>%
|
||||||
# Filter those rows that are further away than time_since_valid_location since the last fused location
|
# Filter those rows that are further away than time_since_valid_location since the last fused location
|
||||||
mutate(time_from_fused = timestamp - first(timestamp)) %>%
|
mutate(time_from_fused = timestamp - first(timestamp)) %>%
|
||||||
filter(provider == "fused" | (time_from_fused < (1000 * 60 * time_since_valid_location))) %>%
|
filter(provider %in% providers_to_keep | (time_from_fused < (1000 * 60 * time_since_valid_location))) %>%
|
||||||
# Summarise the period to resample for
|
# Summarise the period to resample for
|
||||||
summarise(limit = max(timestamp), timestamp = first(timestamp), double_latitude = first(double_latitude), double_longitude = first(double_longitude),
|
summarise(limit = max(timestamp), timestamp = first(timestamp), double_latitude = first(double_latitude), double_longitude = first(double_longitude),
|
||||||
double_bearing=first(double_bearing), double_speed = first(double_speed), double_altitude=first(double_altitude), provider=first(provider),
|
double_bearing=first(double_bearing), double_speed = first(double_speed), double_altitude=first(double_altitude), provider=first(provider),
|
||||||
|
|
|
@ -65,9 +65,9 @@ barnett_features <- function(sensor_data_files, time_segment, params){
|
||||||
# Some minutes have multiple fused rows
|
# Some minutes have multiple fused rows
|
||||||
location_minutes_used <- location %>%
|
location_minutes_used <- location %>%
|
||||||
group_by(local_date, local_hour) %>%
|
group_by(local_date, local_hour) %>%
|
||||||
summarise(n_minutes = n_distinct(local_minute)) %>%
|
summarise(n_minutes = n_distinct(local_minute), .groups = 'drop_last') %>%
|
||||||
group_by(local_date) %>%
|
group_by(local_date) %>%
|
||||||
summarise(minutes_data_used = sum(n_minutes)) %>%
|
summarise(minutes_data_used = sum(n_minutes), .groups = 'drop_last') %>%
|
||||||
select(local_date, minutes_data_used)
|
select(local_date, minutes_data_used)
|
||||||
|
|
||||||
# Save time segment to attach it later
|
# Save time segment to attach it later
|
||||||
|
@ -78,7 +78,7 @@ barnett_features <- function(sensor_data_files, time_segment, params){
|
||||||
if(nrow(location %>% filter(accuracy < accuracy_limit)) > 1){
|
if(nrow(location %>% filter(accuracy < accuracy_limit)) > 1){
|
||||||
outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone)
|
outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone)
|
||||||
} else {
|
} else {
|
||||||
print(paste("Cannot compute location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT", accuracy_limit))
|
print(paste("Cannot compute Barnett location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT", accuracy_limit))
|
||||||
outputMobility <- NULL
|
outputMobility <- NULL
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import warnings
|
||||||
from astropy.timeseries import LombScargle
|
from astropy.timeseries import LombScargle
|
||||||
from sklearn.cluster import DBSCAN,OPTICS
|
from sklearn.cluster import DBSCAN,OPTICS
|
||||||
from math import radians, cos, sin, asin, sqrt
|
from math import radians, cos, sin, asin, sqrt
|
||||||
|
@ -8,6 +9,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
||||||
|
|
||||||
location_data = pd.read_csv(sensor_data_files["sensor_data"])
|
location_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||||
requested_features = provider["FEATURES"]
|
requested_features = provider["FEATURES"]
|
||||||
|
accuracy_limit = provider["ACCURACY_LIMIT"]
|
||||||
dbscan_eps = provider["DBSCAN_EPS"]
|
dbscan_eps = provider["DBSCAN_EPS"]
|
||||||
dbscan_minsamples = provider["DBSCAN_MINSAMPLES"]
|
dbscan_minsamples = provider["DBSCAN_MINSAMPLES"]
|
||||||
threshold_static = provider["THRESHOLD_STATIC"]
|
threshold_static = provider["THRESHOLD_STATIC"]
|
||||||
|
@ -32,6 +34,11 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
||||||
else:
|
else:
|
||||||
raise ValueError("config[PHONE_LOCATIONS][DORYAB][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm)
|
raise ValueError("config[PHONE_LOCATIONS][DORYAB][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm)
|
||||||
|
|
||||||
|
rows_before_accuracy_filter = len(location_data)
|
||||||
|
location_data.query("accuracy < @accuracy_limit", inplace=True)
|
||||||
|
if rows_before_accuracy_filter > 0 and len(location_data) == 0:
|
||||||
|
warnings.warn("Cannot compute Doryab location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT: {}".format(accuracy_limit))
|
||||||
|
|
||||||
if location_data.empty:
|
if location_data.empty:
|
||||||
location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
|
location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue