diff --git a/Snakefile b/Snakefile index eca70344..71a6fc5c 100644 --- a/Snakefile +++ b/Snakefile @@ -205,6 +205,9 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): else: raise ValueError("Error: Add PHONE_LOCATIONS (and as many PHONE_SENSORS as you have) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data) which is used to resample fused location data (ALL_RESAMPLED and RESAMPLED_FUSED)") + if provider == "BARNETT": + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_barnett_daily.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"])) diff --git a/docs/features/phone-locations.md b/docs/features/phone-locations.md index eeb34a82..d46ce1ef 100644 --- a/docs/features/phone-locations.md +++ b/docs/features/phone-locations.md @@ -33,7 +33,7 @@ These features are based on the original open-source implementation by [Barnett !!! info "Available time segments and platforms" - - Available only for segments that start at 00:00:00 and end at 23:59:59 of the same day (daily segments) + - Available only for segments that start at 00:00:00 and end at 23:59:59 of the same or a different day (daily, weekly, weekend, etc.) - Available for Android and iOS !!! info "File Sequence" @@ -78,7 +78,17 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][BARNETT]` adapted from [B |wkenddayrtn | - | Same as circdnrtn but computed separately for weekends and weekdays. !!! note "Assumptions/Observations" - **Barnett\'s et al features** + **Multi day segment features** + Barnett's features are only available on time segments that span entire days (00:00:00 to 23:59:59). Such segments can be one-day long (daily) or multi-day (weekly, for example). Multi-day segment features are computed based on daily features summarized the following way: + + - sum for `hometime`, `disttravelled`, `siglocsvisited`, and `minutes_data_used` + - max for `maxdiam`, and `maxhomedist` + - mean for `rog`, `avgflightlen`, `stdflightlen`, `avgflightdur`, `stdflightdur`, `probpause`, `siglocentropy`, `circdnrtn`, `wkenddayrtn`, and `minsmissing` + + **Computation speed** + The process to extract these features can be slow compared to other sensors and providers due to the required simulation. + + **How are these features computed?** These features are based on a Pause-Flight model. A pause is defined as a mobility trace (location pings) within a certain duration and distance (by default, 300 seconds and 60 meters). A flight is any mobility trace between two pauses. Data is resampled and imputed before the features are computed. See [Barnett et al](../../citation#barnett-locations) for more information. In RAPIDS, we only expose one parameter for these features (accuracy limit). You can change other parameters in `src/features/phone_locations/barnett/library/MobilityFeatures.R`. **Significant Locations** diff --git a/rules/common.smk b/rules/common.smk index 3e34a105..6651677a 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -11,6 +11,11 @@ def get_script_language(script_path): # Features.smk ######################################################################################################### +def get_barnett_daily(wildcards): + if wildcards.provider_key.upper() == "BARNETT": + return "data/interim/{pid}/phone_locations_barnett_daily.csv" + return [] + def find_features_files(wildcards): feature_files = [] for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items(): diff --git a/rules/features.smk b/rules/features.smk index a2105145..7f471968 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -379,10 +379,22 @@ rule phone_locations_python_features: script: "../src/features/entry.py" +rule phone_locations_barnett_daily_features: + input: + sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv", + params: + provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"]["BARNETT"], + output: + "data/interim/{pid}/phone_locations_barnett_daily.csv" + script: + "../src/features/phone_locations/barnett/daily_features.R" + rule phone_locations_r_features: input: sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv", - time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv", + barnett_daily = get_barnett_daily params: provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", diff --git a/src/features/phone_locations/barnett/daily_features.R b/src/features/phone_locations/barnett/daily_features.R new file mode 100644 index 00000000..4a9009c5 --- /dev/null +++ b/src/features/phone_locations/barnett/daily_features.R @@ -0,0 +1,68 @@ +source("renv/activate.R") +library("dplyr", warn.conflicts = F) +library("stringr") +library("lubridate") +library("purrr") + +# Load Ian Barnett's code. From https://scholar.harvard.edu/ibarnett/software/gpsmobility +file.sources = list.files(c("src/features/phone_locations/barnett/library"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE) +output_apply <- sapply(file.sources,source,.GlobalEnv) + +create_empty_file <- function(){ + return(data.frame(local_date= character(), hometime= numeric(), disttravelled= numeric(), rog= numeric(), maxdiam= numeric(), + maxhomedist= numeric(), siglocsvisited= numeric(), avgflightlen= numeric(), stdflightlen= numeric(), + avgflightdur= numeric(), stdflightdur= numeric(), probpause= numeric(), siglocentropy= numeric(), minsmissing= numeric(), + circdnrtn= numeric(), wkenddayrtn= numeric(), minutes_data_used= numeric() + )) +} + +barnett_daily_features <- function(snakemake){ + location_features <- NULL + location <- read.csv(snakemake@input[["sensor_data"]], stringsAsFactors = FALSE) + segment_labels <- read.csv(snakemake@input[["time_segments_labels"]], stringsAsFactors = FALSE) + accuracy_limit <- snakemake@params[["provider"]][["ACCURACY_LIMIT"]] + datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00" + datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59" + location <- location %>% + filter(accuracy < accuracy_limit) %>% + mutate(is_daily = str_detect(assigned_segments, paste0(".*#", datetime_start_regex, ",", datetime_end_regex, ".*"))) + + + if(nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)){ + warning("Barnett's location features cannot be computed for data or time segments that do not span one or more entire days (00:00:00 to 23:59:59). Values below point to the problem:", + "\nLocation data rows within accuracy: ", nrow(location %>% filter(accuracy < accuracy_limit)), + "\nLocation data rows within a daily time segment: ", nrow(filter(location, is_daily)), + "\nLocation data time span in days: ", round((max(location$timestamp) - min(location$timestamp)) / 86400000, 2) + ) + location_features <- create_empty_file() + } else{ + # Count how many minutes of data we use to get location features. Some minutes have multiple fused rows + location_minutes_used <- location %>% + group_by(local_date, local_hour) %>% + summarise(n_minutes = n_distinct(local_minute), .groups = 'drop_last') %>% + group_by(local_date) %>% + summarise(minutes_data_used = sum(n_minutes), .groups = 'drop_last') %>% + select(local_date, minutes_data_used) + + # Select only the columns that the algorithm needs + all_timezones <- table(location %>% pull(local_timezone)) + location <- location %>% select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy) + timezone <- names(all_timezones)[as.vector(all_timezones)==max(all_timezones)] + outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone) + + if(is.null(outputMobility)){ + location_features <- create_empty_file() + } else { + # Copy index (dates) as a column + features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg) + features <- as.data.frame(features) + features[-1] <- lapply(lapply(features[-1], as.character), as.numeric) + colnames(features)=c("local_date",tolower(colnames(outputMobility$featavg))) + location_features <- left_join(features, location_minutes_used, by = "local_date") + } + + } + write.csv(location_features, snakemake@output[[1]], row.names =FALSE) +} + +barnett_daily_features(snakemake) \ No newline at end of file diff --git a/src/features/phone_locations/barnett/main.R b/src/features/phone_locations/barnett/main.R index 127cb68e..db58dfe8 100644 --- a/src/features/phone_locations/barnett/main.R +++ b/src/features/phone_locations/barnett/main.R @@ -1,108 +1,83 @@ source("renv/activate.R") library("dplyr", warn.conflicts = F) library("stringr") - -# Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility -file.sources = list.files(c("src/features/phone_locations/barnett/library"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE) -sapply(file.sources,source,.GlobalEnv) +library("lubridate") +library("purrr") create_empty_file <- function(requested_features){ return(data.frame(local_segment= character(), - hometime= numeric(), - disttravelled= numeric(), - rog= numeric(), - maxdiam= numeric(), - maxhomedist= numeric(), - siglocsvisited= numeric(), - avgflightlen= numeric(), - stdflightlen= numeric(), - avgflightdur= numeric(), - stdflightdur= numeric(), - probpause= numeric(), - siglocentropy= numeric(), - minsmissing= numeric(), - circdnrtn= numeric(), - wkenddayrtn= numeric(), - minutes_data_used= numeric() - ) %>% select(all_of(requested_features))) + hometime= numeric(), + disttravelled= numeric(), + rog= numeric(), + maxdiam= numeric(), + maxhomedist= numeric(), + siglocsvisited= numeric(), + avgflightlen= numeric(), + stdflightlen= numeric(), + avgflightdur= numeric(), + stdflightdur= numeric(), + probpause= numeric(), + siglocentropy= numeric(), + minsmissing= numeric(), + circdnrtn= numeric(), + wkenddayrtn= numeric(), + minutes_data_used= numeric() + ) %>% select(all_of(requested_features))) +} + +summarise_multiday_segments <- function(segments, features){ + features <- features %>% mutate(local_date=ymd(local_date)) + segments <- segments %>% extract(col = local_segment, + into = c ("local_segment_start_datetime", "local_segment_end_datetime"), + ".*#(.*) .*,(.*) .*", + remove = FALSE) %>% + mutate(local_segment_start_datetime = ymd(local_segment_start_datetime), + local_segment_end_datetime = ymd(local_segment_end_datetime)) %>% + group_by(local_segment) %>% + nest() %>% + mutate(data = map(data, function(nested_data, nested_features){ + + summary <- nested_features %>% filter(local_date >= nested_data$local_segment_start_datetime & + local_date <= nested_data$local_segment_end_datetime) + if(nrow(summary) > 0) + summary <- summary %>% + summarise(across(c(hometime, disttravelled, siglocsvisited, minutes_data_used), sum), + across(c(maxdiam, maxhomedist), max), + across(c(rog, avgflightlen, stdflightlen, avgflightdur, stdflightdur, probpause, siglocentropy, circdnrtn, wkenddayrtn, minsmissing), mean)) + return(summary) + + }, features)) %>% + unnest(cols = everything()) %>% + ungroup() + return(segments) } barnett_features <- function(sensor_data_files, time_segment, params){ - - location_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) location_features <- NULL - - location <- location_data - accuracy_limit <- params[["ACCURACY_LIMIT"]] + daily_features <- read.csv(sensor_data_files[["barnett_daily"]], stringsAsFactors = FALSE) + location <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) minutes_data_used <- params[["MINUTES_DATA_USED"]] - - # Compute what features were requested available_features <- c("hometime","disttravelled","rog","maxdiam", "maxhomedist","siglocsvisited","avgflightlen", "stdflightlen", - "avgflightdur","stdflightdur", "probpause","siglocentropy","minsmissing", "circdnrtn","wkenddayrtn") + "avgflightdur","stdflightdur", "probpause","siglocentropy", "circdnrtn","wkenddayrtn") requested_features <- intersect(unlist(params["FEATURES"], use.names = F), available_features) requested_features <- c("local_segment", requested_features) if(minutes_data_used) requested_features <- c(requested_features, "minutes_data_used") - - # Excludes datasets with less than 24 hours of data - if(max(location$timestamp) - min(location$timestamp) < 86400000) - location <- head(location, 0) - - if (nrow(location) > 1){ - # Filter by segment and skipping any non-daily segment + + if (nrow(location) > 0 & nrow(daily_features) > 0){ location <- location %>% filter_data_by_segment(time_segment) - datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00" datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59" location <- location %>% mutate(is_daily = str_detect(local_segment, paste0(time_segment, "#", datetime_start_regex, ",", datetime_end_regex))) - - if(!all(location$is_daily)){ - message(paste("Barnett's location features cannot be computed for time segmentes that are not daily (cover 00:00:00 to 23:59:59 of every day). Skipping ", time_segment)) + if(nrow(location) == 0 || !all(location$is_daily)){ + message(paste("Barnett's location features cannot be computed for data or time segmentes that do not span entire days (00:00:00 to 23:59:59). Skipping ", time_segment)) location_features <- create_empty_file(requested_features) } else { - # Count how many minutes of data we use to get location features - # Some minutes have multiple fused rows - location_minutes_used <- location %>% - group_by(local_date, local_hour) %>% - summarise(n_minutes = n_distinct(local_minute), .groups = 'drop_last') %>% - group_by(local_date) %>% - summarise(minutes_data_used = sum(n_minutes), .groups = 'drop_last') %>% - select(local_date, minutes_data_used) - - # Save time segment to attach it later - location_dates_segments <- location %>% select(local_date, local_segment) %>% distinct(local_date, .keep_all = TRUE) - - # Select only the columns that the algorithm needs - all_timezones <- table(location %>% pull(local_timezone)) - location <- location %>% select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy) - if(nrow(location %>% filter(accuracy < accuracy_limit)) > 1){ - timezone <- names(all_timezones)[as.vector(all_timezones)==max(all_timezones)] - outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone) - } else { - print(paste("Cannot compute Barnett location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT", accuracy_limit)) - outputMobility <- NULL - } - - if(is.null(outputMobility)){ - location_features <- create_empty_file(requested_features) - } else{ - # Copy index (dates) as a column - features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg) - features <- as.data.frame(features) - features[-1] <- lapply(lapply(features[-1], as.character), as.numeric) - colnames(features)=c("local_date",tolower(colnames(outputMobility$featavg))) - # Add the minute count column - features <- left_join(features, location_minutes_used, by = "local_date") - # Add the time segment column for consistency - features <- left_join(features, location_dates_segments, by = "local_date") - location_features <- features %>% select(all_of(requested_features)) - } + location_dates_segments <- location %>% select(local_segment) %>% distinct(local_segment, .keep_all = TRUE) + features <- summarise_multiday_segments(location_dates_segments, daily_features) + location_features <- features %>% select(all_of(requested_features)) } - } else { - location_features <- create_empty_file(requested_features) - } - - if(ncol(location_features) != length(requested_features)) - stop(paste0("The number of features in the output dataframe (=", ncol(location_features),") does not match the expected value (=", length(requested_features),"). Verify your barnett location features")) + } else + location_features <- create_empty_file(requested_features) return(location_features) } \ No newline at end of file