rapids/src/features/phone_locations/barnett/daily_features.R

source("renv/activate.R")
library("dplyr", warn.conflicts = F)
library("stringr")
library("lubridate")
library("purrr")

# Load Ian Barnett's code. From https://scholar.harvard.edu/ibarnett/software/gpsmobility
file.sources = list.files(c("src/features/phone_locations/barnett/library"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE)
output_apply <- sapply(file.sources,source,.GlobalEnv)

create_empty_file <- function(){
  return(data.frame(local_date= character(), hometime= numeric(), disttravelled= numeric(), rog= numeric(), maxdiam= numeric(), 
                    maxhomedist= numeric(), siglocsvisited= numeric(), avgflightlen= numeric(), stdflightlen= numeric(), 
                    avgflightdur= numeric(), stdflightdur= numeric(), probpause= numeric(), siglocentropy= numeric(), minsmissing= numeric(), 
                    circdnrtn= numeric(), wkenddayrtn= numeric(), minutes_data_used= numeric()
  ))
}

barnett_daily_features <- function(snakemake){
  location_features <- NULL
  location <- read.csv(snakemake@input[["sensor_data"]], stringsAsFactors = FALSE)
  segment_labels <- read.csv(snakemake@input[["time_segments_labels"]], stringsAsFactors = FALSE)
  accuracy_limit = 999999999 # We filter rows based on accuracy in src/data/process_location_types.R script
  datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00"
  datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59"
  location <- location %>% 
    mutate(is_daily = str_detect(assigned_segments, paste0(".*#", datetime_start_regex, ",", datetime_end_regex, ".*")))
  
  if(nrow(segment_labels) == 0 || nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)){
    warning("Barnett's location features cannot be computed for data or time segments that do not span one or more entire days (00:00:00 to 23:59:59). Values below point to the problem:",
            "\nLocation data rows within a daily time segment: ", nrow(filter(location, is_daily)),
            "\nLocation data time span in days: ", round((max(location$timestamp) - min(location$timestamp)) / 86400000, 2)
            )
    location_features <- create_empty_file()  
  } else{
    # Count how many minutes of data we use to get location features. Some minutes have multiple fused rows
    location_minutes_used <- location %>% 
      group_by(local_date, local_hour) %>% 
      summarise(n_minutes = n_distinct(local_minute), .groups = 'drop_last') %>% 
      group_by(local_date) %>% 
      summarise(minutes_data_used = sum(n_minutes), .groups = 'drop_last') %>% 
      select(local_date, minutes_data_used)
    
    # Select only the columns that the algorithm needs
    all_timezones <- table(location %>% pull(local_timezone))
    location <- location %>% select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy)
    timezone <- names(all_timezones)[as.vector(all_timezones)==max(all_timezones)]
    outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone)
    
    if(is.null(outputMobility)){
      location_features <- create_empty_file()
    } else {
      # Copy index (dates) as a column 
      features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg)
      features <- as.data.frame(features)
      features[-1] <- lapply(lapply(features[-1], as.character), as.numeric)
      colnames(features)=c("local_date",tolower(colnames(outputMobility$featavg)))
      location_features <- left_join(features, location_minutes_used, by = "local_date")
    }
    
  }
  write.csv(location_features, snakemake@output[[1]], row.names =FALSE)
}

barnett_daily_features(snakemake)
Optimize Barnett's computation multi-day segments 2021-03-23 19:04:01 +01:00			`source("renv/activate.R")`
			`library("dplyr", warn.conflicts = F)`
			`library("stringr")`
			`library("lubridate")`
			`library("purrr")`

			`# Load Ian Barnett's code. From https://scholar.harvard.edu/ibarnett/software/gpsmobility`
			`file.sources = list.files(c("src/features/phone_locations/barnett/library"), pattern="*.R$", full.names=TRUE, ignore.case=TRUE)`
			`output_apply <- sapply(file.sources,source,.GlobalEnv)`

			`create_empty_file <- function(){`
			`return(data.frame(local_date= character(), hometime= numeric(), disttravelled= numeric(), rog= numeric(), maxdiam= numeric(),`
			`maxhomedist= numeric(), siglocsvisited= numeric(), avgflightlen= numeric(), stdflightlen= numeric(),`
			`avgflightdur= numeric(), stdflightdur= numeric(), probpause= numeric(), siglocentropy= numeric(), minsmissing= numeric(),`
			`circdnrtn= numeric(), wkenddayrtn= numeric(), minutes_data_used= numeric()`
			`))`
			`}`

			`barnett_daily_features <- function(snakemake){`
			`location_features <- NULL`
			`location <- read.csv(snakemake@input[["sensor_data"]], stringsAsFactors = FALSE)`
			`segment_labels <- read.csv(snakemake@input[["time_segments_labels"]], stringsAsFactors = FALSE)`
Temporary revert PHONE_LOCATIONS BARNETT provider to use R script 2021-09-24 00:16:13 +02:00			`accuracy_limit = 999999999 # We filter rows based on accuracy in src/data/process_location_types.R script`
Optimize Barnett's computation multi-day segments 2021-03-23 19:04:01 +01:00			`datetime_start_regex = "[0-9]{4}[\\-\|\\/][0-9]{2}[\\-\|\\/][0-9]{2} 00:00:00"`
			`datetime_end_regex = "[0-9]{4}[\\-\|\\/][0-9]{2}[\\-\|\\/][0-9]{2} 23:59:59"`
			`location <- location %>%`
			`mutate(is_daily = str_detect(assigned_segments, paste0(".#", datetime_start_regex, ",", datetime_end_regex, ".")))`

Update segment labels and fix the bug when we do not have any labels for event segments 2021-06-26 01:06:46 +02:00			`if(nrow(segment_labels) == 0 \|\| nrow(location) == 0 \|\| all(location$is_daily == FALSE) \|\| (max(location$timestamp) - min(location$timestamp) < 86400000)){`
Optimize Barnett's computation multi-day segments 2021-03-23 19:04:01 +01:00			`warning("Barnett's location features cannot be computed for data or time segments that do not span one or more entire days (00:00:00 to 23:59:59). Values below point to the problem:",`
			`"\nLocation data rows within a daily time segment: ", nrow(filter(location, is_daily)),`
			`"\nLocation data time span in days: ", round((max(location$timestamp) - min(location$timestamp)) / 86400000, 2)`
			`)`
			`location_features <- create_empty_file()`
			`} else{`
			`# Count how many minutes of data we use to get location features. Some minutes have multiple fused rows`
			`location_minutes_used <- location %>%`
			`group_by(local_date, local_hour) %>%`
			`summarise(n_minutes = n_distinct(local_minute), .groups = 'drop_last') %>%`
			`group_by(local_date) %>%`
			`summarise(minutes_data_used = sum(n_minutes), .groups = 'drop_last') %>%`
			`select(local_date, minutes_data_used)`

			`# Select only the columns that the algorithm needs`
			`all_timezones <- table(location %>% pull(local_timezone))`
			`location <- location %>% select(timestamp, latitude = double_latitude, longitude = double_longitude, altitude = double_altitude, accuracy)`
			`timezone <- names(all_timezones)[as.vector(all_timezones)==max(all_timezones)]`
			`outputMobility <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone)`

			`if(is.null(outputMobility)){`
			`location_features <- create_empty_file()`
			`} else {`
			`# Copy index (dates) as a column`
			`features <- cbind(rownames(outputMobility$featavg), outputMobility$featavg)`
			`features <- as.data.frame(features)`
			`features[-1] <- lapply(lapply(features[-1], as.character), as.numeric)`
			`colnames(features)=c("local_date",tolower(colnames(outputMobility$featavg)))`
			`location_features <- left_join(features, location_minutes_used, by = "local_date")`
			`}`

			`}`
			`write.csv(location_features, snakemake@output[[1]], row.names =FALSE)`
			`}`

			`barnett_daily_features(snakemake)`