2020-08-28 19:53:00 +02:00
source ( " renv/activate.R" )
2020-10-23 16:41:00 +02:00
library ( " dplyr" , warn.conflicts = F )
2020-08-28 19:53:00 +02:00
library ( " stringr" )
# Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility
2020-10-19 21:07:12 +02:00
file.sources = list.files ( c ( " src/features/phone_locations/barnett/library" ) , pattern = " *.R$" , full.names = TRUE , ignore.case = TRUE )
2020-08-28 19:53:00 +02:00
sapply ( file.sources , source , .GlobalEnv )
create_empty_file <- function ( requested_features ) {
return ( data.frame ( local_segment = character ( ) ,
2020-11-25 20:49:42 +01:00
hometime = numeric ( ) ,
disttravelled = numeric ( ) ,
rog = numeric ( ) ,
maxdiam = numeric ( ) ,
maxhomedist = numeric ( ) ,
siglocsvisited = numeric ( ) ,
avgflightlen = numeric ( ) ,
stdflightlen = numeric ( ) ,
avgflightdur = numeric ( ) ,
stdflightdur = numeric ( ) ,
probpause = numeric ( ) ,
siglocentropy = numeric ( ) ,
minsmissing = numeric ( ) ,
circdnrtn = numeric ( ) ,
wkenddayrtn = numeric ( ) ,
minutes_data_used = numeric ( )
2020-08-28 19:53:00 +02:00
) %>% select ( all_of ( requested_features ) ) )
}
2020-12-03 00:41:03 +01:00
barnett_features <- function ( sensor_data_files , time_segment , params ) {
2020-10-08 00:11:06 +02:00
location_data <- read.csv ( sensor_data_files [ [ " sensor_data" ] ] , stringsAsFactors = FALSE )
2020-08-28 19:53:00 +02:00
location_features <- NULL
2020-10-08 00:11:06 +02:00
2020-08-28 19:53:00 +02:00
location <- location_data
accuracy_limit <- params [ [ " ACCURACY_LIMIT" ] ]
minutes_data_used <- params [ [ " MINUTES_DATA_USED" ] ]
# Compute what features were requested
available_features <- c ( " hometime" , " disttravelled" , " rog" , " maxdiam" , " maxhomedist" , " siglocsvisited" , " avgflightlen" , " stdflightlen" ,
" avgflightdur" , " stdflightdur" , " probpause" , " siglocentropy" , " minsmissing" , " circdnrtn" , " wkenddayrtn" )
requested_features <- intersect ( unlist ( params [ " FEATURES" ] , use.names = F ) , available_features )
2020-11-25 20:49:42 +01:00
requested_features <- c ( " local_segment" , requested_features )
2020-08-28 19:53:00 +02:00
if ( minutes_data_used )
2020-11-25 20:49:42 +01:00
requested_features <- c ( requested_features , " minutes_data_used" )
2020-08-28 19:53:00 +02:00
# Excludes datasets with less than 24 hours of data
if ( max ( location $ timestamp ) - min ( location $ timestamp ) < 86400000 )
location <- head ( location , 0 )
if ( nrow ( location ) > 1 ) {
# Filter by segment and skipping any non-daily segment
2020-12-03 00:41:03 +01:00
location <- location %>% filter_data_by_segment ( time_segment )
2020-10-19 21:07:12 +02:00
datetime_start_regex = " [0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00"
datetime_end_regex = " [0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59"
2020-12-03 00:41:03 +01:00
location <- location %>% mutate ( is_daily = str_detect ( local_segment , paste0 ( time_segment , " #" , datetime_start_regex , " ," , datetime_end_regex ) ) )
2020-10-19 21:07:12 +02:00
if ( ! all ( location $ is_daily ) ) {
2020-12-03 00:41:03 +01:00
message ( paste ( " Barnett's location features cannot be computed for time segmentes that are not daily (cover 00:00:00 to 23:59:59 of every day). Skipping " , time_segment ) )
2020-08-28 19:53:00 +02:00
location_features <- create_empty_file ( requested_features )
} else {
# Count how many minutes of data we use to get location features
# Some minutes have multiple fused rows
location_minutes_used <- location %>%
group_by ( local_date , local_hour ) %>%
2021-01-14 20:27:35 +01:00
summarise ( n_minutes = n_distinct ( local_minute ) , .groups = ' drop_last' ) %>%
2020-08-28 19:53:00 +02:00
group_by ( local_date ) %>%
2021-01-14 20:27:35 +01:00
summarise ( minutes_data_used = sum ( n_minutes ) , .groups = ' drop_last' ) %>%
2020-11-25 20:49:42 +01:00
select ( local_date , minutes_data_used )
2020-08-28 19:53:00 +02:00
2020-12-03 00:41:03 +01:00
# Save time segment to attach it later
2020-08-28 19:53:00 +02:00
location_dates_segments <- location %>% select ( local_date , local_segment ) %>% distinct ( local_date , .keep_all = TRUE )
# Select only the columns that the algorithm needs
2021-03-05 23:49:37 +01:00
all_timezones <- table ( location %>% pull ( local_timezone ) )
2020-08-28 19:53:00 +02:00
location <- location %>% select ( timestamp , latitude = double_latitude , longitude = double_longitude , altitude = double_altitude , accuracy )
2020-10-28 22:50:37 +01:00
if ( nrow ( location %>% filter ( accuracy < accuracy_limit ) ) > 1 ) {
2021-03-05 23:49:37 +01:00
timezone <- names ( all_timezones ) [as.vector ( all_timezones ) == max ( all_timezones ) ]
2020-10-28 22:50:37 +01:00
outputMobility <- MobilityFeatures ( location , ACCURACY_LIM = accuracy_limit , tz = timezone )
} else {
2021-01-14 20:27:35 +01:00
print ( paste ( " Cannot compute Barnett location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT" , accuracy_limit ) )
2020-10-28 22:50:37 +01:00
outputMobility <- NULL
}
2020-08-28 19:53:00 +02:00
if ( is.null ( outputMobility ) ) {
location_features <- create_empty_file ( requested_features )
} else {
# Copy index (dates) as a column
features <- cbind ( rownames ( outputMobility $ featavg ) , outputMobility $ featavg )
features <- as.data.frame ( features )
features [ -1 ] <- lapply ( lapply ( features [ -1 ] , as.character ) , as.numeric )
2020-11-25 20:49:42 +01:00
colnames ( features ) = c ( " local_date" , tolower ( colnames ( outputMobility $ featavg ) ) )
2020-08-28 19:53:00 +02:00
# Add the minute count column
features <- left_join ( features , location_minutes_used , by = " local_date" )
2020-12-03 00:41:03 +01:00
# Add the time segment column for consistency
2020-08-28 19:53:00 +02:00
features <- left_join ( features , location_dates_segments , by = " local_date" )
location_features <- features %>% select ( all_of ( requested_features ) )
}
}
} else {
location_features <- create_empty_file ( requested_features )
}
if ( ncol ( location_features ) != length ( requested_features ) )
stop ( paste0 ( " The number of features in the output dataframe (=" , ncol ( location_features ) , " ) does not match the expected value (=" , length ( requested_features ) , " ). Verify your barnett location features" ) )
return ( location_features )
}