2020-07-16 20:26:43 +02:00
import pandas as pd
import numpy as np
2021-01-14 20:27:35 +01:00
import warnings
2020-07-16 20:26:43 +02:00
from astropy . timeseries import LombScargle
2021-01-14 20:22:51 +01:00
from sklearn . cluster import DBSCAN , OPTICS
2020-07-16 20:26:43 +02:00
from math import radians , cos , sin , asin , sqrt
2020-12-03 00:41:03 +01:00
def doryab_features ( sensor_data_files , time_segment , provider , filter_data_by_segment , * args , * * kwargs ) :
2020-10-08 00:11:06 +02:00
location_data = pd . read_csv ( sensor_data_files [ " sensor_data " ] )
2020-09-01 18:01:45 +02:00
requested_features = provider [ " FEATURES " ]
2021-01-14 20:27:35 +01:00
accuracy_limit = provider [ " ACCURACY_LIMIT " ]
2020-09-01 18:01:45 +02:00
dbscan_eps = provider [ " DBSCAN_EPS " ]
dbscan_minsamples = provider [ " DBSCAN_MINSAMPLES " ]
threshold_static = provider [ " THRESHOLD_STATIC " ]
2021-02-02 17:14:23 +01:00
maximum_gap_allowed = provider [ " MAXIMUM_ROW_GAP " ]
maximum_row_duration = provider [ " MAXIMUM_ROW_DURATION " ]
2021-01-07 22:20:46 +01:00
cluster_on = provider [ " CLUSTER_ON " ]
2021-01-14 20:22:51 +01:00
clustering_algorithm = provider [ " CLUSTERING_ALGORITHM " ]
2021-02-19 00:37:35 +01:00
radius_from_home = provider [ " RADIUS_FOR_HOME " ]
2020-08-28 19:53:00 +02:00
2020-09-01 18:01:45 +02:00
minutes_data_used = provider [ " MINUTES_DATA_USED " ]
2020-08-28 19:53:00 +02:00
if ( minutes_data_used ) :
requested_features . append ( " minutesdataused " )
2020-07-16 20:26:43 +02:00
# name of the features this function can compute
2021-02-19 00:37:35 +01:00
base_features_names = [ " locationvariance " , " loglocationvariance " , " totaldistance " , " averagespeed " , " varspeed " , " circadianmovement " , " numberofsignificantplaces " , " numberlocationtransitions " , " radiusgyration " , " timeattop1location " , " timeattop2location " , " timeattop3location " , " movingtostaticratio " , " outlierstimepercent " , " maxlengthstayatclusters " , " minlengthstayatclusters " , " meanlengthstayatclusters " , " stdlengthstayatclusters " , " locationentropy " , " normalizedlocationentropy " , " minutesdataused " , " timeathome " ]
2020-07-16 20:26:43 +02:00
# the subset of requested features this function can compute
features_to_compute = list ( set ( requested_features ) & set ( base_features_names ) )
2021-01-14 20:22:51 +01:00
if clustering_algorithm == " DBSCAN " :
hyperparameters = { ' eps ' : distance_to_degrees ( dbscan_eps ) , ' min_samples ' : dbscan_minsamples }
elif clustering_algorithm == " OPTICS " :
hyperparameters = { ' max_eps ' : distance_to_degrees ( dbscan_eps ) , ' min_samples ' : 2 , ' metric ' : ' euclidean ' , ' cluster_method ' : ' dbscan ' }
else :
raise ValueError ( " config[PHONE_LOCATIONS][DORYAB][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided " , clustering_algorithm )
2021-01-14 20:27:35 +01:00
rows_before_accuracy_filter = len ( location_data )
location_data . query ( " accuracy < @accuracy_limit " , inplace = True )
if rows_before_accuracy_filter > 0 and len ( location_data ) == 0 :
warnings . warn ( " Cannot compute Doryab location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT: {} " . format ( accuracy_limit ) )
2020-07-16 20:26:43 +02:00
if location_data . empty :
2020-11-30 20:42:19 +01:00
location_features = pd . DataFrame ( columns = [ " local_segment " ] + features_to_compute )
2020-07-16 20:26:43 +02:00
else :
2021-01-07 22:20:46 +01:00
if cluster_on == " PARTICIPANT_DATASET " :
2021-01-14 20:22:51 +01:00
location_data = cluster_and_label ( location_data , clustering_algorithm , threshold_static , * * hyperparameters )
2021-01-07 22:20:46 +01:00
location_data = filter_data_by_segment ( location_data , time_segment )
elif cluster_on == " TIME_SEGMENT " :
location_data = filter_data_by_segment ( location_data , time_segment )
2021-01-14 20:22:51 +01:00
location_data = cluster_and_label ( location_data , clustering_algorithm , threshold_static , * * hyperparameters )
2021-01-07 22:20:46 +01:00
else :
2021-01-14 20:22:51 +01:00
raise ValueError ( " config[PHONE_LOCATIONS][DORYAB][CLUSTER_ON] only accepts PARTICIPANT_DATASET or TIME_SEGMENT but you provided " , cluster_on )
2020-07-16 20:26:43 +02:00
if location_data . empty :
2020-11-30 20:42:19 +01:00
location_features = pd . DataFrame ( columns = [ " local_segment " ] + features_to_compute )
2020-07-16 20:26:43 +02:00
else :
location_features = pd . DataFrame ( )
2020-07-23 20:23:32 +02:00
if " minutesdataused " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in location_data [ " local_segment " ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " minutesdataused " ] = getMinutesData ( location_data [ location_data [ " local_segment " ] == localDate ] )
2020-07-23 20:23:32 +02:00
2020-08-28 19:53:00 +02:00
location_features . index . name = ' local_segment '
2020-07-23 20:23:32 +02:00
2020-07-16 20:26:43 +02:00
location_data = location_data [ ( location_data [ ' double_latitude ' ] != 0.0 ) & ( location_data [ ' double_longitude ' ] != 0.0 ) ]
2020-09-28 21:02:03 +02:00
if location_data . empty :
2021-01-07 22:20:46 +01:00
location_features = pd . DataFrame ( columns = [ " local_segment " ] + [ " location_ " + time_segment + " _ " + x for x in features_to_compute ] )
2020-10-01 17:34:41 +02:00
location_features = location_features . reset_index ( drop = True )
return location_features
2020-09-28 21:02:03 +02:00
2021-01-18 23:41:41 +01:00
location_data [ ' timeInSeconds ' ] = ( location_data . timestamp . diff ( - 1 ) * - 1 ) / 1000
2020-07-16 20:26:43 +02:00
if " locationvariance " in features_to_compute :
2020-11-30 20:42:19 +01:00
location_features [ " locationvariance " ] = location_data . groupby ( [ ' local_segment ' ] ) [ ' double_latitude ' ] . var ( ) + location_data . groupby ( [ ' local_segment ' ] ) [ ' double_longitude ' ] . var ( )
2020-07-16 20:26:43 +02:00
if " loglocationvariance " in features_to_compute :
2020-11-30 20:42:19 +01:00
location_features [ " loglocationvariance " ] = ( location_data . groupby ( [ ' local_segment ' ] ) [ ' double_latitude ' ] . var ( ) + location_data . groupby ( [ ' local_segment ' ] ) [ ' double_longitude ' ] . var ( ) ) . apply ( lambda x : np . log10 ( x ) if x > 0 else None )
2020-07-16 20:26:43 +02:00
preComputedDistanceandSpeed = pd . DataFrame ( )
2020-08-28 19:53:00 +02:00
for localDate in location_data [ ' local_segment ' ] . unique ( ) :
2021-01-07 22:20:46 +01:00
speeddf = get_all_travel_distances_meters_speed ( location_data [ location_data [ ' local_segment ' ] == localDate ] , threshold_static , maximum_gap_allowed )
preComputedDistanceandSpeed . loc [ localDate , " distance " ] = speeddf [ ' distances ' ] . sum ( )
2020-07-16 20:26:43 +02:00
preComputedDistanceandSpeed . loc [ localDate , " avgspeed " ] = speeddf [ speeddf [ ' speedTag ' ] == ' Moving ' ] [ ' speed ' ] . mean ( )
preComputedDistanceandSpeed . loc [ localDate , " varspeed " ] = speeddf [ speeddf [ ' speedTag ' ] == ' Moving ' ] [ ' speed ' ] . var ( )
if " totaldistance " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in location_data [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " totaldistance " ] = preComputedDistanceandSpeed . loc [ localDate , " distance " ]
2020-07-16 20:26:43 +02:00
if " averagespeed " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in location_data [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " averagespeed " ] = preComputedDistanceandSpeed . loc [ localDate , " avgspeed " ]
2020-07-16 20:26:43 +02:00
if " varspeed " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in location_data [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " varspeed " ] = preComputedDistanceandSpeed . loc [ localDate , " varspeed " ]
2020-07-16 20:26:43 +02:00
if " circadianmovement " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in location_data [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " circadianmovement " ] = circadian_movement ( location_data [ location_data [ ' local_segment ' ] == localDate ] )
2020-07-16 20:26:43 +02:00
2021-01-07 22:20:46 +01:00
stationaryLocations = location_data [ location_data [ ' stationary_or_not ' ] == 1 ]
2020-07-18 02:18:48 +02:00
2020-07-16 20:26:43 +02:00
if " numberofsignificantplaces " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
location_features . loc [ localDate , " numberofsignificantplaces " ] = number_of_significant_places ( stationaryLocations [ stationaryLocations [ ' local_segment ' ] == localDate ] )
2020-07-16 20:26:43 +02:00
if " numberlocationtransitions " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
location_features . loc [ localDate , " numberlocationtransitions " ] = number_location_transitions ( stationaryLocations [ stationaryLocations [ ' local_segment ' ] == localDate ] )
2020-07-16 20:26:43 +02:00
if " radiusgyration " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
2021-02-02 02:57:08 +01:00
location_features . loc [ localDate , " radiusgyration " ] = radius_of_gyration ( stationaryLocations [ stationaryLocations [ ' local_segment ' ] == localDate ] )
2020-07-16 20:26:43 +02:00
2021-01-18 23:41:41 +01:00
preComputedTimeArray = pd . DataFrame ( )
for localDate in stationaryLocations [ " local_segment " ] . unique ( ) :
2021-02-02 17:14:23 +01:00
top1 , top2 , top3 , smax , smin , sstd , smean = len_stay_timeattopn ( stationaryLocations [ stationaryLocations [ " local_segment " ] == localDate ] , maximum_gap_allowed , maximum_row_duration )
2021-01-18 23:41:41 +01:00
preComputedTimeArray . loc [ localDate , " timeattop1 " ] = top1
preComputedTimeArray . loc [ localDate , " timeattop2 " ] = top2
preComputedTimeArray . loc [ localDate , " timeattop3 " ] = top3
preComputedTimeArray . loc [ localDate , " maxlengthstayatclusters " ] = smax
preComputedTimeArray . loc [ localDate , " minlengthstayatclusters " ] = smin
preComputedTimeArray . loc [ localDate , " stdlengthstayatclusters " ] = sstd
preComputedTimeArray . loc [ localDate , " meanlengthstayatclusters " ] = smean
2020-07-16 20:26:43 +02:00
if " timeattop1location " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
2021-01-18 23:41:41 +01:00
location_features . loc [ localDate , " timeattop1 " ] = preComputedTimeArray . loc [ localDate , " timeattop1 " ]
2020-07-16 20:26:43 +02:00
if " timeattop2location " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
2021-01-18 23:41:41 +01:00
location_features . loc [ localDate , " timeattop2 " ] = preComputedTimeArray . loc [ localDate , " timeattop2 " ]
2020-07-16 20:26:43 +02:00
if " timeattop3location " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
2021-01-18 23:41:41 +01:00
location_features . loc [ localDate , " timeattop3 " ] = preComputedTimeArray . loc [ localDate , " timeattop3 " ]
2020-07-16 20:26:43 +02:00
if " movingtostaticratio " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
2021-01-18 23:41:41 +01:00
location_features . loc [ localDate , " movingtostaticratio " ] = ( stationaryLocations [ stationaryLocations [ ' local_segment ' ] == localDate ] [ ' timeInSeconds ' ] . sum ( ) ) / ( location_data [ location_data [ ' local_segment ' ] == localDate ] [ ' timeInSeconds ' ] . sum ( ) )
2020-07-16 20:26:43 +02:00
if " outlierstimepercent " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
2021-01-18 23:41:41 +01:00
location_features . loc [ localDate , " outlierstimepercent " ] = outlier_time_percent_new ( stationaryLocations [ stationaryLocations [ ' local_segment ' ] == localDate ] )
2020-07-16 20:26:43 +02:00
if " maxlengthstayatclusters " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
2021-01-18 23:41:41 +01:00
location_features . loc [ localDate , " maxlengthstayatclusters " ] = preComputedTimeArray . loc [ localDate , " maxlengthstayatclusters " ]
2020-07-16 20:26:43 +02:00
if " minlengthstayatclusters " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
2021-01-18 23:41:41 +01:00
location_features . loc [ localDate , " minlengthstayatclusters " ] = preComputedTimeArray . loc [ localDate , " minlengthstayatclusters " ]
2020-07-16 20:26:43 +02:00
if " stdlengthstayatclusters " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
2021-01-18 23:41:41 +01:00
location_features . loc [ localDate , " stdlengthstayatclusters " ] = preComputedTimeArray . loc [ localDate , " stdlengthstayatclusters " ]
2020-07-16 20:26:43 +02:00
if " meanlengthstayatclusters " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
2021-01-18 23:41:41 +01:00
location_features . loc [ localDate , " meanlengthstayatclusters " ] = preComputedTimeArray . loc [ localDate , " meanlengthstayatclusters " ]
2020-07-16 20:26:43 +02:00
if " locationentropy " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
location_features . loc [ localDate , " locationentropy " ] = location_entropy ( stationaryLocations [ stationaryLocations [ ' local_segment ' ] == localDate ] )
2020-07-16 20:26:43 +02:00
if " normalizedlocationentropy " in features_to_compute :
2021-01-07 22:20:46 +01:00
for localDate in stationaryLocations [ ' local_segment ' ] . unique ( ) :
location_features . loc [ localDate , " normalizedlocationentropy " ] = location_entropy_normalized ( stationaryLocations [ stationaryLocations [ ' local_segment ' ] == localDate ] )
2020-07-23 20:23:32 +02:00
2021-02-19 00:37:35 +01:00
if " timeathome " in features_to_compute :
calculationDf = stationaryLocations [ [ ' local_segment ' , ' distancefromhome ' , ' timeInSeconds ' ] ] . copy ( )
calculationDf . loc [ calculationDf . timeInSeconds > = maximum_gap_allowed , ' timeInSeconds ' ] = maximum_row_duration
location_features [ " timeathome " ] = calculationDf [ calculationDf [ " distancefromhome " ] < = radius_from_home ] . groupby ( " local_segment " ) [ " timeInSeconds " ] . sum ( ) / 60
2020-07-16 20:26:43 +02:00
location_features = location_features . reset_index ( )
return location_features
2021-02-02 17:14:23 +01:00
def len_stay_timeattopn ( locationData , maximum_gap_allowed , maximum_row_duration ) :
2021-01-18 23:41:41 +01:00
if locationData is None or len ( locationData ) == 0 :
return ( None , None , None , None , None , None , None )
2021-02-02 02:57:08 +01:00
calculationDf = locationData [ locationData [ " location_label " ] > = 1 ] [ [ ' location_label ' , ' timeInSeconds ' ] ] . copy ( )
2021-02-02 17:14:23 +01:00
calculationDf . loc [ calculationDf . timeInSeconds > = maximum_gap_allowed , ' timeInSeconds ' ] = maximum_row_duration
2021-01-18 23:41:41 +01:00
timeArray = calculationDf . groupby ( ' location_label ' ) [ ' timeInSeconds ' ] . sum ( ) . reset_index ( ) [ ' timeInSeconds ' ] . sort_values ( ascending = False ) / 60
2021-03-18 23:44:11 +01:00
if len ( timeArray ) == 3 :
2021-01-18 23:41:41 +01:00
return ( timeArray [ 0 ] , timeArray [ 1 ] , timeArray [ 2 ] , timeArray . max ( ) , timeArray . min ( ) , timeArray . std ( ) , timeArray . mean ( ) )
elif len ( timeArray ) == 2 :
return ( timeArray [ 0 ] , timeArray [ 1 ] , None , timeArray . max ( ) , timeArray . min ( ) , timeArray . std ( ) , timeArray . mean ( ) )
2021-03-18 23:44:11 +01:00
elif len ( timeArray ) == 1 :
2021-01-18 23:41:41 +01:00
return ( timeArray [ 0 ] , None , None , timeArray . max ( ) , timeArray . min ( ) , timeArray . std ( ) , timeArray . mean ( ) )
2021-03-18 23:44:11 +01:00
else :
return ( None , None , None , timeArray . max ( ) , timeArray . min ( ) , timeArray . std ( ) , timeArray . mean ( ) )
2021-01-18 23:41:41 +01:00
2020-07-18 02:18:48 +02:00
2020-07-23 20:23:32 +02:00
def getMinutesData ( locationData ) :
return locationData [ [ ' local_hour ' , ' local_minute ' ] ] . drop_duplicates ( inplace = False ) . shape [ 0 ]
2020-07-16 20:26:43 +02:00
def distance_to_degrees ( d ) :
#Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error
#over small distances
d = d / 1852
d = d / 60
return d
2020-07-22 00:56:43 +02:00
def get_all_travel_distances_meters_speed ( locationData , threshold , maximum_gap_allowed ) :
2020-07-16 20:26:43 +02:00
2021-01-07 22:20:46 +01:00
lat_lon_temp = locationData [ locationData [ ' timeInSeconds ' ] < = maximum_gap_allowed ] [ [ ' double_latitude ' , ' double_longitude ' , ' timeInSeconds ' ] ]
2020-07-21 21:47:48 +02:00
if lat_lon_temp . empty :
2021-01-07 22:20:46 +01:00
return pd . DataFrame ( { " speed " : [ ] , " speedTag " : [ ] , " distances " : [ ] } )
lat_lon_temp [ ' distances ' ] = haversine ( lat_lon_temp [ ' double_longitude ' ] , lat_lon_temp [ ' double_latitude ' ] , lat_lon_temp [ ' double_longitude ' ] . shift ( - 1 ) , lat_lon_temp [ ' double_latitude ' ] . shift ( - 1 ) )
lat_lon_temp [ ' speed ' ] = ( lat_lon_temp [ ' distances ' ] / lat_lon_temp [ ' timeInSeconds ' ] ) # meter/second
lat_lon_temp [ ' speed ' ] = lat_lon_temp [ ' speed ' ] . replace ( np . inf , np . nan ) * 3.6
2020-07-21 21:47:48 +02:00
2020-07-16 20:26:43 +02:00
lat_lon_temp [ ' speedTag ' ] = np . where ( lat_lon_temp [ ' speed ' ] > = threshold , " Moving " , " Static " )
2021-01-07 22:20:46 +01:00
return lat_lon_temp [ [ ' speed ' , ' speedTag ' , ' distances ' ] ]
2020-07-16 20:26:43 +02:00
def vincenty_row ( x ) :
"""
: param x : A row from a dataframe
: return : The distance in meters between
"""
try :
return vincenty ( ( x [ ' _lat_before ' ] , x [ ' _lon_before ' ] ) , ( x [ ' _lat_after ' ] , x [ ' _lon_after ' ] ) ) . meters
except :
return 0
2021-01-07 22:20:46 +01:00
def haversine ( lon1 , lat1 , lon2 , lat2 ) :
2020-07-16 20:26:43 +02:00
"""
Calculate the great circle distance between two points
on the earth ( specified in decimal degrees )
"""
# convert decimal degrees to radians
2021-01-07 22:20:46 +01:00
lon1 , lat1 , lon2 , lat2 = np . radians ( [ lon1 , lat1 , lon2 , lat2 ] )
2020-07-16 20:26:43 +02:00
# haversine formula
2021-01-07 22:20:46 +01:00
a = np . sin ( ( lat2 - lat1 ) / 2.0 ) * * 2 + np . cos ( lat1 ) * np . cos ( lat2 ) * np . sin ( ( lon2 - lon1 ) / 2.0 ) * * 2
2020-07-16 20:26:43 +02:00
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
2021-01-07 22:20:46 +01:00
return ( r * 2 * np . arcsin ( np . sqrt ( a ) ) * 1000 )
2020-07-16 20:26:43 +02:00
def circadian_movement_energies ( locationData ) :
time = ( locationData [ " timestamp " ] . values / 1000.0 ) # seconds
ylat = locationData [ " double_latitude " ] . values
ylong = locationData [ " double_longitude " ] . values
hours_intervals = np . arange ( 23.5 , 24.51 , 0.01 ) # hours
seconds_intervals = hours_intervals * 60 * 60 # seconds
frequency = 1 / seconds_intervals
power_latitude = LombScargle ( time , ylat ) . power ( frequency = frequency , normalization = ' psd ' )
power_longitude = LombScargle ( time , ylong ) . power ( frequency = frequency , normalization = ' psd ' )
energy_latitude = np . sum ( power_latitude )
energy_longitude = np . sum ( power_longitude )
return ( energy_latitude , energy_longitude )
def circadian_movement ( locationData ) :
energy_latitude , energy_longitude = circadian_movement_energies ( locationData )
return np . log10 ( energy_latitude + energy_longitude )
2021-01-14 20:22:51 +01:00
def cluster_and_label ( df , clustering_algorithm , threshold_static , * * kwargs ) :
2020-07-16 20:26:43 +02:00
"""
: param df : a df with columns " latitude " , " longitude " , and " datetime "
or
a df with comlumns " latitude " , " longitude " and a datetime index
: param kwargs : arguments for sklearn ' s DBSCAN
: return : a new df of labeled locations with moving points removed , where the cluster
labeled as " 1 " is the largest , " 2 " the second largest , and so on
"""
2021-01-07 22:20:46 +01:00
if not df . empty :
location_data = df
if not isinstance ( df . index , pd . DatetimeIndex ) :
location_data = df . set_index ( " local_date_time " )
2021-01-14 20:22:51 +01:00
stationary = mark_moving ( location_data , threshold_static )
2021-01-07 22:20:46 +01:00
counts_df = stationary [ [ " double_latitude " , " double_longitude " ] ] . groupby ( [ " double_latitude " , " double_longitude " ] ) . size ( ) . reset_index ( )
counts = counts_df [ 0 ]
lat_lon = counts_df [ [ " double_latitude " , " double_longitude " ] ] . values
2021-01-14 20:22:51 +01:00
if clustering_algorithm == " DBSCAN " :
clusterer = DBSCAN ( * * kwargs )
cluster_results = clusterer . fit_predict ( lat_lon , sample_weight = counts )
else :
clusterer = OPTICS ( * * kwargs )
cluster_results = clusterer . fit_predict ( lat_lon )
2021-01-07 22:20:46 +01:00
#Need to extend labels back to original df without weights
counts_df [ " location_label " ] = cluster_results
# remove the old count column
del counts_df [ 0 ]
merged = pd . merge ( stationary , counts_df , on = [ " double_latitude " , " double_longitude " ] )
#Now compute the label mapping:
cluster_results = merged [ " location_label " ] . values
valid_clusters = cluster_results [ np . where ( cluster_results != - 1 ) ]
label_map = rank_count_map ( valid_clusters )
#And remap the labels:
merged . index = stationary . index
stationary = stationary . assign ( location_label = merged [ " location_label " ] . map ( label_map ) . values )
stationary . loc [ : , " location_label " ] = merged [ " location_label " ] . map ( label_map )
return stationary
else :
return df
2020-07-16 20:26:43 +02:00
def rank_count_map ( clusters ) :
""" Returns a function which will map each element of a list ' l ' to its rank,
such that the most common element maps to 1
Is used in this context to sort the cluster labels so that cluster with rank 1 is the most
visited .
If return_dict , return a mapping dict rather than a function
If a function , if the value can ' t be found label as -1
"""
labels , counts = tuple ( np . unique ( clusters , return_counts = True ) )
sorted_by_count = [ x for ( y , x ) in sorted ( zip ( counts , labels ) , reverse = True ) ]
label_to_rank = { label : rank + 1 for ( label , rank ) in [ ( sorted_by_count [ i ] , i ) for i in range ( len ( sorted_by_count ) ) ] }
return lambda x : label_to_rank . get ( x , - 1 )
2021-01-14 20:22:51 +01:00
def mark_moving ( df , threshold_static ) :
2020-07-16 20:26:43 +02:00
if not df . index . is_monotonic :
df = df . sort_index ( )
2021-01-07 22:20:46 +01:00
distance = haversine ( df . double_longitude , df . double_latitude , df . double_longitude . shift ( - 1 ) , df . double_latitude . shift ( - 1 ) ) / 1000
time = ( df . timestamp . diff ( - 1 ) * - 1 ) / ( 1000 * 60 * 60 )
2020-07-16 20:26:43 +02:00
2021-01-14 20:22:51 +01:00
df [ ' stationary_or_not ' ] = np . where ( ( distance / time ) < threshold_static , 1 , 0 ) # 1 being stationary,0 for moving
2021-01-07 22:20:46 +01:00
return df
2020-07-16 20:26:43 +02:00
def number_of_significant_places ( locationData ) :
uniquelst = locationData [ locationData [ " location_label " ] > = 1 ] [ " location_label " ] . unique ( )
return len ( uniquelst )
def number_location_transitions ( locationData ) :
# ignores transitions from moving to static and vice-versa, but counts transitions from outliers to major location clusters
df = pd . DataFrame ( )
df [ ' boolCol ' ] = ( locationData . location_label == locationData . location_label . shift ( ) )
return df [ df [ ' boolCol ' ] == False ] . shape [ 0 ] - 1
2021-02-02 02:57:08 +01:00
def radius_of_gyration ( locationData ) :
2020-07-16 20:26:43 +02:00
if locationData is None or len ( locationData ) == 0 :
return None
# Center is the centroid, not the home location
valid_clusters = locationData [ locationData [ " location_label " ] != - 1 ]
2020-08-05 18:06:39 +02:00
centroid_all_clusters = ( valid_clusters . groupby ( ' location_label ' ) [ [ ' double_latitude ' , ' double_longitude ' ] ] . mean ( ) ) . mean ( )
clusters_centroid = valid_clusters . groupby ( ' location_label ' ) [ [ ' double_latitude ' , ' double_longitude ' ] ] . mean ( )
2020-08-04 00:28:02 +02:00
2020-08-05 18:06:39 +02:00
rog = 0
2021-01-07 22:20:46 +01:00
for labels in clusters_centroid . index :
distance = haversine ( clusters_centroid . loc [ labels ] . double_longitude , clusters_centroid . loc [ labels ] . double_latitude ,
centroid_all_clusters . double_longitude , centroid_all_clusters . double_latitude ) * * 2
2020-08-05 18:06:39 +02:00
2021-02-02 02:57:08 +01:00
time_in_cluster = locationData [ locationData [ " location_label " ] == labels ] [ ' timeInSeconds ' ] . sum ( )
2020-08-05 18:06:39 +02:00
rog = rog + ( time_in_cluster * distance )
2020-10-10 19:26:06 +02:00
2021-02-02 02:57:08 +01:00
time_all_clusters = valid_clusters [ ' timeInSeconds ' ] . sum ( )
2020-10-10 19:26:06 +02:00
if time_all_clusters == 0 :
return 0
2020-08-05 19:14:31 +02:00
final_rog = ( 1 / time_all_clusters ) * rog
2020-08-05 18:06:39 +02:00
return np . sqrt ( final_rog )
2020-07-16 20:26:43 +02:00
2021-01-18 23:41:41 +01:00
def outlier_time_percent_new ( locationData ) :
if locationData is None or len ( locationData ) == 0 :
2020-07-16 20:26:43 +02:00
return None
2021-01-18 23:41:41 +01:00
clustersDf = locationData [ [ " location_label " , " timeInSeconds " ] ]
numoutliers = clustersDf [ clustersDf [ " location_label " ] == - 1 ] [ " timeInSeconds " ] . sum ( )
numtotal = clustersDf . timeInSeconds . sum ( )
2020-07-16 20:26:43 +02:00
2021-01-18 23:41:41 +01:00
return numoutliers / numtotal
2020-07-16 20:26:43 +02:00
def location_entropy ( locationData ) :
if locationData is None or len ( locationData ) == 0 :
return None
clusters = locationData [ locationData [ " location_label " ] > = 1 ] # remove outliers/ cluster noise
if len ( clusters ) > 0 :
# Get percentages for each location
2021-02-02 02:57:08 +01:00
percents = clusters . groupby ( [ ' location_label ' ] ) [ ' timeInSeconds ' ] . sum ( ) / clusters [ ' timeInSeconds ' ] . sum ( )
2020-07-16 20:26:43 +02:00
entropy = - 1 * percents . map ( lambda x : x * np . log ( x ) ) . sum ( )
return entropy
else :
return None
def location_entropy_normalized ( locationData ) :
if locationData is None or len ( locationData ) == 0 :
return None
locationData = locationData [ locationData [ " location_label " ] > = 1 ] # remove outliers/ cluster noise
entropy = location_entropy ( locationData )
unique_clusters = locationData [ " location_label " ] . unique ( )
num_clusters = len ( unique_clusters )
if num_clusters == 0 or len ( locationData ) == 0 or entropy is None :
return None
2021-02-02 02:57:08 +01:00
elif np . log ( num_clusters ) == 0 :
return None
2020-07-16 20:26:43 +02:00
else :
2021-02-02 02:57:08 +01:00
return entropy / np . log ( num_clusters )