2020-07-16 20:26:43 +02:00
import pandas as pd
import numpy as np
from astropy . timeseries import LombScargle
from sklearn . cluster import DBSCAN
from math import radians , cos , sin , asin , sqrt
2020-12-03 00:41:03 +01:00
def doryab_features ( sensor_data_files , time_segment , provider , filter_data_by_segment , * args , * * kwargs ) :
2020-10-08 00:11:06 +02:00
location_data = pd . read_csv ( sensor_data_files [ " sensor_data " ] )
2020-09-01 18:01:45 +02:00
requested_features = provider [ " FEATURES " ]
dbscan_eps = provider [ " DBSCAN_EPS " ]
dbscan_minsamples = provider [ " DBSCAN_MINSAMPLES " ]
threshold_static = provider [ " THRESHOLD_STATIC " ]
maximum_gap_allowed = provider [ " MAXIMUM_GAP_ALLOWED " ]
sampling_frequency = provider [ " SAMPLING_FREQUENCY " ]
2020-08-28 19:53:00 +02:00
2020-09-01 18:01:45 +02:00
minutes_data_used = provider [ " MINUTES_DATA_USED " ]
2020-08-28 19:53:00 +02:00
if ( minutes_data_used ) :
requested_features . append ( " minutesdataused " )
2020-07-16 20:26:43 +02:00
# name of the features this function can compute
2020-07-23 20:23:32 +02:00
base_features_names = [ " locationvariance " , " loglocationvariance " , " totaldistance " , " averagespeed " , " varspeed " , " circadianmovement " , " numberofsignificantplaces " , " numberlocationtransitions " , " radiusgyration " , " timeattop1location " , " timeattop2location " , " timeattop3location " , " movingtostaticratio " , " outlierstimepercent " , " maxlengthstayatclusters " , " minlengthstayatclusters " , " meanlengthstayatclusters " , " stdlengthstayatclusters " , " locationentropy " , " normalizedlocationentropy " , " minutesdataused " ]
2020-07-16 20:26:43 +02:00
# the subset of requested features this function can compute
features_to_compute = list ( set ( requested_features ) & set ( base_features_names ) )
2020-08-28 19:53:00 +02:00
2020-07-16 20:26:43 +02:00
if location_data . empty :
2020-11-30 20:42:19 +01:00
location_features = pd . DataFrame ( columns = [ " local_segment " ] + features_to_compute )
2020-07-16 20:26:43 +02:00
else :
2020-12-03 00:41:03 +01:00
location_data = filter_data_by_segment ( location_data , time_segment )
2020-07-16 20:26:43 +02:00
if location_data . empty :
2020-11-30 20:42:19 +01:00
location_features = pd . DataFrame ( columns = [ " local_segment " ] + features_to_compute )
2020-07-16 20:26:43 +02:00
else :
location_features = pd . DataFrame ( )
2020-08-05 19:02:34 +02:00
if sampling_frequency == 0 :
sampling_frequency = getSamplingFrequency ( location_data )
2020-07-23 20:23:32 +02:00
if " minutesdataused " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in location_data [ " local_segment " ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " minutesdataused " ] = getMinutesData ( location_data [ location_data [ " local_segment " ] == localDate ] )
2020-07-23 20:23:32 +02:00
2020-08-28 19:53:00 +02:00
location_features . index . name = ' local_segment '
2020-07-23 20:23:32 +02:00
2020-07-16 20:26:43 +02:00
location_data = location_data [ ( location_data [ ' double_latitude ' ] != 0.0 ) & ( location_data [ ' double_longitude ' ] != 0.0 ) ]
2020-09-28 21:02:03 +02:00
if location_data . empty :
2020-12-03 00:41:03 +01:00
location_features = pd . DataFrame ( columns = [ " local_date " ] + [ " location_ " + time_segment + " _ " + x for x in features_to_compute ] )
2020-10-01 17:34:41 +02:00
location_features = location_features . reset_index ( drop = True )
return location_features
2020-09-28 21:02:03 +02:00
2020-07-16 20:26:43 +02:00
if " locationvariance " in features_to_compute :
2020-11-30 20:42:19 +01:00
location_features [ " locationvariance " ] = location_data . groupby ( [ ' local_segment ' ] ) [ ' double_latitude ' ] . var ( ) + location_data . groupby ( [ ' local_segment ' ] ) [ ' double_longitude ' ] . var ( )
2020-07-16 20:26:43 +02:00
if " loglocationvariance " in features_to_compute :
2020-11-30 20:42:19 +01:00
location_features [ " loglocationvariance " ] = ( location_data . groupby ( [ ' local_segment ' ] ) [ ' double_latitude ' ] . var ( ) + location_data . groupby ( [ ' local_segment ' ] ) [ ' double_longitude ' ] . var ( ) ) . apply ( lambda x : np . log10 ( x ) if x > 0 else None )
2020-07-16 20:26:43 +02:00
preComputedDistanceandSpeed = pd . DataFrame ( )
2020-08-28 19:53:00 +02:00
for localDate in location_data [ ' local_segment ' ] . unique ( ) :
distance , speeddf = get_all_travel_distances_meters_speed ( location_data [ location_data [ ' local_segment ' ] == localDate ] , threshold_static , maximum_gap_allowed )
2020-07-16 20:26:43 +02:00
preComputedDistanceandSpeed . loc [ localDate , " distance " ] = distance . sum ( )
preComputedDistanceandSpeed . loc [ localDate , " avgspeed " ] = speeddf [ speeddf [ ' speedTag ' ] == ' Moving ' ] [ ' speed ' ] . mean ( )
preComputedDistanceandSpeed . loc [ localDate , " varspeed " ] = speeddf [ speeddf [ ' speedTag ' ] == ' Moving ' ] [ ' speed ' ] . var ( )
if " totaldistance " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in location_data [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " totaldistance " ] = preComputedDistanceandSpeed . loc [ localDate , " distance " ]
2020-07-16 20:26:43 +02:00
if " averagespeed " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in location_data [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " averagespeed " ] = preComputedDistanceandSpeed . loc [ localDate , " avgspeed " ]
2020-07-16 20:26:43 +02:00
if " varspeed " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in location_data [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " varspeed " ] = preComputedDistanceandSpeed . loc [ localDate , " varspeed " ]
2020-07-16 20:26:43 +02:00
if " circadianmovement " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in location_data [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " circadianmovement " ] = circadian_movement ( location_data [ location_data [ ' local_segment ' ] == localDate ] )
2020-07-16 20:26:43 +02:00
newLocationData = cluster_and_label ( location_data , eps = distance_to_degrees ( dbscan_eps ) , min_samples = dbscan_minsamples )
2020-07-18 02:18:48 +02:00
2020-07-16 20:26:43 +02:00
if " numberofsignificantplaces " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " numberofsignificantplaces " ] = number_of_significant_places ( newLocationData [ newLocationData [ ' local_segment ' ] == localDate ] )
2020-07-16 20:26:43 +02:00
if " numberlocationtransitions " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " numberlocationtransitions " ] = number_location_transitions ( newLocationData [ newLocationData [ ' local_segment ' ] == localDate ] )
2020-07-16 20:26:43 +02:00
if " radiusgyration " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " radiusgyration " ] = radius_of_gyration ( newLocationData [ newLocationData [ ' local_segment ' ] == localDate ] , sampling_frequency )
2020-07-16 20:26:43 +02:00
if " timeattop1location " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " timeattop1 " ] = time_at_topn_clusters_in_group ( newLocationData [ newLocationData [ ' local_segment ' ] == localDate ] , 1 , sampling_frequency )
2020-07-16 20:26:43 +02:00
if " timeattop2location " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " timeattop2 " ] = time_at_topn_clusters_in_group ( newLocationData [ newLocationData [ ' local_segment ' ] == localDate ] , 2 , sampling_frequency )
2020-07-16 20:26:43 +02:00
if " timeattop3location " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " timeattop3 " ] = time_at_topn_clusters_in_group ( newLocationData [ newLocationData [ ' local_segment ' ] == localDate ] , 3 , sampling_frequency )
2020-07-16 20:26:43 +02:00
if " movingtostaticratio " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " movingtostaticratio " ] = ( newLocationData [ newLocationData [ ' local_segment ' ] == localDate ] . shape [ 0 ] * sampling_frequency ) / ( location_data [ location_data [ ' local_segment ' ] == localDate ] . shape [ 0 ] * sampling_frequency )
2020-07-16 20:26:43 +02:00
if " outlierstimepercent " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " outlierstimepercent " ] = outliers_time_percent ( newLocationData [ newLocationData [ ' local_segment ' ] == localDate ] , sampling_frequency )
2020-07-16 20:26:43 +02:00
preComputedmaxminCluster = pd . DataFrame ( )
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
smax , smin , sstd , smean = len_stay_at_clusters_in_minutes ( newLocationData [ newLocationData [ ' local_segment ' ] == localDate ] , sampling_frequency )
2020-11-30 20:42:19 +01:00
preComputedmaxminCluster . loc [ localDate , " maxlengthstayatclusters " ] = smax
preComputedmaxminCluster . loc [ localDate , " minlengthstayatclusters " ] = smin
preComputedmaxminCluster . loc [ localDate , " stdlengthstayatclusters " ] = sstd
preComputedmaxminCluster . loc [ localDate , " meanlengthstayatclusters " ] = smean
2020-07-16 20:26:43 +02:00
if " maxlengthstayatclusters " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " maxlengthstayatclusters " ] = preComputedmaxminCluster . loc [ localDate , " maxlengthstayatclusters " ]
2020-07-16 20:26:43 +02:00
if " minlengthstayatclusters " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " minlengthstayatclusters " ] = preComputedmaxminCluster . loc [ localDate , " minlengthstayatclusters " ]
2020-07-16 20:26:43 +02:00
if " stdlengthstayatclusters " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " stdlengthstayatclusters " ] = preComputedmaxminCluster . loc [ localDate , " stdlengthstayatclusters " ]
2020-07-16 20:26:43 +02:00
if " meanlengthstayatclusters " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " meanlengthstayatclusters " ] = preComputedmaxminCluster . loc [ localDate , " meanlengthstayatclusters " ]
2020-07-16 20:26:43 +02:00
if " locationentropy " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " locationentropy " ] = location_entropy ( newLocationData [ newLocationData [ ' local_segment ' ] == localDate ] )
2020-07-16 20:26:43 +02:00
if " normalizedlocationentropy " in features_to_compute :
2020-08-28 19:53:00 +02:00
for localDate in newLocationData [ ' local_segment ' ] . unique ( ) :
2020-11-30 20:42:19 +01:00
location_features . loc [ localDate , " normalizedlocationentropy " ] = location_entropy_normalized ( newLocationData [ newLocationData [ ' local_segment ' ] == localDate ] )
2020-07-23 20:23:32 +02:00
2020-07-16 20:26:43 +02:00
location_features = location_features . reset_index ( )
return location_features
2020-07-18 02:18:48 +02:00
2020-07-23 20:23:32 +02:00
def getMinutesData ( locationData ) :
return locationData [ [ ' local_hour ' , ' local_minute ' ] ] . drop_duplicates ( inplace = False ) . shape [ 0 ]
2020-07-16 20:26:43 +02:00
def distance_to_degrees ( d ) :
#Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error
#over small distances
d = d / 1852
d = d / 60
return d
2020-07-22 00:56:43 +02:00
def get_all_travel_distances_meters_speed ( locationData , threshold , maximum_gap_allowed ) :
2020-07-16 20:26:43 +02:00
lat_lon_temp = pd . DataFrame ( )
lat_lon_temp [ ' _lat_before ' ] = locationData . double_latitude
lat_lon_temp [ ' _lat_after ' ] = locationData . double_latitude . shift ( - 1 )
lat_lon_temp [ ' _lon_before ' ] = locationData . double_longitude
lat_lon_temp [ ' _lon_after ' ] = locationData . double_longitude . shift ( - 1 )
lat_lon_temp [ ' time_before ' ] = pd . to_datetime ( locationData [ ' local_time ' ] , format = " % H: % M: % S " )
lat_lon_temp [ ' time_after ' ] = lat_lon_temp [ ' time_before ' ] . shift ( - 1 )
lat_lon_temp [ ' time_diff ' ] = lat_lon_temp [ ' time_after ' ] - lat_lon_temp [ ' time_before ' ]
lat_lon_temp [ ' timeInSeconds ' ] = lat_lon_temp [ ' time_diff ' ] . apply ( lambda x : x . total_seconds ( ) )
2020-07-22 00:56:43 +02:00
lat_lon_temp = lat_lon_temp [ lat_lon_temp [ ' timeInSeconds ' ] < = maximum_gap_allowed ]
2020-07-21 21:47:48 +02:00
if lat_lon_temp . empty :
return pd . Series ( ) , pd . DataFrame ( { " speed " : [ ] , " speedTag " : [ ] } )
2020-07-16 20:26:43 +02:00
lat_lon_temp [ ' distances ' ] = lat_lon_temp . apply ( haversine , axis = 1 ) # meters
lat_lon_temp [ ' speed ' ] = ( lat_lon_temp [ ' distances ' ] / lat_lon_temp [ ' timeInSeconds ' ] )
lat_lon_temp [ ' speed ' ] = lat_lon_temp [ ' speed ' ] . replace ( np . inf , np . nan ) * 3.6
distances = lat_lon_temp [ ' distances ' ]
lat_lon_temp = lat_lon_temp . dropna ( )
lat_lon_temp [ ' speedTag ' ] = np . where ( lat_lon_temp [ ' speed ' ] > = threshold , " Moving " , " Static " )
return distances , lat_lon_temp [ [ ' speed ' , ' speedTag ' ] ]
def vincenty_row ( x ) :
"""
: param x : A row from a dataframe
: return : The distance in meters between
"""
try :
return vincenty ( ( x [ ' _lat_before ' ] , x [ ' _lon_before ' ] ) , ( x [ ' _lat_after ' ] , x [ ' _lon_after ' ] ) ) . meters
except :
return 0
def haversine ( x ) :
"""
Calculate the great circle distance between two points
on the earth ( specified in decimal degrees )
"""
# convert decimal degrees to radians
lon1 , lat1 , lon2 , lat2 = x [ ' _lon_before ' ] , x [ ' _lat_before ' ] , x [ ' _lon_after ' ] , x [ ' _lat_after ' ]
lon1 , lat1 , lon2 , lat2 = map ( radians , [ lon1 , lat1 , lon2 , lat2 ] )
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin ( dlat / 2 ) * * 2 + cos ( lat1 ) * cos ( lat2 ) * sin ( dlon / 2 ) * * 2
c = 2 * asin ( sqrt ( a ) )
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
return c * r * 1000
def circadian_movement_energies ( locationData ) :
time = ( locationData [ " timestamp " ] . values / 1000.0 ) # seconds
ylat = locationData [ " double_latitude " ] . values
ylong = locationData [ " double_longitude " ] . values
hours_intervals = np . arange ( 23.5 , 24.51 , 0.01 ) # hours
seconds_intervals = hours_intervals * 60 * 60 # seconds
frequency = 1 / seconds_intervals
power_latitude = LombScargle ( time , ylat ) . power ( frequency = frequency , normalization = ' psd ' )
power_longitude = LombScargle ( time , ylong ) . power ( frequency = frequency , normalization = ' psd ' )
energy_latitude = np . sum ( power_latitude )
energy_longitude = np . sum ( power_longitude )
return ( energy_latitude , energy_longitude )
def circadian_movement ( locationData ) :
energy_latitude , energy_longitude = circadian_movement_energies ( locationData )
return np . log10 ( energy_latitude + energy_longitude )
def cluster_and_label ( df , * * kwargs ) :
"""
: param df : a df with columns " latitude " , " longitude " , and " datetime "
or
a df with comlumns " latitude " , " longitude " and a datetime index
: param kwargs : arguments for sklearn ' s DBSCAN
: return : a new df of labeled locations with moving points removed , where the cluster
labeled as " 1 " is the largest , " 2 " the second largest , and so on
"""
location_data = df
if not isinstance ( df . index , pd . DatetimeIndex ) :
location_data = df . set_index ( " local_date_time " )
stationary = remove_moving ( location_data , 1 )
#return degrees(arcminutes=nautical(meters= d))
#nautical miles = m ÷ 1,852
clusterer = DBSCAN ( * * kwargs )
counts_df = stationary [ [ " double_latitude " , " double_longitude " ] ] . groupby ( [ " double_latitude " , " double_longitude " ] ) . size ( ) . reset_index ( )
counts = counts_df [ 0 ]
lat_lon = counts_df [ [ " double_latitude " , " double_longitude " ] ] . values
cluster_results = clusterer . fit_predict ( lat_lon , sample_weight = counts )
#Need to extend labels back to original df without weights
counts_df [ " location_label " ] = cluster_results
# remove the old count column
del counts_df [ 0 ]
merged = pd . merge ( stationary , counts_df , on = [ " double_latitude " , " double_longitude " ] )
#Now compute the label mapping:
cluster_results = merged [ " location_label " ] . values
valid_clusters = cluster_results [ np . where ( cluster_results != - 1 ) ]
label_map = rank_count_map ( valid_clusters )
#And remap the labels:
merged . index = stationary . index
2020-07-18 02:18:48 +02:00
stationary = stationary . assign ( location_label = merged [ " location_label " ] . map ( label_map ) . values )
stationary . loc [ : , " location_label " ] = merged [ " location_label " ] . map ( label_map )
2020-07-16 20:26:43 +02:00
return stationary
def rank_count_map ( clusters ) :
""" Returns a function which will map each element of a list ' l ' to its rank,
such that the most common element maps to 1
Is used in this context to sort the cluster labels so that cluster with rank 1 is the most
visited .
If return_dict , return a mapping dict rather than a function
If a function , if the value can ' t be found label as -1
"""
labels , counts = tuple ( np . unique ( clusters , return_counts = True ) )
sorted_by_count = [ x for ( y , x ) in sorted ( zip ( counts , labels ) , reverse = True ) ]
label_to_rank = { label : rank + 1 for ( label , rank ) in [ ( sorted_by_count [ i ] , i ) for i in range ( len ( sorted_by_count ) ) ] }
return lambda x : label_to_rank . get ( x , - 1 )
def remove_moving ( df , v ) :
if not df . index . is_monotonic :
df = df . sort_index ( )
lat_lon_temp = pd . DataFrame ( )
lat_lon_temp [ ' _lat_before ' ] = df . double_latitude . shift ( )
lat_lon_temp [ ' _lat_after ' ] = df . double_latitude . shift ( - 1 )
lat_lon_temp [ ' _lon_before ' ] = df . double_longitude . shift ( )
lat_lon_temp [ ' _lon_after ' ] = df . double_longitude . shift ( - 1 )
#
distance = lat_lon_temp . apply ( haversine , axis = 1 ) / 1000
2020-07-18 02:18:48 +02:00
time = ( ( pd . to_datetime ( df . reset_index ( ) . local_date_time . shift ( - 1 ) , format = " % Y- % m- %d % H: % M: % S " ) - pd . to_datetime ( df . reset_index ( ) . local_date_time . shift ( ) , format = " % Y- % m- %d % H: % M: % S " ) ) / np . timedelta64 ( 1 , ' s ' ) ) . fillna ( - 1 ) / ( 60. * 60 )
2020-07-16 20:26:43 +02:00
time . index = distance . index . copy ( )
return df [ ( distance / time ) < v ]
def number_of_significant_places ( locationData ) :
uniquelst = locationData [ locationData [ " location_label " ] > = 1 ] [ " location_label " ] . unique ( )
return len ( uniquelst )
def number_location_transitions ( locationData ) :
# ignores transitions from moving to static and vice-versa, but counts transitions from outliers to major location clusters
df = pd . DataFrame ( )
df [ ' boolCol ' ] = ( locationData . location_label == locationData . location_label . shift ( ) )
return df [ df [ ' boolCol ' ] == False ] . shape [ 0 ] - 1
2020-08-05 19:02:34 +02:00
def radius_of_gyration ( locationData , sampling_frequency ) :
2020-07-16 20:26:43 +02:00
if locationData is None or len ( locationData ) == 0 :
return None
# Center is the centroid, not the home location
valid_clusters = locationData [ locationData [ " location_label " ] != - 1 ]
2020-08-05 18:06:39 +02:00
centroid_all_clusters = ( valid_clusters . groupby ( ' location_label ' ) [ [ ' double_latitude ' , ' double_longitude ' ] ] . mean ( ) ) . mean ( )
clusters_centroid = valid_clusters . groupby ( ' location_label ' ) [ [ ' double_latitude ' , ' double_longitude ' ] ] . mean ( )
2020-08-04 00:28:02 +02:00
2020-08-05 18:06:39 +02:00
rog = 0
for labels in clusters_centroid . index :
lat_lon_dict = dict ( )
lat_lon_dict [ ' _lon_before ' ] = clusters_centroid . loc [ labels ] . double_longitude
lat_lon_dict [ ' _lat_before ' ] = clusters_centroid . loc [ labels ] . double_latitude
lat_lon_dict [ ' _lon_after ' ] = centroid_all_clusters . double_longitude
lat_lon_dict [ ' _lat_after ' ] = centroid_all_clusters . double_latitude
distance = haversine ( lat_lon_dict ) * * 2
2020-08-05 19:02:34 +02:00
time_in_cluster = locationData [ locationData [ " location_label " ] == labels ] . shape [ 0 ] * sampling_frequency
2020-08-05 18:06:39 +02:00
rog = rog + ( time_in_cluster * distance )
2020-10-10 19:26:06 +02:00
2020-08-05 19:14:31 +02:00
time_all_clusters = valid_clusters . shape [ 0 ] * sampling_frequency
2020-10-10 19:26:06 +02:00
if time_all_clusters == 0 :
return 0
2020-08-05 19:14:31 +02:00
final_rog = ( 1 / time_all_clusters ) * rog
2020-08-05 18:06:39 +02:00
return np . sqrt ( final_rog )
2020-07-16 20:26:43 +02:00
2020-08-05 19:02:34 +02:00
def time_at_topn_clusters_in_group ( locationData , n , sampling_frequency ) : # relevant only for global location features since, top3_clusters = top3_clusters_in_group for local
2020-07-16 20:26:43 +02:00
if locationData is None or len ( locationData ) == 0 :
return None
locationData = locationData [ locationData [ " location_label " ] > = 1 ] # remove outliers/ cluster noise
valcounts = locationData [ " location_label " ] . value_counts ( ) . to_dict ( )
sorted_valcounts = sorted ( valcounts . items ( ) , key = lambda kv : ( - kv [ 1 ] , kv [ 0 ] ) )
if len ( sorted_valcounts ) > = n :
topn = sorted_valcounts [ n - 1 ]
2020-08-05 19:39:37 +02:00
topn_time = topn [ 1 ] * sampling_frequency
2020-07-16 20:26:43 +02:00
else :
topn_time = None
return topn_time
2020-08-05 19:02:34 +02:00
def outliers_time_percent ( locationData , sampling_frequency ) :
2020-07-16 20:26:43 +02:00
if locationData is None or len ( locationData ) == 0 :
return None
clusters = locationData [ " location_label " ]
2020-08-05 19:02:34 +02:00
numoutliers = clusters [ ( clusters == - 1 ) ] . sum ( ) * sampling_frequency
numtotal = len ( clusters ) * sampling_frequency
2020-07-16 20:26:43 +02:00
return ( float ( - 1 * numoutliers ) / numtotal )
def moving_time_percent ( locationData ) :
if locationData is None or len ( locationData ) == 0 :
return None
lbls = locationData [ " location_label " ]
nummoving = lbls . isnull ( ) . sum ( )
numtotal = len ( lbls )
# print (nummoving)
# print(numtotal)
return ( float ( nummoving ) / numtotal )
2020-08-05 19:39:37 +02:00
def len_stay_at_clusters_in_minutes ( locationData , sampling_frequency ) :
2020-07-16 20:26:43 +02:00
if locationData is None or len ( locationData ) == 0 :
return ( None , None , None , None )
lenstays = [ ]
count = 0
prev_loc_label = None
for row in locationData . iterrows ( ) :
cur_loc_label = row [ 1 ] [ " location_label " ]
if np . isnan ( cur_loc_label ) :
continue
elif prev_loc_label == None :
prev_loc_label = int ( cur_loc_label )
count + = 1
else :
if prev_loc_label == int ( cur_loc_label ) :
count + = 1
else :
lenstays . append ( count )
prev_loc_label = int ( cur_loc_label )
count = 0 + 1
if count > 0 : # in case of no transition
lenstays . append ( count )
2020-08-05 19:39:37 +02:00
lenstays = np . array ( lenstays ) * sampling_frequency
2020-07-16 20:26:43 +02:00
if len ( lenstays ) > 0 :
smax = np . max ( lenstays )
smin = np . min ( lenstays )
sstd = np . std ( lenstays )
smean = np . mean ( lenstays )
else :
smax = None
smin = None
sstd = None
smean = None
return ( smax , smin , sstd , smean )
def location_entropy ( locationData ) :
if locationData is None or len ( locationData ) == 0 :
return None
clusters = locationData [ locationData [ " location_label " ] > = 1 ] # remove outliers/ cluster noise
if len ( clusters ) > 0 :
# Get percentages for each location
percents = clusters [ " location_label " ] . value_counts ( normalize = True )
entropy = - 1 * percents . map ( lambda x : x * np . log ( x ) ) . sum ( )
return entropy
else :
return None
def location_entropy_normalized ( locationData ) :
if locationData is None or len ( locationData ) == 0 :
return None
locationData = locationData [ locationData [ " location_label " ] > = 1 ] # remove outliers/ cluster noise
entropy = location_entropy ( locationData )
unique_clusters = locationData [ " location_label " ] . unique ( )
num_clusters = len ( unique_clusters )
if num_clusters == 0 or len ( locationData ) == 0 or entropy is None :
return None
else :
2020-08-05 19:02:34 +02:00
return entropy / num_clusters
def getSamplingFrequency ( locationData ) :
return ( ( pd . to_datetime ( locationData [ ' local_time ' ] , format = " % H: % M: % S " ) - pd . to_datetime ( locationData [ ' local_time ' ] . shift ( periods = 1 ) , format = " % H: % M: % S " ) ) . apply ( lambda x : x . total_seconds ( ) ) / 60 ) . median ( )