419 lines
20 KiB
Python
419 lines
20 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from astropy.timeseries import LombScargle
|
|
from sklearn.cluster import DBSCAN
|
|
from math import radians, cos, sin, asin, sqrt
|
|
|
|
def base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples, threshold_static):
|
|
# name of the features this function can compute
|
|
base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
|
|
|
|
# the subset of requested features this function can compute
|
|
features_to_compute = list(set(requested_features) & set(base_features_names))
|
|
|
|
|
|
if location_data.empty:
|
|
location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute])
|
|
else:
|
|
if day_segment != "daily":
|
|
location_data = location_data[location_data["local_day_segment"] == day_segment]
|
|
|
|
if location_data.empty:
|
|
location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute])
|
|
else:
|
|
location_features = pd.DataFrame()
|
|
|
|
location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)]
|
|
|
|
if "locationvariance" in features_to_compute:
|
|
location_features["location_" + day_segment + "_locationvariance"] = location_data.groupby(['local_date'])['double_latitude'].var() + location_data.groupby(['local_date'])['double_longitude'].var()
|
|
|
|
if "loglocationvariance" in features_to_compute:
|
|
location_features["location_" + day_segment + "_loglocationvariance"] = np.log10(location_data.groupby(['local_date'])['double_latitude'].var() + location_data.groupby(['local_date'])['double_longitude'].var())
|
|
|
|
|
|
preComputedDistanceandSpeed = pd.DataFrame()
|
|
for localDate in location_data['local_date'].unique():
|
|
distance, speeddf = get_all_travel_distances_meters_speed(location_data[location_data['local_date']==localDate],threshold_static)
|
|
preComputedDistanceandSpeed.loc[localDate,"distance"] = distance.sum()
|
|
preComputedDistanceandSpeed.loc[localDate,"avgspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].mean()
|
|
preComputedDistanceandSpeed.loc[localDate,"varspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].var()
|
|
|
|
if "totaldistance" in features_to_compute:
|
|
for localDate in location_data['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"]
|
|
|
|
if "averagespeed" in features_to_compute:
|
|
for localDate in location_data['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"]
|
|
|
|
if "varspeed" in features_to_compute:
|
|
for localDate in location_data['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"]
|
|
|
|
if "circadianmovement" in features_to_compute:
|
|
for localDate in location_data['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_circadianmovement"] = circadian_movement(location_data[location_data['local_date']==localDate])
|
|
|
|
newLocationData = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples)
|
|
|
|
if "numberofsignificantplaces" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_date']==localDate])
|
|
|
|
if "numberlocationtransitions" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_date']==localDate])
|
|
|
|
if "radiusgyration" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_date']==localDate])
|
|
|
|
if "timeattop1location" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],1)
|
|
|
|
if "timeattop2location" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],2)
|
|
|
|
if "timeattop3location" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],3)
|
|
|
|
if "movingtostaticratio" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_movingtostaticratio"] = (newLocationData[newLocationData['local_date']==localDate].shape[0] / location_data[location_data['local_date']==localDate].shape[0])
|
|
|
|
if "outlierstimepercent" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_date']==localDate])
|
|
|
|
preComputedmaxminCluster = pd.DataFrame()
|
|
for localDate in newLocationData['local_date'].unique():
|
|
smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(newLocationData[newLocationData['local_date']==localDate])
|
|
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = smax
|
|
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = smin
|
|
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = sstd
|
|
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = smean
|
|
|
|
if "maxlengthstayatclusters" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"]
|
|
|
|
if "minlengthstayatclusters" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"]
|
|
|
|
if "stdlengthstayatclusters" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"]
|
|
|
|
if "meanlengthstayatclusters" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"]
|
|
|
|
if "locationentropy" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_locationentropy"] = location_entropy(newLocationData[newLocationData['local_date']==localDate])
|
|
|
|
if "normalizedlocationentropy" in features_to_compute:
|
|
for localDate in newLocationData['local_date'].unique():
|
|
location_features.loc[localDate,"location_" + day_segment + "_normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_date']==localDate])
|
|
|
|
location_features = location_features.reset_index()
|
|
|
|
return location_features
|
|
|
|
def distance_to_degrees(d):
|
|
#Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error
|
|
#over small distances
|
|
d = d / 1852
|
|
d = d / 60
|
|
return d
|
|
|
|
def get_all_travel_distances_meters_speed(locationData,threshold):
|
|
|
|
lat_lon_temp = pd.DataFrame()
|
|
|
|
lat_lon_temp['_lat_before'] = locationData.double_latitude
|
|
lat_lon_temp['_lat_after'] = locationData.double_latitude.shift(-1)
|
|
lat_lon_temp['_lon_before'] = locationData.double_longitude
|
|
lat_lon_temp['_lon_after'] = locationData.double_longitude.shift(-1)
|
|
lat_lon_temp['time_before'] = pd.to_datetime(locationData['local_time'], format="%H:%M:%S")
|
|
lat_lon_temp['time_after'] = lat_lon_temp['time_before'].shift(-1)
|
|
lat_lon_temp['time_diff'] = lat_lon_temp['time_after'] - lat_lon_temp['time_before']
|
|
lat_lon_temp['timeInSeconds'] = lat_lon_temp['time_diff'].apply(lambda x: x.total_seconds())
|
|
|
|
lat_lon_temp = lat_lon_temp[lat_lon_temp['timeInSeconds'] <= 300]
|
|
lat_lon_temp['distances'] = lat_lon_temp.apply(haversine, axis=1) # meters
|
|
lat_lon_temp['speed'] = (lat_lon_temp['distances'] / lat_lon_temp['timeInSeconds'] )
|
|
lat_lon_temp['speed'] = lat_lon_temp['speed'].replace(np.inf, np.nan) * 3.6
|
|
distances = lat_lon_temp['distances']
|
|
lat_lon_temp = lat_lon_temp.dropna()
|
|
lat_lon_temp['speedTag'] = np.where(lat_lon_temp['speed'] >= threshold,"Moving","Static")
|
|
|
|
return distances,lat_lon_temp[['speed','speedTag']]
|
|
|
|
|
|
def vincenty_row(x):
|
|
"""
|
|
:param x: A row from a dataframe
|
|
:return: The distance in meters between
|
|
"""
|
|
|
|
try:
|
|
return vincenty((x['_lat_before'], x['_lon_before']),(x['_lat_after'], x['_lon_after'])).meters
|
|
|
|
except:
|
|
return 0
|
|
|
|
def haversine(x):
|
|
"""
|
|
Calculate the great circle distance between two points
|
|
on the earth (specified in decimal degrees)
|
|
"""
|
|
# convert decimal degrees to radians
|
|
lon1, lat1, lon2, lat2 = x['_lon_before'], x['_lat_before'],x['_lon_after'], x['_lat_after']
|
|
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
|
|
|
|
# haversine formula
|
|
dlon = lon2 - lon1
|
|
dlat = lat2 - lat1
|
|
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
|
|
c = 2 * asin(sqrt(a))
|
|
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
|
|
return c * r* 1000
|
|
|
|
|
|
def circadian_movement_energies(locationData):
|
|
time = (locationData["timestamp"].values / 1000.0) # seconds
|
|
ylat = locationData["double_latitude"].values
|
|
ylong = locationData["double_longitude"].values
|
|
hours_intervals = np.arange(23.5, 24.51, 0.01) # hours
|
|
seconds_intervals = hours_intervals * 60 * 60 # seconds
|
|
frequency = 1 / seconds_intervals
|
|
|
|
power_latitude = LombScargle(time, ylat).power(frequency=frequency, normalization='psd')
|
|
power_longitude = LombScargle(time, ylong).power(frequency=frequency, normalization='psd')
|
|
|
|
energy_latitude = np.sum(power_latitude)
|
|
energy_longitude = np.sum(power_longitude)
|
|
return (energy_latitude, energy_longitude)
|
|
|
|
def circadian_movement(locationData):
|
|
|
|
energy_latitude, energy_longitude = circadian_movement_energies(locationData)
|
|
return np.log10(energy_latitude + energy_longitude)
|
|
|
|
def cluster_and_label(df,**kwargs):
|
|
"""
|
|
|
|
:param df: a df with columns "latitude", "longitude", and "datetime"
|
|
or
|
|
a df with comlumns "latitude","longitude" and a datetime index
|
|
:param kwargs: arguments for sklearn's DBSCAN
|
|
:return: a new df of labeled locations with moving points removed, where the cluster
|
|
labeled as "1" is the largest, "2" the second largest, and so on
|
|
"""
|
|
location_data = df
|
|
if not isinstance(df.index, pd.DatetimeIndex):
|
|
location_data = df.set_index("local_date_time")
|
|
|
|
stationary = remove_moving(location_data,1)
|
|
|
|
#return degrees(arcminutes=nautical(meters= d))
|
|
#nautical miles = m ÷ 1,852
|
|
clusterer = DBSCAN(**kwargs)
|
|
|
|
counts_df = stationary[["double_latitude" ,"double_longitude"]].groupby(["double_latitude" ,"double_longitude"]).size().reset_index()
|
|
counts = counts_df[0]
|
|
lat_lon = counts_df[["double_latitude","double_longitude"]].values
|
|
cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts)
|
|
|
|
#Need to extend labels back to original df without weights
|
|
counts_df["location_label"] = cluster_results
|
|
# remove the old count column
|
|
del counts_df[0]
|
|
|
|
merged = pd.merge(stationary,counts_df, on = ["double_latitude" ,"double_longitude"])
|
|
|
|
#Now compute the label mapping:
|
|
cluster_results = merged["location_label"].values
|
|
valid_clusters = cluster_results[np.where(cluster_results != -1)]
|
|
label_map = rank_count_map(valid_clusters)
|
|
|
|
#And remap the labels:
|
|
merged.index = stationary.index
|
|
stationary["location_label"] = merged["location_label"].map(label_map)
|
|
|
|
return stationary
|
|
|
|
def rank_count_map(clusters):
|
|
""" Returns a function which will map each element of a list 'l' to its rank,
|
|
such that the most common element maps to 1
|
|
|
|
Is used in this context to sort the cluster labels so that cluster with rank 1 is the most
|
|
visited.
|
|
|
|
If return_dict, return a mapping dict rather than a function
|
|
|
|
If a function, if the value can't be found label as -1
|
|
|
|
"""
|
|
labels, counts = tuple(np.unique(clusters, return_counts = True))
|
|
sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)]
|
|
label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]}
|
|
|
|
return lambda x: label_to_rank.get(x, -1)
|
|
|
|
|
|
def remove_moving(df, v):
|
|
|
|
if not df.index.is_monotonic:
|
|
df = df.sort_index()
|
|
|
|
lat_lon_temp = pd.DataFrame()
|
|
|
|
lat_lon_temp['_lat_before'] = df.double_latitude.shift()
|
|
lat_lon_temp['_lat_after'] = df.double_latitude.shift(-1)
|
|
lat_lon_temp['_lon_before'] = df.double_longitude.shift()
|
|
lat_lon_temp['_lon_after'] = df.double_longitude.shift(-1)
|
|
|
|
#
|
|
distance = lat_lon_temp.apply( haversine, axis = 1) / 1000
|
|
time = (pd.to_datetime(df.reset_index().local_date_time.shift(-1),format="%Y-%m-%d %H:%M:%S") - pd.to_datetime(df.reset_index().local_date_time.shift(),format="%Y-%m-%d %H:%M:%S")).fillna(-1) / np.timedelta64(1,'s') / (60.*60)
|
|
time.index = distance.index.copy()
|
|
|
|
return df[(distance / time) < v]
|
|
|
|
def number_of_significant_places(locationData):
|
|
|
|
uniquelst = locationData[locationData["location_label"] >= 1]["location_label"].unique()
|
|
return len(uniquelst)
|
|
|
|
def number_location_transitions(locationData):
|
|
|
|
# ignores transitions from moving to static and vice-versa, but counts transitions from outliers to major location clusters
|
|
df = pd.DataFrame()
|
|
|
|
df['boolCol'] = (locationData.location_label == locationData.location_label.shift())
|
|
|
|
return df[df['boolCol'] == False].shape[0] - 1
|
|
|
|
def radius_of_gyration(locationData):
|
|
if locationData is None or len(locationData) == 0:
|
|
return None
|
|
# Center is the centroid, not the home location
|
|
valid_clusters = locationData[locationData["location_label"] != -1]
|
|
changes_selector = (valid_clusters["location_label"].shift() != valid_clusters["location_label"])
|
|
mobility_trace = valid_clusters[changes_selector]
|
|
|
|
# Average x,y
|
|
lat_lon = mobility_trace[["double_latitude", "double_longitude"]].values
|
|
center = np.average(lat_lon, axis=0)
|
|
norm = np.linalg.norm(lat_lon - center)
|
|
if len(lat_lon) != 0:
|
|
return np.sqrt(norm) / len(lat_lon)
|
|
else:
|
|
return 0
|
|
|
|
def time_at_topn_clusters_in_group(locationData,n): # relevant only for global location features since, top3_clusters = top3_clusters_in_group for local
|
|
|
|
if locationData is None or len(locationData) == 0:
|
|
return None
|
|
|
|
locationData = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
|
|
valcounts = locationData["location_label"].value_counts().to_dict()
|
|
sorted_valcounts = sorted(valcounts.items(), key=lambda kv: (-kv[1], kv[0]))
|
|
|
|
if len(sorted_valcounts) >= n:
|
|
topn = sorted_valcounts[n-1]
|
|
topn_time = topn[1]
|
|
else:
|
|
topn_time = None
|
|
|
|
return topn_time
|
|
|
|
def outliers_time_percent(locationData):
|
|
if locationData is None or len(locationData) == 0:
|
|
return None
|
|
clusters = locationData["location_label"]
|
|
numoutliers = clusters[(clusters == -1)].sum()
|
|
numtotal = len(clusters)
|
|
return (float(-1*numoutliers) / numtotal)
|
|
|
|
|
|
def moving_time_percent(locationData):
|
|
if locationData is None or len(locationData) == 0:
|
|
return None
|
|
lbls = locationData["location_label"]
|
|
nummoving = lbls.isnull().sum()
|
|
numtotal = len(lbls)
|
|
# print (nummoving)
|
|
# print(numtotal)
|
|
return (float(nummoving) / numtotal)
|
|
|
|
def len_stay_at_clusters_in_minutes(locationData):
|
|
if locationData is None or len(locationData) == 0:
|
|
return (None, None, None,None)
|
|
|
|
lenstays = []
|
|
count = 0
|
|
prev_loc_label = None
|
|
for row in locationData.iterrows():
|
|
cur_loc_label = row[1]["location_label"]
|
|
if np.isnan(cur_loc_label):
|
|
continue
|
|
elif prev_loc_label == None:
|
|
prev_loc_label = int(cur_loc_label)
|
|
count += 1
|
|
else:
|
|
if prev_loc_label == int(cur_loc_label):
|
|
count += 1
|
|
else:
|
|
lenstays.append(count)
|
|
prev_loc_label = int(cur_loc_label)
|
|
count = 0 + 1
|
|
if count > 0: # in case of no transition
|
|
lenstays.append(count)
|
|
lenstays = np.array(lenstays)
|
|
|
|
if len(lenstays) > 0:
|
|
smax = np.max(lenstays)
|
|
smin = np.min(lenstays)
|
|
sstd = np.std(lenstays)
|
|
smean = np.mean(lenstays)
|
|
else:
|
|
smax = None
|
|
smin = None
|
|
sstd = None
|
|
smean = None
|
|
return (smax, smin, sstd, smean)
|
|
|
|
|
|
def location_entropy(locationData):
|
|
if locationData is None or len(locationData) == 0:
|
|
return None
|
|
|
|
clusters = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
|
|
if len(clusters) > 0:
|
|
# Get percentages for each location
|
|
percents = clusters["location_label"].value_counts(normalize=True)
|
|
entropy = -1 * percents.map(lambda x: x * np.log(x)).sum()
|
|
return entropy
|
|
else:
|
|
return None
|
|
|
|
def location_entropy_normalized(locationData):
|
|
if locationData is None or len(locationData) == 0:
|
|
return None
|
|
|
|
locationData = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
|
|
entropy = location_entropy(locationData)
|
|
unique_clusters = locationData["location_label"].unique()
|
|
num_clusters = len(unique_clusters)
|
|
if num_clusters == 0 or len(locationData) == 0 or entropy is None:
|
|
return None
|
|
else:
|
|
return entropy / num_clusters |