2021-04-09 18:05:25 +02:00
import pandas as pd
import numpy as np
from sklearn . cluster import DBSCAN , OPTICS
# Calculate the great-circle distance (in meters) between two points on the earth (specified in decimal degrees)
def haversine ( lon1 , lat1 , lon2 , lat2 ) :
# Radius of earth in kilometers. Use 3956 for miles
r = 6371
# Convert decimal degrees to radians
lon1 , lat1 , lon2 , lat2 = np . radians ( [ lon1 , lat1 , lon2 , lat2 ] )
# Haversine formula
distance = r * 2 * np . arcsin ( np . sqrt ( np . sin ( ( lat2 - lat1 ) / 2.0 ) * * 2 + np . cos ( lat1 ) * np . cos ( lat2 ) * np . sin ( ( lon2 - lon1 ) / 2.0 ) * * 2 ) ) * 1000
return distance
# Just an approximation, but speeds up clustering by a huge amount and doesn't introduce much error over small distances
# Reference: https://jonisalonen.com/2014/computing-distance-between-coordinates-can-be-simple-and-fast/
def meters_to_degrees ( distance ) :
# Convert meter to nautical mile
distance = distance / 1852
# Convert nautical mile to degree
distance = distance / 60
return distance
# Relabel clusters: -1 denotes the outliers (insignificant or rarely visited locations), 1 denotes the most visited significant location, 2 denotes the 2nd most significant location,...
def label ( location_data ) :
2021-09-15 20:28:09 +02:00
# Exclude outliers (cluster_label = -1) while calculating the total duration of locations in a cluster
label2duration = location_data [ [ " cluster_label " , " duration " ] ] . replace ( - 1 , np . nan ) . groupby ( " cluster_label " ) [ [ " duration " ] ] . sum ( ) . sort_values ( by = [ " duration " ] , ascending = False )
# Add the row number as the new cluster label
label2duration [ " new_cluster_label " ] = np . arange ( len ( label2duration ) ) + 1
2021-04-09 18:05:25 +02:00
# Still use -1 to denote the outliers
2021-09-15 20:28:09 +02:00
label2duration . loc [ - 1 , " new_cluster_label " ] = - 1
2021-04-09 18:05:25 +02:00
# Merge the new cluster label with the original location data
2021-09-15 20:28:09 +02:00
location_data = location_data . merge ( label2duration [ [ " new_cluster_label " ] ] , left_on = " cluster_label " , right_index = True , how = " left " )
2021-04-09 18:05:25 +02:00
del location_data [ " cluster_label " ]
location_data . rename ( columns = { " new_cluster_label " : " cluster_label " } , inplace = True )
return location_data
def create_clustering_hyperparameters ( clustering_algorithm , dbscan_eps , dbscan_minsamples ) :
if clustering_algorithm == " DBSCAN " :
hyperparameters = { " eps " : meters_to_degrees ( dbscan_eps ) , " min_samples " : dbscan_minsamples }
else : # OPTICS
hyperparameters = { " max_eps " : meters_to_degrees ( dbscan_eps ) , " min_samples " : dbscan_minsamples , " metric " : " euclidean " , " cluster_method " : " dbscan " }
return hyperparameters
# Only stationary samples are clustered, hence moving samples are labeled with NA
def cluster ( location_data , clustering_algorithm , * * kwargs ) :
if location_data . empty :
return pd . DataFrame ( columns = location_data . columns . tolist ( ) + [ " is_stationary " , " cluster_label " ] )
2021-09-15 20:28:09 +02:00
if " duration " not in location_data . columns :
# Convert second to minute
location_data = location_data . assign ( duration = location_data [ " duration_in_seconds " ] / 60 )
2021-04-09 18:05:25 +02:00
# Only keep stationary samples for clustering
2021-09-15 20:28:09 +02:00
stationary_data = location_data [ location_data [ " is_stationary " ] == 1 ] [ [ " double_latitude " , " double_longitude " , " duration " ] ]
2021-04-09 18:05:25 +02:00
# Remove duplicates and apply sample_weight (only available for DBSCAN currently) to reduce memory usage
2021-09-15 20:28:09 +02:00
stationary_data_dedup = stationary_data . groupby ( [ " double_latitude " , " double_longitude " ] ) [ [ " duration " ] ] . sum ( ) . reset_index ( )
lat_lon_dedup = stationary_data_dedup [ [ " double_latitude " , " double_longitude " ] ] . values
2021-04-09 18:05:25 +02:00
if stationary_data_dedup . shape [ 0 ] < kwargs [ " min_samples " ] :
cluster_results = np . array ( [ - 1 ] * stationary_data_dedup . shape [ 0 ] )
2021-09-15 20:28:09 +02:00
elif clustering_algorithm == " DBSCAN " :
2021-04-09 18:05:25 +02:00
clusterer = DBSCAN ( * * kwargs )
2021-09-15 20:28:09 +02:00
cluster_results = clusterer . fit_predict ( lat_lon_dedup , sample_weight = stationary_data_dedup [ " duration " ] )
2021-04-09 18:05:25 +02:00
else : # OPTICS
clusterer = OPTICS ( * * kwargs )
2021-09-15 20:28:09 +02:00
cluster_results = clusterer . fit_predict ( lat_lon_dedup )
2021-04-09 18:05:25 +02:00
# Add cluster labels
stationary_data_dedup [ " cluster_label " ] = cluster_results
2021-09-15 20:28:09 +02:00
location_data_with_labels = label ( location_data . merge ( stationary_data_dedup [ [ " double_latitude " , " double_longitude " , " cluster_label " ] ] , how = " left " , on = [ " double_latitude " , " double_longitude " ] ) )
2021-04-09 18:05:25 +02:00
return location_data_with_labels