82 lines
4.2 KiB
Python
82 lines
4.2 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.cluster import DBSCAN, OPTICS
|
|
|
|
|
|
|
|
# Calculate the great-circle distance (in meters) between two points on the earth (specified in decimal degrees)
|
|
def haversine(lon1, lat1, lon2, lat2):
|
|
# Radius of earth in kilometers. Use 3956 for miles
|
|
r = 6371
|
|
# Convert decimal degrees to radians
|
|
lon1, lat1, lon2, lat2 = np.radians([lon1, lat1, lon2, lat2])
|
|
# Haversine formula
|
|
distance = r * 2 * np.arcsin(np.sqrt(np.sin((lat2 - lat1) / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2 - lon1) / 2.0) ** 2)) * 1000
|
|
return distance
|
|
|
|
# Just an approximation, but speeds up clustering by a huge amount and doesn't introduce much error over small distances
|
|
# Reference: https://jonisalonen.com/2014/computing-distance-between-coordinates-can-be-simple-and-fast/
|
|
def meters_to_degrees(distance):
|
|
# Convert meter to nautical mile
|
|
distance = distance / 1852
|
|
# Convert nautical mile to degree
|
|
distance = distance / 60
|
|
return distance
|
|
|
|
# Relabel clusters: -1 denotes the outliers (insignificant or rarely visited locations), 1 denotes the most visited significant location, 2 denotes the 2nd most significant location,...
|
|
def label(location_data):
|
|
|
|
# Exclude outliers (cluster_label = -1) while calculating the total duration of locations in a cluster
|
|
label2duration = location_data[["cluster_label", "duration"]].replace(-1, np.nan).groupby("cluster_label")[["duration"]].sum().sort_values(by=["duration"], ascending=False)
|
|
# Add the row number as the new cluster label
|
|
label2duration["new_cluster_label"] = np.arange(len(label2duration)) + 1
|
|
# Still use -1 to denote the outliers
|
|
label2duration.loc[-1, "new_cluster_label"] = -1
|
|
# Merge the new cluster label with the original location data
|
|
location_data = location_data.merge(label2duration[["new_cluster_label"]], left_on="cluster_label", right_index=True, how="left")
|
|
|
|
del location_data["cluster_label"]
|
|
location_data.rename(columns={"new_cluster_label": "cluster_label"}, inplace=True)
|
|
|
|
return location_data
|
|
|
|
def create_clustering_hyperparameters(clustering_algorithm, dbscan_eps, dbscan_minsamples):
|
|
if clustering_algorithm == "DBSCAN":
|
|
hyperparameters = {"eps": meters_to_degrees(dbscan_eps), "min_samples": dbscan_minsamples}
|
|
else: # OPTICS
|
|
hyperparameters = {"max_eps": meters_to_degrees(dbscan_eps), "min_samples": dbscan_minsamples, "metric": "euclidean", "cluster_method": "dbscan"}
|
|
|
|
return hyperparameters
|
|
|
|
# Only stationary samples are clustered, hence moving samples are labeled with NA
|
|
def cluster(location_data, clustering_algorithm, **kwargs):
|
|
|
|
if location_data.empty:
|
|
return pd.DataFrame(columns=location_data.columns.tolist() + ["is_stationary", "cluster_label"])
|
|
|
|
if "duration" not in location_data.columns:
|
|
# Convert second to minute
|
|
location_data = location_data.assign(duration=location_data["duration_in_seconds"] / 60)
|
|
|
|
# Only keep stationary samples for clustering
|
|
stationary_data = location_data[location_data["is_stationary"] == 1][["double_latitude", "double_longitude", "duration"]]
|
|
|
|
# Remove duplicates and apply sample_weight (only available for DBSCAN currently) to reduce memory usage
|
|
stationary_data_dedup = stationary_data.groupby(["double_latitude", "double_longitude"])[["duration"]].sum().reset_index()
|
|
lat_lon_dedup = stationary_data_dedup[["double_latitude", "double_longitude"]].values
|
|
|
|
if stationary_data_dedup.shape[0] < kwargs["min_samples"]:
|
|
cluster_results = np.array([-1] * stationary_data_dedup.shape[0])
|
|
elif clustering_algorithm == "DBSCAN":
|
|
clusterer = DBSCAN(**kwargs)
|
|
cluster_results = clusterer.fit_predict(lat_lon_dedup, sample_weight=stationary_data_dedup["duration"])
|
|
else: # OPTICS
|
|
clusterer = OPTICS(**kwargs)
|
|
cluster_results = clusterer.fit_predict(lat_lon_dedup)
|
|
|
|
# Add cluster labels
|
|
stationary_data_dedup["cluster_label"] = cluster_results
|
|
location_data_with_labels = label(location_data.merge(stationary_data_dedup[["double_latitude", "double_longitude", "cluster_label"]], how="left", on=["double_latitude", "double_longitude"]))
|
|
|
|
return location_data_with_labels
|