rapids/src/features/phone_locations/doryab/doryab_clustering.py

82 lines
4.2 KiB
Python

import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN, OPTICS
# Calculate the great-circle distance (in meters) between two points on the earth (specified in decimal degrees)
def haversine(lon1, lat1, lon2, lat2):
# Radius of earth in kilometers. Use 3956 for miles
r = 6371
# Convert decimal degrees to radians
lon1, lat1, lon2, lat2 = np.radians([lon1, lat1, lon2, lat2])
# Haversine formula
distance = r * 2 * np.arcsin(np.sqrt(np.sin((lat2 - lat1) / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2 - lon1) / 2.0) ** 2)) * 1000
return distance
# Just an approximation, but speeds up clustering by a huge amount and doesn't introduce much error over small distances
# Reference: https://jonisalonen.com/2014/computing-distance-between-coordinates-can-be-simple-and-fast/
def meters_to_degrees(distance):
# Convert meter to nautical mile
distance = distance / 1852
# Convert nautical mile to degree
distance = distance / 60
return distance
# Relabel clusters: -1 denotes the outliers (insignificant or rarely visited locations), 1 denotes the most visited significant location, 2 denotes the 2nd most significant location,...
def label(location_data):
# Exclude outliers (cluster_label = -1) while calculating the total duration of locations in a cluster
label2duration = location_data[["cluster_label", "duration"]].replace(-1, np.nan).groupby("cluster_label")[["duration"]].sum().sort_values(by=["duration"], ascending=False)
# Add the row number as the new cluster label
label2duration["new_cluster_label"] = np.arange(len(label2duration)) + 1
# Still use -1 to denote the outliers
label2duration.loc[-1, "new_cluster_label"] = -1
# Merge the new cluster label with the original location data
location_data = location_data.merge(label2duration[["new_cluster_label"]], left_on="cluster_label", right_index=True, how="left")
del location_data["cluster_label"]
location_data.rename(columns={"new_cluster_label": "cluster_label"}, inplace=True)
return location_data
def create_clustering_hyperparameters(clustering_algorithm, dbscan_eps, dbscan_minsamples):
if clustering_algorithm == "DBSCAN":
hyperparameters = {"eps": meters_to_degrees(dbscan_eps), "min_samples": dbscan_minsamples}
else: # OPTICS
hyperparameters = {"max_eps": meters_to_degrees(dbscan_eps), "min_samples": dbscan_minsamples, "metric": "euclidean", "cluster_method": "dbscan"}
return hyperparameters
# Only stationary samples are clustered, hence moving samples are labeled with NA
def cluster(location_data, clustering_algorithm, **kwargs):
if location_data.empty:
return pd.DataFrame(columns=location_data.columns.tolist() + ["is_stationary", "cluster_label"])
if "duration" not in location_data.columns:
# Convert second to minute
location_data = location_data.assign(duration=location_data["duration_in_seconds"] / 60)
# Only keep stationary samples for clustering
stationary_data = location_data[location_data["is_stationary"] == 1][["double_latitude", "double_longitude", "duration"]]
# Remove duplicates and apply sample_weight (only available for DBSCAN currently) to reduce memory usage
stationary_data_dedup = stationary_data.groupby(["double_latitude", "double_longitude"])[["duration"]].sum().reset_index()
lat_lon_dedup = stationary_data_dedup[["double_latitude", "double_longitude"]].values
if stationary_data_dedup.shape[0] < kwargs["min_samples"]:
cluster_results = np.array([-1] * stationary_data_dedup.shape[0])
elif clustering_algorithm == "DBSCAN":
clusterer = DBSCAN(**kwargs)
cluster_results = clusterer.fit_predict(lat_lon_dedup, sample_weight=stationary_data_dedup["duration"])
else: # OPTICS
clusterer = OPTICS(**kwargs)
cluster_results = clusterer.fit_predict(lat_lon_dedup)
# Add cluster labels
stationary_data_dedup["cluster_label"] = cluster_results
location_data_with_labels = label(location_data.merge(stationary_data_dedup[["double_latitude", "double_longitude", "cluster_label"]], how="left", on=["double_latitude", "double_longitude"]))
return location_data_with_labels