rapids/src/features/phone_locations/doryab/doryab_clustering.py

import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN, OPTICS


# Calculate the great-circle distance (in meters) between two points on the earth (specified in decimal degrees)
def haversine(lon1, lat1, lon2, lat2):
    # Radius of earth in kilometers. Use 3956 for miles
    r = 6371
    # Convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = np.radians([lon1, lat1, lon2, lat2])
    # Haversine formula
    distance = r * 2 * np.arcsin(np.sqrt(np.sin((lat2 - lat1) / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2 - lon1) / 2.0) ** 2)) * 1000
    return distance

# Just an approximation, but speeds up clustering by a huge amount and doesn't introduce much error over small distances
# Reference: https://jonisalonen.com/2014/computing-distance-between-coordinates-can-be-simple-and-fast/
def meters_to_degrees(distance):
    # Convert meter to nautical mile
    distance = distance / 1852
    # Convert nautical mile to degree
    distance = distance / 60
    return distance

# Relabel clusters: -1 denotes the outliers (insignificant or rarely visited locations), 1 denotes the most visited significant location, 2 denotes the 2nd most significant location,...
def label(location_data):

    # Exclude outliers (cluster_label = -1) while calculating the total duration of locations in a cluster
    label2duration = location_data[["cluster_label", "duration"]].replace(-1, np.nan).groupby("cluster_label")[["duration"]].sum().sort_values(by=["duration"], ascending=False)
    # Add the row number as the new cluster label
    label2duration["new_cluster_label"] = np.arange(len(label2duration)) + 1
    # Still use -1 to denote the outliers
    label2duration.loc[-1, "new_cluster_label"] = -1
    # Merge the new cluster label with the original location data
    location_data = location_data.merge(label2duration[["new_cluster_label"]], left_on="cluster_label", right_index=True, how="left")

    del location_data["cluster_label"]
    location_data.rename(columns={"new_cluster_label": "cluster_label"}, inplace=True)

    return location_data

def create_clustering_hyperparameters(clustering_algorithm, dbscan_eps, dbscan_minsamples):
    if clustering_algorithm == "DBSCAN":
        hyperparameters = {"eps": meters_to_degrees(dbscan_eps), "min_samples": dbscan_minsamples}
    else: # OPTICS
        hyperparameters = {"max_eps": meters_to_degrees(dbscan_eps), "min_samples": dbscan_minsamples, "metric": "euclidean", "cluster_method": "dbscan"}
    
    return hyperparameters

# Only stationary samples are clustered, hence moving samples are labeled with NA
def cluster(location_data, clustering_algorithm, **kwargs):

    if location_data.empty:
        return pd.DataFrame(columns=location_data.columns.tolist() + ["is_stationary", "cluster_label"])
    
    if "duration" not in location_data.columns:
        # Convert second to minute
        location_data = location_data.assign(duration=location_data["duration_in_seconds"] / 60)

    # Only keep stationary samples for clustering
    stationary_data = location_data[location_data["is_stationary"] == 1][["double_latitude", "double_longitude", "duration"]]

    # Remove duplicates and apply sample_weight (only available for DBSCAN currently) to reduce memory usage
    stationary_data_dedup = stationary_data.groupby(["double_latitude", "double_longitude"])[["duration"]].sum().reset_index()
    lat_lon_dedup = stationary_data_dedup[["double_latitude", "double_longitude"]].values

    if stationary_data_dedup.shape[0] < kwargs["min_samples"]:
        cluster_results = np.array([-1] * stationary_data_dedup.shape[0])
    elif clustering_algorithm == "DBSCAN":        
        clusterer = DBSCAN(**kwargs)
        cluster_results = clusterer.fit_predict(lat_lon_dedup, sample_weight=stationary_data_dedup["duration"])
    else: # OPTICS
        clusterer = OPTICS(**kwargs)
        cluster_results = clusterer.fit_predict(lat_lon_dedup)

    # Add cluster labels
    stationary_data_dedup["cluster_label"] = cluster_results
    location_data_with_labels = label(location_data.merge(stationary_data_dedup[["double_latitude", "double_longitude", "cluster_label"]], how="left", on=["double_latitude", "double_longitude"]))

    return location_data_with_labels
Refactor location features from Doryab provider & add a new strategy to infer home location & fix bugs 2021-04-09 18:05:25 +02:00			`import pandas as pd`
			`import numpy as np`
			`from sklearn.cluster import DBSCAN, OPTICS`



			`# Calculate the great-circle distance (in meters) between two points on the earth (specified in decimal degrees)`
			`def haversine(lon1, lat1, lon2, lat2):`
			`# Radius of earth in kilometers. Use 3956 for miles`
			`r = 6371`
			`# Convert decimal degrees to radians`
			`lon1, lat1, lon2, lat2 = np.radians([lon1, lat1, lon2, lat2])`
			`# Haversine formula`
			`distance = r * 2 * np.arcsin(np.sqrt(np.sin((lat2 - lat1) / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2 - lon1) / 2.0) ** 2)) * 1000`
			`return distance`

			`# Just an approximation, but speeds up clustering by a huge amount and doesn't introduce much error over small distances`
			`# Reference: https://jonisalonen.com/2014/computing-distance-between-coordinates-can-be-simple-and-fast/`
			`def meters_to_degrees(distance):`
			`# Convert meter to nautical mile`
			`distance = distance / 1852`
			`# Convert nautical mile to degree`
			`distance = distance / 60`
			`return distance`

			`# Relabel clusters: -1 denotes the outliers (insignificant or rarely visited locations), 1 denotes the most visited significant location, 2 denotes the 2nd most significant location,...`
			`def label(location_data):`

Refactor PHONE_LOCATIONS DORYAB provider to compute features based on location episodes 2021-09-15 20:28:09 +02:00			`# Exclude outliers (cluster_label = -1) while calculating the total duration of locations in a cluster`
			`label2duration = location_data[["cluster_label", "duration"]].replace(-1, np.nan).groupby("cluster_label")[["duration"]].sum().sort_values(by=["duration"], ascending=False)`
			`# Add the row number as the new cluster label`
			`label2duration["new_cluster_label"] = np.arange(len(label2duration)) + 1`
Refactor location features from Doryab provider & add a new strategy to infer home location & fix bugs 2021-04-09 18:05:25 +02:00			`# Still use -1 to denote the outliers`
Refactor PHONE_LOCATIONS DORYAB provider to compute features based on location episodes 2021-09-15 20:28:09 +02:00			`label2duration.loc[-1, "new_cluster_label"] = -1`
Refactor location features from Doryab provider & add a new strategy to infer home location & fix bugs 2021-04-09 18:05:25 +02:00			`# Merge the new cluster label with the original location data`
Refactor PHONE_LOCATIONS DORYAB provider to compute features based on location episodes 2021-09-15 20:28:09 +02:00			`location_data = location_data.merge(label2duration[["new_cluster_label"]], left_on="cluster_label", right_index=True, how="left")`
Refactor location features from Doryab provider & add a new strategy to infer home location & fix bugs 2021-04-09 18:05:25 +02:00
			`del location_data["cluster_label"]`
			`location_data.rename(columns={"new_cluster_label": "cluster_label"}, inplace=True)`

			`return location_data`

			`def create_clustering_hyperparameters(clustering_algorithm, dbscan_eps, dbscan_minsamples):`
			`if clustering_algorithm == "DBSCAN":`
			`hyperparameters = {"eps": meters_to_degrees(dbscan_eps), "min_samples": dbscan_minsamples}`
			`else: # OPTICS`
			`hyperparameters = {"max_eps": meters_to_degrees(dbscan_eps), "min_samples": dbscan_minsamples, "metric": "euclidean", "cluster_method": "dbscan"}`

			`return hyperparameters`

			`# Only stationary samples are clustered, hence moving samples are labeled with NA`
			`def cluster(location_data, clustering_algorithm, **kwargs):`

			`if location_data.empty:`
			`return pd.DataFrame(columns=location_data.columns.tolist() + ["is_stationary", "cluster_label"])`

Refactor PHONE_LOCATIONS DORYAB provider to compute features based on location episodes 2021-09-15 20:28:09 +02:00			`if "duration" not in location_data.columns:`
			`# Convert second to minute`
			`location_data = location_data.assign(duration=location_data["duration_in_seconds"] / 60)`

Refactor location features from Doryab provider & add a new strategy to infer home location & fix bugs 2021-04-09 18:05:25 +02:00			`# Only keep stationary samples for clustering`
Refactor PHONE_LOCATIONS DORYAB provider to compute features based on location episodes 2021-09-15 20:28:09 +02:00			`stationary_data = location_data[location_data["is_stationary"] == 1][["double_latitude", "double_longitude", "duration"]]`
Refactor location features from Doryab provider & add a new strategy to infer home location & fix bugs 2021-04-09 18:05:25 +02:00
			`# Remove duplicates and apply sample_weight (only available for DBSCAN currently) to reduce memory usage`
Refactor PHONE_LOCATIONS DORYAB provider to compute features based on location episodes 2021-09-15 20:28:09 +02:00			`stationary_data_dedup = stationary_data.groupby(["double_latitude", "double_longitude"])[["duration"]].sum().reset_index()`
			`lat_lon_dedup = stationary_data_dedup[["double_latitude", "double_longitude"]].values`
Refactor location features from Doryab provider & add a new strategy to infer home location & fix bugs 2021-04-09 18:05:25 +02:00
			`if stationary_data_dedup.shape[0] < kwargs["min_samples"]:`
			`cluster_results = np.array([-1] * stationary_data_dedup.shape[0])`
Refactor PHONE_LOCATIONS DORYAB provider to compute features based on location episodes 2021-09-15 20:28:09 +02:00			`elif clustering_algorithm == "DBSCAN":`
Refactor location features from Doryab provider & add a new strategy to infer home location & fix bugs 2021-04-09 18:05:25 +02:00			`clusterer = DBSCAN(**kwargs)`
Refactor PHONE_LOCATIONS DORYAB provider to compute features based on location episodes 2021-09-15 20:28:09 +02:00			`cluster_results = clusterer.fit_predict(lat_lon_dedup, sample_weight=stationary_data_dedup["duration"])`
Refactor location features from Doryab provider & add a new strategy to infer home location & fix bugs 2021-04-09 18:05:25 +02:00			`else: # OPTICS`
			`clusterer = OPTICS(**kwargs)`
Refactor PHONE_LOCATIONS DORYAB provider to compute features based on location episodes 2021-09-15 20:28:09 +02:00			`cluster_results = clusterer.fit_predict(lat_lon_dedup)`
Refactor location features from Doryab provider & add a new strategy to infer home location & fix bugs 2021-04-09 18:05:25 +02:00
			`# Add cluster labels`
			`stationary_data_dedup["cluster_label"] = cluster_results`
Refactor PHONE_LOCATIONS DORYAB provider to compute features based on location episodes 2021-09-15 20:28:09 +02:00			`location_data_with_labels = label(location_data.merge(stationary_data_dedup[["double_latitude", "double_longitude", "cluster_label"]], how="left", on=["double_latitude", "double_longitude"]))`
Refactor location features from Doryab provider & add a new strategy to infer home location & fix bugs 2021-04-09 18:05:25 +02:00
			`return location_data_with_labels`