From 38fadbf202aab3662ca7aea90e74987c05f4004d Mon Sep 17 00:00:00 2001
From: JulioV <JulioV@users.noreply.github.com>
Date: Thu, 14 Jan 2021 14:22:51 -0500
Subject: [PATCH] Feature/doryab location clustering (#111)

* Added OPTICS -  lightweight clustering algorithm.

* Changed the error message for inconsistent parameters in CONFIG

* Removing hardcoded values and changing default EPS value in the clustering algorithm.

* Added Observation in Doryab Feature docs.

Co-authored-by: nikunjgoel95 <nikunjgoel2009@gmail.com>
---
 config.yaml                                 |  3 +-
 docs/features/phone-locations.md            |  6 +++-
 src/features/phone_locations/doryab/main.py | 37 +++++++++++++--------
 3 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/config.yaml b/config.yaml
index 4b97b3ec..ac871093 100644
--- a/config.yaml
+++ b/config.yaml
@@ -209,13 +209,14 @@ PHONE_LOCATIONS:
     DORYAB:
       COMPUTE: False
       FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
-      DBSCAN_EPS: 10 # meters
+      DBSCAN_EPS: 100 # meters
       DBSCAN_MINSAMPLES: 5
       THRESHOLD_STATIC : 1 # km/h
       MAXIMUM_GAP_ALLOWED: 300
       MINUTES_DATA_USED: False
       SAMPLING_FREQUENCY: 0
       CLUSTER_ON: TIME_SEGMENT # PARTICIPANT_DATASET,TIME_SEGMENT
+      CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS  
       SRC_FOLDER: "doryab" # inside src/features/phone_locations
       SRC_LANGUAGE: "python"
 
diff --git a/docs/features/phone-locations.md b/docs/features/phone-locations.md
index 223a224d..1529525e 100644
--- a/docs/features/phone-locations.md
+++ b/docs/features/phone-locations.md
@@ -107,6 +107,7 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
 | `[MINUTES_DATA_USED]`     | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
 | `[SAMPLING_FREQUENCY]`     | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations.
 | `[CLUSTER_ON]`             | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings).
+| `[CLUSTERING_ALGORITHM]`   | The original Doryab et al implementation uses `DBSCAN`, `OPTICS` is also available with similar (but not identical) clustering results and lower memory consumption.
 
 
 Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
@@ -140,4 +141,7 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
     Significant locations are determined using DBSCAN clustering on locations that a patient visit over the course of the period of data collection.
 
     **The Circadian Calculation**
-    For a detailed description of how this is calculated, see [Canzian et al](../../citation#doryab-locations).
\ No newline at end of file
+    For a detailed description of how this is calculated, see [Canzian et al](../../citation#doryab-locations).
+
+    **Fine Tuning Clustering Parameters**
+    Based on an experiment where we collected fused location data for 7 days with a mean accuracy of 86 & SD of 350.874635, we determined that `EPS/MAX_EPS`=100 produced closer clustering results to reality. Higher values (>100) missed out some significant places like a short grocery visit while lower values (<100) picked up traffic lights and stop signs while driving as significant locations. We recommend you set `EPS` based on the accuracy of your location data (the more accurate your data is, the lower you should be able to set EPS).
\ No newline at end of file
diff --git a/src/features/phone_locations/doryab/main.py b/src/features/phone_locations/doryab/main.py
index f46f993e..42c63c99 100644
--- a/src/features/phone_locations/doryab/main.py
+++ b/src/features/phone_locations/doryab/main.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import numpy as np
 from astropy.timeseries import LombScargle
-from sklearn.cluster import DBSCAN
+from sklearn.cluster import DBSCAN,OPTICS
 from math import radians, cos, sin, asin, sqrt
 
 def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
@@ -14,6 +14,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
     maximum_gap_allowed = provider["MAXIMUM_GAP_ALLOWED"]
     sampling_frequency = provider["SAMPLING_FREQUENCY"]
     cluster_on = provider["CLUSTER_ON"]
+    clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
     
     minutes_data_used = provider["MINUTES_DATA_USED"]
     if(minutes_data_used):
@@ -24,19 +25,25 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
     # the subset of requested features this function can compute
     features_to_compute = list(set(requested_features) & set(base_features_names))
 
-    
+    if clustering_algorithm == "DBSCAN":
+        hyperparameters = {'eps' : distance_to_degrees(dbscan_eps), 'min_samples': dbscan_minsamples}
+    elif clustering_algorithm == "OPTICS":
+        hyperparameters = {'max_eps': distance_to_degrees(dbscan_eps), 'min_samples': 2, 'metric':'euclidean', 'cluster_method' : 'dbscan'} 
+    else:
+        raise ValueError("config[PHONE_LOCATIONS][DORYAB][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm)
 
     if location_data.empty:
         location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
     else:
         if cluster_on == "PARTICIPANT_DATASET":
-            location_data = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples)
+            location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters)
             location_data = filter_data_by_segment(location_data, time_segment)
+
         elif cluster_on == "TIME_SEGMENT":
             location_data = filter_data_by_segment(location_data, time_segment)
-            location_data = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples)
+            location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters)
         else:
-            raise ValueError("Incorrect Clustering technique in Config")
+            raise ValueError("config[PHONE_LOCATIONS][DORYAB][CLUSTER_ON] only accepts PARTICIPANT_DATASET or TIME_SEGMENT but you provided ",cluster_on)
 
         if location_data.empty:
             location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
@@ -237,7 +244,7 @@ def circadian_movement(locationData):
     energy_latitude, energy_longitude = circadian_movement_energies(locationData)
     return np.log10(energy_latitude + energy_longitude)
 
-def cluster_and_label(df,**kwargs):
+def cluster_and_label(df,clustering_algorithm,threshold_static,**kwargs):
     """
 
     :param df:   a df with columns "latitude", "longitude", and "datetime"
@@ -252,16 +259,18 @@ def cluster_and_label(df,**kwargs):
         if not isinstance(df.index, pd.DatetimeIndex):
             location_data = df.set_index("local_date_time")
 
-        stationary = mark_moving(location_data,1)
-
-        #return degrees(arcminutes=nautical(meters= d))
-        #nautical miles = m ÷ 1,852
-        clusterer = DBSCAN(**kwargs)
+        stationary = mark_moving(location_data,threshold_static)
 
         counts_df = stationary[["double_latitude" ,"double_longitude"]].groupby(["double_latitude" ,"double_longitude"]).size().reset_index()
         counts = counts_df[0]
         lat_lon = counts_df[["double_latitude","double_longitude"]].values
-        cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts)
+
+        if clustering_algorithm == "DBSCAN":
+            clusterer = DBSCAN(**kwargs)
+            cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts)
+        else:
+            clusterer = OPTICS(**kwargs)
+            cluster_results = clusterer.fit_predict(lat_lon)
 
         #Need to extend labels back to original df without weights
         counts_df["location_label"] = cluster_results
@@ -302,7 +311,7 @@ def rank_count_map(clusters):
     return lambda x: label_to_rank.get(x, -1)
 
 
-def mark_moving(df, v):
+def mark_moving(df, threshold_static):
 
     if not df.index.is_monotonic:
         df = df.sort_index()
@@ -310,7 +319,7 @@ def mark_moving(df, v):
     distance = haversine(df.double_longitude,df.double_latitude,df.double_longitude.shift(-1),df.double_latitude.shift(-1))/ 1000
     time = (df.timestamp.diff(-1) * -1) / (1000*60*60)
     
-    df['stationary_or_not'] = np.where((distance / time) < v,1,0)   # 1 being stationary,0 for moving 
+    df['stationary_or_not'] = np.where((distance / time) < threshold_static,1,0)   # 1 being stationary,0 for moving 
 
     return df