Feature/doryab location clustering (#111)

* Added OPTICS - lightweight clustering algorithm. * Changed the error message for inconsistent parameters in CONFIG * Removing hardcoded values and changing default EPS value in the clustering algorithm. * Added Observation in Doryab Feature docs. Co-authored-by: nikunjgoel95 <nikunjgoel2009@gmail.com>
2021-01-14 14:22:51 -05:00 · 2021-01-14 14:22:51 -05:00 · 38fadbf202
parent 22f2bfd211
commit 38fadbf202
3 changed files with 30 additions and 16 deletions
--- a/config.yaml
+++ b/config.yaml
@ -209,13 +209,14 @@ PHONE_LOCATIONS:
    DORYAB:
      COMPUTE: False
      FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
-      DBSCAN_EPS: 10 # meters
+      DBSCAN_EPS: 100 # meters
      DBSCAN_MINSAMPLES: 5
      THRESHOLD_STATIC : 1 # km/h
      MAXIMUM_GAP_ALLOWED: 300
      MINUTES_DATA_USED: False
      SAMPLING_FREQUENCY: 0
      CLUSTER_ON: TIME_SEGMENT # PARTICIPANT_DATASET,TIME_SEGMENT
      CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS  
      SRC_FOLDER: "doryab" # inside src/features/phone_locations
      SRC_LANGUAGE: "python"
--- a/docs/features/phone-locations.md
+++ b/docs/features/phone-locations.md
@ -107,6 +107,7 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
 | `[MINUTES_DATA_USED]`     | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
 | `[SAMPLING_FREQUENCY]`     | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations.
 | `[CLUSTER_ON]`             | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings).
 | `[CLUSTERING_ALGORITHM]`   | The original Doryab et al implementation uses `DBSCAN`, `OPTICS` is also available with similar (but not identical) clustering results and lower memory consumption.
 Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
@ -140,4 +141,7 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
    Significant locations are determined using DBSCAN clustering on locations that a patient visit over the course of the period of data collection.
    **The Circadian Calculation**
-    For a detailed description of how this is calculated, see [Canzian et al](../../citation#doryab-locations).
+    For a detailed description of how this is calculated, see [Canzian et al](../../citation#doryab-locations).
    **Fine Tuning Clustering Parameters**
    Based on an experiment where we collected fused location data for 7 days with a mean accuracy of 86 & SD of 350.874635, we determined that `EPS/MAX_EPS`=100 produced closer clustering results to reality. Higher values (>100) missed out some significant places like a short grocery visit while lower values (<100) picked up traffic lights and stop signs while driving as significant locations. We recommend you set `EPS` based on the accuracy of your location data (the more accurate your data is, the lower you should be able to set EPS).
--- a/src/features/phone_locations/doryab/main.py
+++ b/src/features/phone_locations/doryab/main.py
@ -1,7 +1,7 @@
 import pandas as pd
 import numpy as np
 from astropy.timeseries import LombScargle
-from sklearn.cluster import DBSCAN
+from sklearn.cluster import DBSCAN,OPTICS
 from math import radians, cos, sin, asin, sqrt
 def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
@ -14,6 +14,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
    maximum_gap_allowed = provider["MAXIMUM_GAP_ALLOWED"]
    sampling_frequency = provider["SAMPLING_FREQUENCY"]
    cluster_on = provider["CLUSTER_ON"]
    clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
    minutes_data_used = provider["MINUTES_DATA_USED"]
    if(minutes_data_used):
@ -24,19 +25,25 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
    # the subset of requested features this function can compute
    features_to_compute = list(set(requested_features) & set(base_features_names))
-    
+    if clustering_algorithm == "DBSCAN":
        hyperparameters = {'eps' : distance_to_degrees(dbscan_eps), 'min_samples': dbscan_minsamples}
    elif clustering_algorithm == "OPTICS":
        hyperparameters = {'max_eps': distance_to_degrees(dbscan_eps), 'min_samples': 2, 'metric':'euclidean', 'cluster_method' : 'dbscan'} 
    else:
        raise ValueError("config[PHONE_LOCATIONS][DORYAB][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm)
    if location_data.empty:
        location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
    else:
        if cluster_on == "PARTICIPANT_DATASET":
-            location_data = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples)
+            location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters)
            location_data = filter_data_by_segment(location_data, time_segment)
        elif cluster_on == "TIME_SEGMENT":
            location_data = filter_data_by_segment(location_data, time_segment)
-            location_data = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples)
+            location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters)
        else:
-            raise ValueError("Incorrect Clustering technique in Config")
+            raise ValueError("config[PHONE_LOCATIONS][DORYAB][CLUSTER_ON] only accepts PARTICIPANT_DATASET or TIME_SEGMENT but you provided ",cluster_on)
        if location_data.empty:
            location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
@ -237,7 +244,7 @@ def circadian_movement(locationData):
    energy_latitude, energy_longitude = circadian_movement_energies(locationData)
    return np.log10(energy_latitude + energy_longitude)
-def cluster_and_label(df,**kwargs):
+def cluster_and_label(df,clustering_algorithm,threshold_static,**kwargs):
    """
    :param df:   a df with columns "latitude", "longitude", and "datetime"
@ -252,16 +259,18 @@ def cluster_and_label(df,**kwargs):
        if not isinstance(df.index, pd.DatetimeIndex):
            location_data = df.set_index("local_date_time")
-        stationary = mark_moving(location_data,1)
+        stationary = mark_moving(location_data,threshold_static)
        #return degrees(arcminutes=nautical(meters= d))
        #nautical miles = m ÷ 1,852
        clusterer = DBSCAN(**kwargs)
        counts_df = stationary[["double_latitude" ,"double_longitude"]].groupby(["double_latitude" ,"double_longitude"]).size().reset_index()
        counts = counts_df[0]
        lat_lon = counts_df[["double_latitude","double_longitude"]].values
-        cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts)
+
        if clustering_algorithm == "DBSCAN":
            clusterer = DBSCAN(**kwargs)
            cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts)
        else:
            clusterer = OPTICS(**kwargs)
            cluster_results = clusterer.fit_predict(lat_lon)
        #Need to extend labels back to original df without weights
        counts_df["location_label"] = cluster_results
@ -302,7 +311,7 @@ def rank_count_map(clusters):
    return lambda x: label_to_rank.get(x, -1)
-def mark_moving(df, v):
+def mark_moving(df, threshold_static):
    if not df.index.is_monotonic:
        df = df.sort_index()
@ -310,7 +319,7 @@ def mark_moving(df, v):
    distance = haversine(df.double_longitude,df.double_latitude,df.double_longitude.shift(-1),df.double_latitude.shift(-1))/ 1000
    time = (df.timestamp.diff(-1) * -1) / (1000*60*60)
-    df['stationary_or_not'] = np.where((distance / time) < v,1,0)   # 1 being stationary,0 for moving 
+    df['stationary_or_not'] = np.where((distance / time) < threshold_static,1,0)   # 1 being stationary,0 for moving 
    return df