Merge branch 'feature/location_doryab_home_location' into develop

2021-02-24 17:51:30 -05:00 · 2021-02-24 17:51:30 -05:00 · 0b57b80e54
parent a16ebca563 724027e383
commit 0b57b80e54
9 changed files with 192 additions and 5 deletions
--- a/1
+++ b/1
@ -212,6 +212,7 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
        files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"]))
        files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"]))
        files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"]))
+        files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv", pid=config["PIDS"]))
        files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
        files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"]))
        files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
--- a/config.yaml
+++ b/config.yaml
@ -237,10 +237,16 @@ PHONE_LOCATIONS:
  LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED
  FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
  FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
+  HOME_INFERENCE:
+    DBSCAN_EPS: 10 # meters
+    DBSCAN_MINSAMPLES: 5
+    THRESHOLD_STATIC : 1 # km/h
+    CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
+  
  PROVIDERS:
    DORYAB:
      COMPUTE: False
-      FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
+      FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"]
      ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
      DBSCAN_EPS: 10 # meters
      DBSCAN_MINSAMPLES: 5
@ -249,7 +255,8 @@ PHONE_LOCATIONS:
      MAXIMUM_ROW_DURATION: 60
      MINUTES_DATA_USED: False
      CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT
-      CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS  
+      CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
+      RADIUS_FOR_HOME: 100  
      SRC_FOLDER: "doryab" # inside src/features/phone_locations
      SRC_LANGUAGE: "python"

--- a/docs/change-log.md
+++ b/docs/change-log.md
@ -5,6 +5,7 @@
 - Add logo
 - Move Citation page to the Setup section
 - Add `config.yaml` validation schema and documentation.
+- Add time at home Doryab location feature and home coordinates to location file
 ## v0.4.3
 - Fix bug when any of the rows from any sensor do not belong a time segment
 ## v0.4.2
--- a/docs/features/phone-locations.md
+++ b/docs/features/phone-locations.md
@ -89,6 +89,7 @@ These features are based on the original implementation by [Doryab et al.](../..
    - data/raw/{pid}/phone_locations_raw.csv
    - data/interim/{pid}/phone_locations_processed.csv
    - data/interim/{pid}/phone_locations_processed_with_datetime.csv
+    - data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv
    - data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv
    - data/processed/features/{pid}/phone_locations.csv
    ```
@ -110,6 +111,7 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
 | `[SAMPLING_FREQUENCY]`     | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations.
 | `[CLUSTER_ON]`             | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings).
 | `[CLUSTERING_ALGORITHM]`   | The original Doryab et al implementation uses `DBSCAN`, `OPTICS` is also available with similar (but not identical) clustering results and lower memory consumption.
+| `[RADIUS_FOR_HOME]`        | All location coordinates within this distance (meters) from the home location coordinates are considered a home stay (see `timeathome` feature).


 Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
@ -136,6 +138,7 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
 |stdlengthstayatclusters                                      |minutes       |Standard deviation of time spent in a cluster (significant location).
 |locationentropy                                              |nats          |Shannon Entropy computed over the row count of each cluster (significant location), it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location).
 |normalizedlocationentropy                                    |nats          |Shannon Entropy computed over the row count of each cluster (significant location) divided by the number of clusters, it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location).
+|timeathome                                                   |minutes       | Time spent at home (see Observations below for a description on how we compute home).


 !!! note "Assumptions/Observations"
@ -150,3 +153,6 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:

    **Duration Calculation**
    To calculate the time duration component for our features, we compute the difference between the timestamps of consecutive rows to take into account sampling rate variability. If this time difference is larger than a threshold (300 seconds by default) we replace it with a maximum duration (60 seconds by default, i.e. we assume a participant spent at least 60 seconds in their last known location)
+
+    **Home location**
+    Home is calculated using all location data of a participant between 12 am and 6 am, then applying a clustering algorithm (`DB_SCAN` or `OPTICS`), and considering the center of the biggest cluster as the home coordinates for that participant.
--- a/rules/features.smk
+++ b/rules/features.smk
@ -368,7 +368,7 @@ rule phone_light_r_features:

 rule phone_locations_python_features:
    input:
-        sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv",
+        sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv",
        time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
    params:
        provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"][wildcards.provider_key.upper()],
--- a/rules/preprocessing.smk
+++ b/rules/preprocessing.smk
@ -139,6 +139,19 @@ rule phone_locations_processed_with_datetime:
    script:
        "../src/data/readable_datetime.R"

+rule phone_locations_processed_with_datetime_with_home:
+    input:
+        sensor_input = "data/interim/{pid}/phone_locations_processed_with_datetime.csv"
+    params:
+        dbscan_eps = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["DBSCAN_EPS"],
+        dbscan_minsamples = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["DBSCAN_MINSAMPLES"],
+        threshold_static = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["THRESHOLD_STATIC"],
+        clustering_algorithm = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["CLUSTERING_ALGORITHM"]
+    output: 
+        "data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv"
+    script:
+        "../src/data/infer_home_location.py"
+
 rule resample_episodes:
    input:
        "data/interim/{pid}/{sensor}_episodes.csv"
--- a/src/data/infer_home_location.py
+++ b/src/data/infer_home_location.py
@ -0,0 +1,137 @@
+import pandas as pd
+import numpy as np
+from sklearn.cluster import DBSCAN,OPTICS
+from math import radians, cos, sin, asin, sqrt
+
+def filterDatafromDf(origDf):
+    
+    return origDf[origDf['local_hour']<=6]
+
+def distance_to_degrees(d):
+    #Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error
+    #over small distances
+    d = d / 1852
+    d = d / 60
+    return d
+
+def cluster_and_label(df,clustering_algorithm,threshold_static,**kwargs):
+    """
+
+    :param df:   a df with columns "latitude", "longitude", and "datetime"
+                                     or
+               a df with comlumns "latitude","longitude" and a datetime index
+    :param kwargs: arguments for sklearn's DBSCAN
+    :return: a new df of labeled locations with moving points removed, where the cluster
+             labeled as "1" is the largest, "2" the second largest, and so on
+    """
+    if not df.empty:
+        location_data = df
+        if not isinstance(df.index, pd.DatetimeIndex):
+            location_data = df.set_index("local_date_time")
+
+        stationary = mark_moving(location_data,threshold_static)
+
+        counts_df = stationary[["double_latitude" ,"double_longitude"]].groupby(["double_latitude" ,"double_longitude"]).size().reset_index()
+        counts = counts_df[0]
+        lat_lon = counts_df[["double_latitude","double_longitude"]].values
+
+        if clustering_algorithm == "DBSCAN":
+            clusterer = DBSCAN(**kwargs)
+            cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts)
+        else:
+            clusterer = OPTICS(**kwargs)
+            cluster_results = clusterer.fit_predict(lat_lon)
+
+        #Need to extend labels back to original df without weights
+        counts_df["location_label"] = cluster_results
+        # remove the old count column
+        del counts_df[0]
+
+        merged = pd.merge(stationary,counts_df, on = ["double_latitude" ,"double_longitude"])
+
+        #Now compute the label mapping:
+        cluster_results = merged["location_label"].values
+        valid_clusters = cluster_results[np.where(cluster_results != -1)]
+        label_map = rank_count_map(valid_clusters)
+
+        #And remap the labels:
+        merged.index = stationary.index
+        stationary = stationary.assign(location_label = merged["location_label"].map(label_map).values)
+        stationary.loc[:, "location_label"] = merged["location_label"].map(label_map)
+        return stationary
+    else:
+        return df
+    
+def rank_count_map(clusters):
+    """ Returns a function which will map each element of a list 'l' to its rank,
+    such that the most common element maps to 1
+
+    Is used in this context to sort the cluster labels so that cluster with rank 1 is the most
+    visited.
+
+    If return_dict, return a mapping dict rather than a function
+
+    If a function, if the value can't be found label as -1
+
+    """
+    labels, counts = tuple(np.unique(clusters, return_counts = True))
+    sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)]
+    label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]}
+    return lambda x: label_to_rank.get(x, -1)
+
+
+def mark_moving(df, threshold_static):
+
+    if not df.index.is_monotonic:
+        df = df.sort_index()
+
+    distance = haversine(df.double_longitude,df.double_latitude,df.double_longitude.shift(-1),df.double_latitude.shift(-1))/ 1000
+    time = (df.timestamp.diff(-1) * -1) / (1000*60*60)
+    
+    df['stationary_or_not'] = np.where((distance / time) < threshold_static,1,0)   # 1 being stationary,0 for moving 
+
+    return df
+
+def haversine(lon1,lat1,lon2,lat2):
+    """
+    Calculate the great circle distance between two points 
+    on the earth (specified in decimal degrees)
+    """
+    # convert decimal degrees to radians 
+    lon1, lat1, lon2, lat2 = np.radians([lon1, lat1, lon2, lat2])
+
+    # haversine formula 
+    a = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2
+
+    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
+
+    return (r * 2 * np.arcsin(np.sqrt(a)) * 1000)
+
+# Infer a participants home location
+
+origDf = pd.read_csv(snakemake.input[0])
+filteredDf = filterDatafromDf(origDf)
+dbscan_eps = snakemake.params["dbscan_eps"]
+dbscan_minsamples = snakemake.params["dbscan_minsamples"]
+threshold_static = snakemake.params["threshold_static"]
+clustering_algorithm = snakemake.params["clustering_algorithm"]
+
+if clustering_algorithm == "DBSCAN":
+    hyperparameters = {'eps' : distance_to_degrees(dbscan_eps), 'min_samples': dbscan_minsamples}
+elif clustering_algorithm == "OPTICS":
+    hyperparameters = {'max_eps': distance_to_degrees(dbscan_eps), 'min_samples': 2, 'metric':'euclidean', 'cluster_method' : 'dbscan'} 
+else:
+    raise ValueError("config[PHONE_LOCATIONS][HOME_INFERENCE][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm)
+
+filteredDf = cluster_and_label(filteredDf,clustering_algorithm,threshold_static,**hyperparameters)
+
+origDf['home_latitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_latitude']
+origDf['home_longitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_longitude']
+
+distanceFromHome = haversine(origDf.double_longitude,origDf.double_latitude,origDf.home_longitude,origDf.home_latitude)
+
+finalDf = origDf.drop(['home_latitude','home_longitude'], axis=1)
+finalDf.insert(len(finalDf.columns)-1,'distancefromhome',distanceFromHome)
+finalDf.to_csv(snakemake.output[0], index=False)
+
+
--- a/src/features/phone_locations/doryab/main.py
+++ b/src/features/phone_locations/doryab/main.py
@ -17,13 +17,14 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
    maximum_row_duration = provider["MAXIMUM_ROW_DURATION"]
    cluster_on = provider["CLUSTER_ON"]
    clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
+    radius_from_home = provider["RADIUS_FOR_HOME"]
    
    minutes_data_used = provider["MINUTES_DATA_USED"]
    if(minutes_data_used):
            requested_features.append("minutesdataused")

    # name of the features this function can compute
-    base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused"]    
+    base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused","timeathome"]    
    # the subset of requested features this function can compute
    features_to_compute = list(set(requested_features) & set(base_features_names))

@ -170,6 +171,11 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
                for localDate in stationaryLocations['local_segment'].unique():
                    location_features.loc[localDate,"normalizedlocationentropy"] = location_entropy_normalized(stationaryLocations[stationaryLocations['local_segment']==localDate])
            
+            if "timeathome" in features_to_compute:
+                calculationDf = stationaryLocations[['local_segment','distancefromhome','timeInSeconds']].copy()
+                calculationDf.loc[calculationDf.timeInSeconds >= maximum_gap_allowed,'timeInSeconds'] = maximum_row_duration
+                location_features["timeathome"] = calculationDf[calculationDf["distancefromhome"] <= radius_from_home].groupby("local_segment")["timeInSeconds"].sum()/60
+
            location_features = location_features.reset_index()

    return location_features
--- a/tools/config.schema.yaml
+++ b/tools/config.schema.yaml
@ -598,6 +598,22 @@ properties:
      FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION:
        type: integer
        exclusiveMinimum: 0
+      HOME_INFERENCE:
+        type: object
+        required: [DBSCAN_EPS, DBSCAN_MINSAMPLES, THRESHOLD_STATIC, CLUSTERING_ALGORITHM]
+        properties:
+          DBSCAN_EPS:
+            type: integer
+            exclusiveMinimum: 0
+          DBSCAN_MINSAMPLES:
+            type: integer
+            exclusiveMinimum: 0
+          THRESHOLD_STATIC:
+            type: integer
+            exclusiveMinimum: 0
+          CLUSTERING_ALGORITHM:
+            type: string
+            enum: ["DBSCAN", "OPTICS"]
      PROVIDERS:
        type: ["null", object]
        properties:
@ -610,7 +626,7 @@ properties:
                      uniqueItems: True
                      items:
                        type: string
-                        enum: [locationvariance,loglocationvariance,totaldistance,averagespeed,varspeed,circadianmovement,numberofsignificantplaces,numberlocationtransitions,radiusgyration,timeattop1location,timeattop2location,timeattop3location,movingtostaticratio,outlierstimepercent,maxlengthstayatclusters,minlengthstayatclusters,meanlengthstayatclusters,stdlengthstayatclusters,locationentropy,normalizedlocationentropy]
+                        enum: [locationvariance,loglocationvariance,totaldistance,averagespeed,varspeed,circadianmovement,numberofsignificantplaces,numberlocationtransitions,radiusgyration,timeattop1location,timeattop2location,timeattop3location,movingtostaticratio,outlierstimepercent,maxlengthstayatclusters,minlengthstayatclusters,meanlengthstayatclusters,stdlengthstayatclusters,locationentropy,normalizedlocationentropy,timeathome]
                    ACCURACY_LIMIT:
                      type: integer
                      exclusiveMinimum: 0