Removing Sampling Frequency and fixing ROG, location entropy and normalized location entropy.

2021-02-01 20:57:08 -05:00 · 2021-02-01 20:57:08 -05:00 · e7fc8f44f2
parent cc2127e72d
commit e7fc8f44f2
3 changed files with 13 additions and 21 deletions
--- a/config.yaml
+++ b/config.yaml
@ -245,8 +245,7 @@ PHONE_LOCATIONS:
      THRESHOLD_STATIC : 1 # km/h
      MAXIMUM_GAP_ALLOWED: 300
      MINUTES_DATA_USED: False
-      SAMPLING_FREQUENCY: 0
-      CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT
+      CLUSTER_ON: TIME_SEGMENT # PARTICIPANT_DATASET,TIME_SEGMENT
      CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS  
      SRC_FOLDER: "doryab" # inside src/features/phone_locations
      SRC_LANGUAGE: "python"
--- a/docs/features/phone-locations.md
+++ b/docs/features/phone-locations.md
@ -127,8 +127,8 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
 |timeattop1location                                           |minutes       |Time spent at the most significant location.
 |timeattop2location                                           |minutes       |Time spent at the 2nd most significant location.
 |timeattop3location                                           |minutes       |Time spent at the 3rd most significant location. 
-|movingtostaticratio                                          | -   |  Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labelled as stationary if it’s speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]`
-|outlierstimepercent                                          | -   | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]`
+|movingtostaticratio                                          | -   |  Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labelled as stationary if it’s speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed using timeInSeconds feature.
+|outlierstimepercent                                          | -   | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed using timeInSeconds feature.
 |maxlengthstayatclusters                                      |minutes       |Maximum time spent in a cluster (significant location).
 |minlengthstayatclusters                                      |minutes       |Minimum time spent in a cluster (significant location).
 |meanlengthstayatclusters                                     |minutes       |Average time spent in a cluster (significant location).
--- a/src/features/phone_locations/doryab/main.py
+++ b/src/features/phone_locations/doryab/main.py
@ -14,7 +14,6 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
    dbscan_minsamples = provider["DBSCAN_MINSAMPLES"]
    threshold_static = provider["THRESHOLD_STATIC"]
    maximum_gap_allowed = provider["MAXIMUM_GAP_ALLOWED"]
-    sampling_frequency = provider["SAMPLING_FREQUENCY"]
    cluster_on = provider["CLUSTER_ON"]
    clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
    
@ -56,9 +55,6 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
        else:
            location_features = pd.DataFrame()

-            if sampling_frequency == 0:
-                sampling_frequency = getSamplingFrequency(location_data)
-
            if "minutesdataused" in features_to_compute:
                for localDate in location_data["local_segment"].unique():
                    location_features.loc[localDate,"minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate])
@ -116,7 +112,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
            
            if "radiusgyration" in features_to_compute:
                for localDate in stationaryLocations['local_segment'].unique():
-                    location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(stationaryLocations[stationaryLocations['local_segment']==localDate],sampling_frequency)
+                    location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(stationaryLocations[stationaryLocations['local_segment']==localDate])

            preComputedTimeArray = pd.DataFrame()
            for localDate in stationaryLocations["local_segment"].unique():
@ -181,8 +177,8 @@ def len_stay_timeattopn(locationData):
    if locationData is None or len(locationData) == 0:
        return  (None, None, None,None, None, None, None)

-    calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']]
-    calculationDf[calculationDf['timeInSeconds'] >= 300]['timeInSeconds'] = 60
+    calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']].copy()
+    calculationDf.loc[calculationDf.timeInSeconds >= 300,'timeInSeconds'] = 60
    timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60
    
    if len(timeArray) > 2:
@ -360,7 +356,7 @@ def number_location_transitions(locationData):

    return df[df['boolCol'] == False].shape[0] - 1

-def radius_of_gyration(locationData,sampling_frequency):
+def radius_of_gyration(locationData):
    if locationData is None or len(locationData) == 0:
        return None
    # Center is the centroid, not the home location
@ -373,10 +369,10 @@ def radius_of_gyration(locationData,sampling_frequency):
        distance = haversine(clusters_centroid.loc[labels].double_longitude,clusters_centroid.loc[labels].double_latitude,
                    centroid_all_clusters.double_longitude,centroid_all_clusters.double_latitude) ** 2
        
-        time_in_cluster = locationData[locationData["location_label"]==labels].shape[0]* sampling_frequency
+        time_in_cluster = locationData[locationData["location_label"]==labels]['timeInSeconds'].sum()
        rog = rog + (time_in_cluster * distance)
    
-    time_all_clusters = valid_clusters.shape[0] * sampling_frequency
+    time_all_clusters = valid_clusters['timeInSeconds'].sum()
    if time_all_clusters == 0:
        return 0
    final_rog = (1/time_all_clusters) * rog
@ -400,7 +396,7 @@ def location_entropy(locationData):
    clusters = locationData[locationData["location_label"] >= 1]  # remove outliers/ cluster noise
    if len(clusters) > 0:
        # Get percentages for each location
-        percents = clusters["location_label"].value_counts(normalize=True)
+        percents = clusters.groupby(['location_label'])['timeInSeconds'].sum() / clusters['timeInSeconds'].sum()
        entropy = -1 * percents.map(lambda x: x * np.log(x)).sum()
        return entropy
    else:
@ -416,10 +412,7 @@ def location_entropy_normalized(locationData):
    num_clusters = len(unique_clusters)
    if num_clusters == 0 or len(locationData) == 0 or entropy is None:
        return None
+    elif np.log(num_clusters)==0:
+        return None
    else:
-        return entropy / num_clusters
-
-
-def getSamplingFrequency(locationData):
-
-    return (locationData.timestamp.diff()/(1000*60)).median()
+        return entropy / np.log(num_clusters)