diff --git a/config.yaml b/config.yaml index e47c2363..b1f8cd99 100644 --- a/config.yaml +++ b/config.yaml @@ -243,7 +243,8 @@ PHONE_LOCATIONS: DBSCAN_EPS: 10 # meters DBSCAN_MINSAMPLES: 5 THRESHOLD_STATIC : 1 # km/h - MAXIMUM_GAP_ALLOWED: 300 + MAXIMUM_ROW_GAP: 300 + MAXIMUM_ROW_DURATION: 60 MINUTES_DATA_USED: False CLUSTER_ON: TIME_SEGMENT # PARTICIPANT_DATASET,TIME_SEGMENT CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS diff --git a/docs/features/phone-locations.md b/docs/features/phone-locations.md index 8445f050..76542263 100644 --- a/docs/features/phone-locations.md +++ b/docs/features/phone-locations.md @@ -104,7 +104,8 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`: | `[DBSCAN_EPS]` | The maximum distance in meters between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. | `[DBSCAN_MINSAMPLES]` | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point of a cluster. This includes the point itself. | `[THRESHOLD_STATIC]` | It is the threshold value in km/hr which labels a row as Static or Moving. -| `[MAXIMUM_GAP_ALLOWED]` | The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the the phone was not sensing. +| `[MAXIMUM_ROW_GAP]` | The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the the phone was not sensing. +| `[MAXIMUM_ROW_DURATION]` | The time difference between any two consecutive rows `A` and `B` is considered as the time a participant spent in `A`. If this difference is bigger than MAXIMUM_ROW_GAP we will substitute it with `MAXIMUM_ROW_DURATION`. | `[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. | `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations. | `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings). diff --git a/src/features/phone_locations/doryab/main.py b/src/features/phone_locations/doryab/main.py index 0b9f5bfe..bb850d25 100644 --- a/src/features/phone_locations/doryab/main.py +++ b/src/features/phone_locations/doryab/main.py @@ -13,7 +13,8 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se dbscan_eps = provider["DBSCAN_EPS"] dbscan_minsamples = provider["DBSCAN_MINSAMPLES"] threshold_static = provider["THRESHOLD_STATIC"] - maximum_gap_allowed = provider["MAXIMUM_GAP_ALLOWED"] + maximum_gap_allowed = provider["MAXIMUM_ROW_GAP"] + maximum_row_duration = provider["MAXIMUM_ROW_DURATION"] cluster_on = provider["CLUSTER_ON"] clustering_algorithm = provider["CLUSTERING_ALGORITHM"] @@ -116,7 +117,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se preComputedTimeArray = pd.DataFrame() for localDate in stationaryLocations["local_segment"].unique(): - top1,top2,top3,smax,smin,sstd,smean = len_stay_timeattopn(stationaryLocations[stationaryLocations["local_segment"]==localDate]) + top1,top2,top3,smax,smin,sstd,smean = len_stay_timeattopn(stationaryLocations[stationaryLocations["local_segment"]==localDate],maximum_gap_allowed,maximum_row_duration) preComputedTimeArray.loc[localDate,"timeattop1"] = top1 preComputedTimeArray.loc[localDate,"timeattop2"] = top2 preComputedTimeArray.loc[localDate,"timeattop3"] = top3 @@ -173,12 +174,12 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se return location_features -def len_stay_timeattopn(locationData): +def len_stay_timeattopn(locationData,maximum_gap_allowed,maximum_row_duration): if locationData is None or len(locationData) == 0: return (None, None, None,None, None, None, None) calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']].copy() - calculationDf.loc[calculationDf.timeInSeconds >= 300,'timeInSeconds'] = 60 + calculationDf.loc[calculationDf.timeInSeconds >= maximum_gap_allowed,'timeInSeconds'] = maximum_row_duration timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60 if len(timeArray) > 2: