Fixing and adding MAXIMUM_ROW_DURATION.

pull/114/head
nikunjgoel95 2021-02-02 11:14:23 -05:00
parent e7fc8f44f2
commit 9b248c449d
3 changed files with 9 additions and 6 deletions

View File

@ -243,7 +243,8 @@ PHONE_LOCATIONS:
DBSCAN_EPS: 10 # meters DBSCAN_EPS: 10 # meters
DBSCAN_MINSAMPLES: 5 DBSCAN_MINSAMPLES: 5
THRESHOLD_STATIC : 1 # km/h THRESHOLD_STATIC : 1 # km/h
MAXIMUM_GAP_ALLOWED: 300 MAXIMUM_ROW_GAP: 300
MAXIMUM_ROW_DURATION: 60
MINUTES_DATA_USED: False MINUTES_DATA_USED: False
CLUSTER_ON: TIME_SEGMENT # PARTICIPANT_DATASET,TIME_SEGMENT CLUSTER_ON: TIME_SEGMENT # PARTICIPANT_DATASET,TIME_SEGMENT
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS

View File

@ -104,7 +104,8 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
| `[DBSCAN_EPS]` | The maximum distance in meters between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. | `[DBSCAN_EPS]` | The maximum distance in meters between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
| `[DBSCAN_MINSAMPLES]` | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point of a cluster. This includes the point itself. | `[DBSCAN_MINSAMPLES]` | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point of a cluster. This includes the point itself.
| `[THRESHOLD_STATIC]` | It is the threshold value in km/hr which labels a row as Static or Moving. | `[THRESHOLD_STATIC]` | It is the threshold value in km/hr which labels a row as Static or Moving.
| `[MAXIMUM_GAP_ALLOWED]` | The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the the phone was not sensing. | `[MAXIMUM_ROW_GAP]` | The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the the phone was not sensing.
| `[MAXIMUM_ROW_DURATION]` | The time difference between any two consecutive rows `A` and `B` is considered as the time a participant spent in `A`. If this difference is bigger than MAXIMUM_ROW_GAP we will substitute it with `MAXIMUM_ROW_DURATION`.
| `[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough. | `[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
| `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations. | `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations.
| `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings). | `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings).

View File

@ -13,7 +13,8 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
dbscan_eps = provider["DBSCAN_EPS"] dbscan_eps = provider["DBSCAN_EPS"]
dbscan_minsamples = provider["DBSCAN_MINSAMPLES"] dbscan_minsamples = provider["DBSCAN_MINSAMPLES"]
threshold_static = provider["THRESHOLD_STATIC"] threshold_static = provider["THRESHOLD_STATIC"]
maximum_gap_allowed = provider["MAXIMUM_GAP_ALLOWED"] maximum_gap_allowed = provider["MAXIMUM_ROW_GAP"]
maximum_row_duration = provider["MAXIMUM_ROW_DURATION"]
cluster_on = provider["CLUSTER_ON"] cluster_on = provider["CLUSTER_ON"]
clustering_algorithm = provider["CLUSTERING_ALGORITHM"] clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
@ -116,7 +117,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
preComputedTimeArray = pd.DataFrame() preComputedTimeArray = pd.DataFrame()
for localDate in stationaryLocations["local_segment"].unique(): for localDate in stationaryLocations["local_segment"].unique():
top1,top2,top3,smax,smin,sstd,smean = len_stay_timeattopn(stationaryLocations[stationaryLocations["local_segment"]==localDate]) top1,top2,top3,smax,smin,sstd,smean = len_stay_timeattopn(stationaryLocations[stationaryLocations["local_segment"]==localDate],maximum_gap_allowed,maximum_row_duration)
preComputedTimeArray.loc[localDate,"timeattop1"] = top1 preComputedTimeArray.loc[localDate,"timeattop1"] = top1
preComputedTimeArray.loc[localDate,"timeattop2"] = top2 preComputedTimeArray.loc[localDate,"timeattop2"] = top2
preComputedTimeArray.loc[localDate,"timeattop3"] = top3 preComputedTimeArray.loc[localDate,"timeattop3"] = top3
@ -173,12 +174,12 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
return location_features return location_features
def len_stay_timeattopn(locationData): def len_stay_timeattopn(locationData,maximum_gap_allowed,maximum_row_duration):
if locationData is None or len(locationData) == 0: if locationData is None or len(locationData) == 0:
return (None, None, None,None, None, None, None) return (None, None, None,None, None, None, None)
calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']].copy() calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']].copy()
calculationDf.loc[calculationDf.timeInSeconds >= 300,'timeInSeconds'] = 60 calculationDf.loc[calculationDf.timeInSeconds >= maximum_gap_allowed,'timeInSeconds'] = maximum_row_duration
timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60 timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60
if len(timeArray) > 2: if len(timeArray) > 2: