Fixing and adding MAXIMUM_ROW_DURATION.
parent
e7fc8f44f2
commit
9b248c449d
|
@ -243,7 +243,8 @@ PHONE_LOCATIONS:
|
||||||
DBSCAN_EPS: 10 # meters
|
DBSCAN_EPS: 10 # meters
|
||||||
DBSCAN_MINSAMPLES: 5
|
DBSCAN_MINSAMPLES: 5
|
||||||
THRESHOLD_STATIC : 1 # km/h
|
THRESHOLD_STATIC : 1 # km/h
|
||||||
MAXIMUM_GAP_ALLOWED: 300
|
MAXIMUM_ROW_GAP: 300
|
||||||
|
MAXIMUM_ROW_DURATION: 60
|
||||||
MINUTES_DATA_USED: False
|
MINUTES_DATA_USED: False
|
||||||
CLUSTER_ON: TIME_SEGMENT # PARTICIPANT_DATASET,TIME_SEGMENT
|
CLUSTER_ON: TIME_SEGMENT # PARTICIPANT_DATASET,TIME_SEGMENT
|
||||||
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
|
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
|
||||||
|
|
|
@ -104,7 +104,8 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
|
||||||
| `[DBSCAN_EPS]` | The maximum distance in meters between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
|
| `[DBSCAN_EPS]` | The maximum distance in meters between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
|
||||||
| `[DBSCAN_MINSAMPLES]` | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point of a cluster. This includes the point itself.
|
| `[DBSCAN_MINSAMPLES]` | The number of samples (or total weight) in a neighborhood for a point to be considered as a core point of a cluster. This includes the point itself.
|
||||||
| `[THRESHOLD_STATIC]` | It is the threshold value in km/hr which labels a row as Static or Moving.
|
| `[THRESHOLD_STATIC]` | It is the threshold value in km/hr which labels a row as Static or Moving.
|
||||||
| `[MAXIMUM_GAP_ALLOWED]` | The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the the phone was not sensing.
|
| `[MAXIMUM_ROW_GAP]` | The maximum gap (in seconds) allowed between any two consecutive rows for them to be considered part of the same displacement. If this threshold is too high, it can throw speed and distance calculations off for periods when the the phone was not sensing.
|
||||||
|
| `[MAXIMUM_ROW_DURATION]` | The time difference between any two consecutive rows `A` and `B` is considered as the time a participant spent in `A`. If this difference is bigger than MAXIMUM_ROW_GAP we will substitute it with `MAXIMUM_ROW_DURATION`.
|
||||||
| `[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
|
| `[MINUTES_DATA_USED]` | Set to `True` to include an extra column in the final location feature file containing the number of minutes used to compute the features on each time segment. Use this for quality control purposes, the more data minutes exist for a period, the more reliable its features should be. For fused location, a single minute can contain more than one coordinate pair if the participant is moving fast enough.
|
||||||
| `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations.
|
| `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations.
|
||||||
| `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings).
|
| `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings).
|
||||||
|
|
|
@ -13,7 +13,8 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
||||||
dbscan_eps = provider["DBSCAN_EPS"]
|
dbscan_eps = provider["DBSCAN_EPS"]
|
||||||
dbscan_minsamples = provider["DBSCAN_MINSAMPLES"]
|
dbscan_minsamples = provider["DBSCAN_MINSAMPLES"]
|
||||||
threshold_static = provider["THRESHOLD_STATIC"]
|
threshold_static = provider["THRESHOLD_STATIC"]
|
||||||
maximum_gap_allowed = provider["MAXIMUM_GAP_ALLOWED"]
|
maximum_gap_allowed = provider["MAXIMUM_ROW_GAP"]
|
||||||
|
maximum_row_duration = provider["MAXIMUM_ROW_DURATION"]
|
||||||
cluster_on = provider["CLUSTER_ON"]
|
cluster_on = provider["CLUSTER_ON"]
|
||||||
clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
|
clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
|
||||||
|
|
||||||
|
@ -116,7 +117,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
||||||
|
|
||||||
preComputedTimeArray = pd.DataFrame()
|
preComputedTimeArray = pd.DataFrame()
|
||||||
for localDate in stationaryLocations["local_segment"].unique():
|
for localDate in stationaryLocations["local_segment"].unique():
|
||||||
top1,top2,top3,smax,smin,sstd,smean = len_stay_timeattopn(stationaryLocations[stationaryLocations["local_segment"]==localDate])
|
top1,top2,top3,smax,smin,sstd,smean = len_stay_timeattopn(stationaryLocations[stationaryLocations["local_segment"]==localDate],maximum_gap_allowed,maximum_row_duration)
|
||||||
preComputedTimeArray.loc[localDate,"timeattop1"] = top1
|
preComputedTimeArray.loc[localDate,"timeattop1"] = top1
|
||||||
preComputedTimeArray.loc[localDate,"timeattop2"] = top2
|
preComputedTimeArray.loc[localDate,"timeattop2"] = top2
|
||||||
preComputedTimeArray.loc[localDate,"timeattop3"] = top3
|
preComputedTimeArray.loc[localDate,"timeattop3"] = top3
|
||||||
|
@ -173,12 +174,12 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
||||||
|
|
||||||
return location_features
|
return location_features
|
||||||
|
|
||||||
def len_stay_timeattopn(locationData):
|
def len_stay_timeattopn(locationData,maximum_gap_allowed,maximum_row_duration):
|
||||||
if locationData is None or len(locationData) == 0:
|
if locationData is None or len(locationData) == 0:
|
||||||
return (None, None, None,None, None, None, None)
|
return (None, None, None,None, None, None, None)
|
||||||
|
|
||||||
calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']].copy()
|
calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']].copy()
|
||||||
calculationDf.loc[calculationDf.timeInSeconds >= 300,'timeInSeconds'] = 60
|
calculationDf.loc[calculationDf.timeInSeconds >= maximum_gap_allowed,'timeInSeconds'] = maximum_row_duration
|
||||||
timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60
|
timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60
|
||||||
|
|
||||||
if len(timeArray) > 2:
|
if len(timeArray) > 2:
|
||||||
|
|
Loading…
Reference in New Issue