Removing Sampling Frequency and fixing ROG, location entropy and normalized location entropy.

pull/114/head
nikunjgoel95 2021-02-01 20:57:08 -05:00
parent cc2127e72d
commit e7fc8f44f2
3 changed files with 13 additions and 21 deletions

View File

@ -245,8 +245,7 @@ PHONE_LOCATIONS:
THRESHOLD_STATIC : 1 # km/h THRESHOLD_STATIC : 1 # km/h
MAXIMUM_GAP_ALLOWED: 300 MAXIMUM_GAP_ALLOWED: 300
MINUTES_DATA_USED: False MINUTES_DATA_USED: False
SAMPLING_FREQUENCY: 0 CLUSTER_ON: TIME_SEGMENT # PARTICIPANT_DATASET,TIME_SEGMENT
CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
SRC_FOLDER: "doryab" # inside src/features/phone_locations SRC_FOLDER: "doryab" # inside src/features/phone_locations
SRC_LANGUAGE: "python" SRC_LANGUAGE: "python"

View File

@ -127,8 +127,8 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
|timeattop1location |minutes |Time spent at the most significant location. |timeattop1location |minutes |Time spent at the most significant location.
|timeattop2location |minutes |Time spent at the 2nd most significant location. |timeattop2location |minutes |Time spent at the 2nd most significant location.
|timeattop3location |minutes |Time spent at the 3rd most significant location. |timeattop3location |minutes |Time spent at the 3rd most significant location.
|movingtostaticratio | - | Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labelled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]` |movingtostaticratio | - | Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labelled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed using timeInSeconds feature.
|outlierstimepercent | - | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]` |outlierstimepercent | - | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed using timeInSeconds feature.
|maxlengthstayatclusters |minutes |Maximum time spent in a cluster (significant location). |maxlengthstayatclusters |minutes |Maximum time spent in a cluster (significant location).
|minlengthstayatclusters |minutes |Minimum time spent in a cluster (significant location). |minlengthstayatclusters |minutes |Minimum time spent in a cluster (significant location).
|meanlengthstayatclusters |minutes |Average time spent in a cluster (significant location). |meanlengthstayatclusters |minutes |Average time spent in a cluster (significant location).

View File

@ -14,7 +14,6 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
dbscan_minsamples = provider["DBSCAN_MINSAMPLES"] dbscan_minsamples = provider["DBSCAN_MINSAMPLES"]
threshold_static = provider["THRESHOLD_STATIC"] threshold_static = provider["THRESHOLD_STATIC"]
maximum_gap_allowed = provider["MAXIMUM_GAP_ALLOWED"] maximum_gap_allowed = provider["MAXIMUM_GAP_ALLOWED"]
sampling_frequency = provider["SAMPLING_FREQUENCY"]
cluster_on = provider["CLUSTER_ON"] cluster_on = provider["CLUSTER_ON"]
clustering_algorithm = provider["CLUSTERING_ALGORITHM"] clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
@ -56,9 +55,6 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
else: else:
location_features = pd.DataFrame() location_features = pd.DataFrame()
if sampling_frequency == 0:
sampling_frequency = getSamplingFrequency(location_data)
if "minutesdataused" in features_to_compute: if "minutesdataused" in features_to_compute:
for localDate in location_data["local_segment"].unique(): for localDate in location_data["local_segment"].unique():
location_features.loc[localDate,"minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate]) location_features.loc[localDate,"minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate])
@ -116,7 +112,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
if "radiusgyration" in features_to_compute: if "radiusgyration" in features_to_compute:
for localDate in stationaryLocations['local_segment'].unique(): for localDate in stationaryLocations['local_segment'].unique():
location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(stationaryLocations[stationaryLocations['local_segment']==localDate],sampling_frequency) location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(stationaryLocations[stationaryLocations['local_segment']==localDate])
preComputedTimeArray = pd.DataFrame() preComputedTimeArray = pd.DataFrame()
for localDate in stationaryLocations["local_segment"].unique(): for localDate in stationaryLocations["local_segment"].unique():
@ -181,8 +177,8 @@ def len_stay_timeattopn(locationData):
if locationData is None or len(locationData) == 0: if locationData is None or len(locationData) == 0:
return (None, None, None,None, None, None, None) return (None, None, None,None, None, None, None)
calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']] calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']].copy()
calculationDf[calculationDf['timeInSeconds'] >= 300]['timeInSeconds'] = 60 calculationDf.loc[calculationDf.timeInSeconds >= 300,'timeInSeconds'] = 60
timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60 timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60
if len(timeArray) > 2: if len(timeArray) > 2:
@ -360,7 +356,7 @@ def number_location_transitions(locationData):
return df[df['boolCol'] == False].shape[0] - 1 return df[df['boolCol'] == False].shape[0] - 1
def radius_of_gyration(locationData,sampling_frequency): def radius_of_gyration(locationData):
if locationData is None or len(locationData) == 0: if locationData is None or len(locationData) == 0:
return None return None
# Center is the centroid, not the home location # Center is the centroid, not the home location
@ -373,10 +369,10 @@ def radius_of_gyration(locationData,sampling_frequency):
distance = haversine(clusters_centroid.loc[labels].double_longitude,clusters_centroid.loc[labels].double_latitude, distance = haversine(clusters_centroid.loc[labels].double_longitude,clusters_centroid.loc[labels].double_latitude,
centroid_all_clusters.double_longitude,centroid_all_clusters.double_latitude) ** 2 centroid_all_clusters.double_longitude,centroid_all_clusters.double_latitude) ** 2
time_in_cluster = locationData[locationData["location_label"]==labels].shape[0]* sampling_frequency time_in_cluster = locationData[locationData["location_label"]==labels]['timeInSeconds'].sum()
rog = rog + (time_in_cluster * distance) rog = rog + (time_in_cluster * distance)
time_all_clusters = valid_clusters.shape[0] * sampling_frequency time_all_clusters = valid_clusters['timeInSeconds'].sum()
if time_all_clusters == 0: if time_all_clusters == 0:
return 0 return 0
final_rog = (1/time_all_clusters) * rog final_rog = (1/time_all_clusters) * rog
@ -400,7 +396,7 @@ def location_entropy(locationData):
clusters = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise clusters = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
if len(clusters) > 0: if len(clusters) > 0:
# Get percentages for each location # Get percentages for each location
percents = clusters["location_label"].value_counts(normalize=True) percents = clusters.groupby(['location_label'])['timeInSeconds'].sum() / clusters['timeInSeconds'].sum()
entropy = -1 * percents.map(lambda x: x * np.log(x)).sum() entropy = -1 * percents.map(lambda x: x * np.log(x)).sum()
return entropy return entropy
else: else:
@ -416,10 +412,7 @@ def location_entropy_normalized(locationData):
num_clusters = len(unique_clusters) num_clusters = len(unique_clusters)
if num_clusters == 0 or len(locationData) == 0 or entropy is None: if num_clusters == 0 or len(locationData) == 0 or entropy is None:
return None return None
elif np.log(num_clusters)==0:
return None
else: else:
return entropy / num_clusters return entropy / np.log(num_clusters)
def getSamplingFrequency(locationData):
return (locationData.timestamp.diff()/(1000*60)).median()