Removing Sampling Frequency and fixing ROG, location entropy and normalized location entropy.
parent
cc2127e72d
commit
e7fc8f44f2
|
@ -245,8 +245,7 @@ PHONE_LOCATIONS:
|
||||||
THRESHOLD_STATIC : 1 # km/h
|
THRESHOLD_STATIC : 1 # km/h
|
||||||
MAXIMUM_GAP_ALLOWED: 300
|
MAXIMUM_GAP_ALLOWED: 300
|
||||||
MINUTES_DATA_USED: False
|
MINUTES_DATA_USED: False
|
||||||
SAMPLING_FREQUENCY: 0
|
CLUSTER_ON: TIME_SEGMENT # PARTICIPANT_DATASET,TIME_SEGMENT
|
||||||
CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT
|
|
||||||
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
|
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
|
||||||
SRC_FOLDER: "doryab" # inside src/features/phone_locations
|
SRC_FOLDER: "doryab" # inside src/features/phone_locations
|
||||||
SRC_LANGUAGE: "python"
|
SRC_LANGUAGE: "python"
|
||||||
|
|
|
@ -127,8 +127,8 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
|
||||||
|timeattop1location |minutes |Time spent at the most significant location.
|
|timeattop1location |minutes |Time spent at the most significant location.
|
||||||
|timeattop2location |minutes |Time spent at the 2nd most significant location.
|
|timeattop2location |minutes |Time spent at the 2nd most significant location.
|
||||||
|timeattop3location |minutes |Time spent at the 3rd most significant location.
|
|timeattop3location |minutes |Time spent at the 3rd most significant location.
|
||||||
|movingtostaticratio | - | Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labelled as stationary if it’s speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]`
|
|movingtostaticratio | - | Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labelled as stationary if it’s speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine. These times are computed using timeInSeconds feature.
|
||||||
|outlierstimepercent | - | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed by multiplying the number of rows by `[SAMPLING_FREQUENCY]`
|
|outlierstimepercent | - | Ratio between the time spent in non-significant clusters divided by the time spent in all clusters (total location sensed time). A higher value represents more time spent in non-significant clusters. These times are computed using timeInSeconds feature.
|
||||||
|maxlengthstayatclusters |minutes |Maximum time spent in a cluster (significant location).
|
|maxlengthstayatclusters |minutes |Maximum time spent in a cluster (significant location).
|
||||||
|minlengthstayatclusters |minutes |Minimum time spent in a cluster (significant location).
|
|minlengthstayatclusters |minutes |Minimum time spent in a cluster (significant location).
|
||||||
|meanlengthstayatclusters |minutes |Average time spent in a cluster (significant location).
|
|meanlengthstayatclusters |minutes |Average time spent in a cluster (significant location).
|
||||||
|
|
|
@ -14,7 +14,6 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
||||||
dbscan_minsamples = provider["DBSCAN_MINSAMPLES"]
|
dbscan_minsamples = provider["DBSCAN_MINSAMPLES"]
|
||||||
threshold_static = provider["THRESHOLD_STATIC"]
|
threshold_static = provider["THRESHOLD_STATIC"]
|
||||||
maximum_gap_allowed = provider["MAXIMUM_GAP_ALLOWED"]
|
maximum_gap_allowed = provider["MAXIMUM_GAP_ALLOWED"]
|
||||||
sampling_frequency = provider["SAMPLING_FREQUENCY"]
|
|
||||||
cluster_on = provider["CLUSTER_ON"]
|
cluster_on = provider["CLUSTER_ON"]
|
||||||
clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
|
clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
|
||||||
|
|
||||||
|
@ -56,9 +55,6 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
||||||
else:
|
else:
|
||||||
location_features = pd.DataFrame()
|
location_features = pd.DataFrame()
|
||||||
|
|
||||||
if sampling_frequency == 0:
|
|
||||||
sampling_frequency = getSamplingFrequency(location_data)
|
|
||||||
|
|
||||||
if "minutesdataused" in features_to_compute:
|
if "minutesdataused" in features_to_compute:
|
||||||
for localDate in location_data["local_segment"].unique():
|
for localDate in location_data["local_segment"].unique():
|
||||||
location_features.loc[localDate,"minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate])
|
location_features.loc[localDate,"minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate])
|
||||||
|
@ -116,7 +112,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
||||||
|
|
||||||
if "radiusgyration" in features_to_compute:
|
if "radiusgyration" in features_to_compute:
|
||||||
for localDate in stationaryLocations['local_segment'].unique():
|
for localDate in stationaryLocations['local_segment'].unique():
|
||||||
location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(stationaryLocations[stationaryLocations['local_segment']==localDate],sampling_frequency)
|
location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(stationaryLocations[stationaryLocations['local_segment']==localDate])
|
||||||
|
|
||||||
preComputedTimeArray = pd.DataFrame()
|
preComputedTimeArray = pd.DataFrame()
|
||||||
for localDate in stationaryLocations["local_segment"].unique():
|
for localDate in stationaryLocations["local_segment"].unique():
|
||||||
|
@ -181,8 +177,8 @@ def len_stay_timeattopn(locationData):
|
||||||
if locationData is None or len(locationData) == 0:
|
if locationData is None or len(locationData) == 0:
|
||||||
return (None, None, None,None, None, None, None)
|
return (None, None, None,None, None, None, None)
|
||||||
|
|
||||||
calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']]
|
calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']].copy()
|
||||||
calculationDf[calculationDf['timeInSeconds'] >= 300]['timeInSeconds'] = 60
|
calculationDf.loc[calculationDf.timeInSeconds >= 300,'timeInSeconds'] = 60
|
||||||
timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60
|
timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60
|
||||||
|
|
||||||
if len(timeArray) > 2:
|
if len(timeArray) > 2:
|
||||||
|
@ -360,7 +356,7 @@ def number_location_transitions(locationData):
|
||||||
|
|
||||||
return df[df['boolCol'] == False].shape[0] - 1
|
return df[df['boolCol'] == False].shape[0] - 1
|
||||||
|
|
||||||
def radius_of_gyration(locationData,sampling_frequency):
|
def radius_of_gyration(locationData):
|
||||||
if locationData is None or len(locationData) == 0:
|
if locationData is None or len(locationData) == 0:
|
||||||
return None
|
return None
|
||||||
# Center is the centroid, not the home location
|
# Center is the centroid, not the home location
|
||||||
|
@ -373,10 +369,10 @@ def radius_of_gyration(locationData,sampling_frequency):
|
||||||
distance = haversine(clusters_centroid.loc[labels].double_longitude,clusters_centroid.loc[labels].double_latitude,
|
distance = haversine(clusters_centroid.loc[labels].double_longitude,clusters_centroid.loc[labels].double_latitude,
|
||||||
centroid_all_clusters.double_longitude,centroid_all_clusters.double_latitude) ** 2
|
centroid_all_clusters.double_longitude,centroid_all_clusters.double_latitude) ** 2
|
||||||
|
|
||||||
time_in_cluster = locationData[locationData["location_label"]==labels].shape[0]* sampling_frequency
|
time_in_cluster = locationData[locationData["location_label"]==labels]['timeInSeconds'].sum()
|
||||||
rog = rog + (time_in_cluster * distance)
|
rog = rog + (time_in_cluster * distance)
|
||||||
|
|
||||||
time_all_clusters = valid_clusters.shape[0] * sampling_frequency
|
time_all_clusters = valid_clusters['timeInSeconds'].sum()
|
||||||
if time_all_clusters == 0:
|
if time_all_clusters == 0:
|
||||||
return 0
|
return 0
|
||||||
final_rog = (1/time_all_clusters) * rog
|
final_rog = (1/time_all_clusters) * rog
|
||||||
|
@ -400,7 +396,7 @@ def location_entropy(locationData):
|
||||||
clusters = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
|
clusters = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
|
||||||
if len(clusters) > 0:
|
if len(clusters) > 0:
|
||||||
# Get percentages for each location
|
# Get percentages for each location
|
||||||
percents = clusters["location_label"].value_counts(normalize=True)
|
percents = clusters.groupby(['location_label'])['timeInSeconds'].sum() / clusters['timeInSeconds'].sum()
|
||||||
entropy = -1 * percents.map(lambda x: x * np.log(x)).sum()
|
entropy = -1 * percents.map(lambda x: x * np.log(x)).sum()
|
||||||
return entropy
|
return entropy
|
||||||
else:
|
else:
|
||||||
|
@ -416,10 +412,7 @@ def location_entropy_normalized(locationData):
|
||||||
num_clusters = len(unique_clusters)
|
num_clusters = len(unique_clusters)
|
||||||
if num_clusters == 0 or len(locationData) == 0 or entropy is None:
|
if num_clusters == 0 or len(locationData) == 0 or entropy is None:
|
||||||
return None
|
return None
|
||||||
|
elif np.log(num_clusters)==0:
|
||||||
|
return None
|
||||||
else:
|
else:
|
||||||
return entropy / num_clusters
|
return entropy / np.log(num_clusters)
|
||||||
|
|
||||||
|
|
||||||
def getSamplingFrequency(locationData):
|
|
||||||
|
|
||||||
return (locationData.timestamp.diff()/(1000*60)).median()
|
|
||||||
|
|
Loading…
Reference in New Issue