428 lines
22 KiB
Python
428 lines
22 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import warnings
|
|
from astropy.timeseries import LombScargle
|
|
from sklearn.cluster import DBSCAN,OPTICS
|
|
from math import radians, cos, sin, asin, sqrt
|
|
|
|
def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
|
|
|
location_data = pd.read_csv(sensor_data_files["sensor_data"])
|
|
requested_features = provider["FEATURES"]
|
|
accuracy_limit = provider["ACCURACY_LIMIT"]
|
|
dbscan_eps = provider["DBSCAN_EPS"]
|
|
dbscan_minsamples = provider["DBSCAN_MINSAMPLES"]
|
|
threshold_static = provider["THRESHOLD_STATIC"]
|
|
maximum_gap_allowed = provider["MAXIMUM_ROW_GAP"]
|
|
maximum_row_duration = provider["MAXIMUM_ROW_DURATION"]
|
|
cluster_on = provider["CLUSTER_ON"]
|
|
clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
|
|
radius_from_home = provider["RADIUS_FOR_HOME"]
|
|
|
|
minutes_data_used = provider["MINUTES_DATA_USED"]
|
|
if(minutes_data_used):
|
|
requested_features.append("minutesdataused")
|
|
|
|
# name of the features this function can compute
|
|
base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused","timeathome"]
|
|
# the subset of requested features this function can compute
|
|
features_to_compute = list(set(requested_features) & set(base_features_names))
|
|
|
|
if clustering_algorithm == "DBSCAN":
|
|
hyperparameters = {'eps' : distance_to_degrees(dbscan_eps), 'min_samples': dbscan_minsamples}
|
|
elif clustering_algorithm == "OPTICS":
|
|
hyperparameters = {'max_eps': distance_to_degrees(dbscan_eps), 'min_samples': 2, 'metric':'euclidean', 'cluster_method' : 'dbscan'}
|
|
else:
|
|
raise ValueError("config[PHONE_LOCATIONS][DORYAB][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm)
|
|
|
|
rows_before_accuracy_filter = len(location_data)
|
|
location_data.query("accuracy < @accuracy_limit", inplace=True)
|
|
if rows_before_accuracy_filter > 0 and len(location_data) == 0:
|
|
warnings.warn("Cannot compute Doryab location features because there are no rows with an accuracy value lower than ACCURACY_LIMIT: {}".format(accuracy_limit))
|
|
|
|
if location_data.empty:
|
|
location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
|
|
else:
|
|
if cluster_on == "PARTICIPANT_DATASET":
|
|
location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters)
|
|
location_data = filter_data_by_segment(location_data, time_segment)
|
|
elif cluster_on == "TIME_SEGMENT":
|
|
location_data = filter_data_by_segment(location_data, time_segment)
|
|
location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters)
|
|
else:
|
|
raise ValueError("config[PHONE_LOCATIONS][DORYAB][CLUSTER_ON] only accepts PARTICIPANT_DATASET or TIME_SEGMENT but you provided ",cluster_on)
|
|
|
|
if location_data.empty:
|
|
location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
|
|
else:
|
|
location_features = pd.DataFrame()
|
|
|
|
if "minutesdataused" in features_to_compute:
|
|
for localDate in location_data["local_segment"].unique():
|
|
location_features.loc[localDate,"minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate])
|
|
|
|
location_features.index.name = 'local_segment'
|
|
|
|
location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)]
|
|
|
|
if location_data.empty:
|
|
location_features = pd.DataFrame(columns=["local_segment"] + ["location_" + time_segment + "_" + x for x in features_to_compute])
|
|
location_features = location_features.reset_index(drop=True)
|
|
return location_features
|
|
|
|
location_data['timeInSeconds'] = (location_data.timestamp.diff(-1)* -1)/1000
|
|
if "locationvariance" in features_to_compute:
|
|
location_features["locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()
|
|
|
|
if "loglocationvariance" in features_to_compute:
|
|
location_features["loglocationvariance"] = (location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()).apply(lambda x: np.log10(x) if x > 0 else None)
|
|
|
|
|
|
preComputedDistanceandSpeed = pd.DataFrame()
|
|
for localDate in location_data['local_segment'].unique():
|
|
speeddf = get_all_travel_distances_meters_speed(location_data[location_data['local_segment']==localDate],threshold_static,maximum_gap_allowed)
|
|
preComputedDistanceandSpeed.loc[localDate,"distance"] = speeddf['distances'].sum()
|
|
preComputedDistanceandSpeed.loc[localDate,"avgspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].mean()
|
|
preComputedDistanceandSpeed.loc[localDate,"varspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].var()
|
|
|
|
if "totaldistance" in features_to_compute:
|
|
for localDate in location_data['local_segment'].unique():
|
|
location_features.loc[localDate,"totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"]
|
|
|
|
if "averagespeed" in features_to_compute:
|
|
for localDate in location_data['local_segment'].unique():
|
|
location_features.loc[localDate,"averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"]
|
|
|
|
if "varspeed" in features_to_compute:
|
|
for localDate in location_data['local_segment'].unique():
|
|
location_features.loc[localDate,"varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"]
|
|
|
|
if "circadianmovement" in features_to_compute:
|
|
for localDate in location_data['local_segment'].unique():
|
|
location_features.loc[localDate,"circadianmovement"] = circadian_movement(location_data[location_data['local_segment']==localDate])
|
|
|
|
|
|
stationaryLocations = location_data[location_data['stationary_or_not'] == 1]
|
|
|
|
if "numberofsignificantplaces" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"numberofsignificantplaces"] = number_of_significant_places(stationaryLocations[stationaryLocations['local_segment']==localDate])
|
|
|
|
if "numberlocationtransitions" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"numberlocationtransitions"] = number_location_transitions(stationaryLocations[stationaryLocations['local_segment']==localDate])
|
|
|
|
if "radiusgyration" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(stationaryLocations[stationaryLocations['local_segment']==localDate])
|
|
|
|
preComputedTimeArray = pd.DataFrame()
|
|
for localDate in stationaryLocations["local_segment"].unique():
|
|
top1,top2,top3,smax,smin,sstd,smean = len_stay_timeattopn(stationaryLocations[stationaryLocations["local_segment"]==localDate],maximum_gap_allowed,maximum_row_duration)
|
|
preComputedTimeArray.loc[localDate,"timeattop1"] = top1
|
|
preComputedTimeArray.loc[localDate,"timeattop2"] = top2
|
|
preComputedTimeArray.loc[localDate,"timeattop3"] = top3
|
|
preComputedTimeArray.loc[localDate,"maxlengthstayatclusters"] = smax
|
|
preComputedTimeArray.loc[localDate,"minlengthstayatclusters"] = smin
|
|
preComputedTimeArray.loc[localDate,"stdlengthstayatclusters"] = sstd
|
|
preComputedTimeArray.loc[localDate,"meanlengthstayatclusters"] = smean
|
|
|
|
if "timeattop1location" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"timeattop1"] = preComputedTimeArray.loc[localDate,"timeattop1"]
|
|
|
|
if "timeattop2location" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"timeattop2"] = preComputedTimeArray.loc[localDate,"timeattop2"]
|
|
|
|
if "timeattop3location" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"timeattop3"] = preComputedTimeArray.loc[localDate,"timeattop3"]
|
|
|
|
if "movingtostaticratio" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"movingtostaticratio"] = (stationaryLocations[stationaryLocations['local_segment']==localDate]['timeInSeconds'].sum()) / (location_data[location_data['local_segment']==localDate]['timeInSeconds'].sum())
|
|
|
|
if "outlierstimepercent" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"outlierstimepercent"] = outlier_time_percent_new(stationaryLocations[stationaryLocations['local_segment']==localDate])
|
|
|
|
if "maxlengthstayatclusters" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"maxlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"maxlengthstayatclusters"]
|
|
|
|
if "minlengthstayatclusters" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"minlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"minlengthstayatclusters"]
|
|
|
|
if "stdlengthstayatclusters" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"stdlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"stdlengthstayatclusters"]
|
|
|
|
if "meanlengthstayatclusters" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"meanlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"meanlengthstayatclusters"]
|
|
|
|
if "locationentropy" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"locationentropy"] = location_entropy(stationaryLocations[stationaryLocations['local_segment']==localDate])
|
|
|
|
if "normalizedlocationentropy" in features_to_compute:
|
|
for localDate in stationaryLocations['local_segment'].unique():
|
|
location_features.loc[localDate,"normalizedlocationentropy"] = location_entropy_normalized(stationaryLocations[stationaryLocations['local_segment']==localDate])
|
|
|
|
if "timeathome" in features_to_compute:
|
|
calculationDf = stationaryLocations[['local_segment','distancefromhome','timeInSeconds']].copy()
|
|
calculationDf.loc[calculationDf.timeInSeconds >= maximum_gap_allowed,'timeInSeconds'] = maximum_row_duration
|
|
location_features["timeathome"] = calculationDf[calculationDf["distancefromhome"] <= radius_from_home].groupby("local_segment")["timeInSeconds"].sum()/60
|
|
|
|
location_features = location_features.reset_index()
|
|
|
|
return location_features
|
|
|
|
def len_stay_timeattopn(locationData,maximum_gap_allowed,maximum_row_duration):
|
|
if locationData is None or len(locationData) == 0:
|
|
return (None, None, None,None, None, None, None)
|
|
|
|
calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']].copy()
|
|
calculationDf.loc[calculationDf.timeInSeconds >= maximum_gap_allowed,'timeInSeconds'] = maximum_row_duration
|
|
timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60
|
|
|
|
if len(timeArray) == 3:
|
|
return (timeArray[0],timeArray[1],timeArray[2],timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
|
|
elif len(timeArray)==2:
|
|
return (timeArray[0],timeArray[1],None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
|
|
elif len(timeArray)==1:
|
|
return (timeArray[0],None,None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
|
|
else:
|
|
return (None,None,None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
|
|
|
|
|
|
def getMinutesData(locationData):
|
|
|
|
return locationData[['local_hour','local_minute']].drop_duplicates(inplace = False).shape[0]
|
|
|
|
def distance_to_degrees(d):
|
|
#Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error
|
|
#over small distances
|
|
d = d / 1852
|
|
d = d / 60
|
|
return d
|
|
|
|
def get_all_travel_distances_meters_speed(locationData,threshold,maximum_gap_allowed):
|
|
|
|
lat_lon_temp = locationData[locationData['timeInSeconds'] <= maximum_gap_allowed][['double_latitude','double_longitude','timeInSeconds']]
|
|
|
|
if lat_lon_temp.empty:
|
|
return pd.DataFrame({"speed": [], "speedTag": [],"distances": []})
|
|
|
|
lat_lon_temp['distances'] = haversine(lat_lon_temp['double_longitude'],lat_lon_temp['double_latitude'],lat_lon_temp['double_longitude'].shift(-1),lat_lon_temp['double_latitude'].shift(-1))
|
|
lat_lon_temp['speed'] = (lat_lon_temp['distances'] / lat_lon_temp['timeInSeconds'] ) # meter/second
|
|
lat_lon_temp['speed'] = lat_lon_temp['speed'].replace(np.inf, np.nan) * 3.6
|
|
|
|
lat_lon_temp['speedTag'] = np.where(lat_lon_temp['speed'] >= threshold,"Moving","Static")
|
|
|
|
return lat_lon_temp[['speed','speedTag','distances']]
|
|
|
|
|
|
def vincenty_row(x):
|
|
"""
|
|
:param x: A row from a dataframe
|
|
:return: The distance in meters between
|
|
"""
|
|
|
|
try:
|
|
return vincenty((x['_lat_before'], x['_lon_before']),(x['_lat_after'], x['_lon_after'])).meters
|
|
|
|
except:
|
|
return 0
|
|
|
|
def haversine(lon1,lat1,lon2,lat2):
|
|
"""
|
|
Calculate the great circle distance between two points
|
|
on the earth (specified in decimal degrees)
|
|
"""
|
|
# convert decimal degrees to radians
|
|
lon1, lat1, lon2, lat2 = np.radians([lon1, lat1, lon2, lat2])
|
|
|
|
# haversine formula
|
|
a = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2
|
|
|
|
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
|
|
|
|
return (r * 2 * np.arcsin(np.sqrt(a)) * 1000)
|
|
|
|
|
|
def circadian_movement_energies(locationData):
|
|
time = (locationData["timestamp"].values / 1000.0) # seconds
|
|
ylat = locationData["double_latitude"].values
|
|
ylong = locationData["double_longitude"].values
|
|
hours_intervals = np.arange(23.5, 24.51, 0.01) # hours
|
|
seconds_intervals = hours_intervals * 60 * 60 # seconds
|
|
frequency = 1 / seconds_intervals
|
|
|
|
power_latitude = LombScargle(time, ylat).power(frequency=frequency, normalization='psd')
|
|
power_longitude = LombScargle(time, ylong).power(frequency=frequency, normalization='psd')
|
|
|
|
energy_latitude = np.sum(power_latitude)
|
|
energy_longitude = np.sum(power_longitude)
|
|
return (energy_latitude, energy_longitude)
|
|
|
|
def circadian_movement(locationData):
|
|
|
|
energy_latitude, energy_longitude = circadian_movement_energies(locationData)
|
|
return np.log10(energy_latitude + energy_longitude)
|
|
|
|
def cluster_and_label(df,clustering_algorithm,threshold_static,**kwargs):
|
|
"""
|
|
|
|
:param df: a df with columns "latitude", "longitude", and "datetime"
|
|
or
|
|
a df with comlumns "latitude","longitude" and a datetime index
|
|
:param kwargs: arguments for sklearn's DBSCAN
|
|
:return: a new df of labeled locations with moving points removed, where the cluster
|
|
labeled as "1" is the largest, "2" the second largest, and so on
|
|
"""
|
|
if not df.empty:
|
|
location_data = df
|
|
if not isinstance(df.index, pd.DatetimeIndex):
|
|
location_data = df.set_index("local_date_time")
|
|
|
|
stationary = mark_moving(location_data,threshold_static)
|
|
|
|
counts_df = stationary[["double_latitude" ,"double_longitude"]].groupby(["double_latitude" ,"double_longitude"]).size().reset_index()
|
|
counts = counts_df[0]
|
|
lat_lon = counts_df[["double_latitude","double_longitude"]].values
|
|
|
|
if clustering_algorithm == "DBSCAN":
|
|
clusterer = DBSCAN(**kwargs)
|
|
cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts)
|
|
else:
|
|
clusterer = OPTICS(**kwargs)
|
|
cluster_results = clusterer.fit_predict(lat_lon)
|
|
|
|
#Need to extend labels back to original df without weights
|
|
counts_df["location_label"] = cluster_results
|
|
# remove the old count column
|
|
del counts_df[0]
|
|
|
|
merged = pd.merge(stationary,counts_df, on = ["double_latitude" ,"double_longitude"])
|
|
|
|
#Now compute the label mapping:
|
|
cluster_results = merged["location_label"].values
|
|
valid_clusters = cluster_results[np.where(cluster_results != -1)]
|
|
label_map = rank_count_map(valid_clusters)
|
|
|
|
#And remap the labels:
|
|
merged.index = stationary.index
|
|
stationary = stationary.assign(location_label = merged["location_label"].map(label_map).values)
|
|
stationary.loc[:, "location_label"] = merged["location_label"].map(label_map)
|
|
return stationary
|
|
else:
|
|
return df
|
|
|
|
def rank_count_map(clusters):
|
|
""" Returns a function which will map each element of a list 'l' to its rank,
|
|
such that the most common element maps to 1
|
|
|
|
Is used in this context to sort the cluster labels so that cluster with rank 1 is the most
|
|
visited.
|
|
|
|
If return_dict, return a mapping dict rather than a function
|
|
|
|
If a function, if the value can't be found label as -1
|
|
|
|
"""
|
|
labels, counts = tuple(np.unique(clusters, return_counts = True))
|
|
sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)]
|
|
label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]}
|
|
return lambda x: label_to_rank.get(x, -1)
|
|
|
|
|
|
def mark_moving(df, threshold_static):
|
|
|
|
if not df.index.is_monotonic:
|
|
df = df.sort_index()
|
|
|
|
distance = haversine(df.double_longitude,df.double_latitude,df.double_longitude.shift(-1),df.double_latitude.shift(-1))/ 1000
|
|
time = (df.timestamp.diff(-1) * -1) / (1000*60*60)
|
|
|
|
df['stationary_or_not'] = np.where((distance / time) < threshold_static,1,0) # 1 being stationary,0 for moving
|
|
|
|
return df
|
|
|
|
def number_of_significant_places(locationData):
|
|
|
|
uniquelst = locationData[locationData["location_label"] >= 1]["location_label"].unique()
|
|
return len(uniquelst)
|
|
|
|
def number_location_transitions(locationData):
|
|
|
|
# ignores transitions from moving to static and vice-versa, but counts transitions from outliers to major location clusters
|
|
df = pd.DataFrame()
|
|
|
|
df['boolCol'] = (locationData.location_label == locationData.location_label.shift())
|
|
|
|
return df[df['boolCol'] == False].shape[0] - 1
|
|
|
|
def radius_of_gyration(locationData):
|
|
if locationData is None or len(locationData) == 0:
|
|
return None
|
|
# Center is the centroid, not the home location
|
|
valid_clusters = locationData[locationData["location_label"] != -1]
|
|
centroid_all_clusters = (valid_clusters.groupby('location_label')[['double_latitude','double_longitude']].mean()).mean()
|
|
clusters_centroid = valid_clusters.groupby('location_label')[['double_latitude','double_longitude']].mean()
|
|
|
|
rog = 0
|
|
for labels in clusters_centroid.index:
|
|
distance = haversine(clusters_centroid.loc[labels].double_longitude,clusters_centroid.loc[labels].double_latitude,
|
|
centroid_all_clusters.double_longitude,centroid_all_clusters.double_latitude) ** 2
|
|
|
|
time_in_cluster = locationData[locationData["location_label"]==labels]['timeInSeconds'].sum()
|
|
rog = rog + (time_in_cluster * distance)
|
|
|
|
time_all_clusters = valid_clusters['timeInSeconds'].sum()
|
|
if time_all_clusters == 0:
|
|
return 0
|
|
final_rog = (1/time_all_clusters) * rog
|
|
|
|
return np.sqrt(final_rog)
|
|
|
|
def outlier_time_percent_new(locationData):
|
|
if locationData is None or len(locationData)==0:
|
|
return None
|
|
|
|
clustersDf = locationData[["location_label","timeInSeconds"]]
|
|
numoutliers = clustersDf[clustersDf["location_label"]== -1]["timeInSeconds"].sum()
|
|
numtotal = clustersDf.timeInSeconds.sum()
|
|
|
|
return numoutliers/numtotal
|
|
|
|
def location_entropy(locationData):
|
|
if locationData is None or len(locationData) == 0:
|
|
return None
|
|
|
|
clusters = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
|
|
if len(clusters) > 0:
|
|
# Get percentages for each location
|
|
percents = clusters.groupby(['location_label'])['timeInSeconds'].sum() / clusters['timeInSeconds'].sum()
|
|
entropy = -1 * percents.map(lambda x: x * np.log(x)).sum()
|
|
return entropy
|
|
else:
|
|
return None
|
|
|
|
def location_entropy_normalized(locationData):
|
|
if locationData is None or len(locationData) == 0:
|
|
return None
|
|
|
|
locationData = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
|
|
entropy = location_entropy(locationData)
|
|
unique_clusters = locationData["location_label"].unique()
|
|
num_clusters = len(unique_clusters)
|
|
if num_clusters == 0 or len(locationData) == 0 or entropy is None:
|
|
return None
|
|
elif np.log(num_clusters)==0:
|
|
return None
|
|
else:
|
|
return entropy / np.log(num_clusters)
|