rapids/src/features/phone_locations/doryab/main.py

470 lines
22 KiB
Python

import pandas as pd
import numpy as np
from astropy.timeseries import LombScargle
from sklearn.cluster import DBSCAN
from math import radians, cos, sin, asin, sqrt
def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
location_data = pd.read_csv(sensor_data_files["sensor_data"])
requested_features = provider["FEATURES"]
dbscan_eps = provider["DBSCAN_EPS"]
dbscan_minsamples = provider["DBSCAN_MINSAMPLES"]
threshold_static = provider["THRESHOLD_STATIC"]
maximum_gap_allowed = provider["MAXIMUM_GAP_ALLOWED"]
sampling_frequency = provider["SAMPLING_FREQUENCY"]
minutes_data_used = provider["MINUTES_DATA_USED"]
if(minutes_data_used):
requested_features.append("minutesdataused")
# name of the features this function can compute
base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused"]
# the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names))
if location_data.empty:
location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
else:
location_data = filter_data_by_segment(location_data, time_segment)
if location_data.empty:
location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
else:
location_features = pd.DataFrame()
if sampling_frequency == 0:
sampling_frequency = getSamplingFrequency(location_data)
if "minutesdataused" in features_to_compute:
for localDate in location_data["local_segment"].unique():
location_features.loc[localDate,"minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate])
location_features.index.name = 'local_segment'
location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)]
if location_data.empty:
location_features = pd.DataFrame(columns=["local_date"] + ["location_" + time_segment + "_" + x for x in features_to_compute])
location_features = location_features.reset_index(drop=True)
return location_features
if "locationvariance" in features_to_compute:
location_features["locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()
if "loglocationvariance" in features_to_compute:
location_features["loglocationvariance"] = (location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()).apply(lambda x: np.log10(x) if x > 0 else None)
preComputedDistanceandSpeed = pd.DataFrame()
for localDate in location_data['local_segment'].unique():
distance, speeddf = get_all_travel_distances_meters_speed(location_data[location_data['local_segment']==localDate],threshold_static,maximum_gap_allowed)
preComputedDistanceandSpeed.loc[localDate,"distance"] = distance.sum()
preComputedDistanceandSpeed.loc[localDate,"avgspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].mean()
preComputedDistanceandSpeed.loc[localDate,"varspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].var()
if "totaldistance" in features_to_compute:
for localDate in location_data['local_segment'].unique():
location_features.loc[localDate,"totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"]
if "averagespeed" in features_to_compute:
for localDate in location_data['local_segment'].unique():
location_features.loc[localDate,"averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"]
if "varspeed" in features_to_compute:
for localDate in location_data['local_segment'].unique():
location_features.loc[localDate,"varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"]
if "circadianmovement" in features_to_compute:
for localDate in location_data['local_segment'].unique():
location_features.loc[localDate,"circadianmovement"] = circadian_movement(location_data[location_data['local_segment']==localDate])
newLocationData = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples)
if "numberofsignificantplaces" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_segment']==localDate])
if "numberlocationtransitions" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_segment']==localDate])
if "radiusgyration" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
if "timeattop1location" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],1,sampling_frequency)
if "timeattop2location" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],2,sampling_frequency)
if "timeattop3location" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],3,sampling_frequency)
if "movingtostaticratio" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"movingtostaticratio"] = (newLocationData[newLocationData['local_segment']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_segment']==localDate].shape[0] * sampling_frequency)
if "outlierstimepercent" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
preComputedmaxminCluster = pd.DataFrame()
for localDate in newLocationData['local_segment'].unique():
smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
preComputedmaxminCluster.loc[localDate,"maxlengthstayatclusters"] = smax
preComputedmaxminCluster.loc[localDate,"minlengthstayatclusters"] = smin
preComputedmaxminCluster.loc[localDate,"stdlengthstayatclusters"] = sstd
preComputedmaxminCluster.loc[localDate,"meanlengthstayatclusters"] = smean
if "maxlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"maxlengthstayatclusters"]
if "minlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"minlengthstayatclusters"]
if "stdlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"stdlengthstayatclusters"]
if "meanlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"meanlengthstayatclusters"]
if "locationentropy" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locationentropy"] = location_entropy(newLocationData[newLocationData['local_segment']==localDate])
if "normalizedlocationentropy" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_segment']==localDate])
location_features = location_features.reset_index()
return location_features
def getMinutesData(locationData):
return locationData[['local_hour','local_minute']].drop_duplicates(inplace = False).shape[0]
def distance_to_degrees(d):
#Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error
#over small distances
d = d / 1852
d = d / 60
return d
def get_all_travel_distances_meters_speed(locationData,threshold,maximum_gap_allowed):
lat_lon_temp = pd.DataFrame()
lat_lon_temp['_lat_before'] = locationData.double_latitude
lat_lon_temp['_lat_after'] = locationData.double_latitude.shift(-1)
lat_lon_temp['_lon_before'] = locationData.double_longitude
lat_lon_temp['_lon_after'] = locationData.double_longitude.shift(-1)
lat_lon_temp['time_before'] = pd.to_datetime(locationData['local_time'], format="%H:%M:%S")
lat_lon_temp['time_after'] = lat_lon_temp['time_before'].shift(-1)
lat_lon_temp['time_diff'] = lat_lon_temp['time_after'] - lat_lon_temp['time_before']
lat_lon_temp['timeInSeconds'] = lat_lon_temp['time_diff'].apply(lambda x: x.total_seconds())
lat_lon_temp = lat_lon_temp[lat_lon_temp['timeInSeconds'] <= maximum_gap_allowed]
if lat_lon_temp.empty:
return pd.Series(), pd.DataFrame({"speed": [], "speedTag": []})
lat_lon_temp['distances'] = lat_lon_temp.apply(haversine, axis=1) # meters
lat_lon_temp['speed'] = (lat_lon_temp['distances'] / lat_lon_temp['timeInSeconds'] )
lat_lon_temp['speed'] = lat_lon_temp['speed'].replace(np.inf, np.nan) * 3.6
distances = lat_lon_temp['distances']
lat_lon_temp = lat_lon_temp.dropna()
lat_lon_temp['speedTag'] = np.where(lat_lon_temp['speed'] >= threshold,"Moving","Static")
return distances,lat_lon_temp[['speed','speedTag']]
def vincenty_row(x):
"""
:param x: A row from a dataframe
:return: The distance in meters between
"""
try:
return vincenty((x['_lat_before'], x['_lon_before']),(x['_lat_after'], x['_lon_after'])).meters
except:
return 0
def haversine(x):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = x['_lon_before'], x['_lat_before'],x['_lon_after'], x['_lat_after']
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
return c * r* 1000
def circadian_movement_energies(locationData):
time = (locationData["timestamp"].values / 1000.0) # seconds
ylat = locationData["double_latitude"].values
ylong = locationData["double_longitude"].values
hours_intervals = np.arange(23.5, 24.51, 0.01) # hours
seconds_intervals = hours_intervals * 60 * 60 # seconds
frequency = 1 / seconds_intervals
power_latitude = LombScargle(time, ylat).power(frequency=frequency, normalization='psd')
power_longitude = LombScargle(time, ylong).power(frequency=frequency, normalization='psd')
energy_latitude = np.sum(power_latitude)
energy_longitude = np.sum(power_longitude)
return (energy_latitude, energy_longitude)
def circadian_movement(locationData):
energy_latitude, energy_longitude = circadian_movement_energies(locationData)
return np.log10(energy_latitude + energy_longitude)
def cluster_and_label(df,**kwargs):
"""
:param df: a df with columns "latitude", "longitude", and "datetime"
or
a df with comlumns "latitude","longitude" and a datetime index
:param kwargs: arguments for sklearn's DBSCAN
:return: a new df of labeled locations with moving points removed, where the cluster
labeled as "1" is the largest, "2" the second largest, and so on
"""
location_data = df
if not isinstance(df.index, pd.DatetimeIndex):
location_data = df.set_index("local_date_time")
stationary = remove_moving(location_data,1)
#return degrees(arcminutes=nautical(meters= d))
#nautical miles = m ÷ 1,852
clusterer = DBSCAN(**kwargs)
counts_df = stationary[["double_latitude" ,"double_longitude"]].groupby(["double_latitude" ,"double_longitude"]).size().reset_index()
counts = counts_df[0]
lat_lon = counts_df[["double_latitude","double_longitude"]].values
cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts)
#Need to extend labels back to original df without weights
counts_df["location_label"] = cluster_results
# remove the old count column
del counts_df[0]
merged = pd.merge(stationary,counts_df, on = ["double_latitude" ,"double_longitude"])
#Now compute the label mapping:
cluster_results = merged["location_label"].values
valid_clusters = cluster_results[np.where(cluster_results != -1)]
label_map = rank_count_map(valid_clusters)
#And remap the labels:
merged.index = stationary.index
stationary = stationary.assign(location_label = merged["location_label"].map(label_map).values)
stationary.loc[:, "location_label"] = merged["location_label"].map(label_map)
return stationary
def rank_count_map(clusters):
""" Returns a function which will map each element of a list 'l' to its rank,
such that the most common element maps to 1
Is used in this context to sort the cluster labels so that cluster with rank 1 is the most
visited.
If return_dict, return a mapping dict rather than a function
If a function, if the value can't be found label as -1
"""
labels, counts = tuple(np.unique(clusters, return_counts = True))
sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)]
label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]}
return lambda x: label_to_rank.get(x, -1)
def remove_moving(df, v):
if not df.index.is_monotonic:
df = df.sort_index()
lat_lon_temp = pd.DataFrame()
lat_lon_temp['_lat_before'] = df.double_latitude.shift()
lat_lon_temp['_lat_after'] = df.double_latitude.shift(-1)
lat_lon_temp['_lon_before'] = df.double_longitude.shift()
lat_lon_temp['_lon_after'] = df.double_longitude.shift(-1)
#
distance = lat_lon_temp.apply( haversine, axis = 1) / 1000
time = ((pd.to_datetime(df.reset_index().local_date_time.shift(-1),format="%Y-%m-%d %H:%M:%S") - pd.to_datetime(df.reset_index().local_date_time.shift(),format="%Y-%m-%d %H:%M:%S")) / np.timedelta64(1,'s')).fillna(-1) / (60.*60)
time.index = distance.index.copy()
return df[(distance / time) < v]
def number_of_significant_places(locationData):
uniquelst = locationData[locationData["location_label"] >= 1]["location_label"].unique()
return len(uniquelst)
def number_location_transitions(locationData):
# ignores transitions from moving to static and vice-versa, but counts transitions from outliers to major location clusters
df = pd.DataFrame()
df['boolCol'] = (locationData.location_label == locationData.location_label.shift())
return df[df['boolCol'] == False].shape[0] - 1
def radius_of_gyration(locationData,sampling_frequency):
if locationData is None or len(locationData) == 0:
return None
# Center is the centroid, not the home location
valid_clusters = locationData[locationData["location_label"] != -1]
centroid_all_clusters = (valid_clusters.groupby('location_label')[['double_latitude','double_longitude']].mean()).mean()
clusters_centroid = valid_clusters.groupby('location_label')[['double_latitude','double_longitude']].mean()
rog = 0
for labels in clusters_centroid.index:
lat_lon_dict = dict()
lat_lon_dict['_lon_before'] = clusters_centroid.loc[labels].double_longitude
lat_lon_dict['_lat_before'] = clusters_centroid.loc[labels].double_latitude
lat_lon_dict['_lon_after'] = centroid_all_clusters.double_longitude
lat_lon_dict['_lat_after'] = centroid_all_clusters.double_latitude
distance = haversine(lat_lon_dict) ** 2
time_in_cluster = locationData[locationData["location_label"]==labels].shape[0]* sampling_frequency
rog = rog + (time_in_cluster * distance)
time_all_clusters = valid_clusters.shape[0] * sampling_frequency
if time_all_clusters == 0:
return 0
final_rog = (1/time_all_clusters) * rog
return np.sqrt(final_rog)
def time_at_topn_clusters_in_group(locationData,n,sampling_frequency): # relevant only for global location features since, top3_clusters = top3_clusters_in_group for local
if locationData is None or len(locationData) == 0:
return None
locationData = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
valcounts = locationData["location_label"].value_counts().to_dict()
sorted_valcounts = sorted(valcounts.items(), key=lambda kv: (-kv[1], kv[0]))
if len(sorted_valcounts) >= n:
topn = sorted_valcounts[n-1]
topn_time = topn[1] * sampling_frequency
else:
topn_time = None
return topn_time
def outliers_time_percent(locationData,sampling_frequency):
if locationData is None or len(locationData) == 0:
return None
clusters = locationData["location_label"]
numoutliers = clusters[(clusters == -1)].sum() * sampling_frequency
numtotal = len(clusters) * sampling_frequency
return (float(-1*numoutliers) / numtotal)
def moving_time_percent(locationData):
if locationData is None or len(locationData) == 0:
return None
lbls = locationData["location_label"]
nummoving = lbls.isnull().sum()
numtotal = len(lbls)
# print (nummoving)
# print(numtotal)
return (float(nummoving) / numtotal)
def len_stay_at_clusters_in_minutes(locationData,sampling_frequency):
if locationData is None or len(locationData) == 0:
return (None, None, None,None)
lenstays = []
count = 0
prev_loc_label = None
for row in locationData.iterrows():
cur_loc_label = row[1]["location_label"]
if np.isnan(cur_loc_label):
continue
elif prev_loc_label == None:
prev_loc_label = int(cur_loc_label)
count += 1
else:
if prev_loc_label == int(cur_loc_label):
count += 1
else:
lenstays.append(count)
prev_loc_label = int(cur_loc_label)
count = 0 + 1
if count > 0: # in case of no transition
lenstays.append(count)
lenstays = np.array(lenstays) * sampling_frequency
if len(lenstays) > 0:
smax = np.max(lenstays)
smin = np.min(lenstays)
sstd = np.std(lenstays)
smean = np.mean(lenstays)
else:
smax = None
smin = None
sstd = None
smean = None
return (smax, smin, sstd, smean)
def location_entropy(locationData):
if locationData is None or len(locationData) == 0:
return None
clusters = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
if len(clusters) > 0:
# Get percentages for each location
percents = clusters["location_label"].value_counts(normalize=True)
entropy = -1 * percents.map(lambda x: x * np.log(x)).sum()
return entropy
else:
return None
def location_entropy_normalized(locationData):
if locationData is None or len(locationData) == 0:
return None
locationData = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
entropy = location_entropy(locationData)
unique_clusters = locationData["location_label"].unique()
num_clusters = len(unique_clusters)
if num_clusters == 0 or len(locationData) == 0 or entropy is None:
return None
else:
return entropy / num_clusters
def getSamplingFrequency(locationData):
return ((pd.to_datetime(locationData['local_time'], format="%H:%M:%S") - pd.to_datetime(locationData['local_time'].shift(periods=1), format="%H:%M:%S")).apply(lambda x: x.total_seconds())/60).median()