Added Doryab's location features to rapids.

Co-authored-by: JulioV <juliovhz@gmail.com>
pull/95/head
nikunjgoel95 2020-07-16 14:26:43 -04:00
parent dfc0d73d15
commit 024ae1a549
7 changed files with 563 additions and 3 deletions

View File

@ -115,6 +115,16 @@ if config["CONVERSATION"]["COMPUTE"]:
# TODO add files_to_compute.extend(optional_conversation_input(None)), the Android or iOS table gets processed depending on each participant
files_to_compute.extend(expand("data/processed/{pid}/conversation_{day_segment}.csv",pid=config["PIDS"], day_segment = config["CONVERSATION"]["DAY_SEGMENTS"]))
if config["DORYAB_LOCATION"]["COMPUTE"]:
if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
if config["DORYAB_LOCATION"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["TABLES"]:
files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]))
else:
raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)")
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"]))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"]))
files_to_compute.extend(expand("data/processed/{pid}/location_doryab_{segment}.csv", pid=config["PIDS"], segment = config["DORYAB_LOCATION"]["DAY_SEGMENTS"]))
if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]:
rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"]
cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"]

View File

@ -31,7 +31,7 @@ PHONE_VALID_SENSED_BINS:
COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features
BIN_SIZE: 5 # (in minutes)
# Add as many sensor tables as you have, they all improve the computation of PHONE_VALID_SENSED_BINS and PHONE_VALID_SENSED_DAYS.
# If you are extracting screen or Barnett's location features, screen and locations tables are mandatory.
# If you are extracting screen or Barnett's/Doryab's location features, screen and locations tables are mandatory.
TABLES: []
PHONE_VALID_SENSED_DAYS:
@ -81,6 +81,16 @@ BARNETT_LOCATION:
TIMEZONE: *timezone
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
DORYAB_LOCATION:
COMPUTE: True
DB_TABLE: locations
DAY_SEGMENTS: *day_segments
FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
LOCATIONS_TO_USE: RESAMPLE_FUSED # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED
DBSCAN_EPS: 10 # meters
DBSCAN_MINSAMPLES: 5
THRESHOLD_STATIC : 1 # km/h
BLUETOOTH:
COMPUTE: False
DB_TABLE: bluetooth

View File

@ -52,7 +52,7 @@ Global Parameters
- ``PHONE_VALID_SENSED_BINS``
Contains three attributes: ``COMPUTE``, ``BIN_SIZE`` and ``TABLES``. See the PHONE_VALID_SENSED_BINS_ section in the ``config.yaml`` file
Set the ``COMPUTE`` flag to True if you want to get this file (``data/interim/{pid}/phone_sensed_bins``). Phone valid sensed bins is a matrix of days x bins where we divide every hour of every day into N bins of size ``BIN_SIZE`` (in minutes). Each bin contains the number of rows that were recorded in that interval by all the sensors listed in ``TABLES``. Add as many sensor tables to ``TABLES`` as you have in your database because valid sensed bins are used to compute ``PHONE_VALID_SENSED_DAYS``, the ``episodepersensedminutes`` feature of :ref:`Screen<screen-sensor-doc>` and to resample fused location data if you configure Barnett's location features to use ``RESAMPLE_FUSED``.
Set the ``COMPUTE`` flag to True if you want to get this file (``data/interim/{pid}/phone_sensed_bins``). Phone valid sensed bins is a matrix of days x bins where we divide every hour of every day into N bins of size ``BIN_SIZE`` (in minutes). Each bin contains the number of rows that were recorded in that interval by all the sensors listed in ``TABLES``. Add as many sensor tables to ``TABLES`` as you have in your database because valid sensed bins are used to compute ``PHONE_VALID_SENSED_DAYS``, the ``episodepersensedminutes`` feature of :ref:`Screen<screen-sensor-doc>` and to resample fused location data if you configure Barnett's/Doryab's location features to use ``RESAMPLE_FUSED``.
The ``COMPUTE`` flag is automatically ignored (set internally to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features.
@ -632,6 +632,86 @@ In RAPIDS we only expose two parameters for these features (timezone and accurac
Significant locations are determined using K-means clustering on pauses longer than 10 minutes. The number of clusters (K) is increased until no two clusters are within 400 meters from each other. After this, pauses within a certain range of a cluster (200 meters by default) will count as a visit to that significant location. This description was adapted from the Supplementary Materials of https://doi.org/10.1093/biostatistics/kxy059.
*The Circadian Calculation*
For a detailed description of how this is calculated, see Canzian, L., & Musolesi, M. (2015, September). Trajectories of depression: unobtrusive monitoring of depressive states by means of smartphone mobility traces analysis. In Proceedings of the 2015 ACM international joint conference on pervasive and ubiquitous computing (pp. 1293-1304). Their procedure was followed using 30-min increments as a bin size. Taken from `Beiwe Summary Statistics`_.
Location (Doryab) Features
""""""""""""""""""""""""""""""
Doryab location features are based on the research paper https://arxiv.org/pdf/1812.10394.pdf
See `Location (Doryab) Config Code`_
**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night
**Available Platforms:** Android and iOS
**Snakemake rule chain:**
- Rule ``rules/preprocessing.snakefile/download_dataset``
- Rule ``rules/preprocessing.snakefile/readable_datetime``
- Rule ``rules/preprocessing.snakefile/phone_sensed_bins``
- Rule ``rules/preprocessing.snakefile/resample_fused_location`` (only relevant if setting ``location_to_use`` to ````RESAMPLE_FUSED``.
- Rule ``rules/features.snakefile/location_doryab_features``
.. _location-parameters:
**Location Rule Parameters (location_doryab_features):**
================= ===================
Name Description
================= ===================
day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night``
location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``ALL_EXCEPT_FUSED`` OR ``RESAMPLE_FUSED``.
features Features to be computed, see table below.
threshold_static It is the threshold value in km/hr which labels a row as Static or Moving.
dbscan_minsamples The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself.
dbscan_eps The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
================= ===================
.. _location-available-features:
**Available Location Features**
================ ========= =============
Name Units Description
================ ========= =============
locationvariance The sum of the variance of the latitude and longitude features.
loglocationvariance Log of the sum of the variance of the latitude and longitude features.
totaldistance meters Total distance travelled in an day_segment is calculated using haversine formula.
averagespeed km/hr Average speed of a person in an day_segment considering only the instances labeled as Moving.
varspeed km/hr Variance speeed of a person in an day_segment considering only the instances labeled as Moving.
circadianmovement A continuous metric quantifying a persons circadian routine.
numberofsignificantplaces Number of significant places visited. It is calculated using the DBSCAN clustering algorithm which takes in EPS and MIN_SAMPLES as a paramter to identify clusters. Each cluster is a significant place.
numberlocationtransitions Number of movements from one cluster to another in a day_segment.
radiusgyration The Radius of Gyration (rog) is a measure in meters of the area covered by a person over a day. A centroid is calculated for all the places (pauses) visited during a day and a weighted distance between all the places and that centroid is computed. The weights are proportional to the time spent in each place.
timeattop1location minutes Time spent at the most significant location.
timeattop2location minutes Time spent at the 2nd most significant location.
timeattop3location minutes Time spent at the 3rd most significant location.
movingtostaticratio Ratio of time spent in Moving versus Static
outlierstimepercent Time spent at all the irrelevant clusters in an day_segment.
maxlengthstayatclusters minutes Maximum time spent in a cluster (significant location).
minlengthstayatclusters minutes Minimum time spent in a cluster (significant location).
meanlengthstayatclusters minutes Average time spent in a cluster (significant location).
stdlengthstayatclusters minutes Standard deviation of time spent in a cluster(significant location).
locationentropy
normalizedlocationentropy
================ ========= =============
**Assumptions/Observations:**
*Types of location data to use*
Aware Android and iOS clients can collect location coordinates through the phone's GPS or Google's fused location API. If your Aware client was ONLY configured to use GPS set ``location_to_use`` to ``ALL``, if your client was configured to use BOTH GPS and fused location you can use ``ALL`` or set ``location_to_use`` to ``ALL_EXCEPT_FUSED`` to ignore fused coordinates, if your client was configured to use fused location only, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days <phone-valid-sensed-days>`), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different from the previous one.
There are two parameters associated with resampling fused location in the ``RESAMPLE_FUSED_LOCATION`` section of the ``config.yaml`` file. ``CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know.
*Significant Locations Identified*
(i.e. The clustering method used)
Significant locations are determined using DBSCAN clustering on locations that a patient visit over the course of the period of data collection.
*The Circadian Calculation*
For a detailed description of how this is calculated, see Canzian, L., & Musolesi, M. (2015, September). Trajectories of depression: unobtrusive monitoring of depressive states by means of smartphone mobility traces analysis. In Proceedings of the 2015 ACM international joint conference on pervasive and ubiquitous computing (pp. 1293-1304). Their procedure was followed using 30-min increments as a bin size. Taken from `Beiwe Summary Statistics`_.

View File

@ -38,6 +38,12 @@ def optional_location_input(wildcards):
else:
return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"])
def optional_location_doryab_input(wildcards):
if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
else:
return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"])
def optional_steps_sleep_input(wildcards):
if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED":
return "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv"
@ -115,6 +121,20 @@ rule location_barnett_features:
script:
"../src/features/location_barnett_features.R"
rule location_doryab_features:
input:
locations = optional_location_doryab_input
params:
features = config["DORYAB_LOCATION"]["FEATURES"],
day_segment = "{day_segment}",
dbscan_eps = config["DORYAB_LOCATION"]["DBSCAN_EPS"],
dbscan_minsamples = config["DORYAB_LOCATION"]["DBSCAN_MINSAMPLES"],
threshold_static = config["DORYAB_LOCATION"]["THRESHOLD_STATIC"]
output:
"data/processed/{pid}/location_doryab_{day_segment}.csv"
script:
"../src/features/location_doryab_features.py"
rule bluetooth_features:
input:
expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])

View File

@ -41,7 +41,12 @@ if(nrow(locations) > 0){
local_date_time = format(utc_date_time, tz = timezone, usetz = F)) %>%
separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
mutate(local_hour = as.numeric(local_hour), local_minute = as.numeric(local_minute)) %>%
mutate(local_hour = as.numeric(local_hour),
local_minute = as.numeric(local_minute),
local_day_segment = case_when(local_hour %in% 0:5 ~ "night",
local_hour %in% 6:11 ~ "morning",
local_hour %in% 12:17 ~ "afternoon",
local_hour %in% 18:23 ~ "evening")) %>%
# Delete resampled rows that exist in the same minute as other original (fused) rows
group_by(local_date, local_hour, local_minute) %>%
mutate(n = n()) %>%

View File

@ -0,0 +1,419 @@
import pandas as pd
import numpy as np
from astropy.timeseries import LombScargle
from sklearn.cluster import DBSCAN
from math import radians, cos, sin, asin, sqrt
def base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples, threshold_static):
# name of the features this function can compute
base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
# the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names))
if location_data.empty:
location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute])
else:
if day_segment != "daily":
location_data = location_data[location_data["local_day_segment"] == day_segment]
if location_data.empty:
location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute])
else:
location_features = pd.DataFrame()
location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)]
if "locationvariance" in features_to_compute:
location_features["location_" + day_segment + "_locationvariance"] = location_data.groupby(['local_date'])['double_latitude'].var() + location_data.groupby(['local_date'])['double_longitude'].var()
if "loglocationvariance" in features_to_compute:
location_features["location_" + day_segment + "_loglocationvariance"] = np.log10(location_data.groupby(['local_date'])['double_latitude'].var() + location_data.groupby(['local_date'])['double_longitude'].var())
preComputedDistanceandSpeed = pd.DataFrame()
for localDate in location_data['local_date'].unique():
distance, speeddf = get_all_travel_distances_meters_speed(location_data[location_data['local_date']==localDate],threshold_static)
preComputedDistanceandSpeed.loc[localDate,"distance"] = distance.sum()
preComputedDistanceandSpeed.loc[localDate,"avgspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].mean()
preComputedDistanceandSpeed.loc[localDate,"varspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].var()
if "totaldistance" in features_to_compute:
for localDate in location_data['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"]
if "averagespeed" in features_to_compute:
for localDate in location_data['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"]
if "varspeed" in features_to_compute:
for localDate in location_data['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"]
if "circadianmovement" in features_to_compute:
for localDate in location_data['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_circadianmovement"] = circadian_movement(location_data[location_data['local_date']==localDate])
newLocationData = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples)
if "numberofsignificantplaces" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_date']==localDate])
if "numberlocationtransitions" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_date']==localDate])
if "radiusgyration" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_date']==localDate])
if "timeattop1location" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],1)
if "timeattop2location" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],2)
if "timeattop3location" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],3)
if "movingtostaticratio" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_movingtostaticratio"] = (newLocationData[newLocationData['local_date']==localDate].shape[0] / location_data[location_data['local_date']==localDate].shape[0])
if "outlierstimepercent" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_date']==localDate])
preComputedmaxminCluster = pd.DataFrame()
for localDate in newLocationData['local_date'].unique():
smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(newLocationData[newLocationData['local_date']==localDate])
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = smax
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = smin
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = sstd
preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = smean
if "maxlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"]
if "minlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"]
if "stdlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"]
if "meanlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"]
if "locationentropy" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_locationentropy"] = location_entropy(newLocationData[newLocationData['local_date']==localDate])
if "normalizedlocationentropy" in features_to_compute:
for localDate in newLocationData['local_date'].unique():
location_features.loc[localDate,"location_" + day_segment + "_normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_date']==localDate])
location_features = location_features.reset_index()
return location_features
def distance_to_degrees(d):
#Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error
#over small distances
d = d / 1852
d = d / 60
return d
def get_all_travel_distances_meters_speed(locationData,threshold):
lat_lon_temp = pd.DataFrame()
lat_lon_temp['_lat_before'] = locationData.double_latitude
lat_lon_temp['_lat_after'] = locationData.double_latitude.shift(-1)
lat_lon_temp['_lon_before'] = locationData.double_longitude
lat_lon_temp['_lon_after'] = locationData.double_longitude.shift(-1)
lat_lon_temp['time_before'] = pd.to_datetime(locationData['local_time'], format="%H:%M:%S")
lat_lon_temp['time_after'] = lat_lon_temp['time_before'].shift(-1)
lat_lon_temp['time_diff'] = lat_lon_temp['time_after'] - lat_lon_temp['time_before']
lat_lon_temp['timeInSeconds'] = lat_lon_temp['time_diff'].apply(lambda x: x.total_seconds())
lat_lon_temp = lat_lon_temp[lat_lon_temp['timeInSeconds'] <= 300]
lat_lon_temp['distances'] = lat_lon_temp.apply(haversine, axis=1) # meters
lat_lon_temp['speed'] = (lat_lon_temp['distances'] / lat_lon_temp['timeInSeconds'] )
lat_lon_temp['speed'] = lat_lon_temp['speed'].replace(np.inf, np.nan) * 3.6
distances = lat_lon_temp['distances']
lat_lon_temp = lat_lon_temp.dropna()
lat_lon_temp['speedTag'] = np.where(lat_lon_temp['speed'] >= threshold,"Moving","Static")
return distances,lat_lon_temp[['speed','speedTag']]
def vincenty_row(x):
"""
:param x: A row from a dataframe
:return: The distance in meters between
"""
try:
return vincenty((x['_lat_before'], x['_lon_before']),(x['_lat_after'], x['_lon_after'])).meters
except:
return 0
def haversine(x):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = x['_lon_before'], x['_lat_before'],x['_lon_after'], x['_lat_after']
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
return c * r* 1000
def circadian_movement_energies(locationData):
time = (locationData["timestamp"].values / 1000.0) # seconds
ylat = locationData["double_latitude"].values
ylong = locationData["double_longitude"].values
hours_intervals = np.arange(23.5, 24.51, 0.01) # hours
seconds_intervals = hours_intervals * 60 * 60 # seconds
frequency = 1 / seconds_intervals
power_latitude = LombScargle(time, ylat).power(frequency=frequency, normalization='psd')
power_longitude = LombScargle(time, ylong).power(frequency=frequency, normalization='psd')
energy_latitude = np.sum(power_latitude)
energy_longitude = np.sum(power_longitude)
return (energy_latitude, energy_longitude)
def circadian_movement(locationData):
energy_latitude, energy_longitude = circadian_movement_energies(locationData)
return np.log10(energy_latitude + energy_longitude)
def cluster_and_label(df,**kwargs):
"""
:param df: a df with columns "latitude", "longitude", and "datetime"
or
a df with comlumns "latitude","longitude" and a datetime index
:param kwargs: arguments for sklearn's DBSCAN
:return: a new df of labeled locations with moving points removed, where the cluster
labeled as "1" is the largest, "2" the second largest, and so on
"""
location_data = df
if not isinstance(df.index, pd.DatetimeIndex):
location_data = df.set_index("local_date_time")
stationary = remove_moving(location_data,1)
#return degrees(arcminutes=nautical(meters= d))
#nautical miles = m ÷ 1,852
clusterer = DBSCAN(**kwargs)
counts_df = stationary[["double_latitude" ,"double_longitude"]].groupby(["double_latitude" ,"double_longitude"]).size().reset_index()
counts = counts_df[0]
lat_lon = counts_df[["double_latitude","double_longitude"]].values
cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts)
#Need to extend labels back to original df without weights
counts_df["location_label"] = cluster_results
# remove the old count column
del counts_df[0]
merged = pd.merge(stationary,counts_df, on = ["double_latitude" ,"double_longitude"])
#Now compute the label mapping:
cluster_results = merged["location_label"].values
valid_clusters = cluster_results[np.where(cluster_results != -1)]
label_map = rank_count_map(valid_clusters)
#And remap the labels:
merged.index = stationary.index
stationary["location_label"] = merged["location_label"].map(label_map)
return stationary
def rank_count_map(clusters):
""" Returns a function which will map each element of a list 'l' to its rank,
such that the most common element maps to 1
Is used in this context to sort the cluster labels so that cluster with rank 1 is the most
visited.
If return_dict, return a mapping dict rather than a function
If a function, if the value can't be found label as -1
"""
labels, counts = tuple(np.unique(clusters, return_counts = True))
sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)]
label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]}
return lambda x: label_to_rank.get(x, -1)
def remove_moving(df, v):
if not df.index.is_monotonic:
df = df.sort_index()
lat_lon_temp = pd.DataFrame()
lat_lon_temp['_lat_before'] = df.double_latitude.shift()
lat_lon_temp['_lat_after'] = df.double_latitude.shift(-1)
lat_lon_temp['_lon_before'] = df.double_longitude.shift()
lat_lon_temp['_lon_after'] = df.double_longitude.shift(-1)
#
distance = lat_lon_temp.apply( haversine, axis = 1) / 1000
time = (pd.to_datetime(df.reset_index().local_date_time.shift(-1),format="%Y-%m-%d %H:%M:%S") - pd.to_datetime(df.reset_index().local_date_time.shift(),format="%Y-%m-%d %H:%M:%S")).fillna(-1) / np.timedelta64(1,'s') / (60.*60)
time.index = distance.index.copy()
return df[(distance / time) < v]
def number_of_significant_places(locationData):
uniquelst = locationData[locationData["location_label"] >= 1]["location_label"].unique()
return len(uniquelst)
def number_location_transitions(locationData):
# ignores transitions from moving to static and vice-versa, but counts transitions from outliers to major location clusters
df = pd.DataFrame()
df['boolCol'] = (locationData.location_label == locationData.location_label.shift())
return df[df['boolCol'] == False].shape[0] - 1
def radius_of_gyration(locationData):
if locationData is None or len(locationData) == 0:
return None
# Center is the centroid, not the home location
valid_clusters = locationData[locationData["location_label"] != -1]
changes_selector = (valid_clusters["location_label"].shift() != valid_clusters["location_label"])
mobility_trace = valid_clusters[changes_selector]
# Average x,y
lat_lon = mobility_trace[["double_latitude", "double_longitude"]].values
center = np.average(lat_lon, axis=0)
norm = np.linalg.norm(lat_lon - center)
if len(lat_lon) != 0:
return np.sqrt(norm) / len(lat_lon)
else:
return 0
def time_at_topn_clusters_in_group(locationData,n): # relevant only for global location features since, top3_clusters = top3_clusters_in_group for local
if locationData is None or len(locationData) == 0:
return None
locationData = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
valcounts = locationData["location_label"].value_counts().to_dict()
sorted_valcounts = sorted(valcounts.items(), key=lambda kv: (-kv[1], kv[0]))
if len(sorted_valcounts) >= n:
topn = sorted_valcounts[n-1]
topn_time = topn[1]
else:
topn_time = None
return topn_time
def outliers_time_percent(locationData):
if locationData is None or len(locationData) == 0:
return None
clusters = locationData["location_label"]
numoutliers = clusters[(clusters == -1)].sum()
numtotal = len(clusters)
return (float(-1*numoutliers) / numtotal)
def moving_time_percent(locationData):
if locationData is None or len(locationData) == 0:
return None
lbls = locationData["location_label"]
nummoving = lbls.isnull().sum()
numtotal = len(lbls)
# print (nummoving)
# print(numtotal)
return (float(nummoving) / numtotal)
def len_stay_at_clusters_in_minutes(locationData):
if locationData is None or len(locationData) == 0:
return (None, None, None,None)
lenstays = []
count = 0
prev_loc_label = None
for row in locationData.iterrows():
cur_loc_label = row[1]["location_label"]
if np.isnan(cur_loc_label):
continue
elif prev_loc_label == None:
prev_loc_label = int(cur_loc_label)
count += 1
else:
if prev_loc_label == int(cur_loc_label):
count += 1
else:
lenstays.append(count)
prev_loc_label = int(cur_loc_label)
count = 0 + 1
if count > 0: # in case of no transition
lenstays.append(count)
lenstays = np.array(lenstays)
if len(lenstays) > 0:
smax = np.max(lenstays)
smin = np.min(lenstays)
sstd = np.std(lenstays)
smean = np.mean(lenstays)
else:
smax = None
smin = None
sstd = None
smean = None
return (smax, smin, sstd, smean)
def location_entropy(locationData):
if locationData is None or len(locationData) == 0:
return None
clusters = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
if len(clusters) > 0:
# Get percentages for each location
percents = clusters["location_label"].value_counts(normalize=True)
entropy = -1 * percents.map(lambda x: x * np.log(x)).sum()
return entropy
else:
return None
def location_entropy_normalized(locationData):
if locationData is None or len(locationData) == 0:
return None
locationData = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
entropy = location_entropy(locationData)
unique_clusters = locationData["location_label"].unique()
num_clusters = len(unique_clusters)
if num_clusters == 0 or len(locationData) == 0 or entropy is None:
return None
else:
return entropy / num_clusters

View File

@ -0,0 +1,16 @@
import pandas as pd
from location_doryab.location_base import base_location_features
location_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time", "local_date"])
day_segment = snakemake.params["day_segment"]
requested_features = snakemake.params["features"]
location_features = pd.DataFrame(columns=["local_date"])
dbscan_eps = snakemake.params["dbscan_eps"]
dbscan_minsamples = snakemake.params["dbscan_minsamples"]
threshold_static = snakemake.params["threshold_static"]
location_features = location_features.merge(base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples,threshold_static), on="local_date", how="outer")
assert len(requested_features) + 1 == location_features.shape[1], "The number of features in the output dataframe (=" + str(location_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your location feature extraction functions"
location_features.to_csv(snakemake.output[0], index=False)