diff --git a/Snakefile b/Snakefile index d55ee453..98c0c303 100644 --- a/Snakefile +++ b/Snakefile @@ -115,6 +115,16 @@ if config["CONVERSATION"]["COMPUTE"]: # TODO add files_to_compute.extend(optional_conversation_input(None)), the Android or iOS table gets processed depending on each participant files_to_compute.extend(expand("data/processed/{pid}/conversation_{day_segment}.csv",pid=config["PIDS"], day_segment = config["CONVERSATION"]["DAY_SEGMENTS"])) +if config["DORYAB_LOCATION"]["COMPUTE"]: + if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": + if config["DORYAB_LOCATION"]["DB_TABLE"] in config["PHONE_VALID_SENSED_BINS"]["TABLES"]: + files_to_compute.extend(expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"])) + else: + raise ValueError("Error: Add your locations table (and as many sensor tables as you have) to [PHONE_VALID_SENSED_BINS][TABLES] in config.yaml. This is necessary to compute phone_sensed_bins (bins of time when the smartphone was sensing data) which is used to resample fused location data (RESAMPLED_FUSED)") + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"])) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["DORYAB_LOCATION"]["DB_TABLE"])) + files_to_compute.extend(expand("data/processed/{pid}/location_doryab_{segment}.csv", pid=config["PIDS"], segment = config["DORYAB_LOCATION"]["DAY_SEGMENTS"])) + if config["PARAMS_FOR_ANALYSIS"]["COMPUTE"]: rows_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["ROWS_NAN_THRESHOLD"] cols_nan_threshold = config["PARAMS_FOR_ANALYSIS"]["COLS_NAN_THRESHOLD"] diff --git a/config.yaml b/config.yaml index df05a52b..409d622f 100644 --- a/config.yaml +++ b/config.yaml @@ -31,7 +31,7 @@ PHONE_VALID_SENSED_BINS: COMPUTE: False # This flag is automatically ignored (set to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features BIN_SIZE: 5 # (in minutes) # Add as many sensor tables as you have, they all improve the computation of PHONE_VALID_SENSED_BINS and PHONE_VALID_SENSED_DAYS. - # If you are extracting screen or Barnett's location features, screen and locations tables are mandatory. + # If you are extracting screen or Barnett's/Doryab's location features, screen and locations tables are mandatory. TABLES: [] PHONE_VALID_SENSED_DAYS: @@ -81,6 +81,16 @@ BARNETT_LOCATION: TIMEZONE: *timezone MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features +DORYAB_LOCATION: + COMPUTE: True + DB_TABLE: locations + DAY_SEGMENTS: *day_segments + FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"] + LOCATIONS_TO_USE: RESAMPLE_FUSED # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED + DBSCAN_EPS: 10 # meters + DBSCAN_MINSAMPLES: 5 + THRESHOLD_STATIC : 1 # km/h + BLUETOOTH: COMPUTE: False DB_TABLE: bluetooth diff --git a/docs/features/extracted.rst b/docs/features/extracted.rst index 48f6366a..83c9ae9a 100644 --- a/docs/features/extracted.rst +++ b/docs/features/extracted.rst @@ -52,7 +52,7 @@ Global Parameters - ``PHONE_VALID_SENSED_BINS`` Contains three attributes: ``COMPUTE``, ``BIN_SIZE`` and ``TABLES``. See the PHONE_VALID_SENSED_BINS_ section in the ``config.yaml`` file - Set the ``COMPUTE`` flag to True if you want to get this file (``data/interim/{pid}/phone_sensed_bins``). Phone valid sensed bins is a matrix of days x bins where we divide every hour of every day into N bins of size ``BIN_SIZE`` (in minutes). Each bin contains the number of rows that were recorded in that interval by all the sensors listed in ``TABLES``. Add as many sensor tables to ``TABLES`` as you have in your database because valid sensed bins are used to compute ``PHONE_VALID_SENSED_DAYS``, the ``episodepersensedminutes`` feature of :ref:`Screen` and to resample fused location data if you configure Barnett's location features to use ``RESAMPLE_FUSED``. + Set the ``COMPUTE`` flag to True if you want to get this file (``data/interim/{pid}/phone_sensed_bins``). Phone valid sensed bins is a matrix of days x bins where we divide every hour of every day into N bins of size ``BIN_SIZE`` (in minutes). Each bin contains the number of rows that were recorded in that interval by all the sensors listed in ``TABLES``. Add as many sensor tables to ``TABLES`` as you have in your database because valid sensed bins are used to compute ``PHONE_VALID_SENSED_DAYS``, the ``episodepersensedminutes`` feature of :ref:`Screen` and to resample fused location data if you configure Barnett's/Doryab's location features to use ``RESAMPLE_FUSED``. The ``COMPUTE`` flag is automatically ignored (set internally to True) if you are extracting PHONE_VALID_SENSED_DAYS or screen or Barnett's location features. @@ -632,6 +632,86 @@ In RAPIDS we only expose two parameters for these features (timezone and accurac Significant locations are determined using K-means clustering on pauses longer than 10 minutes. The number of clusters (K) is increased until no two clusters are within 400 meters from each other. After this, pauses within a certain range of a cluster (200 meters by default) will count as a visit to that significant location. This description was adapted from the Supplementary Materials of https://doi.org/10.1093/biostatistics/kxy059. +*The Circadian Calculation* + +For a detailed description of how this is calculated, see Canzian, L., & Musolesi, M. (2015, September). Trajectories of depression: unobtrusive monitoring of depressive states by means of smartphone mobility traces analysis. In Proceedings of the 2015 ACM international joint conference on pervasive and ubiquitous computing (pp. 1293-1304). Their procedure was followed using 30-min increments as a bin size. Taken from `Beiwe Summary Statistics`_. + + +Location (Doryab) Features +"""""""""""""""""""""""""""""" +Doryab location features are based on the research paper https://arxiv.org/pdf/1812.10394.pdf + +See `Location (Doryab) Config Code`_ + +**Available Epochs (day_segment) :** daily, morning, afternoon, evening, night + +**Available Platforms:** Android and iOS + +**Snakemake rule chain:** + +- Rule ``rules/preprocessing.snakefile/download_dataset`` +- Rule ``rules/preprocessing.snakefile/readable_datetime`` +- Rule ``rules/preprocessing.snakefile/phone_sensed_bins`` +- Rule ``rules/preprocessing.snakefile/resample_fused_location`` (only relevant if setting ``location_to_use`` to ````RESAMPLE_FUSED``. +- Rule ``rules/features.snakefile/location_doryab_features`` + +.. _location-parameters: + +**Location Rule Parameters (location_doryab_features):** + +================= =================== +Name Description +================= =================== +day_segment The particular ``day_segment`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` +location_to_use *Read the Observations section below*. The specifies what type of location data will be use in the analysis. Possible options are ``ALL``, ``ALL_EXCEPT_FUSED`` OR ``RESAMPLE_FUSED``. +features Features to be computed, see table below. +threshold_static It is the threshold value in km/hr which labels a row as Static or Moving. +dbscan_minsamples The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. +dbscan_eps The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. +================= =================== + +.. _location-available-features: + +**Available Location Features** + +================ ========= ============= +Name Units Description +================ ========= ============= +locationvariance The sum of the variance of the latitude and longitude features. +loglocationvariance Log of the sum of the variance of the latitude and longitude features. +totaldistance meters Total distance travelled in an day_segment is calculated using haversine formula. +averagespeed km/hr Average speed of a person in an day_segment considering only the instances labeled as Moving. +varspeed km/hr Variance speeed of a person in an day_segment considering only the instances labeled as Moving. +circadianmovement A continuous metric quantifying a person’s circadian routine. +numberofsignificantplaces Number of significant places visited. It is calculated using the DBSCAN clustering algorithm which takes in EPS and MIN_SAMPLES as a paramter to identify clusters. Each cluster is a significant place. +numberlocationtransitions Number of movements from one cluster to another in a day_segment. +radiusgyration The Radius of Gyration (rog) is a measure in meters of the area covered by a person over a day. A centroid is calculated for all the places (pauses) visited during a day and a weighted distance between all the places and that centroid is computed. The weights are proportional to the time spent in each place. +timeattop1location minutes Time spent at the most significant location. +timeattop2location minutes Time spent at the 2nd most significant location. +timeattop3location minutes Time spent at the 3rd most significant location. +movingtostaticratio Ratio of time spent in Moving versus Static +outlierstimepercent Time spent at all the irrelevant clusters in an day_segment. +maxlengthstayatclusters minutes Maximum time spent in a cluster (significant location). +minlengthstayatclusters minutes Minimum time spent in a cluster (significant location). +meanlengthstayatclusters minutes Average time spent in a cluster (significant location). +stdlengthstayatclusters minutes Standard deviation of time spent in a cluster(significant location). +locationentropy +normalizedlocationentropy +================ ========= ============= + +**Assumptions/Observations:** + +*Types of location data to use* + +Aware Android and iOS clients can collect location coordinates through the phone's GPS or Google's fused location API. If your Aware client was ONLY configured to use GPS set ``location_to_use`` to ``ALL``, if your client was configured to use BOTH GPS and fused location you can use ``ALL`` or set ``location_to_use`` to ``ALL_EXCEPT_FUSED`` to ignore fused coordinates, if your client was configured to use fused location only, set ``location_to_use`` to ``RESAMPLE_FUSED``. ``RESAMPLE_FUSED`` takes the original fused location coordinates and replicates each pair forward in time as long as the phone was sensing data as indicated by ``phone_sensed_bins`` (see :ref:`Phone valid sensed days `), this is done because Google's API only logs a new location coordinate pair when it is sufficiently different from the previous one. + +There are two parameters associated with resampling fused location in the ``RESAMPLE_FUSED_LOCATION`` section of the ``config.yaml`` file. ``CONSECUTIVE_THRESHOLD`` (in minutes, default 30) controls the maximum gap between any two coordinate pairs to replicate the last known pair (for example, participant A's phone did not collect data between 10.30am and 10:50am and between 11:05am and 11:40am, the last known coordinate pair will be replicated during the first period but not the second, in other words, we assume that we cannot longer guarantee the participant stayed at the last known location if the phone did not sense data for more than 30 minutes). ``TIME_SINCE_VALID_LOCATION`` (in minutes, default 720 or 12 hours) the last known fused location won't be carried over longer that this threshold even if the phone was sensing data continuously (for example, participant A went home at 9pm and their phone was sensing data without gaps until 11am the next morning, the last known location will only be replicated until 9am). If you have suggestions to modify or improve this imputation, let us know. + +*Significant Locations Identified* + +(i.e. The clustering method used) +Significant locations are determined using DBSCAN clustering on locations that a patient visit over the course of the period of data collection. + *The Circadian Calculation* For a detailed description of how this is calculated, see Canzian, L., & Musolesi, M. (2015, September). Trajectories of depression: unobtrusive monitoring of depressive states by means of smartphone mobility traces analysis. In Proceedings of the 2015 ACM international joint conference on pervasive and ubiquitous computing (pp. 1293-1304). Their procedure was followed using 30-min increments as a bin size. Taken from `Beiwe Summary Statistics`_. diff --git a/rules/features.snakefile b/rules/features.snakefile index 741802cc..e8de9167 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -38,6 +38,12 @@ def optional_location_input(wildcards): else: return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BARNETT_LOCATION"]["DB_TABLE"]) +def optional_location_doryab_input(wildcards): + if config["DORYAB_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": + return expand("data/raw/{{pid}}/{sensor}_resampled.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"]) + else: + return expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["DORYAB_LOCATION"]["DB_TABLE"]) + def optional_steps_sleep_input(wildcards): if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED": return "data/raw/{pid}/fitbit_sleep_summary_with_datetime.csv" @@ -115,6 +121,20 @@ rule location_barnett_features: script: "../src/features/location_barnett_features.R" +rule location_doryab_features: + input: + locations = optional_location_doryab_input + params: + features = config["DORYAB_LOCATION"]["FEATURES"], + day_segment = "{day_segment}", + dbscan_eps = config["DORYAB_LOCATION"]["DBSCAN_EPS"], + dbscan_minsamples = config["DORYAB_LOCATION"]["DBSCAN_MINSAMPLES"], + threshold_static = config["DORYAB_LOCATION"]["THRESHOLD_STATIC"] + output: + "data/processed/{pid}/location_doryab_{day_segment}.csv" + script: + "../src/features/location_doryab_features.py" + rule bluetooth_features: input: expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"]) diff --git a/src/data/resample_fused_location.R b/src/data/resample_fused_location.R index 7210adaf..ccff8a05 100644 --- a/src/data/resample_fused_location.R +++ b/src/data/resample_fused_location.R @@ -41,7 +41,12 @@ if(nrow(locations) > 0){ local_date_time = format(utc_date_time, tz = timezone, usetz = F)) %>% separate(local_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>% separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>% - mutate(local_hour = as.numeric(local_hour), local_minute = as.numeric(local_minute)) %>% + mutate(local_hour = as.numeric(local_hour), + local_minute = as.numeric(local_minute), + local_day_segment = case_when(local_hour %in% 0:5 ~ "night", + local_hour %in% 6:11 ~ "morning", + local_hour %in% 12:17 ~ "afternoon", + local_hour %in% 18:23 ~ "evening")) %>% # Delete resampled rows that exist in the same minute as other original (fused) rows group_by(local_date, local_hour, local_minute) %>% mutate(n = n()) %>% diff --git a/src/features/location_doryab/location_base.py b/src/features/location_doryab/location_base.py new file mode 100644 index 00000000..6e220dc2 --- /dev/null +++ b/src/features/location_doryab/location_base.py @@ -0,0 +1,419 @@ +import pandas as pd +import numpy as np +from astropy.timeseries import LombScargle +from sklearn.cluster import DBSCAN +from math import radians, cos, sin, asin, sqrt + +def base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples, threshold_static): + # name of the features this function can compute + base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"] + + # the subset of requested features this function can compute + features_to_compute = list(set(requested_features) & set(base_features_names)) + + + if location_data.empty: + location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute]) + else: + if day_segment != "daily": + location_data = location_data[location_data["local_day_segment"] == day_segment] + + if location_data.empty: + location_features = pd.DataFrame(columns=["local_date"] + ["location_" + day_segment + "_" + x for x in features_to_compute]) + else: + location_features = pd.DataFrame() + + location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)] + + if "locationvariance" in features_to_compute: + location_features["location_" + day_segment + "_locationvariance"] = location_data.groupby(['local_date'])['double_latitude'].var() + location_data.groupby(['local_date'])['double_longitude'].var() + + if "loglocationvariance" in features_to_compute: + location_features["location_" + day_segment + "_loglocationvariance"] = np.log10(location_data.groupby(['local_date'])['double_latitude'].var() + location_data.groupby(['local_date'])['double_longitude'].var()) + + + preComputedDistanceandSpeed = pd.DataFrame() + for localDate in location_data['local_date'].unique(): + distance, speeddf = get_all_travel_distances_meters_speed(location_data[location_data['local_date']==localDate],threshold_static) + preComputedDistanceandSpeed.loc[localDate,"distance"] = distance.sum() + preComputedDistanceandSpeed.loc[localDate,"avgspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].mean() + preComputedDistanceandSpeed.loc[localDate,"varspeed"] = speeddf[speeddf['speedTag'] == 'Moving']['speed'].var() + + if "totaldistance" in features_to_compute: + for localDate in location_data['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"] + + if "averagespeed" in features_to_compute: + for localDate in location_data['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"] + + if "varspeed" in features_to_compute: + for localDate in location_data['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"] + + if "circadianmovement" in features_to_compute: + for localDate in location_data['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_circadianmovement"] = circadian_movement(location_data[location_data['local_date']==localDate]) + + newLocationData = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples) + + if "numberofsignificantplaces" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_date']==localDate]) + + if "numberlocationtransitions" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_date']==localDate]) + + if "radiusgyration" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_date']==localDate]) + + if "timeattop1location" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],1) + + if "timeattop2location" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],2) + + if "timeattop3location" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],3) + + if "movingtostaticratio" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_movingtostaticratio"] = (newLocationData[newLocationData['local_date']==localDate].shape[0] / location_data[location_data['local_date']==localDate].shape[0]) + + if "outlierstimepercent" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_date']==localDate]) + + preComputedmaxminCluster = pd.DataFrame() + for localDate in newLocationData['local_date'].unique(): + smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(newLocationData[newLocationData['local_date']==localDate]) + preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = smax + preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = smin + preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = sstd + preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = smean + + if "maxlengthstayatclusters" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] + + if "minlengthstayatclusters" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] + + if "stdlengthstayatclusters" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] + + if "meanlengthstayatclusters" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] + + if "locationentropy" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_locationentropy"] = location_entropy(newLocationData[newLocationData['local_date']==localDate]) + + if "normalizedlocationentropy" in features_to_compute: + for localDate in newLocationData['local_date'].unique(): + location_features.loc[localDate,"location_" + day_segment + "_normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_date']==localDate]) + + location_features = location_features.reset_index() + + return location_features + +def distance_to_degrees(d): + #Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error + #over small distances + d = d / 1852 + d = d / 60 + return d + +def get_all_travel_distances_meters_speed(locationData,threshold): + + lat_lon_temp = pd.DataFrame() + + lat_lon_temp['_lat_before'] = locationData.double_latitude + lat_lon_temp['_lat_after'] = locationData.double_latitude.shift(-1) + lat_lon_temp['_lon_before'] = locationData.double_longitude + lat_lon_temp['_lon_after'] = locationData.double_longitude.shift(-1) + lat_lon_temp['time_before'] = pd.to_datetime(locationData['local_time'], format="%H:%M:%S") + lat_lon_temp['time_after'] = lat_lon_temp['time_before'].shift(-1) + lat_lon_temp['time_diff'] = lat_lon_temp['time_after'] - lat_lon_temp['time_before'] + lat_lon_temp['timeInSeconds'] = lat_lon_temp['time_diff'].apply(lambda x: x.total_seconds()) + + lat_lon_temp = lat_lon_temp[lat_lon_temp['timeInSeconds'] <= 300] + lat_lon_temp['distances'] = lat_lon_temp.apply(haversine, axis=1) # meters + lat_lon_temp['speed'] = (lat_lon_temp['distances'] / lat_lon_temp['timeInSeconds'] ) + lat_lon_temp['speed'] = lat_lon_temp['speed'].replace(np.inf, np.nan) * 3.6 + distances = lat_lon_temp['distances'] + lat_lon_temp = lat_lon_temp.dropna() + lat_lon_temp['speedTag'] = np.where(lat_lon_temp['speed'] >= threshold,"Moving","Static") + + return distances,lat_lon_temp[['speed','speedTag']] + + +def vincenty_row(x): + """ + :param x: A row from a dataframe + :return: The distance in meters between + """ + + try: + return vincenty((x['_lat_before'], x['_lon_before']),(x['_lat_after'], x['_lon_after'])).meters + + except: + return 0 + +def haversine(x): + """ + Calculate the great circle distance between two points + on the earth (specified in decimal degrees) + """ + # convert decimal degrees to radians + lon1, lat1, lon2, lat2 = x['_lon_before'], x['_lat_before'],x['_lon_after'], x['_lat_after'] + lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) + + # haversine formula + dlon = lon2 - lon1 + dlat = lat2 - lat1 + a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 + c = 2 * asin(sqrt(a)) + r = 6371 # Radius of earth in kilometers. Use 3956 for miles + return c * r* 1000 + + +def circadian_movement_energies(locationData): + time = (locationData["timestamp"].values / 1000.0) # seconds + ylat = locationData["double_latitude"].values + ylong = locationData["double_longitude"].values + hours_intervals = np.arange(23.5, 24.51, 0.01) # hours + seconds_intervals = hours_intervals * 60 * 60 # seconds + frequency = 1 / seconds_intervals + + power_latitude = LombScargle(time, ylat).power(frequency=frequency, normalization='psd') + power_longitude = LombScargle(time, ylong).power(frequency=frequency, normalization='psd') + + energy_latitude = np.sum(power_latitude) + energy_longitude = np.sum(power_longitude) + return (energy_latitude, energy_longitude) + +def circadian_movement(locationData): + + energy_latitude, energy_longitude = circadian_movement_energies(locationData) + return np.log10(energy_latitude + energy_longitude) + +def cluster_and_label(df,**kwargs): + """ + + :param df: a df with columns "latitude", "longitude", and "datetime" + or + a df with comlumns "latitude","longitude" and a datetime index + :param kwargs: arguments for sklearn's DBSCAN + :return: a new df of labeled locations with moving points removed, where the cluster + labeled as "1" is the largest, "2" the second largest, and so on + """ + location_data = df + if not isinstance(df.index, pd.DatetimeIndex): + location_data = df.set_index("local_date_time") + + stationary = remove_moving(location_data,1) + + #return degrees(arcminutes=nautical(meters= d)) + #nautical miles = m ÷ 1,852 + clusterer = DBSCAN(**kwargs) + + counts_df = stationary[["double_latitude" ,"double_longitude"]].groupby(["double_latitude" ,"double_longitude"]).size().reset_index() + counts = counts_df[0] + lat_lon = counts_df[["double_latitude","double_longitude"]].values + cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts) + + #Need to extend labels back to original df without weights + counts_df["location_label"] = cluster_results + # remove the old count column + del counts_df[0] + + merged = pd.merge(stationary,counts_df, on = ["double_latitude" ,"double_longitude"]) + + #Now compute the label mapping: + cluster_results = merged["location_label"].values + valid_clusters = cluster_results[np.where(cluster_results != -1)] + label_map = rank_count_map(valid_clusters) + + #And remap the labels: + merged.index = stationary.index + stationary["location_label"] = merged["location_label"].map(label_map) + + return stationary + +def rank_count_map(clusters): + """ Returns a function which will map each element of a list 'l' to its rank, + such that the most common element maps to 1 + + Is used in this context to sort the cluster labels so that cluster with rank 1 is the most + visited. + + If return_dict, return a mapping dict rather than a function + + If a function, if the value can't be found label as -1 + + """ + labels, counts = tuple(np.unique(clusters, return_counts = True)) + sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)] + label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]} + + return lambda x: label_to_rank.get(x, -1) + + +def remove_moving(df, v): + + if not df.index.is_monotonic: + df = df.sort_index() + + lat_lon_temp = pd.DataFrame() + + lat_lon_temp['_lat_before'] = df.double_latitude.shift() + lat_lon_temp['_lat_after'] = df.double_latitude.shift(-1) + lat_lon_temp['_lon_before'] = df.double_longitude.shift() + lat_lon_temp['_lon_after'] = df.double_longitude.shift(-1) + + # + distance = lat_lon_temp.apply( haversine, axis = 1) / 1000 + time = (pd.to_datetime(df.reset_index().local_date_time.shift(-1),format="%Y-%m-%d %H:%M:%S") - pd.to_datetime(df.reset_index().local_date_time.shift(),format="%Y-%m-%d %H:%M:%S")).fillna(-1) / np.timedelta64(1,'s') / (60.*60) + time.index = distance.index.copy() + + return df[(distance / time) < v] + +def number_of_significant_places(locationData): + + uniquelst = locationData[locationData["location_label"] >= 1]["location_label"].unique() + return len(uniquelst) + +def number_location_transitions(locationData): + + # ignores transitions from moving to static and vice-versa, but counts transitions from outliers to major location clusters + df = pd.DataFrame() + + df['boolCol'] = (locationData.location_label == locationData.location_label.shift()) + + return df[df['boolCol'] == False].shape[0] - 1 + +def radius_of_gyration(locationData): + if locationData is None or len(locationData) == 0: + return None + # Center is the centroid, not the home location + valid_clusters = locationData[locationData["location_label"] != -1] + changes_selector = (valid_clusters["location_label"].shift() != valid_clusters["location_label"]) + mobility_trace = valid_clusters[changes_selector] + + # Average x,y + lat_lon = mobility_trace[["double_latitude", "double_longitude"]].values + center = np.average(lat_lon, axis=0) + norm = np.linalg.norm(lat_lon - center) + if len(lat_lon) != 0: + return np.sqrt(norm) / len(lat_lon) + else: + return 0 + +def time_at_topn_clusters_in_group(locationData,n): # relevant only for global location features since, top3_clusters = top3_clusters_in_group for local + + if locationData is None or len(locationData) == 0: + return None + + locationData = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise + valcounts = locationData["location_label"].value_counts().to_dict() + sorted_valcounts = sorted(valcounts.items(), key=lambda kv: (-kv[1], kv[0])) + + if len(sorted_valcounts) >= n: + topn = sorted_valcounts[n-1] + topn_time = topn[1] + else: + topn_time = None + + return topn_time + +def outliers_time_percent(locationData): + if locationData is None or len(locationData) == 0: + return None + clusters = locationData["location_label"] + numoutliers = clusters[(clusters == -1)].sum() + numtotal = len(clusters) + return (float(-1*numoutliers) / numtotal) + + +def moving_time_percent(locationData): + if locationData is None or len(locationData) == 0: + return None + lbls = locationData["location_label"] + nummoving = lbls.isnull().sum() + numtotal = len(lbls) + # print (nummoving) + # print(numtotal) + return (float(nummoving) / numtotal) + +def len_stay_at_clusters_in_minutes(locationData): + if locationData is None or len(locationData) == 0: + return (None, None, None,None) + + lenstays = [] + count = 0 + prev_loc_label = None + for row in locationData.iterrows(): + cur_loc_label = row[1]["location_label"] + if np.isnan(cur_loc_label): + continue + elif prev_loc_label == None: + prev_loc_label = int(cur_loc_label) + count += 1 + else: + if prev_loc_label == int(cur_loc_label): + count += 1 + else: + lenstays.append(count) + prev_loc_label = int(cur_loc_label) + count = 0 + 1 + if count > 0: # in case of no transition + lenstays.append(count) + lenstays = np.array(lenstays) + + if len(lenstays) > 0: + smax = np.max(lenstays) + smin = np.min(lenstays) + sstd = np.std(lenstays) + smean = np.mean(lenstays) + else: + smax = None + smin = None + sstd = None + smean = None + return (smax, smin, sstd, smean) + + +def location_entropy(locationData): + if locationData is None or len(locationData) == 0: + return None + + clusters = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise + if len(clusters) > 0: + # Get percentages for each location + percents = clusters["location_label"].value_counts(normalize=True) + entropy = -1 * percents.map(lambda x: x * np.log(x)).sum() + return entropy + else: + return None + +def location_entropy_normalized(locationData): + if locationData is None or len(locationData) == 0: + return None + + locationData = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise + entropy = location_entropy(locationData) + unique_clusters = locationData["location_label"].unique() + num_clusters = len(unique_clusters) + if num_clusters == 0 or len(locationData) == 0 or entropy is None: + return None + else: + return entropy / num_clusters \ No newline at end of file diff --git a/src/features/location_doryab_features.py b/src/features/location_doryab_features.py new file mode 100644 index 00000000..2a78e98e --- /dev/null +++ b/src/features/location_doryab_features.py @@ -0,0 +1,16 @@ +import pandas as pd +from location_doryab.location_base import base_location_features + +location_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time", "local_date"]) +day_segment = snakemake.params["day_segment"] +requested_features = snakemake.params["features"] +location_features = pd.DataFrame(columns=["local_date"]) +dbscan_eps = snakemake.params["dbscan_eps"] +dbscan_minsamples = snakemake.params["dbscan_minsamples"] +threshold_static = snakemake.params["threshold_static"] + +location_features = location_features.merge(base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples,threshold_static), on="local_date", how="outer") + +assert len(requested_features) + 1 == location_features.shape[1], "The number of features in the output dataframe (=" + str(location_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your location feature extraction functions" + +location_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file