Merge branch 'feature/location_doryab_home_location' into develop

feature/plugin_sentimental
JulioV 2021-02-24 17:51:30 -05:00
commit 0b57b80e54
9 changed files with 192 additions and 5 deletions

View File

@ -212,6 +212,7 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))

View File

@ -237,10 +237,16 @@ PHONE_LOCATIONS:
LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED
FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
HOME_INFERENCE:
DBSCAN_EPS: 10 # meters
DBSCAN_MINSAMPLES: 5
THRESHOLD_STATIC : 1 # km/h
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
PROVIDERS:
DORYAB:
COMPUTE: False
FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"]
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
DBSCAN_EPS: 10 # meters
DBSCAN_MINSAMPLES: 5
@ -250,6 +256,7 @@ PHONE_LOCATIONS:
MINUTES_DATA_USED: False
CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
RADIUS_FOR_HOME: 100
SRC_FOLDER: "doryab" # inside src/features/phone_locations
SRC_LANGUAGE: "python"

View File

@ -5,6 +5,7 @@
- Add logo
- Move Citation page to the Setup section
- Add `config.yaml` validation schema and documentation.
- Add time at home Doryab location feature and home coordinates to location file
## v0.4.3
- Fix bug when any of the rows from any sensor do not belong a time segment
## v0.4.2

View File

@ -89,6 +89,7 @@ These features are based on the original implementation by [Doryab et al.](../..
- data/raw/{pid}/phone_locations_raw.csv
- data/interim/{pid}/phone_locations_processed.csv
- data/interim/{pid}/phone_locations_processed_with_datetime.csv
- data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv
- data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv
- data/processed/features/{pid}/phone_locations.csv
```
@ -110,6 +111,7 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
| `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations.
| `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings).
| `[CLUSTERING_ALGORITHM]` | The original Doryab et al implementation uses `DBSCAN`, `OPTICS` is also available with similar (but not identical) clustering results and lower memory consumption.
| `[RADIUS_FOR_HOME]` | All location coordinates within this distance (meters) from the home location coordinates are considered a home stay (see `timeathome` feature).
Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
@ -136,6 +138,7 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
|stdlengthstayatclusters |minutes |Standard deviation of time spent in a cluster (significant location).
|locationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location), it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location).
|normalizedlocationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location) divided by the number of clusters, it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location).
|timeathome |minutes | Time spent at home (see Observations below for a description on how we compute home).
!!! note "Assumptions/Observations"
@ -150,3 +153,6 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
**Duration Calculation**
To calculate the time duration component for our features, we compute the difference between the timestamps of consecutive rows to take into account sampling rate variability. If this time difference is larger than a threshold (300 seconds by default) we replace it with a maximum duration (60 seconds by default, i.e. we assume a participant spent at least 60 seconds in their last known location)
**Home location**
Home is calculated using all location data of a participant between 12 am and 6 am, then applying a clustering algorithm (`DB_SCAN` or `OPTICS`), and considering the center of the biggest cluster as the home coordinates for that participant.

View File

@ -368,7 +368,7 @@ rule phone_light_r_features:
rule phone_locations_python_features:
input:
sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv",
sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv",
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"][wildcards.provider_key.upper()],

View File

@ -139,6 +139,19 @@ rule phone_locations_processed_with_datetime:
script:
"../src/data/readable_datetime.R"
rule phone_locations_processed_with_datetime_with_home:
input:
sensor_input = "data/interim/{pid}/phone_locations_processed_with_datetime.csv"
params:
dbscan_eps = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["DBSCAN_EPS"],
dbscan_minsamples = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["DBSCAN_MINSAMPLES"],
threshold_static = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["THRESHOLD_STATIC"],
clustering_algorithm = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["CLUSTERING_ALGORITHM"]
output:
"data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv"
script:
"../src/data/infer_home_location.py"
rule resample_episodes:
input:
"data/interim/{pid}/{sensor}_episodes.csv"

View File

@ -0,0 +1,137 @@
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN,OPTICS
from math import radians, cos, sin, asin, sqrt
def filterDatafromDf(origDf):
return origDf[origDf['local_hour']<=6]
def distance_to_degrees(d):
#Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error
#over small distances
d = d / 1852
d = d / 60
return d
def cluster_and_label(df,clustering_algorithm,threshold_static,**kwargs):
"""
:param df: a df with columns "latitude", "longitude", and "datetime"
or
a df with comlumns "latitude","longitude" and a datetime index
:param kwargs: arguments for sklearn's DBSCAN
:return: a new df of labeled locations with moving points removed, where the cluster
labeled as "1" is the largest, "2" the second largest, and so on
"""
if not df.empty:
location_data = df
if not isinstance(df.index, pd.DatetimeIndex):
location_data = df.set_index("local_date_time")
stationary = mark_moving(location_data,threshold_static)
counts_df = stationary[["double_latitude" ,"double_longitude"]].groupby(["double_latitude" ,"double_longitude"]).size().reset_index()
counts = counts_df[0]
lat_lon = counts_df[["double_latitude","double_longitude"]].values
if clustering_algorithm == "DBSCAN":
clusterer = DBSCAN(**kwargs)
cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts)
else:
clusterer = OPTICS(**kwargs)
cluster_results = clusterer.fit_predict(lat_lon)
#Need to extend labels back to original df without weights
counts_df["location_label"] = cluster_results
# remove the old count column
del counts_df[0]
merged = pd.merge(stationary,counts_df, on = ["double_latitude" ,"double_longitude"])
#Now compute the label mapping:
cluster_results = merged["location_label"].values
valid_clusters = cluster_results[np.where(cluster_results != -1)]
label_map = rank_count_map(valid_clusters)
#And remap the labels:
merged.index = stationary.index
stationary = stationary.assign(location_label = merged["location_label"].map(label_map).values)
stationary.loc[:, "location_label"] = merged["location_label"].map(label_map)
return stationary
else:
return df
def rank_count_map(clusters):
""" Returns a function which will map each element of a list 'l' to its rank,
such that the most common element maps to 1
Is used in this context to sort the cluster labels so that cluster with rank 1 is the most
visited.
If return_dict, return a mapping dict rather than a function
If a function, if the value can't be found label as -1
"""
labels, counts = tuple(np.unique(clusters, return_counts = True))
sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)]
label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]}
return lambda x: label_to_rank.get(x, -1)
def mark_moving(df, threshold_static):
if not df.index.is_monotonic:
df = df.sort_index()
distance = haversine(df.double_longitude,df.double_latitude,df.double_longitude.shift(-1),df.double_latitude.shift(-1))/ 1000
time = (df.timestamp.diff(-1) * -1) / (1000*60*60)
df['stationary_or_not'] = np.where((distance / time) < threshold_static,1,0) # 1 being stationary,0 for moving
return df
def haversine(lon1,lat1,lon2,lat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = np.radians([lon1, lat1, lon2, lat2])
# haversine formula
a = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
return (r * 2 * np.arcsin(np.sqrt(a)) * 1000)
# Infer a participants home location
origDf = pd.read_csv(snakemake.input[0])
filteredDf = filterDatafromDf(origDf)
dbscan_eps = snakemake.params["dbscan_eps"]
dbscan_minsamples = snakemake.params["dbscan_minsamples"]
threshold_static = snakemake.params["threshold_static"]
clustering_algorithm = snakemake.params["clustering_algorithm"]
if clustering_algorithm == "DBSCAN":
hyperparameters = {'eps' : distance_to_degrees(dbscan_eps), 'min_samples': dbscan_minsamples}
elif clustering_algorithm == "OPTICS":
hyperparameters = {'max_eps': distance_to_degrees(dbscan_eps), 'min_samples': 2, 'metric':'euclidean', 'cluster_method' : 'dbscan'}
else:
raise ValueError("config[PHONE_LOCATIONS][HOME_INFERENCE][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm)
filteredDf = cluster_and_label(filteredDf,clustering_algorithm,threshold_static,**hyperparameters)
origDf['home_latitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_latitude']
origDf['home_longitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_longitude']
distanceFromHome = haversine(origDf.double_longitude,origDf.double_latitude,origDf.home_longitude,origDf.home_latitude)
finalDf = origDf.drop(['home_latitude','home_longitude'], axis=1)
finalDf.insert(len(finalDf.columns)-1,'distancefromhome',distanceFromHome)
finalDf.to_csv(snakemake.output[0], index=False)

View File

@ -17,13 +17,14 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
maximum_row_duration = provider["MAXIMUM_ROW_DURATION"]
cluster_on = provider["CLUSTER_ON"]
clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
radius_from_home = provider["RADIUS_FOR_HOME"]
minutes_data_used = provider["MINUTES_DATA_USED"]
if(minutes_data_used):
requested_features.append("minutesdataused")
# name of the features this function can compute
base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused"]
base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused","timeathome"]
# the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names))
@ -170,6 +171,11 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
for localDate in stationaryLocations['local_segment'].unique():
location_features.loc[localDate,"normalizedlocationentropy"] = location_entropy_normalized(stationaryLocations[stationaryLocations['local_segment']==localDate])
if "timeathome" in features_to_compute:
calculationDf = stationaryLocations[['local_segment','distancefromhome','timeInSeconds']].copy()
calculationDf.loc[calculationDf.timeInSeconds >= maximum_gap_allowed,'timeInSeconds'] = maximum_row_duration
location_features["timeathome"] = calculationDf[calculationDf["distancefromhome"] <= radius_from_home].groupby("local_segment")["timeInSeconds"].sum()/60
location_features = location_features.reset_index()
return location_features

View File

@ -598,6 +598,22 @@ properties:
FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION:
type: integer
exclusiveMinimum: 0
HOME_INFERENCE:
type: object
required: [DBSCAN_EPS, DBSCAN_MINSAMPLES, THRESHOLD_STATIC, CLUSTERING_ALGORITHM]
properties:
DBSCAN_EPS:
type: integer
exclusiveMinimum: 0
DBSCAN_MINSAMPLES:
type: integer
exclusiveMinimum: 0
THRESHOLD_STATIC:
type: integer
exclusiveMinimum: 0
CLUSTERING_ALGORITHM:
type: string
enum: ["DBSCAN", "OPTICS"]
PROVIDERS:
type: ["null", object]
properties:
@ -610,7 +626,7 @@ properties:
uniqueItems: True
items:
type: string
enum: [locationvariance,loglocationvariance,totaldistance,averagespeed,varspeed,circadianmovement,numberofsignificantplaces,numberlocationtransitions,radiusgyration,timeattop1location,timeattop2location,timeattop3location,movingtostaticratio,outlierstimepercent,maxlengthstayatclusters,minlengthstayatclusters,meanlengthstayatclusters,stdlengthstayatclusters,locationentropy,normalizedlocationentropy]
enum: [locationvariance,loglocationvariance,totaldistance,averagespeed,varspeed,circadianmovement,numberofsignificantplaces,numberlocationtransitions,radiusgyration,timeattop1location,timeattop2location,timeattop3location,movingtostaticratio,outlierstimepercent,maxlengthstayatclusters,minlengthstayatclusters,meanlengthstayatclusters,stdlengthstayatclusters,locationentropy,normalizedlocationentropy,timeathome]
ACCURACY_LIMIT:
type: integer
exclusiveMinimum: 0