Added the timeathome feature using infer_home_location.py as interim file.
parent
a16ebca563
commit
3d6caea6c4
|
@ -212,6 +212,7 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"]))
|
||||||
|
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
|
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||||
|
|
11
config.yaml
11
config.yaml
|
@ -237,10 +237,16 @@ PHONE_LOCATIONS:
|
||||||
LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED
|
LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED
|
||||||
FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
|
FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
|
||||||
FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
|
FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
|
||||||
|
HOME_INFERENCE:
|
||||||
|
DBSCAN_EPS: 10 # meters
|
||||||
|
DBSCAN_MINSAMPLES: 5
|
||||||
|
THRESHOLD_STATIC : 1 # km/h
|
||||||
|
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
|
||||||
|
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
DORYAB:
|
DORYAB:
|
||||||
COMPUTE: False
|
COMPUTE: False
|
||||||
FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
|
FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"]
|
||||||
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
|
ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
|
||||||
DBSCAN_EPS: 10 # meters
|
DBSCAN_EPS: 10 # meters
|
||||||
DBSCAN_MINSAMPLES: 5
|
DBSCAN_MINSAMPLES: 5
|
||||||
|
@ -249,7 +255,8 @@ PHONE_LOCATIONS:
|
||||||
MAXIMUM_ROW_DURATION: 60
|
MAXIMUM_ROW_DURATION: 60
|
||||||
MINUTES_DATA_USED: False
|
MINUTES_DATA_USED: False
|
||||||
CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT
|
CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT
|
||||||
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
|
CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS
|
||||||
|
RADIUS_FOR_HOME: 100
|
||||||
SRC_FOLDER: "doryab" # inside src/features/phone_locations
|
SRC_FOLDER: "doryab" # inside src/features/phone_locations
|
||||||
SRC_LANGUAGE: "python"
|
SRC_LANGUAGE: "python"
|
||||||
|
|
||||||
|
|
|
@ -110,6 +110,7 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
|
||||||
| `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations.
|
| `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations.
|
||||||
| `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings).
|
| `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings).
|
||||||
| `[CLUSTERING_ALGORITHM]` | The original Doryab et al implementation uses `DBSCAN`, `OPTICS` is also available with similar (but not identical) clustering results and lower memory consumption.
|
| `[CLUSTERING_ALGORITHM]` | The original Doryab et al implementation uses `DBSCAN`, `OPTICS` is also available with similar (but not identical) clustering results and lower memory consumption.
|
||||||
|
| `[RADIUS_FOR_HOME]` | The distance from the center of the home location coordinates which can be accepted as part of home.
|
||||||
|
|
||||||
|
|
||||||
Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
|
Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
|
||||||
|
@ -136,6 +137,7 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`:
|
||||||
|stdlengthstayatclusters |minutes |Standard deviation of time spent in a cluster (significant location).
|
|stdlengthstayatclusters |minutes |Standard deviation of time spent in a cluster (significant location).
|
||||||
|locationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location), it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location).
|
|locationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location), it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location).
|
||||||
|normalizedlocationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location) divided by the number of clusters, it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location).
|
|normalizedlocationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location) divided by the number of clusters, it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location).
|
||||||
|
|timeathome |minutes | Time spent at home which is calculated by filtering the data between 12 am and 6 am, then applying clustering algorithm, finding the center of the biggest cluster and considering it as home coordinates.
|
||||||
|
|
||||||
|
|
||||||
!!! note "Assumptions/Observations"
|
!!! note "Assumptions/Observations"
|
||||||
|
|
|
@ -368,7 +368,7 @@ rule phone_light_r_features:
|
||||||
|
|
||||||
rule phone_locations_python_features:
|
rule phone_locations_python_features:
|
||||||
input:
|
input:
|
||||||
sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv",
|
sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv",
|
||||||
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
||||||
params:
|
params:
|
||||||
provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"][wildcards.provider_key.upper()],
|
provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||||
|
|
|
@ -139,6 +139,19 @@ rule phone_locations_processed_with_datetime:
|
||||||
script:
|
script:
|
||||||
"../src/data/readable_datetime.R"
|
"../src/data/readable_datetime.R"
|
||||||
|
|
||||||
|
rule phone_locations_processed_with_datetime_with_home:
|
||||||
|
input:
|
||||||
|
sensor_input = "data/interim/{pid}/phone_locations_processed_with_datetime.csv"
|
||||||
|
params:
|
||||||
|
dbscan_eps = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["DBSCAN_EPS"],
|
||||||
|
dbscan_minsamples = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["DBSCAN_MINSAMPLES"],
|
||||||
|
threshold_static = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["THRESHOLD_STATIC"],
|
||||||
|
clustering_algorithm = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["CLUSTERING_ALGORITHM"]
|
||||||
|
output:
|
||||||
|
"data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv"
|
||||||
|
script:
|
||||||
|
"../src/data/infer_home_location.py"
|
||||||
|
|
||||||
rule resample_episodes:
|
rule resample_episodes:
|
||||||
input:
|
input:
|
||||||
"data/interim/{pid}/{sensor}_episodes.csv"
|
"data/interim/{pid}/{sensor}_episodes.csv"
|
||||||
|
|
|
@ -0,0 +1,135 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.cluster import DBSCAN,OPTICS
|
||||||
|
from math import radians, cos, sin, asin, sqrt
|
||||||
|
|
||||||
|
def filterDatafromDf(origDf):
|
||||||
|
|
||||||
|
return origDf[origDf['local_hour']<=6]
|
||||||
|
|
||||||
|
def distance_to_degrees(d):
|
||||||
|
#Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error
|
||||||
|
#over small distances
|
||||||
|
d = d / 1852
|
||||||
|
d = d / 60
|
||||||
|
return d
|
||||||
|
|
||||||
|
origDf = pd.read_csv(snakemake.input[0])
|
||||||
|
filteredDf = filterDatafromDf(origDf)
|
||||||
|
dbscan_eps = snakemake.params["dbscan_eps"]
|
||||||
|
dbscan_minsamples = snakemake.params["dbscan_minsamples"]
|
||||||
|
threshold_static = snakemake.params["threshold_static"]
|
||||||
|
clustering_algorithm = snakemake.params["clustering_algorithm"]
|
||||||
|
|
||||||
|
if clustering_algorithm == "DBSCAN":
|
||||||
|
hyperparameters = {'eps' : distance_to_degrees(dbscan_eps), 'min_samples': dbscan_minsamples}
|
||||||
|
elif clustering_algorithm == "OPTICS":
|
||||||
|
hyperparameters = {'max_eps': distance_to_degrees(dbscan_eps), 'min_samples': 2, 'metric':'euclidean', 'cluster_method' : 'dbscan'}
|
||||||
|
else:
|
||||||
|
raise ValueError("config[PHONE_LOCATIONS][HOME_INFERENCE][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm)
|
||||||
|
|
||||||
|
def cluster_and_label(df,clustering_algorithm,threshold_static,**kwargs):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param df: a df with columns "latitude", "longitude", and "datetime"
|
||||||
|
or
|
||||||
|
a df with comlumns "latitude","longitude" and a datetime index
|
||||||
|
:param kwargs: arguments for sklearn's DBSCAN
|
||||||
|
:return: a new df of labeled locations with moving points removed, where the cluster
|
||||||
|
labeled as "1" is the largest, "2" the second largest, and so on
|
||||||
|
"""
|
||||||
|
if not df.empty:
|
||||||
|
location_data = df
|
||||||
|
if not isinstance(df.index, pd.DatetimeIndex):
|
||||||
|
location_data = df.set_index("local_date_time")
|
||||||
|
|
||||||
|
stationary = mark_moving(location_data,threshold_static)
|
||||||
|
|
||||||
|
counts_df = stationary[["double_latitude" ,"double_longitude"]].groupby(["double_latitude" ,"double_longitude"]).size().reset_index()
|
||||||
|
counts = counts_df[0]
|
||||||
|
lat_lon = counts_df[["double_latitude","double_longitude"]].values
|
||||||
|
|
||||||
|
if clustering_algorithm == "DBSCAN":
|
||||||
|
clusterer = DBSCAN(**kwargs)
|
||||||
|
cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts)
|
||||||
|
else:
|
||||||
|
clusterer = OPTICS(**kwargs)
|
||||||
|
cluster_results = clusterer.fit_predict(lat_lon)
|
||||||
|
|
||||||
|
#Need to extend labels back to original df without weights
|
||||||
|
counts_df["location_label"] = cluster_results
|
||||||
|
# remove the old count column
|
||||||
|
del counts_df[0]
|
||||||
|
|
||||||
|
merged = pd.merge(stationary,counts_df, on = ["double_latitude" ,"double_longitude"])
|
||||||
|
|
||||||
|
#Now compute the label mapping:
|
||||||
|
cluster_results = merged["location_label"].values
|
||||||
|
valid_clusters = cluster_results[np.where(cluster_results != -1)]
|
||||||
|
label_map = rank_count_map(valid_clusters)
|
||||||
|
|
||||||
|
#And remap the labels:
|
||||||
|
merged.index = stationary.index
|
||||||
|
stationary = stationary.assign(location_label = merged["location_label"].map(label_map).values)
|
||||||
|
stationary.loc[:, "location_label"] = merged["location_label"].map(label_map)
|
||||||
|
return stationary
|
||||||
|
else:
|
||||||
|
return df
|
||||||
|
|
||||||
|
def rank_count_map(clusters):
|
||||||
|
""" Returns a function which will map each element of a list 'l' to its rank,
|
||||||
|
such that the most common element maps to 1
|
||||||
|
|
||||||
|
Is used in this context to sort the cluster labels so that cluster with rank 1 is the most
|
||||||
|
visited.
|
||||||
|
|
||||||
|
If return_dict, return a mapping dict rather than a function
|
||||||
|
|
||||||
|
If a function, if the value can't be found label as -1
|
||||||
|
|
||||||
|
"""
|
||||||
|
labels, counts = tuple(np.unique(clusters, return_counts = True))
|
||||||
|
sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)]
|
||||||
|
label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]}
|
||||||
|
return lambda x: label_to_rank.get(x, -1)
|
||||||
|
|
||||||
|
|
||||||
|
def mark_moving(df, threshold_static):
|
||||||
|
|
||||||
|
if not df.index.is_monotonic:
|
||||||
|
df = df.sort_index()
|
||||||
|
|
||||||
|
distance = haversine(df.double_longitude,df.double_latitude,df.double_longitude.shift(-1),df.double_latitude.shift(-1))/ 1000
|
||||||
|
time = (df.timestamp.diff(-1) * -1) / (1000*60*60)
|
||||||
|
|
||||||
|
df['stationary_or_not'] = np.where((distance / time) < threshold_static,1,0) # 1 being stationary,0 for moving
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def haversine(lon1,lat1,lon2,lat2):
|
||||||
|
"""
|
||||||
|
Calculate the great circle distance between two points
|
||||||
|
on the earth (specified in decimal degrees)
|
||||||
|
"""
|
||||||
|
# convert decimal degrees to radians
|
||||||
|
lon1, lat1, lon2, lat2 = np.radians([lon1, lat1, lon2, lat2])
|
||||||
|
|
||||||
|
# haversine formula
|
||||||
|
a = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2
|
||||||
|
|
||||||
|
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
|
||||||
|
|
||||||
|
return (r * 2 * np.arcsin(np.sqrt(a)) * 1000)
|
||||||
|
|
||||||
|
filteredDf = cluster_and_label(filteredDf,clustering_algorithm,threshold_static,**hyperparameters)
|
||||||
|
|
||||||
|
origDf['home_latitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_latitude']
|
||||||
|
origDf['home_longitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_longitude']
|
||||||
|
|
||||||
|
distanceFromHome = haversine(origDf.double_longitude,origDf.double_latitude,origDf.home_longitude,origDf.home_latitude)
|
||||||
|
|
||||||
|
finalDf = origDf.drop(['home_latitude','home_longitude'], axis=1)
|
||||||
|
finalDf.insert(len(finalDf.columns)-1,'distancefromhome',distanceFromHome)
|
||||||
|
finalDf.to_csv(snakemake.output[0], index=False)
|
||||||
|
|
||||||
|
|
|
@ -17,13 +17,14 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
||||||
maximum_row_duration = provider["MAXIMUM_ROW_DURATION"]
|
maximum_row_duration = provider["MAXIMUM_ROW_DURATION"]
|
||||||
cluster_on = provider["CLUSTER_ON"]
|
cluster_on = provider["CLUSTER_ON"]
|
||||||
clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
|
clustering_algorithm = provider["CLUSTERING_ALGORITHM"]
|
||||||
|
radius_from_home = provider["RADIUS_FOR_HOME"]
|
||||||
|
|
||||||
minutes_data_used = provider["MINUTES_DATA_USED"]
|
minutes_data_used = provider["MINUTES_DATA_USED"]
|
||||||
if(minutes_data_used):
|
if(minutes_data_used):
|
||||||
requested_features.append("minutesdataused")
|
requested_features.append("minutesdataused")
|
||||||
|
|
||||||
# name of the features this function can compute
|
# name of the features this function can compute
|
||||||
base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused"]
|
base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused","timeathome"]
|
||||||
# the subset of requested features this function can compute
|
# the subset of requested features this function can compute
|
||||||
features_to_compute = list(set(requested_features) & set(base_features_names))
|
features_to_compute = list(set(requested_features) & set(base_features_names))
|
||||||
|
|
||||||
|
@ -170,6 +171,11 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
||||||
for localDate in stationaryLocations['local_segment'].unique():
|
for localDate in stationaryLocations['local_segment'].unique():
|
||||||
location_features.loc[localDate,"normalizedlocationentropy"] = location_entropy_normalized(stationaryLocations[stationaryLocations['local_segment']==localDate])
|
location_features.loc[localDate,"normalizedlocationentropy"] = location_entropy_normalized(stationaryLocations[stationaryLocations['local_segment']==localDate])
|
||||||
|
|
||||||
|
if "timeathome" in features_to_compute:
|
||||||
|
calculationDf = stationaryLocations[['local_segment','distancefromhome','timeInSeconds']].copy()
|
||||||
|
calculationDf.loc[calculationDf.timeInSeconds >= maximum_gap_allowed,'timeInSeconds'] = maximum_row_duration
|
||||||
|
location_features["timeathome"] = calculationDf[calculationDf["distancefromhome"] <= radius_from_home].groupby("local_segment")["timeInSeconds"].sum()/60
|
||||||
|
|
||||||
location_features = location_features.reset_index()
|
location_features = location_features.reset_index()
|
||||||
|
|
||||||
return location_features
|
return location_features
|
||||||
|
|
Loading…
Reference in New Issue