From 3d6caea6c4a18050ba95aa42e6dd14d5e3a20686 Mon Sep 17 00:00:00 2001 From: nikunjgoel95 Date: Thu, 18 Feb 2021 18:37:35 -0500 Subject: [PATCH] Added the timeathome feature using infer_home_location.py as interim file. --- Snakefile | 1 + config.yaml | 11 +- docs/features/phone-locations.md | 2 + rules/features.smk | 2 +- rules/preprocessing.smk | 13 ++ src/data/infer_home_location.py | 135 ++++++++++++++++++++ src/features/phone_locations/doryab/main.py | 8 +- 7 files changed, 168 insertions(+), 4 deletions(-) create mode 100644 src/data/infer_home_location.py diff --git a/Snakefile b/Snakefile index 68fc10e7..1e4360a5 100644 --- a/Snakefile +++ b/Snakefile @@ -212,6 +212,7 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) diff --git a/config.yaml b/config.yaml index 67f5344e..43734157 100644 --- a/config.yaml +++ b/config.yaml @@ -237,10 +237,16 @@ PHONE_LOCATIONS: LOCATIONS_TO_USE: ALL_RESAMPLED # ALL, GPS, ALL_RESAMPLED, OR FUSED_RESAMPLED FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row + HOME_INFERENCE: + DBSCAN_EPS: 10 # meters + DBSCAN_MINSAMPLES: 5 + THRESHOLD_STATIC : 1 # km/h + CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS + PROVIDERS: DORYAB: COMPUTE: False - FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"] + FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome"] ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius DBSCAN_EPS: 10 # meters DBSCAN_MINSAMPLES: 5 @@ -249,7 +255,8 @@ PHONE_LOCATIONS: MAXIMUM_ROW_DURATION: 60 MINUTES_DATA_USED: False CLUSTER_ON: PARTICIPANT_DATASET # PARTICIPANT_DATASET,TIME_SEGMENT - CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS + CLUSTERING_ALGORITHM: DBSCAN #DBSCAN,OPTICS + RADIUS_FOR_HOME: 100 SRC_FOLDER: "doryab" # inside src/features/phone_locations SRC_LANGUAGE: "python" diff --git a/docs/features/phone-locations.md b/docs/features/phone-locations.md index e837e235..8281b97b 100644 --- a/docs/features/phone-locations.md +++ b/docs/features/phone-locations.md @@ -110,6 +110,7 @@ Parameters description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`: | `[SAMPLING_FREQUENCY]` | Expected time difference between any two location rows in minutes. If set to `0`, the sampling frequency will be inferred automatically as the median of all the differences between any two consecutive row timestamps (recommended if you are using `FUSED_RESAMPLED` data). This parameter impacts all the time calculations. | `[CLUSTER_ON]` | Set this flag to `PARTICIPANT_DATASET` to create clusters based on the entire participant's dataset or to `TIME_SEGMENT` to create clusters based on all the instances of the corresponding time segment (e.g. all mornings). | `[CLUSTERING_ALGORITHM]` | The original Doryab et al implementation uses `DBSCAN`, `OPTICS` is also available with similar (but not identical) clustering results and lower memory consumption. +| `[RADIUS_FOR_HOME]` | The distance from the center of the home location coordinates which can be accepted as part of home. Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`: @@ -136,6 +137,7 @@ Features description for `[PHONE_LOCATIONS][PROVIDERS][DORYAB]`: |stdlengthstayatclusters |minutes |Standard deviation of time spent in a cluster (significant location). |locationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location), it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location). |normalizedlocationentropy |nats |Shannon Entropy computed over the row count of each cluster (significant location) divided by the number of clusters, it will be higher the more rows belong to a cluster (i.e. the more time a participant spent at a significant location). +|timeathome |minutes | Time spent at home which is calculated by filtering the data between 12 am and 6 am, then applying clustering algorithm, finding the center of the biggest cluster and considering it as home coordinates. !!! note "Assumptions/Observations" diff --git a/rules/features.smk b/rules/features.smk index c5f43be4..7c8edf9c 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -368,7 +368,7 @@ rule phone_light_r_features: rule phone_locations_python_features: input: - sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime.csv", + sensor_data = "data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv", time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_LOCATIONS"]["PROVIDERS"][wildcards.provider_key.upper()], diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index fcdea91a..983d330b 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -139,6 +139,19 @@ rule phone_locations_processed_with_datetime: script: "../src/data/readable_datetime.R" +rule phone_locations_processed_with_datetime_with_home: + input: + sensor_input = "data/interim/{pid}/phone_locations_processed_with_datetime.csv" + params: + dbscan_eps = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["DBSCAN_EPS"], + dbscan_minsamples = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["DBSCAN_MINSAMPLES"], + threshold_static = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["THRESHOLD_STATIC"], + clustering_algorithm = config["PHONE_LOCATIONS"]["HOME_INFERENCE"]["CLUSTERING_ALGORITHM"] + output: + "data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv" + script: + "../src/data/infer_home_location.py" + rule resample_episodes: input: "data/interim/{pid}/{sensor}_episodes.csv" diff --git a/src/data/infer_home_location.py b/src/data/infer_home_location.py new file mode 100644 index 00000000..f7efdc7a --- /dev/null +++ b/src/data/infer_home_location.py @@ -0,0 +1,135 @@ +import pandas as pd +import numpy as np +from sklearn.cluster import DBSCAN,OPTICS +from math import radians, cos, sin, asin, sqrt + +def filterDatafromDf(origDf): + + return origDf[origDf['local_hour']<=6] + +def distance_to_degrees(d): + #Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error + #over small distances + d = d / 1852 + d = d / 60 + return d + +origDf = pd.read_csv(snakemake.input[0]) +filteredDf = filterDatafromDf(origDf) +dbscan_eps = snakemake.params["dbscan_eps"] +dbscan_minsamples = snakemake.params["dbscan_minsamples"] +threshold_static = snakemake.params["threshold_static"] +clustering_algorithm = snakemake.params["clustering_algorithm"] + +if clustering_algorithm == "DBSCAN": + hyperparameters = {'eps' : distance_to_degrees(dbscan_eps), 'min_samples': dbscan_minsamples} +elif clustering_algorithm == "OPTICS": + hyperparameters = {'max_eps': distance_to_degrees(dbscan_eps), 'min_samples': 2, 'metric':'euclidean', 'cluster_method' : 'dbscan'} +else: + raise ValueError("config[PHONE_LOCATIONS][HOME_INFERENCE][CLUSTERING ALGORITHM] only accepts DBSCAN or OPTICS but you provided ",clustering_algorithm) + +def cluster_and_label(df,clustering_algorithm,threshold_static,**kwargs): + """ + + :param df: a df with columns "latitude", "longitude", and "datetime" + or + a df with comlumns "latitude","longitude" and a datetime index + :param kwargs: arguments for sklearn's DBSCAN + :return: a new df of labeled locations with moving points removed, where the cluster + labeled as "1" is the largest, "2" the second largest, and so on + """ + if not df.empty: + location_data = df + if not isinstance(df.index, pd.DatetimeIndex): + location_data = df.set_index("local_date_time") + + stationary = mark_moving(location_data,threshold_static) + + counts_df = stationary[["double_latitude" ,"double_longitude"]].groupby(["double_latitude" ,"double_longitude"]).size().reset_index() + counts = counts_df[0] + lat_lon = counts_df[["double_latitude","double_longitude"]].values + + if clustering_algorithm == "DBSCAN": + clusterer = DBSCAN(**kwargs) + cluster_results = clusterer.fit_predict(lat_lon, sample_weight= counts) + else: + clusterer = OPTICS(**kwargs) + cluster_results = clusterer.fit_predict(lat_lon) + + #Need to extend labels back to original df without weights + counts_df["location_label"] = cluster_results + # remove the old count column + del counts_df[0] + + merged = pd.merge(stationary,counts_df, on = ["double_latitude" ,"double_longitude"]) + + #Now compute the label mapping: + cluster_results = merged["location_label"].values + valid_clusters = cluster_results[np.where(cluster_results != -1)] + label_map = rank_count_map(valid_clusters) + + #And remap the labels: + merged.index = stationary.index + stationary = stationary.assign(location_label = merged["location_label"].map(label_map).values) + stationary.loc[:, "location_label"] = merged["location_label"].map(label_map) + return stationary + else: + return df + +def rank_count_map(clusters): + """ Returns a function which will map each element of a list 'l' to its rank, + such that the most common element maps to 1 + + Is used in this context to sort the cluster labels so that cluster with rank 1 is the most + visited. + + If return_dict, return a mapping dict rather than a function + + If a function, if the value can't be found label as -1 + + """ + labels, counts = tuple(np.unique(clusters, return_counts = True)) + sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)] + label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]} + return lambda x: label_to_rank.get(x, -1) + + +def mark_moving(df, threshold_static): + + if not df.index.is_monotonic: + df = df.sort_index() + + distance = haversine(df.double_longitude,df.double_latitude,df.double_longitude.shift(-1),df.double_latitude.shift(-1))/ 1000 + time = (df.timestamp.diff(-1) * -1) / (1000*60*60) + + df['stationary_or_not'] = np.where((distance / time) < threshold_static,1,0) # 1 being stationary,0 for moving + + return df + +def haversine(lon1,lat1,lon2,lat2): + """ + Calculate the great circle distance between two points + on the earth (specified in decimal degrees) + """ + # convert decimal degrees to radians + lon1, lat1, lon2, lat2 = np.radians([lon1, lat1, lon2, lat2]) + + # haversine formula + a = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2 + + r = 6371 # Radius of earth in kilometers. Use 3956 for miles + + return (r * 2 * np.arcsin(np.sqrt(a)) * 1000) + +filteredDf = cluster_and_label(filteredDf,clustering_algorithm,threshold_static,**hyperparameters) + +origDf['home_latitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_latitude'] +origDf['home_longitude'] = filteredDf[filteredDf['location_label']==1][['double_latitude','double_longitude']].mean()['double_longitude'] + +distanceFromHome = haversine(origDf.double_longitude,origDf.double_latitude,origDf.home_longitude,origDf.home_latitude) + +finalDf = origDf.drop(['home_latitude','home_longitude'], axis=1) +finalDf.insert(len(finalDf.columns)-1,'distancefromhome',distanceFromHome) +finalDf.to_csv(snakemake.output[0], index=False) + + diff --git a/src/features/phone_locations/doryab/main.py b/src/features/phone_locations/doryab/main.py index bb850d25..5067274a 100644 --- a/src/features/phone_locations/doryab/main.py +++ b/src/features/phone_locations/doryab/main.py @@ -17,13 +17,14 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se maximum_row_duration = provider["MAXIMUM_ROW_DURATION"] cluster_on = provider["CLUSTER_ON"] clustering_algorithm = provider["CLUSTERING_ALGORITHM"] + radius_from_home = provider["RADIUS_FOR_HOME"] minutes_data_used = provider["MINUTES_DATA_USED"] if(minutes_data_used): requested_features.append("minutesdataused") # name of the features this function can compute - base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused"] + base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused","timeathome"] # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) @@ -170,6 +171,11 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se for localDate in stationaryLocations['local_segment'].unique(): location_features.loc[localDate,"normalizedlocationentropy"] = location_entropy_normalized(stationaryLocations[stationaryLocations['local_segment']==localDate]) + if "timeathome" in features_to_compute: + calculationDf = stationaryLocations[['local_segment','distancefromhome','timeInSeconds']].copy() + calculationDf.loc[calculationDf.timeInSeconds >= maximum_gap_allowed,'timeInSeconds'] = maximum_row_duration + location_features["timeathome"] = calculationDf[calculationDf["distancefromhome"] <= radius_from_home].groupby("local_segment")["timeInSeconds"].sum()/60 + location_features = location_features.reset_index() return location_features