From df7a52cd6c3fc8d48d51e6a402b6a2c51d07c02b Mon Sep 17 00:00:00 2001 From: nikunjgoel95 Date: Wed, 5 Aug 2020 13:02:34 -0400 Subject: [PATCH] Added Sampling Frequency to Doryab Location Features. --- config.yaml | 1 + rules/features.snakefile | 3 +- src/features/location_doryab/location_base.py | 46 +++++++++++-------- src/features/location_doryab_features.py | 3 +- 4 files changed, 32 insertions(+), 21 deletions(-) diff --git a/config.yaml b/config.yaml index f76e275c..13a8981b 100644 --- a/config.yaml +++ b/config.yaml @@ -92,6 +92,7 @@ DORYAB_LOCATION: THRESHOLD_STATIC : 1 # km/h MAXIMUM_GAP_ALLOWED: 300 MINUTES_DATA_USED: False + SAMPLING_FREQUENCY: 0 BLUETOOTH: COMPUTE: False diff --git a/rules/features.snakefile b/rules/features.snakefile index f650589a..bea20a37 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -141,7 +141,8 @@ rule location_doryab_features: dbscan_minsamples = config["DORYAB_LOCATION"]["DBSCAN_MINSAMPLES"], threshold_static = config["DORYAB_LOCATION"]["THRESHOLD_STATIC"], maximum_gap_allowed = config["DORYAB_LOCATION"]["MAXIMUM_GAP_ALLOWED"], - minutes_data_used = config["DORYAB_LOCATION"]["MINUTES_DATA_USED"] + minutes_data_used = config["DORYAB_LOCATION"]["MINUTES_DATA_USED"], + sampling_frequency = config["DORYAB_LOCATION"]["SAMPLING_FREQUENCY"] output: "data/processed/{pid}/location_doryab_{day_segment}.csv" script: diff --git a/src/features/location_doryab/location_base.py b/src/features/location_doryab/location_base.py index 64edbb6e..b735cbad 100644 --- a/src/features/location_doryab/location_base.py +++ b/src/features/location_doryab/location_base.py @@ -4,7 +4,7 @@ from astropy.timeseries import LombScargle from sklearn.cluster import DBSCAN from math import radians, cos, sin, asin, sqrt -def base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples, threshold_static, maximum_gap_allowed): +def base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples, threshold_static, maximum_gap_allowed,sampling_frequency): # name of the features this function can compute base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused"] # the subset of requested features this function can compute @@ -22,6 +22,9 @@ def base_location_features(location_data, day_segment, requested_features, dbsca else: location_features = pd.DataFrame() + if sampling_frequency == 0: + sampling_frequency = getSamplingFrequency(location_data) + if "minutesdataused" in features_to_compute: for localDate in location_data["local_date"].unique(): location_features.loc[localDate,"location_" + day_segment + "_minutesdataused"] = getMinutesData(location_data[location_data["local_date"]==localDate]) @@ -72,35 +75,35 @@ def base_location_features(location_data, day_segment, requested_features, dbsca if "radiusgyration" in features_to_compute: for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_date']==localDate]) + location_features.loc[localDate,"location_" + day_segment + "_radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_date']==localDate],sampling_frequency) if "timeattop1location" in features_to_compute: for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],1) + location_features.loc[localDate,"location_" + day_segment + "_timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],1,sampling_frequency) if "timeattop2location" in features_to_compute: for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],2) + location_features.loc[localDate,"location_" + day_segment + "_timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],2,sampling_frequency) if "timeattop3location" in features_to_compute: for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],3) + location_features.loc[localDate,"location_" + day_segment + "_timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_date']==localDate],3,sampling_frequency) if "movingtostaticratio" in features_to_compute: for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_movingtostaticratio"] = (newLocationData[newLocationData['local_date']==localDate].shape[0] / location_data[location_data['local_date']==localDate].shape[0]) + location_features.loc[localDate,"location_" + day_segment + "_movingtostaticratio"] = (newLocationData[newLocationData['local_date']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_date']==localDate].shape[0] * sampling_frequency) if "outlierstimepercent" in features_to_compute: for localDate in newLocationData['local_date'].unique(): - location_features.loc[localDate,"location_" + day_segment + "_outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_date']==localDate]) + location_features.loc[localDate,"location_" + day_segment + "_outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_date']==localDate],sampling_frequency) preComputedmaxminCluster = pd.DataFrame() for localDate in newLocationData['local_date'].unique(): smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(newLocationData[newLocationData['local_date']==localDate]) - preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = smax - preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = smin - preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = sstd - preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = smean + preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_maxlengthstayatclusters"] = smax * sampling_frequency + preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_minlengthstayatclusters"] = smin * sampling_frequency + preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_stdlengthstayatclusters"] = sstd * sampling_frequency + preComputedmaxminCluster.loc[localDate,"location_" + day_segment + "_meanlengthstayatclusters"] = smean * sampling_frequency if "maxlengthstayatclusters" in features_to_compute: for localDate in newLocationData['local_date'].unique(): @@ -315,7 +318,7 @@ def number_location_transitions(locationData): return df[df['boolCol'] == False].shape[0] - 1 -def radius_of_gyration(locationData): +def radius_of_gyration(locationData,sampling_frequency): if locationData is None or len(locationData) == 0: return None # Center is the centroid, not the home location @@ -333,14 +336,14 @@ def radius_of_gyration(locationData): distance = haversine(lat_lon_dict) ** 2 - time_in_cluster = locationData[locationData["location_label"]==labels].shape[0] + time_in_cluster = locationData[locationData["location_label"]==labels].shape[0]* sampling_frequency rog = rog + (time_in_cluster * distance) - time_all_clusters = valid_clusters.shape[0] + time_all_clusters = valid_clusters.shape[0] * sampling_frequency final_rog = (1/time_all_clusters) * rog return np.sqrt(final_rog) -def time_at_topn_clusters_in_group(locationData,n): # relevant only for global location features since, top3_clusters = top3_clusters_in_group for local +def time_at_topn_clusters_in_group(locationData,n,sampling_frequency): # relevant only for global location features since, top3_clusters = top3_clusters_in_group for local if locationData is None or len(locationData) == 0: return None @@ -357,12 +360,12 @@ def time_at_topn_clusters_in_group(locationData,n): # relevant only for global return topn_time -def outliers_time_percent(locationData): +def outliers_time_percent(locationData,sampling_frequency): if locationData is None or len(locationData) == 0: return None clusters = locationData["location_label"] - numoutliers = clusters[(clusters == -1)].sum() - numtotal = len(clusters) + numoutliers = clusters[(clusters == -1)].sum() * sampling_frequency + numtotal = len(clusters) * sampling_frequency return (float(-1*numoutliers) / numtotal) @@ -438,4 +441,9 @@ def location_entropy_normalized(locationData): if num_clusters == 0 or len(locationData) == 0 or entropy is None: return None else: - return entropy / num_clusters \ No newline at end of file + return entropy / num_clusters + + +def getSamplingFrequency(locationData): + + return ((pd.to_datetime(locationData['local_time'], format="%H:%M:%S") - pd.to_datetime(locationData['local_time'].shift(periods=1), format="%H:%M:%S")).apply(lambda x: x.total_seconds())/60).median() \ No newline at end of file diff --git a/src/features/location_doryab_features.py b/src/features/location_doryab_features.py index d0075341..8d749483 100644 --- a/src/features/location_doryab_features.py +++ b/src/features/location_doryab_features.py @@ -10,11 +10,12 @@ dbscan_minsamples = snakemake.params["dbscan_minsamples"] threshold_static = snakemake.params["threshold_static"] maximum_gap_allowed = snakemake.params["maximum_gap_allowed"] minutes_data_used = snakemake.params["minutes_data_used"] +sampling_frequency = snakemake.params["sampling_frequency"] if(minutes_data_used): requested_features.append("minutesdataused") -base_features = base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples,threshold_static,maximum_gap_allowed) +base_features = base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples,threshold_static,maximum_gap_allowed,sampling_frequency) location_features = location_features.merge(base_features, on="local_date", how="outer")