From faad8f5a8f8b9d0671b5a3d39b4e59ab32f265aa Mon Sep 17 00:00:00 2001 From: nikunjgoel95 Date: Thu, 23 Jul 2020 14:23:32 -0400 Subject: [PATCH] Added minutes of data in a day/epoch. --- config.yaml | 1 + rules/features.snakefile | 3 ++- src/features/location_doryab/location_base.py | 15 ++++++++++++--- src/features/location_doryab_features.py | 8 +++++++- 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/config.yaml b/config.yaml index a11034a2..91bbfb7f 100644 --- a/config.yaml +++ b/config.yaml @@ -91,6 +91,7 @@ DORYAB_LOCATION: DBSCAN_MINSAMPLES: 5 THRESHOLD_STATIC : 1 # km/h MAXIMUM_GAP_ALLOWED: 300 + MINUTES_DATA_USED: True BLUETOOTH: COMPUTE: False diff --git a/rules/features.snakefile b/rules/features.snakefile index 34dc80a3..9288ea83 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -130,7 +130,8 @@ rule location_doryab_features: dbscan_eps = config["DORYAB_LOCATION"]["DBSCAN_EPS"], dbscan_minsamples = config["DORYAB_LOCATION"]["DBSCAN_MINSAMPLES"], threshold_static = config["DORYAB_LOCATION"]["THRESHOLD_STATIC"], - maximum_gap_allowed = config["DORYAB_LOCATION"]["MAXIMUM_GAP_ALLOWED"] + maximum_gap_allowed = config["DORYAB_LOCATION"]["MAXIMUM_GAP_ALLOWED"], + minutes_data_used = config["DORYAB_LOCATION"]["MINUTES_DATA_USED"] output: "data/processed/{pid}/location_doryab_{day_segment}.csv" script: diff --git a/src/features/location_doryab/location_base.py b/src/features/location_doryab/location_base.py index e40603ce..7546d109 100644 --- a/src/features/location_doryab/location_base.py +++ b/src/features/location_doryab/location_base.py @@ -6,8 +6,7 @@ from math import radians, cos, sin, asin, sqrt def base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples, threshold_static, maximum_gap_allowed): # name of the features this function can compute - base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"] - + base_features_names = ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","minutesdataused"] # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) @@ -23,6 +22,12 @@ def base_location_features(location_data, day_segment, requested_features, dbsca else: location_features = pd.DataFrame() + if "minutesdataused" in features_to_compute: + for localDate in location_data["local_date"].unique(): + location_features.loc[localDate,"location_" + day_segment + "_minutesdataused"] = getMinutesData(location_data[location_data["local_date"]==localDate]) + + location_features.index.name = 'local_date' + location_data = location_data[(location_data['double_latitude']!=0.0) & (location_data['double_longitude']!=0.0)] if "locationvariance" in features_to_compute: @@ -120,12 +125,16 @@ def base_location_features(location_data, day_segment, requested_features, dbsca if "normalizedlocationentropy" in features_to_compute: for localDate in newLocationData['local_date'].unique(): location_features.loc[localDate,"location_" + day_segment + "_normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_date']==localDate]) - + location_features = location_features.reset_index() return location_features +def getMinutesData(locationData): + + return locationData[['local_hour','local_minute']].drop_duplicates(inplace = False).shape[0] + def distance_to_degrees(d): #Just an approximation, but speeds up clustering by a huge amount and doesnt introduce much error #over small distances diff --git a/src/features/location_doryab_features.py b/src/features/location_doryab_features.py index a053a8d8..d0075341 100644 --- a/src/features/location_doryab_features.py +++ b/src/features/location_doryab_features.py @@ -9,8 +9,14 @@ dbscan_eps = snakemake.params["dbscan_eps"] dbscan_minsamples = snakemake.params["dbscan_minsamples"] threshold_static = snakemake.params["threshold_static"] maximum_gap_allowed = snakemake.params["maximum_gap_allowed"] +minutes_data_used = snakemake.params["minutes_data_used"] -location_features = location_features.merge(base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples,threshold_static,maximum_gap_allowed), on="local_date", how="outer") +if(minutes_data_used): + requested_features.append("minutesdataused") + +base_features = base_location_features(location_data, day_segment, requested_features, dbscan_eps, dbscan_minsamples,threshold_static,maximum_gap_allowed) + +location_features = location_features.merge(base_features, on="local_date", how="outer") assert len(requested_features) + 1 == location_features.shape[1], "The number of features in the output dataframe (=" + str(location_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your location feature extraction functions"