From 2d7d3bfccf38b42884020be2120168a5f8d7d783 Mon Sep 17 00:00:00 2001 From: Mingze Cao <29229557+Martinze@users.noreply.github.com> Date: Thu, 9 Apr 2020 12:20:39 -0500 Subject: [PATCH] Refactor location_barnett features: replace "metrics" with "features" Co-authored-by: Meng Li --- config.yaml | 2 +- docs/features/extracted.rst | 24 +++++++++---------- rules/features.snakefile | 6 ++--- ..._metrics.R => location_barnett_features.R} | 14 +++++------ 4 files changed, 23 insertions(+), 23 deletions(-) rename src/features/{location_barnett_metrics.R => location_barnett_features.R} (85%) diff --git a/config.yaml b/config.yaml index bf0b9d10..734fc337 100644 --- a/config.yaml +++ b/config.yaml @@ -69,7 +69,7 @@ RESAMPLE_FUSED_LOCATION: BARNETT_LOCATION: DAY_SEGMENTS: [daily] # These metrics are only available on a daily basis - METRICS: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"] + FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"] LOCATIONS_TO_USE: ALL # ALL, ALL_EXCEPT_FUSED OR RESAMPLE_FUSED ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius TIMEZONE: *timezone diff --git a/docs/features/extracted.rst b/docs/features/extracted.rst index 760c264c..be5b4b13 100644 --- a/docs/features/extracted.rst +++ b/docs/features/extracted.rst @@ -759,7 +759,7 @@ stdlux lux The standard deviation of ambient luminance in lux u Location (Barnett’s) Features """""""""""""""""""""""""""""" Barnett’s location features are based on the concept of flights and pauses. GPS coordinates are converted into a -sequence of flights (straight line movements) and pauses (time spent stationary). Data is imputed before metrics +sequence of flights (straight line movements) and pauses (time spent stationary). Data is imputed before features are computed (https://arxiv.org/abs/1606.06328) See `Location (Barnett’s) Config Code`_ @@ -779,7 +779,7 @@ See `Location (Barnett’s) Config Code`_ .. - Apply readable dateime to Sensor dataset: ``expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]),`` -- Extract Sensor Metrics: ``expand("data/processed/{pid}/location_barnett.csv", pid=config["PIDS"]),`` +- Extract Sensor Features: ``expand("data/processed/{pid}/location_barnett.csv", pid=config["PIDS"]),`` **Rule Chain:** @@ -799,9 +799,9 @@ See `Location (Barnett’s) Config Code`_ - **Script:** ``src/data/resample_fused_location.R`` - See the resample_fused_location.R_ script. -- **Rule:** ``rules/features.snakefile/location_barnett_metrics`` - See the location_barnett_metrics_ rule. +- **Rule:** ``rules/features.snakefile/location_barnett_features`` - See the location_barnett_features_ rule. - - **Script:** ``src/features/location_barnett_metrics.R`` - See the location_barnett_metrics.R_ script. + - **Script:** ``src/features/location_barnett_features.R`` - See the location_barnett_features.R_ script. .. _location-parameters: @@ -814,14 +814,14 @@ Name Description location_to_use The specifies which of the location data will be use in the analysis. Possible options are ``ALL``, ``ALL_EXCEPT_FUSED`` OR ``RESAMPLE_FUSED`` accuracy_limit This is in meters. The sensor drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius specified. timezone The timezone used to calculate location. -metrics The different measures that can be retrieved from the Location dataset. See :ref:`Available Location Metrics ` Table below +features The different measures that can be retrieved from the Location dataset. See :ref:`Available Location Features ` Table below ================= =================== -.. _location-available-metrics: +.. _location-available-features: -**Available Location Metrics** +**Available Location Features** -The following table shows a list of the available metrics for Location dataset. +The following table shows a list of the available features for Location dataset. ================ ========= ============= Name Units Description @@ -839,7 +839,7 @@ stdflightdur meters Std flight duration. The standard deviation of probpause Pause probability. The fraction of a day spent in a pause (as opposed to a flight) siglocentropy Significant location entropy. Entropy measurement based on the proportion of time spent at each significant location visited during a day. minsmissing -circdnrtn Circadian routine. A continuous metric that can take any value between 0 and 1, where 0 represents a daily routine completely different from any other sensed days and 1 a routine the same as every other sensed day. +circdnrtn Circadian routine. A continuous feature that can take any value between 0 and 1, where 0 represents a daily routine completely different from any other sensed days and 1 a routine the same as every other sensed day. wkenddayrtn Weekend circadian routine. Same as Circadian routine but computed separately for weekends and weekdays. ================ ========= ============= @@ -1102,7 +1102,7 @@ See `Fitbit: Steps Config Code`_ Name Description ======================= =================== day_segment The particular ``day_segments`` that will be analyzed. The available options are ``daily``, ``morning``, ``afternoon``, ``evening``, ``night`` -features The different measures that can be retrieved from the dataset. See :ref:`Available Fitbit: Steps Metrics ` Table below +features The different measures that can be retrieved from the dataset. See :ref:`Available Fitbit: Steps Features ` Table below threshold_active_bout The maximum number of steps per minute necessary for a bout to be ``sedentary``. That is, if the step count per minute is greater than this value the bout has a status of ``active``. ======================= =================== @@ -1182,8 +1182,8 @@ stddurationactivebout minutes Std duration active bout: The standard .. _phone_sensed_bins.R: https://github.com/carissalow/rapids/blob/master/src/data/phone_sensed_bins.R .. _resample_fused_location: https://github.com/carissalow/rapids/blob/765bb462636d5029a05f54d4c558487e3786b90b/rules/preprocessing.snakefile#L67 .. _resample_fused_location.R: https://github.com/carissalow/rapids/blob/master/src/data/resample_fused_location.R -.. _location_barnett_metrics: https://github.com/carissalow/rapids/blob/765bb462636d5029a05f54d4c558487e3786b90b/rules/features.snakefile#L49 -.. _location_barnett_metrics.R: https://github.com/carissalow/rapids/blob/master/src/features/location_barnett_metrics.R +.. _location_barnett_features: https://github.com/carissalow/rapids/blob/765bb462636d5029a05f54d4c558487e3786b90b/rules/features.snakefile#L49 +.. _location_barnett_features.R: https://github.com/carissalow/rapids/blob/master/src/features/location_barnett_features.R .. _`Screen Config Code`: https://github.com/carissalow/rapids/blob/765bb462636d5029a05f54d4c558487e3786b90b/config.yaml#L88 .. _screen_deltas: https://github.com/carissalow/rapids/blob/765bb462636d5029a05f54d4c558487e3786b90b/rules/features.snakefile#L33 .. _screen_deltas.R: https://github.com/carissalow/rapids/blob/master/src/features/screen_deltas.R diff --git a/rules/features.snakefile b/rules/features.snakefile index 2e1c72c1..5499010c 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -47,12 +47,12 @@ rule google_activity_recognition_deltas: script: "../src/features/google_activity_recognition_deltas.R" -rule location_barnett_metrics: +rule location_barnett_features: input: raw = "data/raw/{pid}/locations_raw.csv", fused = rules.resample_fused_location.output params: - metrics = config["BARNETT_LOCATION"]["METRICS"], + features = config["BARNETT_LOCATION"]["FEATURES"], locations_to_use = config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"], accuracy_limit = config["BARNETT_LOCATION"]["ACCURACY_LIMIT"], timezone = config["BARNETT_LOCATION"]["TIMEZONE"], @@ -60,7 +60,7 @@ rule location_barnett_metrics: output: "data/processed/{pid}/location_barnett_{day_segment}.csv" script: - "../src/features/location_barnett_metrics.R" + "../src/features/location_barnett_features.R" rule bluetooth_features: input: diff --git a/src/features/location_barnett_metrics.R b/src/features/location_barnett_features.R similarity index 85% rename from src/features/location_barnett_metrics.R rename to src/features/location_barnett_features.R index 9f64971f..85bfbf47 100644 --- a/src/features/location_barnett_metrics.R +++ b/src/features/location_barnett_features.R @@ -2,7 +2,7 @@ source("packrat/init.R") library(dplyr) -write_empty_file <- function(file_path, metrics_to_include){ +write_empty_file <- function(file_path, requested_feature){ write.csv(data.frame(local_date= character(), location_barnett_hometime= numeric(), location_barnett_disttravelled= numeric(), @@ -19,7 +19,7 @@ write_empty_file <- function(file_path, metrics_to_include){ location_barnett_minsmissing= numeric(), location_barnett_circdnrtn= numeric(), location_barnett_wkenddayrtn= numeric() - ) %>% select(metrics_to_include), file_path, row.names = F) + ) %>% select(requested_feature), file_path, row.names = F) } # Load Ian Barnett's code. Taken from https://scholar.harvard.edu/ibarnett/software/gpsmobility @@ -29,9 +29,9 @@ sapply(file.sources,source,.GlobalEnv) locations_to_use <- snakemake@params[["locations_to_use"]] accuracy_limit <- snakemake@params[["accuracy_limit"]] timezone <- snakemake@params[["timezone"]] -metrics_to_include <- intersect(unlist(snakemake@params["metrics"], use.names = F), +requested_feature <- intersect(unlist(snakemake@params["features"], use.names = F), c("hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","minsmissing","circdnrtn","wkenddayrtn")) -metrics_to_include <- c("local_date", paste("location_barnett", metrics_to_include, sep = "_")) +requested_feature <- c("local_date", paste("location_barnett", requested_feature, sep = "_")) # By deafult we use all raw locations: fused without resampling and not fused (gps, network) location <- read.csv(snakemake@input[["raw"]], stringsAsFactors = F) %>% @@ -50,16 +50,16 @@ if(locations_to_use == "ALL_EXCEPT_FUSED"){ if (nrow(location) > 1){ features <- MobilityFeatures(location, ACCURACY_LIM = accuracy_limit, tz = timezone) if(is.null(features)){ - write_empty_file(snakemake@output[[1]], metrics_to_include) + write_empty_file(snakemake@output[[1]], requested_feature) } else{ # Copy index (dates) as a column outmatrix <- cbind(rownames(features$featavg), features$featavg) outmatrix <- as.data.frame(outmatrix) outmatrix[-1] <- lapply(lapply(outmatrix[-1], as.character), as.numeric) colnames(outmatrix)=c("local_date",tolower(paste("location_barnett", colnames(features$featavg), sep = "_"))) - write.csv(outmatrix %>% select(metrics_to_include), snakemake@output[[1]], row.names = F) + write.csv(outmatrix %>% select(requested_feature), snakemake@output[[1]], row.names = F) } } else { - write_empty_file(snakemake@output[[1]], metrics_to_include) + write_empty_file(snakemake@output[[1]], requested_feature) }