rapids/src/features/fitbit_calories_intraday/rapids/main.R

92 lines
5.5 KiB
R

source("renv/activate.R")
library(tidyverse)
library(lubridate)
library(glue)
create_empty_dataframe <- function(episode_type){
integer_columns <- c("countepisode{episode_type}", "starttimefirstepisode{episode_type}", "endtimefirstepisode{episode_type}", "starttimelastepisode{episode_type}", "endtimelastepisode{episode_type}", "starttimelongestepisode{episode_type}", "endtimelongestepisode{episode_type}")
integer_columns <- sapply(integer_columns, function(x) glue(x), simplify = TRUE, USE.NAMES = FALSE)
double_columns <- c()
for(col in c("duration", "calories", "mets"))
for(fun in c("sum", "mean", "min","max","sd"))
double_columns <- c(double_columns, glue("{fun}{col}episode{episode_type}"))
as_tibble(c(sapply(integer_columns, function(x) integer()), sapply(double_columns, function(x) numeric())))
}
longest <- function(duration, time){
position_longest <- min(which(duration == max(duration)))
time[position_longest]
}
episode_type_features <- function(data, episode_type, episode_id_column){
if(nrow(data) == 0)
return(create_empty_dataframe(episode_type))
data %>%
group_by(across(all_of(episode_id_column))) %>%
summarise(duration = (max(timestamp) - min(timestamp)) / 60000 + 1,
mets = sum(mets),
calories = sum(value),
start_time = min(time_since_ref),
end_time = max(time_since_ref) + 1) %>%
summarise("countepisode{episode_type}" := n(),
"starttimefirstepisode{episode_type}" := first(start_time),
"endtimefirstepisode{episode_type}" := first(end_time),
"starttimelastepisode{episode_type}" := last(start_time),
"endtimelastepisode{episode_type}" := last(end_time),
"starttimelongestepisode{episode_type}" := longest(duration, start_time),
"endtimelongestepisode{episode_type}" := longest(duration, end_time),
across(duration, list(sum=sum, avg=mean, min=min,max=max,std=sd), .names = "{.fn}{.col}episode{episode_type}"),
across(calories, list(sum=sum, avg=mean, min=min,max=max,std=sd), .names = "{.fn}{.col}episode{episode_type}"),
across(mets, list(sum=sum, avg=mean, min=min,max=max,std=sd), .names = "{.fn}{.col}episode{episode_type}"))
}
rapids_features <- function(sensor_data_files, time_segment, provider){
calories <- read_csv(snakemake@input[["sensor_data"]],
col_types = cols_only(level="i", mets="d", value="d", local_date_time="T",assigned_segments="c", timestamp="d"))# %>%
MET_THRESHOLD <- provider[["EPISODE_MET_THRESHOLD"]]
MVPA_LABELS <- provider[["EPISODE_MVPA_CATEGORIES"]]
FITBIT_LEVELS <- c("sedentary", "lightlyactive", "fairlyactive", "veryactive")
MVPA_LEVELS <- which(FITBIT_LEVELS %in% MVPA_LABELS) - 1
EPISODE_TIME_THRESHOLD <- provider[["EPISODE_TIME_THRESHOLD"]]
EPISODE_REFERENCE_TIME <- provider[["EPISODE_REFERENCE_TIME"]]
REQUESTED_EPISODES <- provider[["EPISODE_TYPE"]]
REQUESTED_FEATURES <- provider[["FEATURES"]]
calories <- calories %>% filter_data_by_segment(time_segment)
if(nrow(calories) == 0)
return(bind_cols(lapply(REQUESTED_EPISODES, function(episode_type) episode_type_features(calories, episode_type, ""))) %>%
add_column(local_segment = character(), .before = 1) %>%
select(starts_with(c("local_segment", REQUESTED_FEATURES))))
calories <- calories %>%
extract(timestamps_segment, regex = "(\\d*),", into = c("segment_start_ts"), remove = TRUE, convert = TRUE) %>%
arrange(timestamp) %>%
mutate(consecutive = c(0,diff(timestamp) / 60000),
level_diff = c(0, diff(level)),
mvpa_diff = c(1, diff(if_else(level %in% MVPA_LEVELS, 1, 0))),
met_diff = c(1, diff(if_else(mets >= MET_THRESHOLD, 1, 0))),
level_episode_id = cumsum(consecutive > EPISODE_TIME_THRESHOLD | level_diff != 0),
mvpa_episode_id = cumsum(consecutive > EPISODE_TIME_THRESHOLD | mvpa_diff != 0),
met_episode_id = cumsum(consecutive > EPISODE_TIME_THRESHOLD | met_diff != 0),
time_since_ref = case_when(EPISODE_REFERENCE_TIME == "MIDNIGHT" ~ ((hour(local_date_time) *3600) + (minute(local_date_time) * 60) + second(local_date_time))/60,
EPISODE_REFERENCE_TIME == "START_OF_THE_SEGMENT" ~ (timestamp - segment_start_ts) / 60000)
) %>%
select(-consecutive, -level_diff, -mvpa_diff, -met_diff) %>%
group_by(local_segment) %>%
nest() %>%
mutate(sedentary = map(data, ~ episode_type_features(.x %>% filter(level == 0) , "sedentary", "level_episode_id")),
lightlyactive = map(data, ~ episode_type_features(.x %>% filter(level == 1) , "lightlyactive", "level_episode_id")),
fairlyactive = map(data, ~ episode_type_features(.x %>% filter(level == 2) , "fairlyactive", "level_episode_id")),
veryactive = map(data, ~ episode_type_features(.x %>% filter(level == 3) , "veryactive", "level_episode_id")),
mvpa = map(data, ~ episode_type_features(.x %>% filter(level >= 2) , "mvpa", "mvpa_episode_id")),
lowmet = map(data, ~ episode_type_features(.x %>% filter(mets < MET_THRESHOLD) , "lowmet", "met_episode_id")),
highmet = map(data, ~ episode_type_features(.x %>% filter(mets >= MET_THRESHOLD) , "highmet", "met_episode_id"))
) %>%
ungroup() %>%
select(all_of(c("local_segment", REQUESTED_EPISODES))) %>%
unnest(everything(), keep_empty=TRUE) %>%
select(starts_with(c("local_segment", REQUESTED_FEATURES)))
}