Redo resample episodes in R, new battery episodes

pull/103/head
JulioV 2020-09-29 15:18:48 -04:00
parent a6b99259f7
commit 0dafdd1340
3 changed files with 38 additions and 9 deletions

View File

@ -78,7 +78,7 @@ rule resample_episodes:
output:
"data/interim/{pid}/{sensor}_episodes_resampled.csv"
script:
"../src/features/utils/resample_episodes.py"
"../src/features/utils/resample_episodes.R"
rule resample_screen_episodes_with_datetime:
input:

View File

@ -5,20 +5,27 @@ battery <- read.csv(snakemake@input[[1]])
if(nrow(battery) > 0){
# TODO expose this in the config file
threshold_between_rows = 30
threshold_between_rows = 30 * 60000
battery_episodes <- battery %>%
filter(battery_status >= 2 ) %>% # discard unknown states
mutate(start_timestamp = timestamp,
end_timestamp = lead(start_timestamp) - 1,
time_diff = (end_timestamp - start_timestamp) / 1000 / 60,
time_diff = if_else(time_diff > threshold_between_rows, threshold_between_rows, time_diff),
episode_id = 1:n()) %>%
select(episode_id, start_timestamp, end_timestamp, battery_level)
mutate(start_timestamp = timestamp, # a battery level starts as soon as is logged
end_timestamp = lead(timestamp) - 1, # a battery level ends as soon as a new one is logged
time_diff = (end_timestamp - start_timestamp),
# we assume the current level existed until the next row only if that row is logged within [threshold_between_rows] minutes
end_timestamp = if_else(is.na(time_diff) | time_diff > (threshold_between_rows), start_timestamp + (threshold_between_rows), end_timestamp)) %>%
mutate(time_diff = c(1, diff(start_timestamp)),
level_diff = c(1, diff(battery_level)),
status_diff = c(1, diff(battery_status)),
episode_id = cumsum(level_diff != 0 | status_diff != 0 | time_diff > (threshold_between_rows))) %>%
group_by(episode_id) %>%
summarise(battery_level = first(battery_level), battery_status = first(battery_status), start_timestamp=first(start_timestamp), end_timestamp = last(end_timestamp))
} else {
battery_episodes <- data.frame(episode_id = numeric(),
start_timestamp = numeric(),
end_timestamp = character(),
battery_level = character())
battery_level = character(),
battery_status = character())
}
write.csv(battery_episodes, snakemake@output[[1]], row.names = FALSE)

View File

@ -0,0 +1,22 @@
source("renv/activate.R")
library("dplyr")
# Using mostly indeixng instead of tidyr because is faster
resampled_episodes <- read.csv(snakemake@input[[1]])
resampled_episodes["n_resamples"] <- 1 + (resampled_episodes["end_timestamp"] - resampled_episodes["start_timestamp"]) %/% 60001
resampled_episodes <- resampled_episodes %>% uncount(n_resamples, .id = "nrow")
resampled_episodes["nrow"] <- (resampled_episodes["nrow"] - 1) * 60000
resampled_episodes["start_timestamp"] <- resampled_episodes["start_timestamp"] + resampled_episodes["nrow"]
# Use +59999 because each resampled minute should not overlap with each other
resampled_episodes["end_timestamp"] <- pmin(resampled_episodes["start_timestamp"] + 59999, resampled_episodes["end_timestamp"])
resampled_episodes <- resampled_episodes %>% select(-nrow)
resampled_episodes <- resampled_episodes %>% uncount(2, .id = "end_flag")
resampled_episodes["timestamp"] = NA_real_
resampled_episodes[resampled_episodes$end_flag ==1, "timestamp"] = resampled_episodes[resampled_episodes$end_flag ==1, "start_timestamp"]
resampled_episodes[resampled_episodes$end_flag ==2, "timestamp"] = resampled_episodes[resampled_episodes$end_flag ==2, "end_timestamp"]
resampled_episodes <- resampled_episodes %>% select(-end_flag)
write.csv(resampled_episodes, snakemake@output[[1]], row.names = FALSE)