Redo resample episodes in R, new battery episodes
parent
a6b99259f7
commit
0dafdd1340
|
@ -78,7 +78,7 @@ rule resample_episodes:
|
|||
output:
|
||||
"data/interim/{pid}/{sensor}_episodes_resampled.csv"
|
||||
script:
|
||||
"../src/features/utils/resample_episodes.py"
|
||||
"../src/features/utils/resample_episodes.R"
|
||||
|
||||
rule resample_screen_episodes_with_datetime:
|
||||
input:
|
||||
|
|
|
@ -5,20 +5,27 @@ battery <- read.csv(snakemake@input[[1]])
|
|||
|
||||
if(nrow(battery) > 0){
|
||||
# TODO expose this in the config file
|
||||
threshold_between_rows = 30
|
||||
threshold_between_rows = 30 * 60000
|
||||
|
||||
battery_episodes <- battery %>%
|
||||
filter(battery_status >= 2 ) %>% # discard unknown states
|
||||
mutate(start_timestamp = timestamp,
|
||||
end_timestamp = lead(start_timestamp) - 1,
|
||||
time_diff = (end_timestamp - start_timestamp) / 1000 / 60,
|
||||
time_diff = if_else(time_diff > threshold_between_rows, threshold_between_rows, time_diff),
|
||||
episode_id = 1:n()) %>%
|
||||
select(episode_id, start_timestamp, end_timestamp, battery_level)
|
||||
mutate(start_timestamp = timestamp, # a battery level starts as soon as is logged
|
||||
end_timestamp = lead(timestamp) - 1, # a battery level ends as soon as a new one is logged
|
||||
time_diff = (end_timestamp - start_timestamp),
|
||||
# we assume the current level existed until the next row only if that row is logged within [threshold_between_rows] minutes
|
||||
end_timestamp = if_else(is.na(time_diff) | time_diff > (threshold_between_rows), start_timestamp + (threshold_between_rows), end_timestamp)) %>%
|
||||
mutate(time_diff = c(1, diff(start_timestamp)),
|
||||
level_diff = c(1, diff(battery_level)),
|
||||
status_diff = c(1, diff(battery_status)),
|
||||
episode_id = cumsum(level_diff != 0 | status_diff != 0 | time_diff > (threshold_between_rows))) %>%
|
||||
group_by(episode_id) %>%
|
||||
summarise(battery_level = first(battery_level), battery_status = first(battery_status), start_timestamp=first(start_timestamp), end_timestamp = last(end_timestamp))
|
||||
} else {
|
||||
battery_episodes <- data.frame(episode_id = numeric(),
|
||||
start_timestamp = numeric(),
|
||||
end_timestamp = character(),
|
||||
battery_level = character())
|
||||
battery_level = character(),
|
||||
battery_status = character())
|
||||
}
|
||||
|
||||
write.csv(battery_episodes, snakemake@output[[1]], row.names = FALSE)
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
source("renv/activate.R")
|
||||
library("dplyr")
|
||||
|
||||
# Using mostly indeixng instead of tidyr because is faster
|
||||
resampled_episodes <- read.csv(snakemake@input[[1]])
|
||||
resampled_episodes["n_resamples"] <- 1 + (resampled_episodes["end_timestamp"] - resampled_episodes["start_timestamp"]) %/% 60001
|
||||
resampled_episodes <- resampled_episodes %>% uncount(n_resamples, .id = "nrow")
|
||||
|
||||
resampled_episodes["nrow"] <- (resampled_episodes["nrow"] - 1) * 60000
|
||||
resampled_episodes["start_timestamp"] <- resampled_episodes["start_timestamp"] + resampled_episodes["nrow"]
|
||||
# Use +59999 because each resampled minute should not overlap with each other
|
||||
resampled_episodes["end_timestamp"] <- pmin(resampled_episodes["start_timestamp"] + 59999, resampled_episodes["end_timestamp"])
|
||||
resampled_episodes <- resampled_episodes %>% select(-nrow)
|
||||
resampled_episodes <- resampled_episodes %>% uncount(2, .id = "end_flag")
|
||||
|
||||
resampled_episodes["timestamp"] = NA_real_
|
||||
resampled_episodes[resampled_episodes$end_flag ==1, "timestamp"] = resampled_episodes[resampled_episodes$end_flag ==1, "start_timestamp"]
|
||||
resampled_episodes[resampled_episodes$end_flag ==2, "timestamp"] = resampled_episodes[resampled_episodes$end_flag ==2, "end_timestamp"]
|
||||
resampled_episodes <- resampled_episodes %>% select(-end_flag)
|
||||
|
||||
|
||||
write.csv(resampled_episodes, snakemake@output[[1]], row.names = FALSE)
|
Loading…
Reference in New Issue