diff --git a/rules/features.smk b/rules/features.smk index f2a5440b..d45069f6 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -78,7 +78,7 @@ rule resample_episodes: output: "data/interim/{pid}/{sensor}_episodes_resampled.csv" script: - "../src/features/utils/resample_episodes.py" + "../src/features/utils/resample_episodes.R" rule resample_screen_episodes_with_datetime: input: diff --git a/src/features/battery/episodes/battery_episodes.R b/src/features/battery/episodes/battery_episodes.R index 616f2201..2dc01cdf 100644 --- a/src/features/battery/episodes/battery_episodes.R +++ b/src/features/battery/episodes/battery_episodes.R @@ -5,20 +5,27 @@ battery <- read.csv(snakemake@input[[1]]) if(nrow(battery) > 0){ # TODO expose this in the config file - threshold_between_rows = 30 + threshold_between_rows = 30 * 60000 + battery_episodes <- battery %>% filter(battery_status >= 2 ) %>% # discard unknown states - mutate(start_timestamp = timestamp, - end_timestamp = lead(start_timestamp) - 1, - time_diff = (end_timestamp - start_timestamp) / 1000 / 60, - time_diff = if_else(time_diff > threshold_between_rows, threshold_between_rows, time_diff), - episode_id = 1:n()) %>% - select(episode_id, start_timestamp, end_timestamp, battery_level) + mutate(start_timestamp = timestamp, # a battery level starts as soon as is logged + end_timestamp = lead(timestamp) - 1, # a battery level ends as soon as a new one is logged + time_diff = (end_timestamp - start_timestamp), + # we assume the current level existed until the next row only if that row is logged within [threshold_between_rows] minutes + end_timestamp = if_else(is.na(time_diff) | time_diff > (threshold_between_rows), start_timestamp + (threshold_between_rows), end_timestamp)) %>% + mutate(time_diff = c(1, diff(start_timestamp)), + level_diff = c(1, diff(battery_level)), + status_diff = c(1, diff(battery_status)), + episode_id = cumsum(level_diff != 0 | status_diff != 0 | time_diff > (threshold_between_rows))) %>% + group_by(episode_id) %>% + summarise(battery_level = first(battery_level), battery_status = first(battery_status), start_timestamp=first(start_timestamp), end_timestamp = last(end_timestamp)) } else { battery_episodes <- data.frame(episode_id = numeric(), start_timestamp = numeric(), end_timestamp = character(), - battery_level = character()) + battery_level = character(), + battery_status = character()) } write.csv(battery_episodes, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/utils/resample_episodes.R b/src/features/utils/resample_episodes.R new file mode 100644 index 00000000..b651c812 --- /dev/null +++ b/src/features/utils/resample_episodes.R @@ -0,0 +1,22 @@ +source("renv/activate.R") +library("dplyr") + +# Using mostly indeixng instead of tidyr because is faster +resampled_episodes <- read.csv(snakemake@input[[1]]) +resampled_episodes["n_resamples"] <- 1 + (resampled_episodes["end_timestamp"] - resampled_episodes["start_timestamp"]) %/% 60001 +resampled_episodes <- resampled_episodes %>% uncount(n_resamples, .id = "nrow") + +resampled_episodes["nrow"] <- (resampled_episodes["nrow"] - 1) * 60000 +resampled_episodes["start_timestamp"] <- resampled_episodes["start_timestamp"] + resampled_episodes["nrow"] +# Use +59999 because each resampled minute should not overlap with each other +resampled_episodes["end_timestamp"] <- pmin(resampled_episodes["start_timestamp"] + 59999, resampled_episodes["end_timestamp"]) +resampled_episodes <- resampled_episodes %>% select(-nrow) +resampled_episodes <- resampled_episodes %>% uncount(2, .id = "end_flag") + +resampled_episodes["timestamp"] = NA_real_ +resampled_episodes[resampled_episodes$end_flag ==1, "timestamp"] = resampled_episodes[resampled_episodes$end_flag ==1, "start_timestamp"] +resampled_episodes[resampled_episodes$end_flag ==2, "timestamp"] = resampled_episodes[resampled_episodes$end_flag ==2, "end_timestamp"] +resampled_episodes <- resampled_episodes %>% select(-end_flag) + + +write.csv(resampled_episodes, snakemake@output[[1]], row.names = FALSE)