diff --git a/Snakefile b/Snakefile index 77d225d0..2874e7a4 100644 --- a/Snakefile +++ b/Snakefile @@ -35,13 +35,13 @@ if config["PHONE_VALID_SENSED_DAYS"]["COMPUTE"]: if config["MESSAGES"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/messages_{messages_type}_{day_segment}.csv", pid=config["PIDS"], messages_type = config["MESSAGES"]["TYPES"], day_segment = config["MESSAGES"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/messages_{messages_type}.csv", pid=config["PIDS"], messages_type = config["MESSAGES"]["TYPES"])) if config["CALLS"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"], day_segment = config["CALLS"]["DAY_SEGMENTS"])) + files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"])) if config["BARNETT_LOCATION"]["COMPUTE"]: if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED": diff --git a/rules/features.smk b/rules/features.smk index 7098e6c9..a4302710 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -1,12 +1,12 @@ rule messages_features: input: - expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"]) + expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"]), + day_segments_labels = expand("data/interim/{sensor}_day_segments_labels.csv", sensor=config["MESSAGES"]["DB_TABLE"]) params: messages_type = "{messages_type}", - day_segment = "{day_segment}", features = lambda wildcards: config["MESSAGES"]["FEATURES"][wildcards.messages_type] output: - "data/processed/{pid}/messages_{messages_type}_{day_segment}.csv" + "data/processed/{pid}/messages_{messages_type}.csv" script: "../src/features/messages_features.R" diff --git a/src/features/messages/messages_base.R b/src/features/messages/messages_base.R index a784a213..1d6cecdf 100644 --- a/src/features/messages/messages_base.R +++ b/src/features/messages/messages_base.R @@ -1,17 +1,9 @@ library('tidyr') - -filter_by_day_segment <- function(data, day_segment) { - if(day_segment %in% c("morning", "afternoon", "evening", "night")) - data <- data %>% filter(local_day_segment == day_segment) - else if(day_segment == "daily") - return(data) - else - return(data %>% head(0)) -} +library('stringr') base_messages_features <- function(messages, messages_type, day_segment, requested_features){ # Output dataframe - features = data.frame(local_date = character(), stringsAsFactors = FALSE) + features = data.frame(local_segment = character(), stringsAsFactors = FALSE) # The name of the features this function can compute base_features_names <- c("countmostfrequentcontact", "count", "distinctcontacts", "timefirstmessage", "timelastmessage") @@ -19,15 +11,20 @@ base_messages_features <- function(messages, messages_type, day_segment, request # The subset of requested features this function can compute features_to_compute <- intersect(base_features_names, requested_features) - # Filter rows that belong to the message type and day segment of interest - messages <- messages %>% filter(message_type == ifelse(messages_type == "received", "1", ifelse(messages_type == "sent", 2, NA))) %>% - filter_by_day_segment(day_segment) - + # Filter the rows that belong to day_segment, and put the segment full name in a new column for grouping + date_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2}" + hour_regex = "[0-9]{2}:[0-9]{2}:[0-9]{2}" + messages <- messages %>% + filter(message_type == ifelse(messages_type == "received", "1", ifelse(messages_type == "sent", 2, NA))) %>% + filter(grepl(paste0("\\[", day_segment, "#"),assigned_segments)) %>% + mutate(local_segment = str_extract(assigned_segments, paste0("\\[", day_segment, "#", date_regex, "#", hour_regex, "#", date_regex, "#", hour_regex, "\\]")), + local_segment = str_sub(local_segment, 2, -2)) # get rid of first and last character([]) + # If there are not features or data to work with, return an empty df with appropiate columns names if(length(features_to_compute) == 0) return(features) if(nrow(messages) < 1) - return(cbind(features, read.csv(text = paste(paste("messages", messages_type, day_segment, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE))) + return(cbind(features, read.csv(text = paste(paste("messages", messages_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE))) for(feature_name in features_to_compute){ if(feature_name == "countmostfrequentcontact"){ @@ -41,21 +38,21 @@ base_messages_features <- function(messages, messages_type, day_segment, request pull(trace) feature <- messages %>% filter(trace == mostfrequentcontact) %>% - group_by(local_date) %>% - summarise(!!paste("messages", messages_type, day_segment, feature_name, sep = "_") := n()) %>% + group_by(local_segment) %>% + summarise(!!paste("messages", messages_type, feature_name, sep = "_") := n()) %>% replace(is.na(.), 0) - features <- merge(features, feature, by="local_date", all = TRUE) + features <- merge(features, feature, by="local_segment", all = TRUE) } else { feature <- messages %>% - group_by(local_date) + group_by(local_segment) feature <- switch(feature_name, - "count" = feature %>% summarise(!!paste("messages", messages_type, day_segment, feature_name, sep = "_") := n()), - "distinctcontacts" = feature %>% summarise(!!paste("messages", messages_type, day_segment, feature_name, sep = "_") := n_distinct(trace)), - "timefirstmessage" = feature %>% summarise(!!paste("messages", messages_type, day_segment, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)), - "timelastmessage" = feature %>% summarise(!!paste("messages", messages_type, day_segment, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute))) + "count" = feature %>% summarise(!!paste("messages", messages_type, feature_name, sep = "_") := n()), + "distinctcontacts" = feature %>% summarise(!!paste("messages", messages_type, feature_name, sep = "_") := n_distinct(trace)), + "timefirstmessage" = feature %>% summarise(!!paste("messages", messages_type, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)), + "timelastmessage" = feature %>% summarise(!!paste("messages", messages_type, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute))) - features <- merge(features, feature, by="local_date", all = TRUE) + features <- merge(features, feature, by="local_segment", all = TRUE) } } features <- features %>% mutate_at(vars(contains("countmostfrequentcontact")), list( ~ replace_na(., 0))) diff --git a/src/features/messages_features.R b/src/features/messages_features.R index 1f8cabde..4602aa99 100644 --- a/src/features/messages_features.R +++ b/src/features/messages_features.R @@ -3,18 +3,24 @@ source("renv/activate.R") source("src/features/messages/messages_base.R") -library(dplyr, warn.conflicts = FALSE) +library("dplyr", warn.conflicts = FALSE) messages <- read.csv(snakemake@input[[1]]) -day_segment <- snakemake@params[["day_segment"]] +day_segments_labels <- read.csv(snakemake@input[["day_segments_labels"]]) requested_features <- snakemake@params[["features"]] messages_type <- snakemake@params[["messages_type"]] -features <- data.frame(local_date = character(), stringsAsFactors = FALSE) +features <- data.frame(local_segment = character(), stringsAsFactors = FALSE) -# Compute base SMS features -features <- merge(features, base_messages_features(messages, messages_type, day_segment, requested_features), by="local_date", all = TRUE) +day_segments <- day_segments_labels %>% pull(label) +for (day_segment in day_segments) + features <- merge(features, base_messages_features(messages, messages_type, day_segment, requested_features), all = TRUE) if(ncol(features) != length(requested_features) + 1) stop(paste0("The number of features in the output dataframe (=", ncol(features),") does not match the expected value (=", length(requested_features)," + 1). Verify your Messages (SMS) feature extraction functions")) +features <- features %>% separate(col = local_segment, + into = c("local_segment_label", "local_start_date", "local_start_time", "local_end_date", "local_end_time"), + sep = "#", + remove = FALSE) + write.csv(features, snakemake@output[[1]], row.names = FALSE)