From 09753905f220003c31e128cb40ca174c7927ede3 Mon Sep 17 00:00:00 2001 From: JulioV Date: Tue, 31 Mar 2020 13:33:03 -0400 Subject: [PATCH] Refactor SMS features --- src/features/sms/sms_base.R | 54 +++++++++++++++++++++++++++++++++++++ src/features/sms_metrics.R | 50 +++++++--------------------------- 2 files changed, 64 insertions(+), 40 deletions(-) create mode 100644 src/features/sms/sms_base.R diff --git a/src/features/sms/sms_base.R b/src/features/sms/sms_base.R new file mode 100644 index 00000000..08dea7f5 --- /dev/null +++ b/src/features/sms/sms_base.R @@ -0,0 +1,54 @@ +filter_by_day_segment <- function(data, day_segment) { + if(day_segment %in% c("morning", "afternoon", "evening", "night")) + data <- data %>% filter(local_day_segment == day_segment) + return(data) +} + +base_sms_features <- function(sms, sms_type, day_segment, requested_features){ + # Output dataframe + features = data.frame(local_date = character(), stringsAsFactors = FALSE) + + # The name of the features this function can compute + base_features_names <- c("countmostfrequentcontact", "count", "distinctcontacts", "timefirstsms", "timelastsms") + + # The subset of requested features this function can compute + features_to_compute <- intersect(base_features_names, requested_features) + + # Filter rows that belong to the sms type and day segment of interest + sms <- sms %>% filter(message_type == ifelse(sms_type == "received", "1", "2")) %>% + filter_by_day_segment(day_segment) + + # If there are not features or data to work with, return an empty df with appropiate columns names + if(length(features_to_compute) == 0) + return(features) + if(nrow(sms) < 1) + return(cbind(features, read.csv(text = paste(paste("sms", sms_type, day_segment, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE))) + + for(feature_name in features_to_compute){ + if(feature_name == "countmostfrequentcontact"){ + # Get the number of messages for the most frequent contact throughout the study + feature <- sms %>% group_by(trace) %>% + mutate(N=n()) %>% + ungroup() %>% + filter(N == max(N)) %>% + group_by(local_date) %>% + summarise(!!paste("sms", sms_type, day_segment, feature_name, sep = "_") := n()) %>% + replace(is.na(.), 0) + + features <- merge(features, feature, by="local_date", all = TRUE) + } else { + feature <- sms %>% + group_by(local_date) + + feature <- switch(feature_name, + "count" = feature %>% summarise(!!paste("sms", sms_type, day_segment, feature_name, sep = "_") := n()), + "distinctcontacts" = feature %>% summarise(!!paste("sms", sms_type, day_segment, feature_name, sep = "_") := n_distinct(trace)), + "timefirstsms" = feature %>% summarise(!!paste("sms", sms_type, day_segment, feature_name, sep = "_") := first(local_hour) + (first(local_minute)/60)), + "timelastsms" = feature %>% summarise(!!paste("sms", sms_type, day_segment, feature_name, sep = "_") := last(local_hour) + (last(local_minute)/60))) + + features <- merge(features, feature, by="local_date", all = TRUE) + } + } + + return(features) +} \ No newline at end of file diff --git a/src/features/sms_metrics.R b/src/features/sms_metrics.R index c20298a4..60473fd5 100644 --- a/src/features/sms_metrics.R +++ b/src/features/sms_metrics.R @@ -1,50 +1,20 @@ +# If you want to implement extra features, source(..) a new file and duplicate the line "features <- merge(...)", then +# swap base_sms_features(...) for your own function + source("packrat/init.R") - -library(dplyr) - -filter_by_day_segment <- function(data, day_segment) { - if(day_segment %in% c("morning", "afternoon", "evening", "night")) - data <- data %>% filter(local_day_segment == day_segment) - - return(data %>% group_by(local_date)) -} - -compute_sms_feature <- function(sms, metric, day_segment){ - if(metric == "countmostfrequentcontact"){ - # Get the most frequent contact - sms <- sms %>% group_by(trace) %>% - mutate(N=n()) %>% - ungroup() %>% - filter(N == max(N)) - - return(sms %>% - filter_by_day_segment(day_segment) %>% - summarise(!!paste("sms", sms_type, day_segment, metric, sep = "_") := n())) - } else { - sms <- sms %>% filter_by_day_segment(day_segment) - feature <- switch(metric, - "count" = sms %>% summarise(!!paste("sms", sms_type, day_segment, metric, sep = "_") := n()), - "distinctcontacts" = sms %>% summarise(!!paste("sms", sms_type, day_segment, metric, sep = "_") := n_distinct(trace)), - "timefirstsms" = sms %>% summarise(!!paste("sms", sms_type, day_segment, metric, sep = "_") := first(local_hour) + (first(local_minute)/60)), - "timelastsms" = sms %>% summarise(!!paste("sms", sms_type, day_segment, metric, sep = "_") := last(local_hour) + (last(local_minute)/60))) - return(feature) - } -} +source("src/features/sms/sms_base.R") +library(dplyr, warn.conflicts = FALSE) sms <- read.csv(snakemake@input[[1]]) day_segment <- snakemake@params[["day_segment"]] metrics <- snakemake@params[["metrics"]] sms_type <- snakemake@params[["sms_type"]] -features = data.frame(local_date = character(), stringsAsFactors = FALSE) +features <- data.frame(local_date = character(), stringsAsFactors = FALSE) -sms <- sms %>% filter(message_type == ifelse(sms_type == "received", "1", "2")) +# Compute base SMS features +features <- merge(features, base_sms_features(sms, sms_type, day_segment, metrics), by="local_date", all = TRUE) -for(metric in metrics){ - feature <- compute_sms_feature(sms, metric, day_segment) - features <- merge(features, feature, by="local_date", all = TRUE) -} - -if("countmostfrequentcontact" %in% metrics) - features <- features %>% mutate_at(vars(contains('countmostfrequentcontact')), funs(ifelse(is.na(.), 0, .))) +if(ncol(features) != length(metrics) + 1) + stop(paste0("The number of features in the output dataframe (=", ncol(features),") does not match the expected value (=", length(metrics)," + 1). Verify your SMS feature extraction functions")) write.csv(features, snakemake@output[[1]], row.names = FALSE)