From cf9de4dedd6a2581ef572b4c41923193b013d561 Mon Sep 17 00:00:00 2001 From: JulioV Date: Wed, 6 Nov 2019 15:38:08 -0500 Subject: [PATCH] Refactor sms metrics to produce a single file --- Snakefile | 7 +++-- config.yaml | 15 ++++++----- rules/features.snakefile | 8 +++--- src/features/communication_sms_metrics.R | 23 ----------------- src/features/sms_metrics.R | 33 ++++++++++++++++++++++++ 5 files changed, 48 insertions(+), 38 deletions(-) delete mode 100644 src/features/communication_sms_metrics.R create mode 100644 src/features/sms_metrics.R diff --git a/Snakefile b/Snakefile index 26076591..e1937d0d 100644 --- a/Snakefile +++ b/Snakefile @@ -9,11 +9,10 @@ rule all: expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]), expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]), expand("data/interim/{pid}/phone_valid_sensed_days.csv", pid=config["PIDS"]), - expand("data/processed/{pid}/com_sms_{sms_type}_{day_segment}_{metric}.csv", + expand("data/processed/{pid}/sms_{sms_type}_{day_segment}.csv", pid=config["PIDS"], - sms_type = config["COM_SMS"]["SMS_TYPES"], - day_segment = config["COM_SMS"]["DAY_SEGMENTS"], - metric = config["COM_SMS"]["METRICS"]), + sms_type = config["SMS"]["TYPES"], + day_segment = config["SMS"]["DAY_SEGMENTS"]), expand("data/processed/{pid}/call_{call_type}_{segment}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"], diff --git a/config.yaml b/config.yaml index bdf4b451..291525ae 100644 --- a/config.yaml +++ b/config.yaml @@ -22,14 +22,15 @@ DOWNLOAD_DATASET: READABLE_DATETIME: FIXED_TIMEZONE: *timezone -# Communication SMS features config -COM_SMS: - SMS_TYPES : [received, sent] - DAY_SEGMENTS: *day_segments - METRICS: [count, distinctcontacts] +# Communication SMS features config, TYPES and METRICS keys need to match +SMS: + TYPES : [received, sent] + METRICS: + received: [count, distinctcontacts] + sent: [count, distinctcontacts] + DAY_SEGMENTS: *day_segments -# Communication call features config -# Separate configurations for missed and taken calls +# Communication call features config, TYPES and METRICS keys need to match CALLS: TYPES: [missed, incoming, outgoing] METRICS: diff --git a/rules/features.snakefile b/rules/features.snakefile index 943b494b..5a9d4ec4 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -1,14 +1,14 @@ -rule communication_sms_metrics: +rule sms_metrics: input: "data/raw/{pid}/messages_with_datetime.csv" params: sms_type = "{sms_type}", day_segment = "{day_segment}", - metric = "{metric}" + metrics = lambda wildcards: config["SMS"]["METRICS"][wildcards.sms_type] output: - "data/processed/{pid}/com_sms_{sms_type}_{day_segment}_{metric}.csv" + "data/processed/{pid}/sms_{sms_type}_{day_segment}.csv" script: - "../src/features/communication_sms_metrics.R" + "../src/features/sms_metrics.R" rule call_metrics: input: diff --git a/src/features/communication_sms_metrics.R b/src/features/communication_sms_metrics.R deleted file mode 100644 index 9aac5a85..00000000 --- a/src/features/communication_sms_metrics.R +++ /dev/null @@ -1,23 +0,0 @@ -source("packrat/init.R") - -library(dplyr) - -sms <- read.csv(snakemake@input[[1]]) -day_segment <- snakemake@params[["day_segment"]] -metric <- snakemake@params[["metric"]] -sms_type <- snakemake@params[["sms_type"]] -output_file <- snakemake@output[[1]] - -metrics <- sms %>% filter(message_type == ifelse(sms_type == "received", "1", "2")) - -if(day_segment == "daily"){ - metrics <- metrics %>% group_by(local_date) -} else { - metrics <- metrics %>% filter(day_segment == local_day_segment) %>% group_by(local_date) -} - -metrics <- switch(metric, - "count" = metrics %>% summarise(!!paste("com", "sms", sms_type, day_segment, metric, sep = "_") := n()), - "distinctcontacts" = metrics %>% summarise(!!paste("com", "sms", sms_type, day_segment, metric, sep = "_") := n_distinct(trace))) - -write.csv(na.omit(metrics), output_file, row.names = F) \ No newline at end of file diff --git a/src/features/sms_metrics.R b/src/features/sms_metrics.R new file mode 100644 index 00000000..fec0c2b6 --- /dev/null +++ b/src/features/sms_metrics.R @@ -0,0 +1,33 @@ +source("packrat/init.R") + +library(dplyr) + +filter_by_day_segment <- function(data, day_segment) { + if(day_segment %in% c("morning", "afternoon", "evening", "night")) + data <- data %>% filter(local_day_segment == day_segment) + + return(data %>% group_by(local_date)) +} + +compute_sms_feature <- function(sms, metric, day_segment){ + sms <- sms %>% filter_by_day_segment(day_segment) + feature <- switch(metric, + "count" = sms %>% summarise(!!paste("com", "sms", sms_type, day_segment, metric, sep = "_") := n()), + "distinctcontacts" = sms %>% summarise(!!paste("com", "sms", sms_type, day_segment, metric, sep = "_") := n_distinct(trace))) + return(feature) +} + +sms <- read.csv(snakemake@input[[1]]) +day_segment <- snakemake@params[["day_segment"]] +metrics <- snakemake@params[["metrics"]] +sms_type <- snakemake@params[["sms_type"]] +features = data.frame(local_date = character(), stringsAsFactors = FALSE) + +sms <- sms %>% filter(message_type == ifelse(sms_type == "received", "1", "2")) + +for(metric in metrics){ + feature <- compute_sms_feature(sms, metric, day_segment) + features <- merge(features, feature, by="local_date", all = TRUE) +} + +write.csv(features, snakemake@output[[1]], row.names = FALSE)