diff --git a/Snakefile b/Snakefile index ae30484c..26076591 100644 --- a/Snakefile +++ b/Snakefile @@ -14,16 +14,10 @@ rule all: sms_type = config["COM_SMS"]["SMS_TYPES"], day_segment = config["COM_SMS"]["DAY_SEGMENTS"], metric = config["COM_SMS"]["METRICS"]), - expand("data/processed/{pid}/com_call_{call_type}_{segment}_{metric}.csv", + expand("data/processed/{pid}/call_{call_type}_{segment}.csv", pid=config["PIDS"], - call_type = config["COM_CALL"]["CALL_TYPE_MISSED"], - segment = config["COM_CALL"]["DAY_SEGMENTS"], - metric = config["COM_CALL"]["METRICS_MISSED"]), - expand("data/processed/{pid}/com_call_{call_type}_{segment}_{metric}.csv", - pid=config["PIDS"], - call_type = config["COM_CALL"]["CALL_TYPE_TAKEN"], - segment = config["COM_CALL"]["DAY_SEGMENTS"], - metric = config["COM_CALL"]["METRICS_TAKEN"]), + call_type=config["CALLS"]["TYPES"], + segment = config["CALLS"]["DAY_SEGMENTS"]), expand("data/processed/{pid}/location_barnett.csv", pid=config["PIDS"]), expand("data/processed/{pid}/bluetooth_{segment}.csv", pid=config["PIDS"], diff --git a/config.yaml b/config.yaml index a1818aba..bdf4b451 100644 --- a/config.yaml +++ b/config.yaml @@ -30,12 +30,13 @@ COM_SMS: # Communication call features config # Separate configurations for missed and taken calls -COM_CALL: - CALL_TYPE_MISSED : [missed] - CALL_TYPE_TAKEN : [incoming, outgoing] +CALLS: + TYPES: [missed, incoming, outgoing] + METRICS: + missed: [count, distinctcontacts] + incoming: [count, distinctcontacts, meanduration, sumduration, hubermduration, varqnduration, entropyduration] + outgoing: [count, distinctcontacts, meanduration, sumduration, hubermduration, varqnduration, entropyduration] DAY_SEGMENTS: *day_segments - METRICS_MISSED: [count, distinctcontacts] - METRICS_TAKEN: [count, distinctcontacts, meanduration, sumduration, hubermduration, varqnduration, entropyduration] PHONE_VALID_SENSED_DAYS: BIN_SIZE: 5 # (in minutes) diff --git a/rules/features.snakefile b/rules/features.snakefile index 125cee80..943b494b 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -10,15 +10,15 @@ rule communication_sms_metrics: script: "../src/features/communication_sms_metrics.R" -rule communication_call_metrics: +rule call_metrics: input: "data/raw/{pid}/calls_with_datetime.csv" params: call_type = "{call_type}", day_segment = "{day_segment}", - metric = "{metric}" + metrics = lambda wildcards: config["CALLS"]["METRICS"][wildcards.call_type] output: - "data/processed/{pid}/com_call_{call_type}_{day_segment}_{metric}.csv" + "data/processed/{pid}/call_{call_type}_{day_segment}.csv" script: "../src/features/communication_call_metrics.R" diff --git a/src/features/communication_call_metrics.R b/src/features/communication_call_metrics.R index 5bc54bb5..3ca52704 100644 --- a/src/features/communication_call_metrics.R +++ b/src/features/communication_call_metrics.R @@ -4,27 +4,37 @@ library(dplyr) library(entropy) library(robustbase) -calls <- read.csv(snakemake@input[[1]]) -day_segment <- snakemake@params[["day_segment"]] -metric <- snakemake@params[["metric"]] -type <- snakemake@params[["call_type"]] -output_file <- snakemake@output[[1]] +filter_by_day_segment <- function(data, day_segment) { + if(day_segment %in% c("morning", "afternoon", "evening", "night")) + data <- data %>% filter(local_day_segment == day_segment) -metrics <- calls %>% filter(call_type == ifelse(type == "incoming", "1", ifelse(type == "outgoing", "2", "3"))) - -if(day_segment == "daily"){ - metrics <- metrics %>% group_by(local_date) -} else { - metrics <- metrics %>% filter(day_segment == local_day_segment) %>% group_by(local_date) + return(data %>% group_by(local_date)) } -metrics <- switch(metric, - "count" = metrics %>% summarise(!!paste("com", "call", type, day_segment, metric, sep = "_") := n()), - "distinctcontacts" = metrics %>% summarise(!!paste("com", "call", type, day_segment, metric, sep = "_") := n_distinct(trace)), - "meanduration" = metrics %>% summarise(!!paste("com", "call", type, day_segment, metric, sep = "_") := mean(call_duration)), - "sumduration" = metrics %>% summarise(!!paste("com", "call", type, day_segment, metric, sep = "_") := sum(call_duration)), - "hubermduration" = metrics %>% summarise(!!paste("com", "call", type, day_segment, metric, sep = "_") := huberM(call_duration)$mu), - "varqnduration" = metrics %>% summarise(!!paste("com", "call", type, day_segment, metric, sep = "_") := Qn(call_duration)), - "entropyduration" = metrics %>% summarise(!!paste("com", "call", type, day_segment, metric, sep = "_") := entropy.MillerMadow(call_duration))) +compute_call_feature <- function(calls, metric, day_segment){ + calls <- calls %>% filter_by_day_segment(day_segment) + feature <- switch(metric, + "count" = calls %>% summarise(!!paste("call", type, day_segment, metric, sep = "_") := n()), + "distinctcontacts" = calls %>% summarise(!!paste("call", type, day_segment, metric, sep = "_") := n_distinct(trace)), + "meanduration" = calls %>% summarise(!!paste("call", type, day_segment, metric, sep = "_") := mean(call_duration)), + "sumduration" = calls %>% summarise(!!paste("call", type, day_segment, metric, sep = "_") := sum(call_duration)), + "hubermduration" = calls %>% summarise(!!paste("call", type, day_segment, metric, sep = "_") := huberM(call_duration)$mu), + "varqnduration" = calls %>% summarise(!!paste("call", type, day_segment, metric, sep = "_") := Qn(call_duration)), + "entropyduration" = calls %>% summarise(!!paste("call", type, day_segment, metric, sep = "_") := entropy.MillerMadow(call_duration))) + return(feature) +} -write.csv(na.omit(metrics), output_file, row.names = F) +calls <- read.csv(snakemake@input[[1]], stringsAsFactors = FALSE) +day_segment <- snakemake@params[["day_segment"]] +metrics <- snakemake@params[["metrics"]] +type <- snakemake@params[["call_type"]] +features = data.frame(local_date = character(), stringsAsFactors = FALSE) + +calls <- calls %>% filter(call_type == ifelse(type == "incoming", "1", ifelse(type == "outgoing", "2", "3"))) + +for(metric in metrics){ + feature <- compute_call_feature(calls, metric, day_segment) + features <- merge(features, feature, by="local_date", all = TRUE) +} + +write.csv(features, snakemake@output[[1]], row.names = FALSE)