Refactor sms metrics to produce a single file

replace/3ec1fc022f2c0c8f6f9183e6a105545c75def296
JulioV 2019-11-06 15:38:08 -05:00
parent 1d1c8e6bf1
commit cf9de4dedd
5 changed files with 48 additions and 38 deletions

View File

@ -9,11 +9,10 @@ rule all:
expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]), expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]), expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]),
expand("data/interim/{pid}/phone_valid_sensed_days.csv", pid=config["PIDS"]), expand("data/interim/{pid}/phone_valid_sensed_days.csv", pid=config["PIDS"]),
expand("data/processed/{pid}/com_sms_{sms_type}_{day_segment}_{metric}.csv", expand("data/processed/{pid}/sms_{sms_type}_{day_segment}.csv",
pid=config["PIDS"], pid=config["PIDS"],
sms_type = config["COM_SMS"]["SMS_TYPES"], sms_type = config["SMS"]["TYPES"],
day_segment = config["COM_SMS"]["DAY_SEGMENTS"], day_segment = config["SMS"]["DAY_SEGMENTS"]),
metric = config["COM_SMS"]["METRICS"]),
expand("data/processed/{pid}/call_{call_type}_{segment}.csv", expand("data/processed/{pid}/call_{call_type}_{segment}.csv",
pid=config["PIDS"], pid=config["PIDS"],
call_type=config["CALLS"]["TYPES"], call_type=config["CALLS"]["TYPES"],

View File

@ -22,14 +22,15 @@ DOWNLOAD_DATASET:
READABLE_DATETIME: READABLE_DATETIME:
FIXED_TIMEZONE: *timezone FIXED_TIMEZONE: *timezone
# Communication SMS features config # Communication SMS features config, TYPES and METRICS keys need to match
COM_SMS: SMS:
SMS_TYPES : [received, sent] TYPES : [received, sent]
METRICS:
received: [count, distinctcontacts]
sent: [count, distinctcontacts]
DAY_SEGMENTS: *day_segments DAY_SEGMENTS: *day_segments
METRICS: [count, distinctcontacts]
# Communication call features config # Communication call features config, TYPES and METRICS keys need to match
# Separate configurations for missed and taken calls
CALLS: CALLS:
TYPES: [missed, incoming, outgoing] TYPES: [missed, incoming, outgoing]
METRICS: METRICS:

View File

@ -1,14 +1,14 @@
rule communication_sms_metrics: rule sms_metrics:
input: input:
"data/raw/{pid}/messages_with_datetime.csv" "data/raw/{pid}/messages_with_datetime.csv"
params: params:
sms_type = "{sms_type}", sms_type = "{sms_type}",
day_segment = "{day_segment}", day_segment = "{day_segment}",
metric = "{metric}" metrics = lambda wildcards: config["SMS"]["METRICS"][wildcards.sms_type]
output: output:
"data/processed/{pid}/com_sms_{sms_type}_{day_segment}_{metric}.csv" "data/processed/{pid}/sms_{sms_type}_{day_segment}.csv"
script: script:
"../src/features/communication_sms_metrics.R" "../src/features/sms_metrics.R"
rule call_metrics: rule call_metrics:
input: input:

View File

@ -1,23 +0,0 @@
source("packrat/init.R")
library(dplyr)
sms <- read.csv(snakemake@input[[1]])
day_segment <- snakemake@params[["day_segment"]]
metric <- snakemake@params[["metric"]]
sms_type <- snakemake@params[["sms_type"]]
output_file <- snakemake@output[[1]]
metrics <- sms %>% filter(message_type == ifelse(sms_type == "received", "1", "2"))
if(day_segment == "daily"){
metrics <- metrics %>% group_by(local_date)
} else {
metrics <- metrics %>% filter(day_segment == local_day_segment) %>% group_by(local_date)
}
metrics <- switch(metric,
"count" = metrics %>% summarise(!!paste("com", "sms", sms_type, day_segment, metric, sep = "_") := n()),
"distinctcontacts" = metrics %>% summarise(!!paste("com", "sms", sms_type, day_segment, metric, sep = "_") := n_distinct(trace)))
write.csv(na.omit(metrics), output_file, row.names = F)

View File

@ -0,0 +1,33 @@
source("packrat/init.R")
library(dplyr)
filter_by_day_segment <- function(data, day_segment) {
if(day_segment %in% c("morning", "afternoon", "evening", "night"))
data <- data %>% filter(local_day_segment == day_segment)
return(data %>% group_by(local_date))
}
compute_sms_feature <- function(sms, metric, day_segment){
sms <- sms %>% filter_by_day_segment(day_segment)
feature <- switch(metric,
"count" = sms %>% summarise(!!paste("com", "sms", sms_type, day_segment, metric, sep = "_") := n()),
"distinctcontacts" = sms %>% summarise(!!paste("com", "sms", sms_type, day_segment, metric, sep = "_") := n_distinct(trace)))
return(feature)
}
sms <- read.csv(snakemake@input[[1]])
day_segment <- snakemake@params[["day_segment"]]
metrics <- snakemake@params[["metrics"]]
sms_type <- snakemake@params[["sms_type"]]
features = data.frame(local_date = character(), stringsAsFactors = FALSE)
sms <- sms %>% filter(message_type == ifelse(sms_type == "received", "1", "2"))
for(metric in metrics){
feature <- compute_sms_feature(sms, metric, day_segment)
features <- merge(features, feature, by="local_date", all = TRUE)
}
write.csv(features, snakemake@output[[1]], row.names = FALSE)