Refactor PHONE_CALLS RAPIDS provider to compute features based on call episodes or events

pull/167/head
Meng Li 2021-09-01 18:54:39 -04:00
parent 2e553dc9e7
commit a8a178486b
10 changed files with 53 additions and 8 deletions

View File

@ -45,7 +45,12 @@ for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys():
for provider in config["PHONE_CALLS"]["PROVIDERS"].keys():
if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"]))
if (provider == "RAPIDS") and (config["PHONE_CALLS"]["PROVIDERS"][provider]["FEATURES_TYPE"] == "EPISODES"):
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
else:
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))

View File

@ -181,6 +181,7 @@ PHONE_CALLS:
PROVIDERS:
RAPIDS:
COMPUTE: False
FEATURES_TYPE: EPISODES # EVENTS or EPISODES
CALL_TYPES: [missed, incoming, outgoing]
FEATURES:
missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact]

View File

@ -1,4 +1,6 @@
# Change Log
## v1.6.0
- Refactor PHONE_CALLS RAPIDS provider to compute features based on call episodes or events
## v1.5.0
- Update Barnett location features with faster Python implementation
- Fix rounding bug in data yield features

View File

@ -26,6 +26,7 @@ Parameters description for `[PHONE_CALLS][PROVIDERS][RAPIDS]`:
| Key                        | Description |
|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|`[COMPUTE]`| Set to `True` to extract `PHONE_CALLS` features from the `RAPIDS` provider|
|`[FEATURES_TYPE]`| Set to `EPISODES` to extract features based on call episodes or `EVENTS` to extract features based on events.|
| `[CALL_TYPES]` | The particular call_type that will be analyzed. The options for this parameter are incoming, outgoing or missed. |
| `[FEATURES]` | Features to be computed for `outgoing`, `incoming`, and `missed` calls. Note that the same features are available for both incoming and outgoing calls, while missed calls has its own set of features. See the tables below. |
@ -60,4 +61,4 @@ Features description for `[PHONE_CALLS][PROVIDERS][RAPIDS]` missed calls:
!!! note "Assumptions/Observations"
1. Traces for iOS calls are unique even for the same contact calling a participant more than once which renders `countmostfrequentcontact` meaningless and `distinctcontacts` equal to the total number of traces.
2. `[CALL_TYPES]` and `[FEATURES]` keys in `config.yaml` need to match. For example, `[CALL_TYPES]` `outgoing` matches the `[FEATURES]` key `outgoing`
3. iOS calls data is transformed to match Android calls data format. See our [algorithm](algorithms/phone-algorithms.md#phone-calls)
3. iOS calls data is transformed to match Android calls data format.

View File

@ -27,6 +27,12 @@ def get_locations_python_input(wildcards):
else:
return "data/interim/{pid}/phone_locations_processed_with_datetime.csv"
def get_calls_input(wildcards):
if (wildcards.provider_key.upper() == "RAPIDS") and (config["PHONE_CALLS"]["PROVIDERS"]["RAPIDS"]["FEATURES_TYPE"] == "EPISODES"):
return "data/interim/{pid}/phone_calls_episodes_resampled_with_datetime.csv"
else:
return "data/raw/{pid}/phone_calls_with_datetime.csv"
def find_features_files(wildcards):
feature_files = []
for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items():

View File

@ -264,9 +264,17 @@ rule phone_bluetooth_r_features:
script:
"../src/features/entry.R"
rule calls_python_features:
rule calls_episodes:
input:
sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv",
calls = "data/raw/{pid}/phone_calls_raw.csv"
output:
"data/interim/{pid}/phone_calls_episodes.csv"
script:
"../src/features/phone_calls/episodes/calls_episodes.py"
rule phone_calls_python_features:
input:
sensor_data = get_calls_input,
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -277,9 +285,9 @@ rule calls_python_features:
script:
"../src/features/entry.py"
rule calls_r_features:
rule phone_calls_r_features:
input:
sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv",
sensor_data = get_calls_input,
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
params:
provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()],

View File

@ -0,0 +1,7 @@
import pandas as pd
calls = pd.read_csv(snakemake.input["calls"]).rename(columns={"timestamp": "start_timestamp"})
calls["end_timestamp"] = calls["start_timestamp"] + calls["call_duration"] * 1000
calls["episode_id"] = calls.index
calls[["episode_id", "device_id", "call_type", "trace", "start_timestamp", "end_timestamp"]].to_csv(snakemake.output[0], index=False)

View File

@ -7,7 +7,7 @@ Mode <- function(v) {
uniqv[which.max(tabulate(match(v, uniqv)))]
}
call_features_of_type <- function(calls, call_type, time_segment, requested_features){
call_features_of_type <- function(calls, features_type, call_type, time_segment, requested_features){
# Output dataframe
features = data.frame(local_segment = character(), stringsAsFactors = FALSE)
@ -22,6 +22,15 @@ call_features_of_type <- function(calls, call_type, time_segment, requested_feat
if(nrow(calls) < 1)
return(cbind(features, read.csv(text = paste(paste(call_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
if(features_type == "EPISODES"){
calls <- calls %>%
mutate(call_duration = (end_timestamp - start_timestamp) / 1000) %>%
separate(local_start_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
mutate(local_hour = as.numeric(local_hour),
local_minute = as.numeric(local_minute))
}
for(feature_name in features_to_compute){
if(feature_name == "countmostfrequentcontact"){
# Get the number of messages for the most frequent contact throughout the study
@ -62,6 +71,8 @@ call_features_of_type <- function(calls, call_type, time_segment, requested_feat
rapids_features <- function(sensor_data_files, time_segment, provider){
calls_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
calls_data <- calls_data %>% filter_data_by_segment(time_segment)
features_type <- provider[["FEATURES_TYPE"]]
call_types = provider[["CALL_TYPES"]]
call_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment"))
@ -74,7 +85,7 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
requested_features <- provider[["FEATURES"]][[call_type]]
calls_of_type <- calls_data %>% filter(call_type == call_type_label)
features <- call_features_of_type(calls_of_type, call_type, time_segment, requested_features)
features <- call_features_of_type(calls_of_type, features_type, call_type, time_segment, requested_features)
call_features <- merge(call_features, features, all=TRUE)
}
call_features <- call_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count")), list( ~ replace_na(., 0)))

View File

@ -1,4 +1,5 @@
library("stringr")
library('purrr')
rapids_log_tag <- "RAPIDS:"

View File

@ -489,6 +489,9 @@ properties:
allOf:
- $ref: "#/definitions/PROVIDER"
- properties:
FEATURES_TYPE:
type: string
enum: [EVENTS, EPISODES]
CALL_TYPES:
type: array
items: