Refactor PHONE_CALLS RAPIDS provider to compute features based on call episodes or events

2021-09-01 18:54:39 -04:00 · 2021-09-01 18:54:39 -04:00 · a8a178486b
parent 2e553dc9e7
commit a8a178486b
10 changed files with 53 additions and 8 deletions
--- a/7
+++ b/7
@ -45,7 +45,12 @@ for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys():
 for provider in config["PHONE_CALLS"]["PROVIDERS"].keys():
    if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]:
        files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"]))
-        files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"]))
+        if (provider == "RAPIDS") and (config["PHONE_CALLS"]["PROVIDERS"][provider]["FEATURES_TYPE"] == "EPISODES"):
            files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes.csv", pid=config["PIDS"]))
            files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes_resampled.csv", pid=config["PIDS"]))
            files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
        else:
            files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"]))
        files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
        files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"]))
        files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
--- a/config.yaml
+++ b/config.yaml
@ -181,6 +181,7 @@ PHONE_CALLS:
  PROVIDERS:
    RAPIDS:
      COMPUTE: False
      FEATURES_TYPE: EPISODES # EVENTS or EPISODES
      CALL_TYPES: [missed, incoming, outgoing]
      FEATURES:
        missed:  [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact]
--- a/docs/change-log.md
+++ b/docs/change-log.md
@ -1,4 +1,6 @@
 # Change Log
 ## v1.6.0
 - Refactor PHONE_CALLS RAPIDS provider to compute features based on call episodes or events
 ## v1.5.0
 - Update Barnett location features with faster Python implementation
 - Fix rounding bug in data yield features
--- a/docs/features/phone-calls.md
+++ b/docs/features/phone-calls.md
@ -26,6 +26,7 @@ Parameters description for `[PHONE_CALLS][PROVIDERS][RAPIDS]`:
 | Key&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;        | Description |
 |-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 |`[COMPUTE]`| Set to `True` to extract `PHONE_CALLS` features from the `RAPIDS` provider|
 |`[FEATURES_TYPE]`| Set to `EPISODES` to extract features based on call episodes or `EVENTS` to extract features based on events.|
 | `[CALL_TYPES]`   | The particular call_type that will be analyzed. The options for this parameter are incoming, outgoing or missed.                                                                                                                                                 |
 | `[FEATURES]`    | Features to be computed for `outgoing`, `incoming`, and `missed` calls. Note that the same features are available for both incoming and outgoing calls, while missed calls has its own set of features. See the tables below. |
@ -60,4 +61,4 @@ Features description for `[PHONE_CALLS][PROVIDERS][RAPIDS]` missed calls:
 !!! note "Assumptions/Observations"
    1. Traces for iOS calls are unique even for the same contact calling a participant more than once which renders `countmostfrequentcontact` meaningless and `distinctcontacts` equal to the total number of traces. 
    2. `[CALL_TYPES]` and `[FEATURES]` keys in `config.yaml` need to match. For example, `[CALL_TYPES]` `outgoing` matches the `[FEATURES]` key `outgoing`
-    3. iOS calls data is transformed to match Android calls data format. See our [algorithm](algorithms/phone-algorithms.md#phone-calls)
+    3. iOS calls data is transformed to match Android calls data format.
--- a/rules/common.smk
+++ b/rules/common.smk
@ -27,6 +27,12 @@ def get_locations_python_input(wildcards):
    else:
        return "data/interim/{pid}/phone_locations_processed_with_datetime.csv"
 def get_calls_input(wildcards):
    if (wildcards.provider_key.upper() == "RAPIDS") and (config["PHONE_CALLS"]["PROVIDERS"]["RAPIDS"]["FEATURES_TYPE"] == "EPISODES"):
        return "data/interim/{pid}/phone_calls_episodes_resampled_with_datetime.csv"
    else:
        return "data/raw/{pid}/phone_calls_with_datetime.csv"
 def find_features_files(wildcards):
    feature_files = []
    for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items():
--- a/rules/features.smk
+++ b/rules/features.smk
@ -264,9 +264,17 @@ rule phone_bluetooth_r_features:
    script:
        "../src/features/entry.R"
-rule calls_python_features:
+rule calls_episodes:
    input:
-        sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv",
+        calls = "data/raw/{pid}/phone_calls_raw.csv"
    output:
        "data/interim/{pid}/phone_calls_episodes.csv"
    script:
        "../src/features/phone_calls/episodes/calls_episodes.py"
 rule phone_calls_python_features:
    input:
        sensor_data = get_calls_input,
        time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
    params:
        provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()],
@ -277,9 +285,9 @@ rule calls_python_features:
    script:
        "../src/features/entry.py"
-rule calls_r_features:
+rule phone_calls_r_features:
    input:
-        sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv",
+        sensor_data = get_calls_input,
        time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
    params:
        provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()],
--- a/src/features/phone_calls/episodes/calls_episodes.py
+++ b/src/features/phone_calls/episodes/calls_episodes.py
@ -0,0 +1,7 @@
 import pandas as pd
 calls = pd.read_csv(snakemake.input["calls"]).rename(columns={"timestamp": "start_timestamp"})
 calls["end_timestamp"] = calls["start_timestamp"] + calls["call_duration"] * 1000
 calls["episode_id"] = calls.index
 calls[["episode_id", "device_id", "call_type", "trace", "start_timestamp", "end_timestamp"]].to_csv(snakemake.output[0], index=False)
--- a/src/features/phone_calls/rapids/main.R
+++ b/src/features/phone_calls/rapids/main.R
@ -7,7 +7,7 @@ Mode <- function(v) {
  uniqv[which.max(tabulate(match(v, uniqv)))]
 }
-call_features_of_type <- function(calls, call_type, time_segment, requested_features){
+call_features_of_type <- function(calls, features_type, call_type, time_segment, requested_features){
    # Output dataframe
    features = data.frame(local_segment = character(), stringsAsFactors = FALSE)
@ -22,6 +22,15 @@ call_features_of_type <- function(calls, call_type, time_segment, requested_feat
    if(nrow(calls) < 1)
        return(cbind(features, read.csv(text = paste(paste(call_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
    if(features_type == "EPISODES"){
        calls <- calls %>% 
            mutate(call_duration = (end_timestamp - start_timestamp) / 1000) %>% 
            separate(local_start_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
            separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
            mutate(local_hour = as.numeric(local_hour),
                local_minute = as.numeric(local_minute))
    }
    for(feature_name in features_to_compute){
        if(feature_name == "countmostfrequentcontact"){
            # Get the number of messages for the most frequent contact throughout the study
@ -62,6 +71,8 @@ call_features_of_type <- function(calls, call_type, time_segment, requested_feat
 rapids_features <- function(sensor_data_files, time_segment, provider){
    calls_data <-  read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
    calls_data <- calls_data %>% filter_data_by_segment(time_segment)
    features_type <- provider[["FEATURES_TYPE"]]
    call_types = provider[["CALL_TYPES"]]
    call_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment"))
@ -74,7 +85,7 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
        requested_features <- provider[["FEATURES"]][[call_type]]
        calls_of_type <- calls_data %>% filter(call_type == call_type_label)
-        features <- call_features_of_type(calls_of_type, call_type, time_segment, requested_features)
+        features <- call_features_of_type(calls_of_type, features_type, call_type, time_segment, requested_features)
        call_features <- merge(call_features, features, all=TRUE)
    }
    call_features <- call_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count")), list( ~ replace_na(., 0)))
--- a/src/features/utils/utils.R
+++ b/src/features/utils/utils.R
@ -1,4 +1,5 @@
 library("stringr")
 library('purrr')
 rapids_log_tag <- "RAPIDS:"
--- a/tools/config.schema.yaml
+++ b/tools/config.schema.yaml
@ -489,6 +489,9 @@ properties:
            allOf:
                - $ref: "#/definitions/PROVIDER"
                - properties:
                    FEATURES_TYPE:
                      type: string
                      enum: [EVENTS, EPISODES]
                    CALL_TYPES:
                      type: array
                      items: