diff --git a/Snakefile b/Snakefile index cc1ed4ba..b49392bb 100644 --- a/Snakefile +++ b/Snakefile @@ -45,7 +45,12 @@ for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys(): for provider in config["PHONE_CALLS"]["PROVIDERS"].keys(): if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"])) + if (provider == "RAPIDS") and (config["PHONE_CALLS"]["PROVIDERS"][provider]["FEATURES_TYPE"] == "EPISODES"): + files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes_resampled.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) + else: + files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) diff --git a/config.yaml b/config.yaml index 9a72becc..c73fd07e 100644 --- a/config.yaml +++ b/config.yaml @@ -181,6 +181,7 @@ PHONE_CALLS: PROVIDERS: RAPIDS: COMPUTE: False + FEATURES_TYPE: EPISODES # EVENTS or EPISODES CALL_TYPES: [missed, incoming, outgoing] FEATURES: missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact] diff --git a/docs/change-log.md b/docs/change-log.md index 48445966..a692df9d 100644 --- a/docs/change-log.md +++ b/docs/change-log.md @@ -1,4 +1,6 @@ # Change Log +## v1.6.0 +- Refactor PHONE_CALLS RAPIDS provider to compute features based on call episodes or events ## v1.5.0 - Update Barnett location features with faster Python implementation - Fix rounding bug in data yield features diff --git a/docs/features/phone-calls.md b/docs/features/phone-calls.md index a29ce345..4cded272 100644 --- a/docs/features/phone-calls.md +++ b/docs/features/phone-calls.md @@ -26,6 +26,7 @@ Parameters description for `[PHONE_CALLS][PROVIDERS][RAPIDS]`: | Key                        | Description | |-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |`[COMPUTE]`| Set to `True` to extract `PHONE_CALLS` features from the `RAPIDS` provider| +|`[FEATURES_TYPE]`| Set to `EPISODES` to extract features based on call episodes or `EVENTS` to extract features based on events.| | `[CALL_TYPES]` | The particular call_type that will be analyzed. The options for this parameter are incoming, outgoing or missed. | | `[FEATURES]` | Features to be computed for `outgoing`, `incoming`, and `missed` calls. Note that the same features are available for both incoming and outgoing calls, while missed calls has its own set of features. See the tables below. | @@ -60,4 +61,4 @@ Features description for `[PHONE_CALLS][PROVIDERS][RAPIDS]` missed calls: !!! note "Assumptions/Observations" 1. Traces for iOS calls are unique even for the same contact calling a participant more than once which renders `countmostfrequentcontact` meaningless and `distinctcontacts` equal to the total number of traces. 2. `[CALL_TYPES]` and `[FEATURES]` keys in `config.yaml` need to match. For example, `[CALL_TYPES]` `outgoing` matches the `[FEATURES]` key `outgoing` - 3. iOS calls data is transformed to match Android calls data format. See our [algorithm](algorithms/phone-algorithms.md#phone-calls) + 3. iOS calls data is transformed to match Android calls data format. diff --git a/rules/common.smk b/rules/common.smk index 8ff4a191..f71b32c4 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -27,6 +27,12 @@ def get_locations_python_input(wildcards): else: return "data/interim/{pid}/phone_locations_processed_with_datetime.csv" +def get_calls_input(wildcards): + if (wildcards.provider_key.upper() == "RAPIDS") and (config["PHONE_CALLS"]["PROVIDERS"]["RAPIDS"]["FEATURES_TYPE"] == "EPISODES"): + return "data/interim/{pid}/phone_calls_episodes_resampled_with_datetime.csv" + else: + return "data/raw/{pid}/phone_calls_with_datetime.csv" + def find_features_files(wildcards): feature_files = [] for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items(): diff --git a/rules/features.smk b/rules/features.smk index d6e435a7..5f806adb 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -264,9 +264,17 @@ rule phone_bluetooth_r_features: script: "../src/features/entry.R" -rule calls_python_features: +rule calls_episodes: input: - sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv", + calls = "data/raw/{pid}/phone_calls_raw.csv" + output: + "data/interim/{pid}/phone_calls_episodes.csv" + script: + "../src/features/phone_calls/episodes/calls_episodes.py" + +rule phone_calls_python_features: + input: + sensor_data = get_calls_input, time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()], @@ -277,9 +285,9 @@ rule calls_python_features: script: "../src/features/entry.py" -rule calls_r_features: +rule phone_calls_r_features: input: - sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv", + sensor_data = get_calls_input, time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()], diff --git a/src/features/phone_calls/episodes/calls_episodes.py b/src/features/phone_calls/episodes/calls_episodes.py new file mode 100644 index 00000000..96da269f --- /dev/null +++ b/src/features/phone_calls/episodes/calls_episodes.py @@ -0,0 +1,7 @@ +import pandas as pd + +calls = pd.read_csv(snakemake.input["calls"]).rename(columns={"timestamp": "start_timestamp"}) +calls["end_timestamp"] = calls["start_timestamp"] + calls["call_duration"] * 1000 +calls["episode_id"] = calls.index + +calls[["episode_id", "device_id", "call_type", "trace", "start_timestamp", "end_timestamp"]].to_csv(snakemake.output[0], index=False) diff --git a/src/features/phone_calls/rapids/main.R b/src/features/phone_calls/rapids/main.R index 39cdfc45..a10dcd7b 100644 --- a/src/features/phone_calls/rapids/main.R +++ b/src/features/phone_calls/rapids/main.R @@ -7,7 +7,7 @@ Mode <- function(v) { uniqv[which.max(tabulate(match(v, uniqv)))] } -call_features_of_type <- function(calls, call_type, time_segment, requested_features){ +call_features_of_type <- function(calls, features_type, call_type, time_segment, requested_features){ # Output dataframe features = data.frame(local_segment = character(), stringsAsFactors = FALSE) @@ -22,6 +22,15 @@ call_features_of_type <- function(calls, call_type, time_segment, requested_feat if(nrow(calls) < 1) return(cbind(features, read.csv(text = paste(paste(call_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE))) + if(features_type == "EPISODES"){ + calls <- calls %>% + mutate(call_duration = (end_timestamp - start_timestamp) / 1000) %>% + separate(local_start_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>% + separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>% + mutate(local_hour = as.numeric(local_hour), + local_minute = as.numeric(local_minute)) + } + for(feature_name in features_to_compute){ if(feature_name == "countmostfrequentcontact"){ # Get the number of messages for the most frequent contact throughout the study @@ -62,6 +71,8 @@ call_features_of_type <- function(calls, call_type, time_segment, requested_feat rapids_features <- function(sensor_data_files, time_segment, provider){ calls_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) calls_data <- calls_data %>% filter_data_by_segment(time_segment) + + features_type <- provider[["FEATURES_TYPE"]] call_types = provider[["CALL_TYPES"]] call_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment")) @@ -74,7 +85,7 @@ rapids_features <- function(sensor_data_files, time_segment, provider){ requested_features <- provider[["FEATURES"]][[call_type]] calls_of_type <- calls_data %>% filter(call_type == call_type_label) - features <- call_features_of_type(calls_of_type, call_type, time_segment, requested_features) + features <- call_features_of_type(calls_of_type, features_type, call_type, time_segment, requested_features) call_features <- merge(call_features, features, all=TRUE) } call_features <- call_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count")), list( ~ replace_na(., 0))) diff --git a/src/features/utils/utils.R b/src/features/utils/utils.R index 9a3dabce..74437515 100644 --- a/src/features/utils/utils.R +++ b/src/features/utils/utils.R @@ -1,4 +1,5 @@ library("stringr") +library('purrr') rapids_log_tag <- "RAPIDS:" diff --git a/tools/config.schema.yaml b/tools/config.schema.yaml index 5ea33a24..4c3638c1 100644 --- a/tools/config.schema.yaml +++ b/tools/config.schema.yaml @@ -489,6 +489,9 @@ properties: allOf: - $ref: "#/definitions/PROVIDER" - properties: + FEATURES_TYPE: + type: string + enum: [EVENTS, EPISODES] CALL_TYPES: type: array items: