Refactor PHONE_CALLS RAPIDS provider to compute features based on call episodes or events
parent
2e553dc9e7
commit
a8a178486b
|
@ -45,7 +45,12 @@ for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys():
|
|||
for provider in config["PHONE_CALLS"]["PROVIDERS"].keys():
|
||||
if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"]))
|
||||
if (provider == "RAPIDS") and (config["PHONE_CALLS"]["PROVIDERS"][provider]["FEATURES_TYPE"] == "EPISODES"):
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes_resampled.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
|
||||
else:
|
||||
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
|
|
|
@ -181,6 +181,7 @@ PHONE_CALLS:
|
|||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: False
|
||||
FEATURES_TYPE: EPISODES # EVENTS or EPISODES
|
||||
CALL_TYPES: [missed, incoming, outgoing]
|
||||
FEATURES:
|
||||
missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact]
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
# Change Log
|
||||
## v1.6.0
|
||||
- Refactor PHONE_CALLS RAPIDS provider to compute features based on call episodes or events
|
||||
## v1.5.0
|
||||
- Update Barnett location features with faster Python implementation
|
||||
- Fix rounding bug in data yield features
|
||||
|
|
|
@ -26,6 +26,7 @@ Parameters description for `[PHONE_CALLS][PROVIDERS][RAPIDS]`:
|
|||
| Key | Description |
|
||||
|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
|`[COMPUTE]`| Set to `True` to extract `PHONE_CALLS` features from the `RAPIDS` provider|
|
||||
|`[FEATURES_TYPE]`| Set to `EPISODES` to extract features based on call episodes or `EVENTS` to extract features based on events.|
|
||||
| `[CALL_TYPES]` | The particular call_type that will be analyzed. The options for this parameter are incoming, outgoing or missed. |
|
||||
| `[FEATURES]` | Features to be computed for `outgoing`, `incoming`, and `missed` calls. Note that the same features are available for both incoming and outgoing calls, while missed calls has its own set of features. See the tables below. |
|
||||
|
||||
|
@ -60,4 +61,4 @@ Features description for `[PHONE_CALLS][PROVIDERS][RAPIDS]` missed calls:
|
|||
!!! note "Assumptions/Observations"
|
||||
1. Traces for iOS calls are unique even for the same contact calling a participant more than once which renders `countmostfrequentcontact` meaningless and `distinctcontacts` equal to the total number of traces.
|
||||
2. `[CALL_TYPES]` and `[FEATURES]` keys in `config.yaml` need to match. For example, `[CALL_TYPES]` `outgoing` matches the `[FEATURES]` key `outgoing`
|
||||
3. iOS calls data is transformed to match Android calls data format. See our [algorithm](algorithms/phone-algorithms.md#phone-calls)
|
||||
3. iOS calls data is transformed to match Android calls data format.
|
||||
|
|
|
@ -27,6 +27,12 @@ def get_locations_python_input(wildcards):
|
|||
else:
|
||||
return "data/interim/{pid}/phone_locations_processed_with_datetime.csv"
|
||||
|
||||
def get_calls_input(wildcards):
|
||||
if (wildcards.provider_key.upper() == "RAPIDS") and (config["PHONE_CALLS"]["PROVIDERS"]["RAPIDS"]["FEATURES_TYPE"] == "EPISODES"):
|
||||
return "data/interim/{pid}/phone_calls_episodes_resampled_with_datetime.csv"
|
||||
else:
|
||||
return "data/raw/{pid}/phone_calls_with_datetime.csv"
|
||||
|
||||
def find_features_files(wildcards):
|
||||
feature_files = []
|
||||
for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items():
|
||||
|
|
|
@ -264,9 +264,17 @@ rule phone_bluetooth_r_features:
|
|||
script:
|
||||
"../src/features/entry.R"
|
||||
|
||||
rule calls_python_features:
|
||||
rule calls_episodes:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv",
|
||||
calls = "data/raw/{pid}/phone_calls_raw.csv"
|
||||
output:
|
||||
"data/interim/{pid}/phone_calls_episodes.csv"
|
||||
script:
|
||||
"../src/features/phone_calls/episodes/calls_episodes.py"
|
||||
|
||||
rule phone_calls_python_features:
|
||||
input:
|
||||
sensor_data = get_calls_input,
|
||||
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
||||
params:
|
||||
provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
|
@ -277,9 +285,9 @@ rule calls_python_features:
|
|||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule calls_r_features:
|
||||
rule phone_calls_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv",
|
||||
sensor_data = get_calls_input,
|
||||
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
||||
params:
|
||||
provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
import pandas as pd
|
||||
|
||||
calls = pd.read_csv(snakemake.input["calls"]).rename(columns={"timestamp": "start_timestamp"})
|
||||
calls["end_timestamp"] = calls["start_timestamp"] + calls["call_duration"] * 1000
|
||||
calls["episode_id"] = calls.index
|
||||
|
||||
calls[["episode_id", "device_id", "call_type", "trace", "start_timestamp", "end_timestamp"]].to_csv(snakemake.output[0], index=False)
|
|
@ -7,7 +7,7 @@ Mode <- function(v) {
|
|||
uniqv[which.max(tabulate(match(v, uniqv)))]
|
||||
}
|
||||
|
||||
call_features_of_type <- function(calls, call_type, time_segment, requested_features){
|
||||
call_features_of_type <- function(calls, features_type, call_type, time_segment, requested_features){
|
||||
# Output dataframe
|
||||
features = data.frame(local_segment = character(), stringsAsFactors = FALSE)
|
||||
|
||||
|
@ -22,6 +22,15 @@ call_features_of_type <- function(calls, call_type, time_segment, requested_feat
|
|||
if(nrow(calls) < 1)
|
||||
return(cbind(features, read.csv(text = paste(paste(call_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
|
||||
|
||||
if(features_type == "EPISODES"){
|
||||
calls <- calls %>%
|
||||
mutate(call_duration = (end_timestamp - start_timestamp) / 1000) %>%
|
||||
separate(local_start_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
|
||||
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
|
||||
mutate(local_hour = as.numeric(local_hour),
|
||||
local_minute = as.numeric(local_minute))
|
||||
}
|
||||
|
||||
for(feature_name in features_to_compute){
|
||||
if(feature_name == "countmostfrequentcontact"){
|
||||
# Get the number of messages for the most frequent contact throughout the study
|
||||
|
@ -62,6 +71,8 @@ call_features_of_type <- function(calls, call_type, time_segment, requested_feat
|
|||
rapids_features <- function(sensor_data_files, time_segment, provider){
|
||||
calls_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
|
||||
calls_data <- calls_data %>% filter_data_by_segment(time_segment)
|
||||
|
||||
features_type <- provider[["FEATURES_TYPE"]]
|
||||
call_types = provider[["CALL_TYPES"]]
|
||||
call_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment"))
|
||||
|
||||
|
@ -74,7 +85,7 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
|||
requested_features <- provider[["FEATURES"]][[call_type]]
|
||||
calls_of_type <- calls_data %>% filter(call_type == call_type_label)
|
||||
|
||||
features <- call_features_of_type(calls_of_type, call_type, time_segment, requested_features)
|
||||
features <- call_features_of_type(calls_of_type, features_type, call_type, time_segment, requested_features)
|
||||
call_features <- merge(call_features, features, all=TRUE)
|
||||
}
|
||||
call_features <- call_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count")), list( ~ replace_na(., 0)))
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
library("stringr")
|
||||
library('purrr')
|
||||
|
||||
rapids_log_tag <- "RAPIDS:"
|
||||
|
||||
|
|
|
@ -489,6 +489,9 @@ properties:
|
|||
allOf:
|
||||
- $ref: "#/definitions/PROVIDER"
|
||||
- properties:
|
||||
FEATURES_TYPE:
|
||||
type: string
|
||||
enum: [EVENTS, EPISODES]
|
||||
CALL_TYPES:
|
||||
type: array
|
||||
items:
|
||||
|
|
Loading…
Reference in New Issue