Refactor PHONE_CALLS RAPIDS provider to compute features based on call episodes or events
parent
2e553dc9e7
commit
a8a178486b
|
@ -45,7 +45,12 @@ for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys():
|
||||||
for provider in config["PHONE_CALLS"]["PROVIDERS"].keys():
|
for provider in config["PHONE_CALLS"]["PROVIDERS"].keys():
|
||||||
if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]:
|
if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"]))
|
if (provider == "RAPIDS") and (config["PHONE_CALLS"]["PROVIDERS"][provider]["FEATURES_TYPE"] == "EPISODES"):
|
||||||
|
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes.csv", pid=config["PIDS"]))
|
||||||
|
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes_resampled.csv", pid=config["PIDS"]))
|
||||||
|
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
|
||||||
|
else:
|
||||||
|
files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||||
|
|
|
@ -181,6 +181,7 @@ PHONE_CALLS:
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: False
|
COMPUTE: False
|
||||||
|
FEATURES_TYPE: EPISODES # EVENTS or EPISODES
|
||||||
CALL_TYPES: [missed, incoming, outgoing]
|
CALL_TYPES: [missed, incoming, outgoing]
|
||||||
FEATURES:
|
FEATURES:
|
||||||
missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact]
|
missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact]
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
# Change Log
|
# Change Log
|
||||||
|
## v1.6.0
|
||||||
|
- Refactor PHONE_CALLS RAPIDS provider to compute features based on call episodes or events
|
||||||
## v1.5.0
|
## v1.5.0
|
||||||
- Update Barnett location features with faster Python implementation
|
- Update Barnett location features with faster Python implementation
|
||||||
- Fix rounding bug in data yield features
|
- Fix rounding bug in data yield features
|
||||||
|
|
|
@ -26,6 +26,7 @@ Parameters description for `[PHONE_CALLS][PROVIDERS][RAPIDS]`:
|
||||||
| Key | Description |
|
| Key | Description |
|
||||||
|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
|`[COMPUTE]`| Set to `True` to extract `PHONE_CALLS` features from the `RAPIDS` provider|
|
|`[COMPUTE]`| Set to `True` to extract `PHONE_CALLS` features from the `RAPIDS` provider|
|
||||||
|
|`[FEATURES_TYPE]`| Set to `EPISODES` to extract features based on call episodes or `EVENTS` to extract features based on events.|
|
||||||
| `[CALL_TYPES]` | The particular call_type that will be analyzed. The options for this parameter are incoming, outgoing or missed. |
|
| `[CALL_TYPES]` | The particular call_type that will be analyzed. The options for this parameter are incoming, outgoing or missed. |
|
||||||
| `[FEATURES]` | Features to be computed for `outgoing`, `incoming`, and `missed` calls. Note that the same features are available for both incoming and outgoing calls, while missed calls has its own set of features. See the tables below. |
|
| `[FEATURES]` | Features to be computed for `outgoing`, `incoming`, and `missed` calls. Note that the same features are available for both incoming and outgoing calls, while missed calls has its own set of features. See the tables below. |
|
||||||
|
|
||||||
|
@ -60,4 +61,4 @@ Features description for `[PHONE_CALLS][PROVIDERS][RAPIDS]` missed calls:
|
||||||
!!! note "Assumptions/Observations"
|
!!! note "Assumptions/Observations"
|
||||||
1. Traces for iOS calls are unique even for the same contact calling a participant more than once which renders `countmostfrequentcontact` meaningless and `distinctcontacts` equal to the total number of traces.
|
1. Traces for iOS calls are unique even for the same contact calling a participant more than once which renders `countmostfrequentcontact` meaningless and `distinctcontacts` equal to the total number of traces.
|
||||||
2. `[CALL_TYPES]` and `[FEATURES]` keys in `config.yaml` need to match. For example, `[CALL_TYPES]` `outgoing` matches the `[FEATURES]` key `outgoing`
|
2. `[CALL_TYPES]` and `[FEATURES]` keys in `config.yaml` need to match. For example, `[CALL_TYPES]` `outgoing` matches the `[FEATURES]` key `outgoing`
|
||||||
3. iOS calls data is transformed to match Android calls data format. See our [algorithm](algorithms/phone-algorithms.md#phone-calls)
|
3. iOS calls data is transformed to match Android calls data format.
|
||||||
|
|
|
@ -27,6 +27,12 @@ def get_locations_python_input(wildcards):
|
||||||
else:
|
else:
|
||||||
return "data/interim/{pid}/phone_locations_processed_with_datetime.csv"
|
return "data/interim/{pid}/phone_locations_processed_with_datetime.csv"
|
||||||
|
|
||||||
|
def get_calls_input(wildcards):
|
||||||
|
if (wildcards.provider_key.upper() == "RAPIDS") and (config["PHONE_CALLS"]["PROVIDERS"]["RAPIDS"]["FEATURES_TYPE"] == "EPISODES"):
|
||||||
|
return "data/interim/{pid}/phone_calls_episodes_resampled_with_datetime.csv"
|
||||||
|
else:
|
||||||
|
return "data/raw/{pid}/phone_calls_with_datetime.csv"
|
||||||
|
|
||||||
def find_features_files(wildcards):
|
def find_features_files(wildcards):
|
||||||
feature_files = []
|
feature_files = []
|
||||||
for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items():
|
for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items():
|
||||||
|
|
|
@ -264,9 +264,17 @@ rule phone_bluetooth_r_features:
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.R"
|
"../src/features/entry.R"
|
||||||
|
|
||||||
rule calls_python_features:
|
rule calls_episodes:
|
||||||
input:
|
input:
|
||||||
sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv",
|
calls = "data/raw/{pid}/phone_calls_raw.csv"
|
||||||
|
output:
|
||||||
|
"data/interim/{pid}/phone_calls_episodes.csv"
|
||||||
|
script:
|
||||||
|
"../src/features/phone_calls/episodes/calls_episodes.py"
|
||||||
|
|
||||||
|
rule phone_calls_python_features:
|
||||||
|
input:
|
||||||
|
sensor_data = get_calls_input,
|
||||||
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
||||||
params:
|
params:
|
||||||
provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()],
|
provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||||
|
@ -277,9 +285,9 @@ rule calls_python_features:
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.py"
|
"../src/features/entry.py"
|
||||||
|
|
||||||
rule calls_r_features:
|
rule phone_calls_r_features:
|
||||||
input:
|
input:
|
||||||
sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv",
|
sensor_data = get_calls_input,
|
||||||
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
||||||
params:
|
params:
|
||||||
provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()],
|
provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
calls = pd.read_csv(snakemake.input["calls"]).rename(columns={"timestamp": "start_timestamp"})
|
||||||
|
calls["end_timestamp"] = calls["start_timestamp"] + calls["call_duration"] * 1000
|
||||||
|
calls["episode_id"] = calls.index
|
||||||
|
|
||||||
|
calls[["episode_id", "device_id", "call_type", "trace", "start_timestamp", "end_timestamp"]].to_csv(snakemake.output[0], index=False)
|
|
@ -7,7 +7,7 @@ Mode <- function(v) {
|
||||||
uniqv[which.max(tabulate(match(v, uniqv)))]
|
uniqv[which.max(tabulate(match(v, uniqv)))]
|
||||||
}
|
}
|
||||||
|
|
||||||
call_features_of_type <- function(calls, call_type, time_segment, requested_features){
|
call_features_of_type <- function(calls, features_type, call_type, time_segment, requested_features){
|
||||||
# Output dataframe
|
# Output dataframe
|
||||||
features = data.frame(local_segment = character(), stringsAsFactors = FALSE)
|
features = data.frame(local_segment = character(), stringsAsFactors = FALSE)
|
||||||
|
|
||||||
|
@ -22,6 +22,15 @@ call_features_of_type <- function(calls, call_type, time_segment, requested_feat
|
||||||
if(nrow(calls) < 1)
|
if(nrow(calls) < 1)
|
||||||
return(cbind(features, read.csv(text = paste(paste(call_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
|
return(cbind(features, read.csv(text = paste(paste(call_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
|
||||||
|
|
||||||
|
if(features_type == "EPISODES"){
|
||||||
|
calls <- calls %>%
|
||||||
|
mutate(call_duration = (end_timestamp - start_timestamp) / 1000) %>%
|
||||||
|
separate(local_start_date_time, c("local_date","local_time"), "\\s", remove = FALSE) %>%
|
||||||
|
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
|
||||||
|
mutate(local_hour = as.numeric(local_hour),
|
||||||
|
local_minute = as.numeric(local_minute))
|
||||||
|
}
|
||||||
|
|
||||||
for(feature_name in features_to_compute){
|
for(feature_name in features_to_compute){
|
||||||
if(feature_name == "countmostfrequentcontact"){
|
if(feature_name == "countmostfrequentcontact"){
|
||||||
# Get the number of messages for the most frequent contact throughout the study
|
# Get the number of messages for the most frequent contact throughout the study
|
||||||
|
@ -62,6 +71,8 @@ call_features_of_type <- function(calls, call_type, time_segment, requested_feat
|
||||||
rapids_features <- function(sensor_data_files, time_segment, provider){
|
rapids_features <- function(sensor_data_files, time_segment, provider){
|
||||||
calls_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
|
calls_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
|
||||||
calls_data <- calls_data %>% filter_data_by_segment(time_segment)
|
calls_data <- calls_data %>% filter_data_by_segment(time_segment)
|
||||||
|
|
||||||
|
features_type <- provider[["FEATURES_TYPE"]]
|
||||||
call_types = provider[["CALL_TYPES"]]
|
call_types = provider[["CALL_TYPES"]]
|
||||||
call_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment"))
|
call_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment"))
|
||||||
|
|
||||||
|
@ -74,7 +85,7 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
||||||
requested_features <- provider[["FEATURES"]][[call_type]]
|
requested_features <- provider[["FEATURES"]][[call_type]]
|
||||||
calls_of_type <- calls_data %>% filter(call_type == call_type_label)
|
calls_of_type <- calls_data %>% filter(call_type == call_type_label)
|
||||||
|
|
||||||
features <- call_features_of_type(calls_of_type, call_type, time_segment, requested_features)
|
features <- call_features_of_type(calls_of_type, features_type, call_type, time_segment, requested_features)
|
||||||
call_features <- merge(call_features, features, all=TRUE)
|
call_features <- merge(call_features, features, all=TRUE)
|
||||||
}
|
}
|
||||||
call_features <- call_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count")), list( ~ replace_na(., 0)))
|
call_features <- call_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count")), list( ~ replace_na(., 0)))
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
library("stringr")
|
library("stringr")
|
||||||
|
library('purrr')
|
||||||
|
|
||||||
rapids_log_tag <- "RAPIDS:"
|
rapids_log_tag <- "RAPIDS:"
|
||||||
|
|
||||||
|
|
|
@ -489,6 +489,9 @@ properties:
|
||||||
allOf:
|
allOf:
|
||||||
- $ref: "#/definitions/PROVIDER"
|
- $ref: "#/definitions/PROVIDER"
|
||||||
- properties:
|
- properties:
|
||||||
|
FEATURES_TYPE:
|
||||||
|
type: string
|
||||||
|
enum: [EVENTS, EPISODES]
|
||||||
CALL_TYPES:
|
CALL_TYPES:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
|
|
Loading…
Reference in New Issue