diff --git a/Snakefile b/Snakefile index 9f86b897..15e32de3 100644 --- a/Snakefile +++ b/Snakefile @@ -46,7 +46,6 @@ for provider in config["PHONE_CALLS"]["PROVIDERS"].keys(): if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/phone_calls_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/phone_calls_with_datetime_unified.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_CALLS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) diff --git a/docs/datastreams/aware-mysql.md b/docs/datastreams/aware-mysql.md index ae06a873..56e4d227 100644 --- a/docs/datastreams/aware-mysql.md +++ b/docs/datastreams/aware-mysql.md @@ -216,6 +216,81 @@ Stream columns named `FLAG_TO_MUTATE` means they are extracted based on the `MUT This sensor is not supported by iOS devices. +??? info "PHONE_CALLS" + + === "ANDROID" + + **RAPIDS_COLUMN_MAPPINGS** + + | RAPIDS column | Stream column | + |----------------------|---------------------| + | TIMESTAMP | timestamp | + | DEVICE_ID | device_id | + | CALL_TYPE | call_type | + | CALL_DURATION | call_duration | + | TRACE | trace | + + **MUTATION** + + - **COLUMN_MAPPINGS** (None) + - **SCRIPTS** (None) + + === "IOS" + + **RAPIDS_COLUMN_MAPPINGS** + + | RAPIDS column | Stream column | + |----------------------|---------------------| + | TIMESTAMP | timestamp | + | DEVICE_ID | device_id | + | CALL_TYPE | FLAG_TO_MUTATE | + | CALL_DURATION | call_duration | + | TRACE | trace | + + **MUTATION** + + - **COLUMN_MAPPINGS** + + | Script column | Stream column | + |----------------------|---------------------| + | CALL_TYPE | call_type | + + + - **SCRIPTS** + + ```bash + src/data/streams/mutations/phone/aware/calls_ios_unification.R + ``` + + !!! note + + We transform iOS call logs into Android's format. iOS stores call status: 1=incoming, 2=connected, 3=dialing, 4=disconnected, as opposed to Android's events: 1=incoming, 2=outgoing, 3=missed. + + We follow this algorithm to convert iOS call data (there are some inaccuracies in the way we handle sequences, see new rules below): + + - Search for the disconnected (4) status as it is common to all calls + - Group all events that preceded every status 4 + - We convert every 1,2,4 (or 2,1,4) sequence to an incoming call + - We convert every 3,2,4 (or 2,3,4) sequence to an outgoing call + - We convert every 1,4 or 3,4 sequence to a missed call (either incoming or outgoing) + - We set the duration of the call to be the sum of every status (dialing/ringing to hangup) as opposed to the duration of the last status (pick up to hang up) + + **Tested with an Android (OnePlus 7T) and an iPhone XR** + + |Call type | Android (duration) | iOS (duration) | New Rule| + |---------|----------|--------|------| + |Outgoing missed ended by me | 2 (0) | 3,4 (0,X) | 3,4 is converted to 2 with duration 0| + |Outgoing missed ended by them|2(0)|3,2,4 (0,X,X2)| 3,2,4 is converted to 2 with duration X2*| + |Incoming missed ended by me|NA**|1,4 (0,X)|1,4 is converted to 3 with duration 0| + |Incoming missed ended by them|3(0)|1,4 (0,X)|1,4 is converted to 3 with duration 0| + |Outgoing answered|2(X excluding dialing time)|3,2,4 (0,X,X2)|3,2,4 is converted to 2 with duration X2| + |Incoming answered|1(X excluding dialing time)|1,2,4 (0,X,X2)|1,2,4 is converted to 1 with duration X2| + + .* There is no way to differentiate an outgoing missed call ended by them from an outgoing answered call because the phone goes directly to voice mail and it counts as call time (essentially the voice mail answered). + + .** Android does not record incoming missed calls ended by the participant, just those ended by the person calling or ignored by the participant. + + ??? info "PHONE_CONVERSATION" === "ANDROID" diff --git a/docs/datastreams/mandatory-phone-format.md b/docs/datastreams/mandatory-phone-format.md index 83d176f6..ec570d68 100644 --- a/docs/datastreams/mandatory-phone-format.md +++ b/docs/datastreams/mandatory-phone-format.md @@ -57,6 +57,17 @@ This is a description of the format RAPIDS needs to process data for the followi | BT_RSSI | The RSSI dB to the scanned device | +??? info "PHONE_CALLS" + + | RAPIDS column | Description | + |--------------------|---------------------------------------------------------------------------| + | TIMESTAMP | An UNIX timestamp (13 digits) when a row of data was logged | + | DEVICE_ID | A string that uniquely identifies a device | + | CALL_TYPE | An integer that denotes call type: 1 = incoming, 2 = outgoing, 3 = missed | + | CALL_DURATION | Length of the call session | + | TRACE | SHA-1 one-way source/target of the call | + + ??? info "PHONE_CONVERSATION" | RAPIDS column | Description | diff --git a/docs/features/phone-calls.md b/docs/features/phone-calls.md index f96ef060..64b57e81 100644 --- a/docs/features/phone-calls.md +++ b/docs/features/phone-calls.md @@ -16,7 +16,6 @@ Sensor parameters description for `[PHONE_CALLS]`: ```bash - data/raw/{pid}/phone_calls_raw.csv - data/raw/{pid}/phone_calls_with_datetime.csv - - data/raw/{pid}/phone_calls_with_datetime_unified.csv - data/interim/{pid}/phone_calls_features/phone_calls_{language}_{provider_key}.csv - data/processed/features/{pid}/phone_calls.csv ``` diff --git a/rules/features.smk b/rules/features.smk index faed95a9..94548a75 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -264,7 +264,7 @@ rule phone_bluetooth_r_features: rule calls_python_features: input: - sensor_data = "data/raw/{pid}/phone_calls_with_datetime_unified.csv", + sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv", time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()], @@ -277,7 +277,7 @@ rule calls_python_features: rule calls_r_features: input: - sensor_data = "data/raw/{pid}/phone_calls_with_datetime_unified.csv", + sensor_data = "data/raw/{pid}/phone_calls_with_datetime.csv", time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: provider = lambda wildcards: config["PHONE_CALLS"]["PROVIDERS"][wildcards.provider_key.upper()], diff --git a/src/data/streams/aware_mysql/format.yaml b/src/data/streams/aware_mysql/format.yaml index 34c9aa62..a6d1c151 100644 --- a/src/data/streams/aware_mysql/format.yaml +++ b/src/data/streams/aware_mysql/format.yaml @@ -93,6 +93,30 @@ PHONE_BLUETOOTH: COLUMN_MAPPINGS: SCRIPTS: # List any python or r scripts that mutate your raw data +PHONE_CALLS: + ANDROID: + RAPIDS_COLUMN_MAPPINGS: + TIMESTAMP: timestamp + DEVICE_ID: device_id + CALL_TYPE: call_type + CALL_DURATION: call_duration + TRACE: trace + MUTATION: + COLUMN_MAPPINGS: + SCRIPTS: # List any python or r scripts that mutate your raw data + IOS: + RAPIDS_COLUMN_MAPPINGS: + TIMESTAMP: timestamp + DEVICE_ID: device_id + CALL_TYPE: FLAG_TO_MUTATE + CALL_DURATION: call_duration + TRACE: trace + MUTATION: + COLUMN_MAPPINGS: + CALL_TYPE: call_type + SCRIPTS: + - "src/data/streams/mutations/phone/aware/calls_ios_unification.R" + PHONE_CONVERSATION: ANDROID: RAPIDS_COLUMN_MAPPINGS: diff --git a/src/data/streams/mutations/phone/aware/calls_ios_unification.R b/src/data/streams/mutations/phone/aware/calls_ios_unification.R new file mode 100644 index 00000000..8f642cf6 --- /dev/null +++ b/src/data/streams/mutations/phone/aware/calls_ios_unification.R @@ -0,0 +1,67 @@ +source("renv/activate.R") +library("dplyr", warn.conflicts = F) + + +unify_ios_calls <- function(ios_calls){ + # Android’s call types 1=incoming, 2=outgoing, 3=missed + # iOS' call status 1=incoming, 2=connected, 3=dialing, 4=disconnected + # iOS' call types based on call status: (1,2,4)=incoming=1, (3,2,4)=outgoing=2, (1,4) or (3,4)=missed=3 + # Sometimes (due to a possible bug in Aware) sequences get logged on the exact same timestamp, thus 3-item sequences can be 2,3,4 or 3,2,4 + # Even tho iOS stores the duration of ringing/dialing for missed calls, we set it to 0 to match Android + + ios_calls <- ios_calls %>% + arrange(trace, timestamp, call_type) %>% + group_by(trace) %>% + # search for the disconnect event, as it is common to outgoing, received and missed calls + mutate(completed_call = ifelse(call_type == 4, 2, 0), + # assign the same ID to all events before a 4 + completed_call = cumsum(c(1, head(completed_call, -1) != tail(completed_call, -1))), + # hack to match ID of last event (4) to that of the previous rows + completed_call = ifelse(call_type == 4, completed_call - 1, completed_call)) + + # We check utc_date_time and local_date_time exist because sometimes we call this function from + # download_dataset to unify multi-platform participants. At that point such time columns are missing + if("utc_date_time" %in% colnames(ios_calls) && "local_date_time" %in% colnames(ios_calls)){ + ios_calls <- ios_calls %>% summarise(call_type_sequence = paste(call_type, collapse = ","), # collapse all events before a 4 + # sanity check, timestamp_diff should be equal or close to duration sum + # timestamp_diff = trunc((last(timestamp) - first(timestamp)) / 1000) + # use call_duration = last(call_duration) if you want duration from pick up to hang up + # use call_duration = sum(call_duration) if you want duration from dialing/ringing to hang up + call_duration = last(call_duration), + timestamp = first(timestamp), + utc_date_time = first(utc_date_time), + local_date_time = first(local_date_time), + local_date = first(local_date), + local_time = first(local_time), + local_hour = first(local_hour), + local_minute = first(local_minute), + local_timezone = first(local_timezone), + assigned_segments = first(assigned_segments)) + } + else { + ios_calls <- ios_calls %>% summarise(call_type_sequence = paste(call_type, collapse = ","), call_duration = sum(call_duration), timestamp = first(timestamp), device_id = first(device_id)) + } + ios_calls <- ios_calls %>% mutate(call_type = case_when( + call_type_sequence == "1,2,4" | call_type_sequence == "2,1,4" ~ 1, # incoming + call_type_sequence == "1,4" ~ 3, # missed + call_type_sequence == "3,2,4" | call_type_sequence == "2,3,4" ~ 2, # outgoing + call_type_sequence == "3,4" ~ 4, # outgoing missed, we create this temp missed state to assign a duration of 0 below + TRUE ~ -1), # other, call sequences without a disconnect (4) event are discarded + # assign a duration of 0 to incoming and outgoing missed calls + call_duration = ifelse(call_type == 3 | call_type == 4, 0, call_duration), + # get rid of the temp missed call type, set to 2 to match Android. See https://github.com/carissalow/rapids/issues/79 + call_type = ifelse(call_type == 4, 2, call_type) + ) %>% + # discard sequences without an event 4 (disconnect) + filter(call_type > 0) %>% + ungroup() %>% + arrange(timestamp) + + ios_calls <- select(ios_calls, -call_type_sequence) + + return(ios_calls) +} + +main <- function(data, stream_parameters){ + return(unify_ios_calls(data)) +} \ No newline at end of file diff --git a/src/data/streams/rapids_columns.yaml b/src/data/streams/rapids_columns.yaml index 43701057..4c974ee0 100644 --- a/src/data/streams/rapids_columns.yaml +++ b/src/data/streams/rapids_columns.yaml @@ -33,6 +33,13 @@ PHONE_BLUETOOTH: - BT_NAME - BT_RSSI +PHONE_CALLS: + - TIMESTAMP + - DEVICE_ID + - CALL_TYPE + - CALL_DURATION + - TRACE + PHONE_CONVERSATION: - TIMESTAMP - DEVICE_ID