Migrate calls to new provider file structure

2020-08-28 17:50:49 -04:00 · 2020-08-28 17:50:49 -04:00 · e269062439
parent 011b9736d5
commit e269062439
6 changed files with 157 additions and 25 deletions
--- a/12
+++ b/12
@ -37,11 +37,13 @@ if config["MESSAGES"]["COMPUTE"]:
    files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"]))
    files_to_compute.extend(expand("data/processed/{pid}/messages_{messages_type}.csv", pid=config["PIDS"], messages_type = config["MESSAGES"]["TYPES"]))

-if config["CALLS"]["COMPUTE"]:
-    files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
-    files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
-    files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
-    files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"]))
+for provider in config["CALLS"]["PROVIDERS"].keys():
+    if config["CALLS"]["PROVIDERS"][provider]["COMPUTE"]:
+        files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
+        files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
+        files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
+        files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["CALLS"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="CALLS".lower()))
+        files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="CALLS".lower()))

 if config["BLUETOOTH"]["COMPUTE"]:
    files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"]))
--- a/config.yaml
+++ b/config.yaml
@ -52,14 +52,18 @@ MESSAGES:

 # Communication call features config, TYPES and FEATURES keys need to match
 CALLS:
-  COMPUTE: False
  DB_TABLE: calls
-  TYPES: [missed, incoming, outgoing]
-  FEATURES:
-    missed:  [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact]
-    incoming: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
-    outgoing: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
-  DAY_SEGMENTS: *day_segments
+  PROVIDERS:
+    RAPIDS:
+      COMPUTE: True
+      CALL_TYPES: [missed, incoming, outgoing]
+      FEATURES:
+        missed:  [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact]
+        incoming: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
+        outgoing: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
+      DAY_SEGMENTS: *day_segments
+      SRC_LANGUAGE: "r"
+      SRC_FOLDER: "rapids"

 APPLICATION_GENRES:
  CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
@ -73,26 +77,25 @@ LOCATIONS:
  FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold
  FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row
  TIMEZONE: *timezone
-  
  PROVIDERS:
    DORYAB:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"]
      DBSCAN_EPS: 10 # meters
      DBSCAN_MINSAMPLES: 5
      THRESHOLD_STATIC : 1 # km/h
      MAXIMUM_GAP_ALLOWED: 300
-      MINUTES_DATA_USED: True
+      MINUTES_DATA_USED: False
      SAMPLING_FREQUENCY: 0
      SRC_FOLDER: "doryab"
      SRC_LANGUAGE: "python"

    BARNETT:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
      ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius
      TIMEZONE: *timezone
-      MINUTES_DATA_USED: True # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
+      MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
      SRC_FOLDER: "barnett"
      SRC_LANGUAGE: "r"

--- a/rules/features.smk
+++ b/rules/features.smk
@ -18,17 +18,29 @@ rule messages_features:
    script:
        "../src/features/messages_features.R"

-rule call_features:
-    input: 
-        expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"]),
-        day_segments_labels = expand("data/interim/{sensor}_day_segments_labels.csv", sensor=config["CALLS"]["DB_TABLE"])
+rule calls_python_features:
+    input:
+        sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"]),
+        day_segments_labels = "data/interim/day_segments_labels.csv"
    params:
-        call_type = "{call_type}",
-        features = lambda wildcards: config["CALLS"]["FEATURES"][wildcards.call_type]
+        provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key],
+        provider_key = "{provider_key}"
    output:
-        "data/processed/{pid}/calls_{call_type}.csv"
+        "data/interim/{pid}/calls_features/calls_python_{provider_key}.csv"
    script:
-        "../src/features/call_features.R"
+        "../src/features/calls/calls_entry.py"
+
+rule calls_r_features:
+    input:
+        sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"]),
+        day_segments_labels = "data/interim/day_segments_labels.csv"
+    params:
+        provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key],
+        provider_key = "{provider_key}"
+    output:
+        "data/interim/{pid}/calls_features/calls_r_{provider_key}.csv"
+    script:
+        "../src/features/calls/calls_entry.R"

 rule battery_deltas:
    input:
--- a/src/features/calls/calls_entry.R
+++ b/src/features/calls/calls_entry.R
@ -0,0 +1,13 @@
+source("renv/activate.R")
+source("src/features/utils/utils.R")
+library("dplyr")
+library("tidyr")
+
+sensor_data_file <-  snakemake@input[["sensor_data"]]
+day_segments_file <-  snakemake@input[["day_segments_labels"]]
+provider <- snakemake@params["provider"][["provider"]]
+provider_key <- snakemake@params["provider_key"]
+
+sensor_features <- fetch_provider_features(provider, provider_key, "calls", sensor_data_file, day_segments_file)
+
+write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)
--- a/src/features/calls/calls_entry.py
+++ b/src/features/calls/calls_entry.py
@ -0,0 +1,18 @@
+import pandas as pd
+from importlib import import_module, util
+from pathlib import Path
+
+# import fetch_provider_features from src/features/utils/utils.py
+spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
+mod = util.module_from_spec(spec)
+spec.loader.exec_module(mod)
+fetch_provider_features = getattr(mod,  "fetch_provider_features")
+
+sensor_data_file = snakemake.input["sensor_data"][0]
+day_segments_file = snakemake.input["day_segments_labels"]
+provider = snakemake.params["provider"]
+provider_key = snakemake.params["provider_key"]
+
+sensor_features = fetch_provider_features(provider, provider_key, "calls", sensor_data_file, day_segments_file)
+
+sensor_features.to_csv(snakemake.output[0], index=False)
--- a/src/features/calls/rapids/main.R
+++ b/src/features/calls/rapids/main.R
@ -0,0 +1,84 @@
+library('tidyr')
+library('stringr')
+library('entropy')
+
+Mode <- function(v) {
+  uniqv <- unique(v)
+  uniqv[which.max(tabulate(match(v, uniqv)))]
+}
+
+call_features_of_type <- function(calls, call_type, day_segment, requested_features){
+    # Output dataframe
+    features = data.frame(local_segment = character(), stringsAsFactors = FALSE)
+
+    # The name of the features this function can compute
+    base_features_names  <- c("count", "distinctcontacts", "meanduration", "sumduration", "minduration", "maxduration", "stdduration", "modeduration", "entropyduration", "timefirstcall", "timelastcall", "countmostfrequentcontact")
+    # The subset of requested features this function can compute
+    features_to_compute  <- intersect(base_features_names, requested_features)
+
+    # If there are not features or data to work with, return an empty df with appropiate columns names
+    if(length(features_to_compute) == 0)
+        return(features)
+    if(nrow(calls) < 1)
+        return(cbind(features, read.csv(text = paste(paste("calls_rapids", call_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
+
+    for(feature_name in features_to_compute){
+        if(feature_name == "countmostfrequentcontact"){
+            # Get the number of messages for the most frequent contact throughout the study
+            mostfrequentcontact <- calls %>% 
+                group_by(trace) %>% 
+                mutate(N=n()) %>% 
+                ungroup() %>%
+                filter(N == max(N)) %>% 
+                head(1) %>% # if there are multiple contacts with the same amount of messages pick the first one only
+                pull(trace)
+            feature <- calls %>% 
+                filter(trace == mostfrequentcontact) %>% 
+                group_by(local_segment) %>% 
+                summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := n())  %>% 
+                replace(is.na(.), 0)
+            features <- merge(features, feature, by="local_segment", all = TRUE)
+        } else {
+            feature <- calls %>% 
+                group_by(local_segment)
+
+            feature <- switch(feature_name,
+                "count" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := n()),
+                "distinctcontacts" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := n_distinct(trace)),
+                "meanduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := mean(call_duration)),
+                "sumduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := sum(call_duration)),
+                "minduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := min(call_duration)),
+                "maxduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := max(call_duration)),
+                "stdduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := sd(call_duration)),
+                "modeduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := Mode(call_duration)),
+                "entropyduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := entropy.MillerMadow(call_duration)),
+                "timefirstcall" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)),
+                "timelastcall" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute)))
+
+            features <- merge(features, feature, by="local_segment", all = TRUE)
+        }
+    }
+    features <- features %>% mutate_at(vars(contains("countmostfrequentcontact")), list( ~ replace_na(., 0)))
+    return(features)
+}
+
+rapids_features <- function(calls, day_segment, provider){
+    calls <- calls %>% filter_data_by_segment(day_segment)
+    call_types = provider[["CALL_TYPES"]]
+    call_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment"))
+
+    for(call_type in call_types){
+        # Filter rows that belong to the calls type and day segment of interest
+        call_type_label = ifelse(call_type == "incoming", "1", ifelse(call_type == "outgoing", "2", ifelse(call_type == "missed", "3", NA)))
+        if(is.na(call_type_label))
+            stop(paste("Call type can online be incoming, outgoing or missed but instead you typed: ", call_type, " in config[CALLS][CALL_TYPES]"))
+
+        requested_features <- provider[["FEATURES"]][[call_type]]
+        calls_of_type <- calls %>% filter(call_type == call_type_label)
+
+        features <- call_features_of_type(calls_of_type, call_type, day_segment, requested_features)
+        call_features <- merge(call_features, features, all=TRUE)
+    }
+
+    return(call_features)
+}