diff --git a/Snakefile b/Snakefile index e56c59a9..cb45efc0 100644 --- a/Snakefile +++ b/Snakefile @@ -37,11 +37,13 @@ if config["MESSAGES"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["MESSAGES"]["DB_TABLE"])) files_to_compute.extend(expand("data/processed/{pid}/messages_{messages_type}.csv", pid=config["PIDS"], messages_type = config["MESSAGES"]["TYPES"])) -if config["CALLS"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"])) +for provider in config["CALLS"]["PROVIDERS"].keys(): + if config["CALLS"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"])) + files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["CALLS"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="CALLS".lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="CALLS".lower())) if config["BLUETOOTH"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["BLUETOOTH"]["DB_TABLE"])) diff --git a/config.yaml b/config.yaml index 3884b70c..671c501d 100644 --- a/config.yaml +++ b/config.yaml @@ -52,14 +52,18 @@ MESSAGES: # Communication call features config, TYPES and FEATURES keys need to match CALLS: - COMPUTE: False DB_TABLE: calls - TYPES: [missed, incoming, outgoing] - FEATURES: - missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact] - incoming: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact] - outgoing: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact] - DAY_SEGMENTS: *day_segments + PROVIDERS: + RAPIDS: + COMPUTE: True + CALL_TYPES: [missed, incoming, outgoing] + FEATURES: + missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact] + incoming: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact] + outgoing: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact] + DAY_SEGMENTS: *day_segments + SRC_LANGUAGE: "r" + SRC_FOLDER: "rapids" APPLICATION_GENRES: CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) @@ -73,26 +77,25 @@ LOCATIONS: FUSED_RESAMPLED_CONSECUTIVE_THRESHOLD: 30 # minutes, only replicate location samples to the next sensed bin if the phone did not stop collecting data for more than this threshold FUSED_RESAMPLED_TIME_SINCE_VALID_LOCATION: 720 # minutes, only replicate location samples to consecutive sensed bins if they were logged within this threshold after a valid location row TIMEZONE: *timezone - PROVIDERS: DORYAB: - COMPUTE: True + COMPUTE: False FEATURES: ["locationvariance","loglocationvariance","totaldistance","averagespeed","varspeed","circadianmovement","numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","meanlengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy"] DBSCAN_EPS: 10 # meters DBSCAN_MINSAMPLES: 5 THRESHOLD_STATIC : 1 # km/h MAXIMUM_GAP_ALLOWED: 300 - MINUTES_DATA_USED: True + MINUTES_DATA_USED: False SAMPLING_FREQUENCY: 0 SRC_FOLDER: "doryab" SRC_LANGUAGE: "python" BARNETT: - COMPUTE: True + COMPUTE: False FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"] ACCURACY_LIMIT: 51 # meters, drops location coordinates with an accuracy higher than this. This number means there's a 68% probability the true location is within this radius TIMEZONE: *timezone - MINUTES_DATA_USED: True # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features + MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features SRC_FOLDER: "barnett" SRC_LANGUAGE: "r" diff --git a/rules/features.smk b/rules/features.smk index 55703574..fcd3adcf 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -18,17 +18,29 @@ rule messages_features: script: "../src/features/messages_features.R" -rule call_features: - input: - expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"]), - day_segments_labels = expand("data/interim/{sensor}_day_segments_labels.csv", sensor=config["CALLS"]["DB_TABLE"]) +rule calls_python_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"]), + day_segments_labels = "data/interim/day_segments_labels.csv" params: - call_type = "{call_type}", - features = lambda wildcards: config["CALLS"]["FEATURES"][wildcards.call_type] + provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}" output: - "data/processed/{pid}/calls_{call_type}.csv" + "data/interim/{pid}/calls_features/calls_python_{provider_key}.csv" script: - "../src/features/call_features.R" + "../src/features/calls/calls_entry.py" + +rule calls_r_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"]), + day_segments_labels = "data/interim/day_segments_labels.csv" + params: + provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}" + output: + "data/interim/{pid}/calls_features/calls_r_{provider_key}.csv" + script: + "../src/features/calls/calls_entry.R" rule battery_deltas: input: diff --git a/src/features/calls/calls_entry.R b/src/features/calls/calls_entry.R new file mode 100644 index 00000000..bea2c7cb --- /dev/null +++ b/src/features/calls/calls_entry.R @@ -0,0 +1,13 @@ +source("renv/activate.R") +source("src/features/utils/utils.R") +library("dplyr") +library("tidyr") + +sensor_data_file <- snakemake@input[["sensor_data"]] +day_segments_file <- snakemake@input[["day_segments_labels"]] +provider <- snakemake@params["provider"][["provider"]] +provider_key <- snakemake@params["provider_key"] + +sensor_features <- fetch_provider_features(provider, provider_key, "calls", sensor_data_file, day_segments_file) + +write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/calls/calls_entry.py b/src/features/calls/calls_entry.py new file mode 100644 index 00000000..828c4718 --- /dev/null +++ b/src/features/calls/calls_entry.py @@ -0,0 +1,18 @@ +import pandas as pd +from importlib import import_module, util +from pathlib import Path + +# import fetch_provider_features from src/features/utils/utils.py +spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) +mod = util.module_from_spec(spec) +spec.loader.exec_module(mod) +fetch_provider_features = getattr(mod, "fetch_provider_features") + +sensor_data_file = snakemake.input["sensor_data"][0] +day_segments_file = snakemake.input["day_segments_labels"] +provider = snakemake.params["provider"] +provider_key = snakemake.params["provider_key"] + +sensor_features = fetch_provider_features(provider, provider_key, "calls", sensor_data_file, day_segments_file) + +sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/calls/rapids/main.R b/src/features/calls/rapids/main.R new file mode 100644 index 00000000..1ff01884 --- /dev/null +++ b/src/features/calls/rapids/main.R @@ -0,0 +1,84 @@ +library('tidyr') +library('stringr') +library('entropy') + +Mode <- function(v) { + uniqv <- unique(v) + uniqv[which.max(tabulate(match(v, uniqv)))] +} + +call_features_of_type <- function(calls, call_type, day_segment, requested_features){ + # Output dataframe + features = data.frame(local_segment = character(), stringsAsFactors = FALSE) + + # The name of the features this function can compute + base_features_names <- c("count", "distinctcontacts", "meanduration", "sumduration", "minduration", "maxduration", "stdduration", "modeduration", "entropyduration", "timefirstcall", "timelastcall", "countmostfrequentcontact") + # The subset of requested features this function can compute + features_to_compute <- intersect(base_features_names, requested_features) + + # If there are not features or data to work with, return an empty df with appropiate columns names + if(length(features_to_compute) == 0) + return(features) + if(nrow(calls) < 1) + return(cbind(features, read.csv(text = paste(paste("calls_rapids", call_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE))) + + for(feature_name in features_to_compute){ + if(feature_name == "countmostfrequentcontact"){ + # Get the number of messages for the most frequent contact throughout the study + mostfrequentcontact <- calls %>% + group_by(trace) %>% + mutate(N=n()) %>% + ungroup() %>% + filter(N == max(N)) %>% + head(1) %>% # if there are multiple contacts with the same amount of messages pick the first one only + pull(trace) + feature <- calls %>% + filter(trace == mostfrequentcontact) %>% + group_by(local_segment) %>% + summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := n()) %>% + replace(is.na(.), 0) + features <- merge(features, feature, by="local_segment", all = TRUE) + } else { + feature <- calls %>% + group_by(local_segment) + + feature <- switch(feature_name, + "count" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := n()), + "distinctcontacts" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := n_distinct(trace)), + "meanduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := mean(call_duration)), + "sumduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := sum(call_duration)), + "minduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := min(call_duration)), + "maxduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := max(call_duration)), + "stdduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := sd(call_duration)), + "modeduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := Mode(call_duration)), + "entropyduration" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := entropy.MillerMadow(call_duration)), + "timefirstcall" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)), + "timelastcall" = feature %>% summarise(!!paste("calls_rapids", call_type, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute))) + + features <- merge(features, feature, by="local_segment", all = TRUE) + } + } + features <- features %>% mutate_at(vars(contains("countmostfrequentcontact")), list( ~ replace_na(., 0))) + return(features) +} + +rapids_features <- function(calls, day_segment, provider){ + calls <- calls %>% filter_data_by_segment(day_segment) + call_types = provider[["CALL_TYPES"]] + call_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment")) + + for(call_type in call_types){ + # Filter rows that belong to the calls type and day segment of interest + call_type_label = ifelse(call_type == "incoming", "1", ifelse(call_type == "outgoing", "2", ifelse(call_type == "missed", "3", NA))) + if(is.na(call_type_label)) + stop(paste("Call type can online be incoming, outgoing or missed but instead you typed: ", call_type, " in config[CALLS][CALL_TYPES]")) + + requested_features <- provider[["FEATURES"]][[call_type]] + calls_of_type <- calls %>% filter(call_type == call_type_label) + + features <- call_features_of_type(calls_of_type, call_type, day_segment, requested_features) + call_features <- merge(call_features, features, all=TRUE) + } + + return(call_features) +} \ No newline at end of file