From 6f451e05ac3abe5c24283902235fc6f7a69ecad1 Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 15 Dec 2021 12:58:02 +0100 Subject: [PATCH] Bring back application_name. This column still needs to be in the data, so add it in app_add_name.py. Later, join categories by package hash. --- src/data/application_categories.R | 7 ++++++- src/data/streams/aware_postgresql/format.yaml | 8 ++++++-- src/data/streams/mutations/phone/straw/app_add_name.py | 5 +++++ src/data/streams/rapids_columns.yaml | 2 ++ 4 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 src/data/streams/mutations/phone/straw/app_add_name.py diff --git a/src/data/application_categories.R b/src/data/application_categories.R index b9fa35ef..83dab3ee 100644 --- a/src/data/application_categories.R +++ b/src/data/application_categories.R @@ -29,6 +29,7 @@ get_genre <- function(apps){ apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F) genre_catalogue <- data.frame() catalogue_source <- snakemake@params[["catalogue_source"]] +package_names_hashed <- snakemake@params[["package_names_hashed"]] update_catalogue_file <- snakemake@params[["update_catalogue_file"]] scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]] apps_with_genre <- data.frame(matrix(ncol=length(colnames(apps)) + 1,nrow=0, dimnames=list(NULL, c(colnames(apps), "genre")))) @@ -38,7 +39,11 @@ if(nrow(apps) > 0){ apps_with_genre <- apps %>% mutate(genre = NA_character_) } else if(catalogue_source == "FILE"){ genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character")) - apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name") + if (package_names_hashed) { + apps_with_genre <- left_join(apps, genre_catalogue, by = "package_hash") + } else { + apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name") + } } if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){ diff --git a/src/data/streams/aware_postgresql/format.yaml b/src/data/streams/aware_postgresql/format.yaml index a6bf17c1..cbd983a4 100644 --- a/src/data/streams/aware_postgresql/format.yaml +++ b/src/data/streams/aware_postgresql/format.yaml @@ -67,10 +67,12 @@ PHONE_APPLICATIONS_FOREGROUND: TIMESTAMP: timestamp DEVICE_ID: device_id PACKAGE_NAME: package_hash + APPLICATION_NAME: FLAG_TO_MUTATE IS_SYSTEM_APP: is_system_app MUTATION: COLUMN_MAPPINGS: - SCRIPTS: # List any python or r scripts that mutate your raw data + SCRIPTS: + - src/data/streams/mutations/phone/straw/app_add_name.py PHONE_APPLICATIONS_NOTIFICATIONS: ANDROID: @@ -78,11 +80,13 @@ PHONE_APPLICATIONS_NOTIFICATIONS: TIMESTAMP: timestamp DEVICE_ID: device_id PACKAGE_NAME: package_hash + APPLICATION_NAME: FLAG_TO_MUTATE SOUND: sound VIBRATE: vibrate MUTATION: COLUMN_MAPPINGS: - SCRIPTS: # List any python or r scripts that mutate your raw data + SCRIPTS: + - src/data/streams/mutations/phone/straw/app_add_name.py PHONE_BATTERY: ANDROID: diff --git a/src/data/streams/mutations/phone/straw/app_add_name.py b/src/data/streams/mutations/phone/straw/app_add_name.py new file mode 100644 index 00000000..88aee733 --- /dev/null +++ b/src/data/streams/mutations/phone/straw/app_add_name.py @@ -0,0 +1,5 @@ +import pandas as pd + +def main(data, stream_parameters): + data["application_name"] = "hashed" + return(data) \ No newline at end of file diff --git a/src/data/streams/rapids_columns.yaml b/src/data/streams/rapids_columns.yaml index 40ecc98c..06dc8f99 100644 --- a/src/data/streams/rapids_columns.yaml +++ b/src/data/streams/rapids_columns.yaml @@ -27,12 +27,14 @@ PHONE_APPLICATIONS_FOREGROUND: - TIMESTAMP - DEVICE_ID - PACKAGE_NAME + - APPLICATION_NAME - IS_SYSTEM_APP PHONE_APPLICATIONS_NOTIFICATIONS: - TIMESTAMP - DEVICE_ID - PACKAGE_NAME + - APPLICATION_NAME - SOUND - VIBRATE