Completely remove PACKAGE_NAMES_HASHED and instead provide a differently structured file.

master
junos 2023-04-18 22:58:42 +02:00
parent 5307c71df0
commit 1cc7339fc8
4 changed files with 3 additions and 12 deletions

View File

@ -116,7 +116,6 @@ PHONE_APPLICATIONS_FOREGROUND:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/play_store_application_genre_catalogue.csv"
# Refer to data/external/play_store_categories_count.csv for a list of categories (genres) and their frequency.
PACKAGE_NAMES_HASHED: True
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
SCRAPE_MISSING_CATEGORIES: False # whether to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
PROVIDERS:

View File

@ -1,4 +1,4 @@
package_hash,genre
package_name,genre
98a5c1a9c7717f791cb4083199ff5c91a958df844a47dc89c7319b2bb824ac94,Personalization
c9112978f6b1c96c767496a15a6dbb9c8dccabe847c31ecc9e5f706de24342a6,Communication
650ab12d7007ee573df2291f3a9207442e3897a9a5b2f5068ad685c0d04751ea,Tools

1 package_hash package_name genre
2 98a5c1a9c7717f791cb4083199ff5c91a958df844a47dc89c7319b2bb824ac94 Personalization
3 c9112978f6b1c96c767496a15a6dbb9c8dccabe847c31ecc9e5f706de24342a6 Communication
4 650ab12d7007ee573df2291f3a9207442e3897a9a5b2f5068ad685c0d04751ea Tools

View File

@ -184,8 +184,7 @@ rule phone_application_categories:
catalogue_source = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_SOURCE"],
catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_FILE"],
update_catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["UPDATE_CATALOGUE_FILE"],
scrape_missing_genres = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["SCRAPE_MISSING_CATEGORIES"],
package_names_hashed = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["PACKAGE_NAMES_HASHED"]
scrape_missing_genres = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["SCRAPE_MISSING_CATEGORIES"]
output:
"data/raw/{pid}/phone_applications_{type}_with_datetime_with_categories.csv"
script:

View File

@ -29,24 +29,17 @@ get_genre <- function(apps){
apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F)
genre_catalogue <- data.frame()
catalogue_source <- snakemake@params[["catalogue_source"]]
package_names_hashed <- snakemake@params[["package_names_hashed"]]
update_catalogue_file <- snakemake@params[["update_catalogue_file"]]
scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]]
apps_with_genre <- data.frame(matrix(ncol=length(colnames(apps)) + 1,nrow=0, dimnames=list(NULL, c(colnames(apps), "genre"))))
if (length(package_names_hashed) == 0) {package_names_hashed <- FALSE}
if(nrow(apps) > 0){
if(catalogue_source == "GOOGLE"){
apps_with_genre <- apps %>% mutate(genre = NA_character_)
} else if(catalogue_source == "FILE"){
genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character"))
if (package_names_hashed) {
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_hash")
} else {
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
}
}
if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){
apps_without_genre <- (apps_with_genre %>% filter(is.na(genre)) %>% distinct(package_name))$package_name