Completely remove PACKAGE_NAMES_HASHED and instead provide a differently structured file.
parent
5307c71df0
commit
1cc7339fc8
|
@ -116,7 +116,6 @@ PHONE_APPLICATIONS_FOREGROUND:
|
|||
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
|
||||
CATALOGUE_FILE: "data/external/play_store_application_genre_catalogue.csv"
|
||||
# Refer to data/external/play_store_categories_count.csv for a list of categories (genres) and their frequency.
|
||||
PACKAGE_NAMES_HASHED: True
|
||||
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
|
||||
SCRAPE_MISSING_CATEGORIES: False # whether to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
||||
PROVIDERS:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package_hash,genre
|
||||
package_name,genre
|
||||
98a5c1a9c7717f791cb4083199ff5c91a958df844a47dc89c7319b2bb824ac94,Personalization
|
||||
c9112978f6b1c96c767496a15a6dbb9c8dccabe847c31ecc9e5f706de24342a6,Communication
|
||||
650ab12d7007ee573df2291f3a9207442e3897a9a5b2f5068ad685c0d04751ea,Tools
|
||||
|
|
|
|
@ -184,8 +184,7 @@ rule phone_application_categories:
|
|||
catalogue_source = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_SOURCE"],
|
||||
catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_FILE"],
|
||||
update_catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["UPDATE_CATALOGUE_FILE"],
|
||||
scrape_missing_genres = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["SCRAPE_MISSING_CATEGORIES"],
|
||||
package_names_hashed = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["PACKAGE_NAMES_HASHED"]
|
||||
scrape_missing_genres = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["SCRAPE_MISSING_CATEGORIES"]
|
||||
output:
|
||||
"data/raw/{pid}/phone_applications_{type}_with_datetime_with_categories.csv"
|
||||
script:
|
||||
|
|
|
@ -29,23 +29,16 @@ get_genre <- function(apps){
|
|||
apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F)
|
||||
genre_catalogue <- data.frame()
|
||||
catalogue_source <- snakemake@params[["catalogue_source"]]
|
||||
package_names_hashed <- snakemake@params[["package_names_hashed"]]
|
||||
update_catalogue_file <- snakemake@params[["update_catalogue_file"]]
|
||||
scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]]
|
||||
apps_with_genre <- data.frame(matrix(ncol=length(colnames(apps)) + 1,nrow=0, dimnames=list(NULL, c(colnames(apps), "genre"))))
|
||||
|
||||
if (length(package_names_hashed) == 0) {package_names_hashed <- FALSE}
|
||||
|
||||
if(nrow(apps) > 0){
|
||||
if(catalogue_source == "GOOGLE"){
|
||||
apps_with_genre <- apps %>% mutate(genre = NA_character_)
|
||||
} else if(catalogue_source == "FILE"){
|
||||
genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character"))
|
||||
if (package_names_hashed) {
|
||||
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_hash")
|
||||
} else {
|
||||
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
|
||||
}
|
||||
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
|
||||
}
|
||||
|
||||
if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){
|
||||
|
|
Loading…
Reference in New Issue