Completely remove PACKAGE_NAMES_HASHED and instead provide a differently structured file.
parent
5307c71df0
commit
1cc7339fc8
|
@ -116,7 +116,6 @@ PHONE_APPLICATIONS_FOREGROUND:
|
||||||
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
|
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
|
||||||
CATALOGUE_FILE: "data/external/play_store_application_genre_catalogue.csv"
|
CATALOGUE_FILE: "data/external/play_store_application_genre_catalogue.csv"
|
||||||
# Refer to data/external/play_store_categories_count.csv for a list of categories (genres) and their frequency.
|
# Refer to data/external/play_store_categories_count.csv for a list of categories (genres) and their frequency.
|
||||||
PACKAGE_NAMES_HASHED: True
|
|
||||||
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
|
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
|
||||||
SCRAPE_MISSING_CATEGORIES: False # whether to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
SCRAPE_MISSING_CATEGORIES: False # whether to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
package_hash,genre
|
package_name,genre
|
||||||
98a5c1a9c7717f791cb4083199ff5c91a958df844a47dc89c7319b2bb824ac94,Personalization
|
98a5c1a9c7717f791cb4083199ff5c91a958df844a47dc89c7319b2bb824ac94,Personalization
|
||||||
c9112978f6b1c96c767496a15a6dbb9c8dccabe847c31ecc9e5f706de24342a6,Communication
|
c9112978f6b1c96c767496a15a6dbb9c8dccabe847c31ecc9e5f706de24342a6,Communication
|
||||||
650ab12d7007ee573df2291f3a9207442e3897a9a5b2f5068ad685c0d04751ea,Tools
|
650ab12d7007ee573df2291f3a9207442e3897a9a5b2f5068ad685c0d04751ea,Tools
|
||||||
|
|
|
|
@ -184,8 +184,7 @@ rule phone_application_categories:
|
||||||
catalogue_source = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_SOURCE"],
|
catalogue_source = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_SOURCE"],
|
||||||
catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_FILE"],
|
catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_FILE"],
|
||||||
update_catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["UPDATE_CATALOGUE_FILE"],
|
update_catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["UPDATE_CATALOGUE_FILE"],
|
||||||
scrape_missing_genres = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["SCRAPE_MISSING_CATEGORIES"],
|
scrape_missing_genres = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["SCRAPE_MISSING_CATEGORIES"]
|
||||||
package_names_hashed = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["PACKAGE_NAMES_HASHED"]
|
|
||||||
output:
|
output:
|
||||||
"data/raw/{pid}/phone_applications_{type}_with_datetime_with_categories.csv"
|
"data/raw/{pid}/phone_applications_{type}_with_datetime_with_categories.csv"
|
||||||
script:
|
script:
|
||||||
|
|
|
@ -29,24 +29,17 @@ get_genre <- function(apps){
|
||||||
apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F)
|
apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F)
|
||||||
genre_catalogue <- data.frame()
|
genre_catalogue <- data.frame()
|
||||||
catalogue_source <- snakemake@params[["catalogue_source"]]
|
catalogue_source <- snakemake@params[["catalogue_source"]]
|
||||||
package_names_hashed <- snakemake@params[["package_names_hashed"]]
|
|
||||||
update_catalogue_file <- snakemake@params[["update_catalogue_file"]]
|
update_catalogue_file <- snakemake@params[["update_catalogue_file"]]
|
||||||
scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]]
|
scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]]
|
||||||
apps_with_genre <- data.frame(matrix(ncol=length(colnames(apps)) + 1,nrow=0, dimnames=list(NULL, c(colnames(apps), "genre"))))
|
apps_with_genre <- data.frame(matrix(ncol=length(colnames(apps)) + 1,nrow=0, dimnames=list(NULL, c(colnames(apps), "genre"))))
|
||||||
|
|
||||||
if (length(package_names_hashed) == 0) {package_names_hashed <- FALSE}
|
|
||||||
|
|
||||||
if(nrow(apps) > 0){
|
if(nrow(apps) > 0){
|
||||||
if(catalogue_source == "GOOGLE"){
|
if(catalogue_source == "GOOGLE"){
|
||||||
apps_with_genre <- apps %>% mutate(genre = NA_character_)
|
apps_with_genre <- apps %>% mutate(genre = NA_character_)
|
||||||
} else if(catalogue_source == "FILE"){
|
} else if(catalogue_source == "FILE"){
|
||||||
genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character"))
|
genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character"))
|
||||||
if (package_names_hashed) {
|
|
||||||
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_hash")
|
|
||||||
} else {
|
|
||||||
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
|
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){
|
if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){
|
||||||
apps_without_genre <- (apps_with_genre %>% filter(is.na(genre)) %>% distinct(package_name))$package_name
|
apps_without_genre <- (apps_with_genre %>% filter(is.na(genre)) %>% distinct(package_name))$package_name
|
||||||
|
|
Loading…
Reference in New Issue