Completely remove PACKAGE_NAMES_HASHED and instead provide a differently structured file.

2023-04-18 22:58:42 +02:00 · 2023-04-18 22:58:42 +02:00 · 1cc7339fc8
parent 5307c71df0
commit 1cc7339fc8
4 changed files with 3 additions and 12 deletions
--- a/config.yaml
+++ b/config.yaml
@ -116,7 +116,6 @@ PHONE_APPLICATIONS_FOREGROUND:
    CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
    CATALOGUE_FILE: "data/external/play_store_application_genre_catalogue.csv"
    # Refer to data/external/play_store_categories_count.csv for a list of categories (genres) and their frequency.
-    PACKAGE_NAMES_HASHED: True
    UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
    SCRAPE_MISSING_CATEGORIES: False # whether to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
  PROVIDERS:
--- a/data/external/play_store_application_genre_catalogue.csv
+++ b/data/external/play_store_application_genre_catalogue.csv
@ -1,4 +1,4 @@
-package_hash,genre
+package_name,genre
 98a5c1a9c7717f791cb4083199ff5c91a958df844a47dc89c7319b2bb824ac94,Personalization
 c9112978f6b1c96c767496a15a6dbb9c8dccabe847c31ecc9e5f706de24342a6,Communication
 650ab12d7007ee573df2291f3a9207442e3897a9a5b2f5068ad685c0d04751ea,Tools
--- a/rules/preprocessing.smk
+++ b/rules/preprocessing.smk
@ -184,8 +184,7 @@ rule phone_application_categories:
        catalogue_source = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_SOURCE"],
        catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["CATALOGUE_FILE"],
        update_catalogue_file = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["UPDATE_CATALOGUE_FILE"],
-        scrape_missing_genres = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["SCRAPE_MISSING_CATEGORIES"],
-        package_names_hashed = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["PACKAGE_NAMES_HASHED"]
+        scrape_missing_genres = lambda wildcards: config["PHONE_APPLICATIONS_" + str(wildcards.type).upper()]["APPLICATION_CATEGORIES"]["SCRAPE_MISSING_CATEGORIES"]
    output:
        "data/raw/{pid}/phone_applications_{type}_with_datetime_with_categories.csv"
    script:
--- a/src/data/application_categories.R
+++ b/src/data/application_categories.R
@ -29,24 +29,17 @@ get_genre <- function(apps){
 apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F)
 genre_catalogue <- data.frame()
 catalogue_source <- snakemake@params[["catalogue_source"]]
-package_names_hashed <- snakemake@params[["package_names_hashed"]]
 update_catalogue_file <- snakemake@params[["update_catalogue_file"]]
 scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]]
 apps_with_genre <- data.frame(matrix(ncol=length(colnames(apps)) + 1,nrow=0, dimnames=list(NULL, c(colnames(apps), "genre"))))

-if (length(package_names_hashed) == 0) {package_names_hashed <- FALSE}
-
 if(nrow(apps) > 0){
  if(catalogue_source == "GOOGLE"){
    apps_with_genre <- apps %>% mutate(genre = NA_character_)
  } else if(catalogue_source == "FILE"){
    genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character"))
-    if (package_names_hashed) {
-      apps_with_genre <- left_join(apps, genre_catalogue, by = "package_hash")
-    } else {
    apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
  }
-  }

  if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){
    apps_without_genre <- (apps_with_genre %>% filter(is.na(genre)) %>% distinct(package_name))$package_name