library(conflicted) library(yaml) library(RPostgreSQL) library(tidyverse) conflicts_prefer( dplyr::filter, dplyr::lag ) library(magrittr) # read the password from file credentials <- yaml.load_file("../rapids/credentials.yaml") pw <- credentials$PSQL_STRAW$password # load the PostgreSQL driver drv <- RPostgres::Postgres() # creates a connection to the postgres database # note that "con" will be used later in each connection to the database con <- RPostgres::dbConnect(drv, dbname = "staw", host = "eol.ijs.si", port = 5432, user = "staw_db", password = pw ) rm(pw, credentials) # removes the password # check for the bluetooth table, an example dbExistsTable(con, "app_categories") df_app_categories <- tbl(con, "app_categories") %>% collect() head(df_app_categories) table(df_app_categories$play_store_genre) df_app_categories %>% filter(play_store_genre == "not_found") %>% group_by(play_store_response) %>% count() # All "not_found" have an HTTP status of 404. df_app_categories %>% filter(play_store_genre == "not_found") %>% group_by(package_name) %>% count() %>% arrange(desc(n)) # All "not_found" apps are unique. # Exclude phone manufacturers, custom ROM names and similar. manufacturers <- c( "samsung", "oneplus", "huawei", "xiaomi", "lge", "motorola", "miui", "lenovo", "oppo", "mediatek" ) custom_rom <- c("coloros", "lineageos", "myos", "cyanogenmod", "foundation.e") other <- c("android", "wssyncmldm") grep_pattern <- paste(c(manufacturers, custom_rom, other), collapse = "|") rows_os_manufacturer <- grepl(grep_pattern, df_app_categories$package_name) # Explore what remains after excluding above. df_app_categories[!rows_os_manufacturer, ] %>% filter(play_store_genre == "not_found") # Correct some mistakes # And classify 'not_found' df_app_categories %<>% mutate( play_store_genre = { function(x) { case_when( x == "Education,Education" ~ "Education", x == "EducationEducation" ~ "Education", x == "not_found" ~ "System", .default = x ) } }(play_store_genre) ) %>% select(-package_name) %>% rename( genre = play_store_genre, package_name = package_hash ) table(df_app_categories$genre) df_app_categories %>% group_by(genre) %>% count() %>% arrange(desc(n)) %>% write_csv("play_store_categories_count.csv") write_csv( x = select(df_app_categories, c(package_name, genre)), file = "play_store_application_genre_catalogue.csv" ) dbDisconnect(con)