library(conflicted) library(yaml) library(RPostgreSQL) library(tidyverse) conflicts_prefer( dplyr::filter, dplyr::lag ) library(magrittr) # read the password from file credentials <- yaml.load_file("../rapids/credentials.yaml") pw <- credentials$PSQL_STRAW$password # load the PostgreSQL driver drv <- RPostgres::Postgres() # creates a connection to the postgres database # note that "con" will be used later in each connection to the database con <- RPostgres::dbConnect(drv, dbname = "staw", host = "eol.ijs.si", port = 5432, user = "staw_db", password = pw ) rm(pw, credentials) # removes the password # check for the bluetooth table, an example dbExistsTable(con, "app_categories") df_app_categories <- tbl(con, "app_categories") %>% collect() head(df_app_categories) table(df_app_categories$play_store_genre) df_app_categories %>% filter(play_store_genre == "not_found") %>% group_by(play_store_response) %>% count() # All "not_found" have an HTTP status of 404. df_app_categories %>% filter(play_store_genre == "not_found") %>% group_by(package_name) %>% count() %>% arrange(desc(n)) # All "not_found" apps are unique. # Exclude phone manufacturers, custom ROM names and similar. manufacturers <- c( "samsung", "oneplus", "huawei", "xiaomi", "lge", "motorola", "miui", "lenovo", "oppo", "mediatek" ) custom_rom <- c("coloros", "lineageos", "myos", "cyanogenmod", "foundation.e") other <- c("android", "wssyncmldm") grep_pattern <- paste(c(manufacturers, custom_rom, other), collapse = "|") rows_os_manufacturer <- grepl(grep_pattern, df_app_categories$package_name) # Explore what remains after excluding above. df_app_categories[!rows_os_manufacturer, ] %>% filter(play_store_genre == "not_found") # Also check the relationship between is_system_app and System category. tbl(con, "applications") %>% filter(is_system_app, play_store_genre != "System") %>% count() # They are perfectly correlated. # Manually classify apps df_app_categories[df_app_categories$play_store_genre == "not_found",] <- df_app_categories %>% filter(play_store_genre == "not_found") %>% mutate( play_store_genre = case_when( str_detect(str_to_lower(package_name), grep_pattern) ~ "System", str_detect(str_to_lower(package_name), "straw") ~ "STRAW", str_detect(str_to_lower(package_name), "chromium") ~ "Communication", # Same as chrome. str_detect(str_to_lower(package_name), "skype") ~ "Communication", # Skype Lite not classified. str_detect(str_to_lower(package_name), "imsservice") ~ "Communication", # IP Multimedia Subsystem str_detect(str_to_lower(package_name), paste(c("covid", "empatica"), collapse = "|")) ~ "Medical", str_detect(str_to_lower(package_name), paste(c("libri", "tachiyomi"), collapse = "|")) ~ "Books & Reference", str_detect(str_to_lower(package_name), paste(c("bricks", "chess"), collapse = "|")) ~ "Casual", str_detect(str_to_lower(package_name), "weather") ~ "Weather", str_detect(str_to_lower(package_name), "excel") ~ "Productivity", str_detect(str_to_lower(package_name), paste(c("qr", "barcode", "archimedes", "mixplorer", "winrar", "filemanager", "shot", "faceunlock", "signin", "milink"), collapse = "|")) ~ "Tools", str_detect(str_to_lower(package_name), "stupeflix") ~ "Photography", str_detect(str_to_lower(package_name), "anyme") ~ "Entertainment", str_detect(str_to_lower(package_name), "vanced") ~ "Video Players & Editors", str_detect(str_to_lower(package_name), paste(c("music", "radio", "dolby"), collapse = "|")) ~ "Music & Audio", str_detect(str_to_lower(package_name), paste(c("tensorflow", "object_detection"), collapse = "|")) ~ "Education", .default = play_store_genre ) ) # Explore what remains after classifying above. df_app_categories %>% filter(play_store_genre == "not_found") # After this, 13 applications remain, which I will classify as "Other". # Correct some mistakes # And classify 'not_found' df_app_categories %<>% mutate( play_store_genre = { function(x) { case_when( x == "Education,Education" ~ "Education", x == "EducationEducation" ~ "Education", x == "not_found" ~ "Other", .default = x ) } }(play_store_genre) ) %>% select(-package_name) %>% rename( genre = play_store_genre, package_name = package_hash ) table(df_app_categories$genre) df_app_categories %>% group_by(genre) %>% count() %>% arrange(desc(n)) %>% write_csv("play_store_categories_count.csv") write_csv( x = select(df_app_categories, c(package_name, genre)), file = "play_store_application_genre_catalogue.csv" ) dbDisconnect(con)