2023-04-18 15:34:06 +02:00
|
|
|
library(conflicted)
|
|
|
|
library(yaml)
|
|
|
|
library(RPostgreSQL)
|
|
|
|
library(tidyverse)
|
2023-04-18 15:49:33 +02:00
|
|
|
conflicts_prefer(
|
|
|
|
dplyr::filter,
|
|
|
|
dplyr::lag
|
|
|
|
)
|
2023-04-18 15:34:06 +02:00
|
|
|
library(magrittr)
|
|
|
|
|
|
|
|
# read the password from file
|
|
|
|
credentials <- yaml.load_file("../rapids/credentials.yaml")
|
|
|
|
pw <- credentials$PSQL_STRAW$password
|
|
|
|
|
2023-04-18 15:49:33 +02:00
|
|
|
# load the PostgreSQL driver
|
2023-04-18 15:34:06 +02:00
|
|
|
drv <- RPostgres::Postgres()
|
|
|
|
|
|
|
|
# creates a connection to the postgres database
|
|
|
|
# note that "con" will be used later in each connection to the database
|
2023-04-18 15:49:33 +02:00
|
|
|
con <- RPostgres::dbConnect(drv,
|
|
|
|
dbname = "staw",
|
|
|
|
host = "eol.ijs.si", port = 5432,
|
|
|
|
user = "staw_db", password = pw
|
|
|
|
)
|
2023-04-18 15:34:06 +02:00
|
|
|
|
|
|
|
rm(pw, credentials) # removes the password
|
|
|
|
|
|
|
|
# check for the bluetooth table, an example
|
|
|
|
dbExistsTable(con, "app_categories")
|
|
|
|
|
2023-04-18 15:49:33 +02:00
|
|
|
df_app_categories <- tbl(con, "app_categories") %>%
|
2023-04-18 15:34:06 +02:00
|
|
|
collect()
|
|
|
|
|
|
|
|
head(df_app_categories)
|
|
|
|
table(df_app_categories$play_store_genre)
|
|
|
|
|
2023-04-19 09:47:43 +02:00
|
|
|
df_app_categories %>%
|
|
|
|
filter(play_store_genre == "not_found") %>%
|
|
|
|
group_by(play_store_response) %>%
|
|
|
|
count()
|
|
|
|
# All "not_found" have an HTTP status of 404.
|
|
|
|
|
|
|
|
df_app_categories %>%
|
|
|
|
filter(play_store_genre == "not_found") %>%
|
|
|
|
group_by(package_name) %>%
|
|
|
|
count() %>%
|
|
|
|
arrange(desc(n))
|
|
|
|
# All "not_found" apps are unique.
|
|
|
|
|
|
|
|
# Exclude phone manufacturers, custom ROM names and similar.
|
|
|
|
manufacturers <- c(
|
|
|
|
"samsung",
|
|
|
|
"oneplus",
|
|
|
|
"huawei",
|
|
|
|
"xiaomi",
|
|
|
|
"lge",
|
|
|
|
"motorola",
|
|
|
|
"miui",
|
|
|
|
"lenovo",
|
|
|
|
"oppo",
|
|
|
|
"mediatek"
|
|
|
|
)
|
|
|
|
custom_rom <- c("coloros", "lineageos", "myos", "cyanogenmod", "foundation.e")
|
|
|
|
other <- c("android", "wssyncmldm")
|
|
|
|
|
|
|
|
grep_pattern <- paste(c(manufacturers, custom_rom, other), collapse = "|")
|
|
|
|
|
|
|
|
rows_os_manufacturer <- grepl(grep_pattern, df_app_categories$package_name)
|
|
|
|
|
|
|
|
# Explore what remains after excluding above.
|
|
|
|
df_app_categories[!rows_os_manufacturer, ] %>%
|
|
|
|
filter(play_store_genre == "not_found")
|
|
|
|
|
2023-04-19 10:54:46 +02:00
|
|
|
# Manually classify apps
|
2023-04-19 11:01:00 +02:00
|
|
|
df_app_categories[df_app_categories$play_store_genre == "not_found",] <-
|
|
|
|
df_app_categories %>%
|
|
|
|
filter(play_store_genre == "not_found") %>%
|
2023-04-19 10:54:46 +02:00
|
|
|
mutate(
|
2023-04-19 11:01:00 +02:00
|
|
|
play_store_genre =
|
2023-04-19 10:54:46 +02:00
|
|
|
case_when(
|
|
|
|
str_detect(str_to_lower(package_name), grep_pattern) ~ "System",
|
|
|
|
str_detect(str_to_lower(package_name), "straw") ~ "STRAW",
|
|
|
|
str_detect(str_to_lower(package_name), "chromium") ~ "Communication", # Same as chrome.
|
|
|
|
str_detect(str_to_lower(package_name), "skype") ~ "Communication", # Skype Lite not classified.
|
|
|
|
str_detect(str_to_lower(package_name), "imsservice") ~ "Communication", # IP Multimedia Subsystem
|
|
|
|
str_detect(str_to_lower(package_name), paste(c("covid", "empatica"), collapse = "|")) ~ "Medical",
|
|
|
|
str_detect(str_to_lower(package_name), paste(c("libri", "tachiyomi"), collapse = "|")) ~ "Books & Reference",
|
|
|
|
str_detect(str_to_lower(package_name), paste(c("bricks", "chess"), collapse = "|")) ~ "Casual",
|
|
|
|
str_detect(str_to_lower(package_name), "weather") ~ "Weather",
|
|
|
|
str_detect(str_to_lower(package_name), "excel") ~ "Productivity",
|
|
|
|
str_detect(str_to_lower(package_name), paste(c("qr", "barcode", "archimedes", "mixplorer", "winrar", "filemanager", "shot", "faceunlock", "signin", "milink"), collapse = "|")) ~ "Tools",
|
|
|
|
str_detect(str_to_lower(package_name), "stupeflix") ~ "Photography",
|
|
|
|
str_detect(str_to_lower(package_name), "anyme") ~ "Entertainment",
|
|
|
|
str_detect(str_to_lower(package_name), "vanced") ~ "Video Players & Editors",
|
|
|
|
str_detect(str_to_lower(package_name), paste(c("music", "radio", "dolby"), collapse = "|")) ~ "Music & Audio",
|
|
|
|
str_detect(str_to_lower(package_name), paste(c("tensorflow", "object_detection"), collapse = "|")) ~ "Education",
|
|
|
|
.default = play_store_genre
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
# Explore what remains after classifying above.
|
|
|
|
df_app_categories %>%
|
|
|
|
filter(play_store_genre == "not_found")
|
|
|
|
|
|
|
|
# After this, 13 applications remain, which I will classify as "Other".
|
|
|
|
|
2023-04-18 15:49:33 +02:00
|
|
|
# Correct some mistakes
|
2023-04-19 09:47:43 +02:00
|
|
|
# And classify 'not_found'
|
2023-04-18 16:10:11 +02:00
|
|
|
df_app_categories %<>%
|
|
|
|
mutate(
|
|
|
|
play_store_genre = {
|
|
|
|
function(x) {
|
|
|
|
case_when(
|
|
|
|
x == "Education,Education" ~ "Education",
|
|
|
|
x == "EducationEducation" ~ "Education",
|
2023-04-19 10:54:46 +02:00
|
|
|
x == "not_found" ~ "Other",
|
2023-04-18 16:10:11 +02:00
|
|
|
.default = x
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}(play_store_genre)
|
|
|
|
) %>%
|
2023-04-19 09:47:43 +02:00
|
|
|
select(-package_name) %>%
|
2023-04-19 09:29:40 +02:00
|
|
|
rename(
|
|
|
|
genre = play_store_genre,
|
2023-04-19 09:47:43 +02:00
|
|
|
package_name = package_hash
|
|
|
|
)
|
2023-04-18 16:10:11 +02:00
|
|
|
|
|
|
|
table(df_app_categories$genre)
|
|
|
|
|
2023-04-19 09:47:43 +02:00
|
|
|
df_app_categories %>%
|
|
|
|
group_by(genre) %>%
|
|
|
|
count() %>%
|
2023-04-19 09:29:40 +02:00
|
|
|
arrange(desc(n)) %>%
|
|
|
|
write_csv("play_store_categories_count.csv")
|
|
|
|
|
2023-04-18 16:10:11 +02:00
|
|
|
write_csv(
|
2023-04-19 09:29:40 +02:00
|
|
|
x = select(df_app_categories, c(package_name, genre)),
|
2023-04-18 16:10:11 +02:00
|
|
|
file = "play_store_application_genre_catalogue.csv"
|
2023-04-18 15:49:33 +02:00
|
|
|
)
|
|
|
|
|
2023-04-18 15:34:06 +02:00
|
|
|
dbDisconnect(con)
|