stress_at_work_analysis/presentation/ApplicationCategories.R

109 lines
2.5 KiB
R
Raw Normal View History

2023-04-18 15:34:06 +02:00
library(conflicted)
library(yaml)
library(RPostgreSQL)
library(tidyverse)
2023-04-18 15:49:33 +02:00
conflicts_prefer(
dplyr::filter,
dplyr::lag
)
2023-04-18 15:34:06 +02:00
library(magrittr)
# read the password from file
credentials <- yaml.load_file("../rapids/credentials.yaml")
pw <- credentials$PSQL_STRAW$password
2023-04-18 15:49:33 +02:00
# load the PostgreSQL driver
2023-04-18 15:34:06 +02:00
drv <- RPostgres::Postgres()
# creates a connection to the postgres database
# note that "con" will be used later in each connection to the database
2023-04-18 15:49:33 +02:00
con <- RPostgres::dbConnect(drv,
dbname = "staw",
host = "eol.ijs.si", port = 5432,
user = "staw_db", password = pw
)
2023-04-18 15:34:06 +02:00
rm(pw, credentials) # removes the password
# check for the bluetooth table, an example
dbExistsTable(con, "app_categories")
2023-04-18 15:49:33 +02:00
df_app_categories <- tbl(con, "app_categories") %>%
2023-04-18 15:34:06 +02:00
collect()
head(df_app_categories)
table(df_app_categories$play_store_genre)
2023-04-19 09:47:43 +02:00
df_app_categories %>%
filter(play_store_genre == "not_found") %>%
group_by(play_store_response) %>%
count()
# All "not_found" have an HTTP status of 404.
df_app_categories %>%
filter(play_store_genre == "not_found") %>%
group_by(package_name) %>%
count() %>%
arrange(desc(n))
# All "not_found" apps are unique.
# Exclude phone manufacturers, custom ROM names and similar.
manufacturers <- c(
"samsung",
"oneplus",
"huawei",
"xiaomi",
"lge",
"motorola",
"miui",
"lenovo",
"oppo",
"mediatek"
)
custom_rom <- c("coloros", "lineageos", "myos", "cyanogenmod", "foundation.e")
other <- c("android", "wssyncmldm")
grep_pattern <- paste(c(manufacturers, custom_rom, other), collapse = "|")
rows_os_manufacturer <- grepl(grep_pattern, df_app_categories$package_name)
# Explore what remains after excluding above.
df_app_categories[!rows_os_manufacturer, ] %>%
filter(play_store_genre == "not_found")
2023-04-18 15:49:33 +02:00
# Correct some mistakes
2023-04-19 09:47:43 +02:00
# And classify 'not_found'
2023-04-18 16:10:11 +02:00
df_app_categories %<>%
mutate(
play_store_genre = {
function(x) {
case_when(
x == "Education,Education" ~ "Education",
x == "EducationEducation" ~ "Education",
x == "not_found" ~ "System",
.default = x
)
}
}(play_store_genre)
) %>%
2023-04-19 09:47:43 +02:00
select(-package_name) %>%
rename(
genre = play_store_genre,
2023-04-19 09:47:43 +02:00
package_name = package_hash
)
2023-04-18 16:10:11 +02:00
table(df_app_categories$genre)
2023-04-19 09:47:43 +02:00
df_app_categories %>%
group_by(genre) %>%
count() %>%
arrange(desc(n)) %>%
write_csv("play_store_categories_count.csv")
2023-04-18 16:10:11 +02:00
write_csv(
x = select(df_app_categories, c(package_name, genre)),
2023-04-18 16:10:11 +02:00
file = "play_store_application_genre_catalogue.csv"
2023-04-18 15:49:33 +02:00
)
2023-04-18 15:34:06 +02:00
dbDisconnect(con)