diff --git a/presentation/ApplicationCategories.R b/presentation/ApplicationCategories.R index 4a696b5..3e47b23 100644 --- a/presentation/ApplicationCategories.R +++ b/presentation/ApplicationCategories.R @@ -34,7 +34,45 @@ df_app_categories <- tbl(con, "app_categories") %>% head(df_app_categories) table(df_app_categories$play_store_genre) +df_app_categories %>% + filter(play_store_genre == "not_found") %>% + group_by(play_store_response) %>% + count() +# All "not_found" have an HTTP status of 404. + +df_app_categories %>% + filter(play_store_genre == "not_found") %>% + group_by(package_name) %>% + count() %>% + arrange(desc(n)) +# All "not_found" apps are unique. + +# Exclude phone manufacturers, custom ROM names and similar. +manufacturers <- c( + "samsung", + "oneplus", + "huawei", + "xiaomi", + "lge", + "motorola", + "miui", + "lenovo", + "oppo", + "mediatek" +) +custom_rom <- c("coloros", "lineageos", "myos", "cyanogenmod", "foundation.e") +other <- c("android", "wssyncmldm") + +grep_pattern <- paste(c(manufacturers, custom_rom, other), collapse = "|") + +rows_os_manufacturer <- grepl(grep_pattern, df_app_categories$package_name) + +# Explore what remains after excluding above. +df_app_categories[!rows_os_manufacturer, ] %>% + filter(play_store_genre == "not_found") + # Correct some mistakes +# And classify 'not_found' df_app_categories %<>% mutate( play_store_genre = { @@ -48,16 +86,17 @@ df_app_categories %<>% } }(play_store_genre) ) %>% - select(-package_name) %>% + select(-package_name) %>% rename( genre = play_store_genre, - package_name = package_hash) + package_name = package_hash + ) table(df_app_categories$genre) -df_app_categories %>% - group_by(genre) %>% - count() %>% +df_app_categories %>% + group_by(genre) %>% + count() %>% arrange(desc(n)) %>% write_csv("play_store_categories_count.csv")