Clean up categories.

ml_pipeline
junos 2023-04-18 15:49:33 +02:00
parent d092e17e33
commit 0b16aa6fe4
1 changed files with 25 additions and 7 deletions

View File

@ -2,22 +2,26 @@ library(conflicted)
library(yaml) library(yaml)
library(RPostgreSQL) library(RPostgreSQL)
library(tidyverse) library(tidyverse)
conflicts_prefer(dplyr::filter, conflicts_prefer(
dplyr::lag) dplyr::filter,
dplyr::lag
)
library(magrittr) library(magrittr)
# read the password from file # read the password from file
credentials <- yaml.load_file("../rapids/credentials.yaml") credentials <- yaml.load_file("../rapids/credentials.yaml")
pw <- credentials$PSQL_STRAW$password pw <- credentials$PSQL_STRAW$password
#load the PostgreSQL driver # load the PostgreSQL driver
drv <- RPostgres::Postgres() drv <- RPostgres::Postgres()
# creates a connection to the postgres database # creates a connection to the postgres database
# note that "con" will be used later in each connection to the database # note that "con" will be used later in each connection to the database
con <- RPostgres::dbConnect(drv, dbname = "staw", con <- RPostgres::dbConnect(drv,
host = "eol.ijs.si", port = 5432, dbname = "staw",
user = "staw_db", password = pw) host = "eol.ijs.si", port = 5432,
user = "staw_db", password = pw
)
rm(pw, credentials) # removes the password rm(pw, credentials) # removes the password
@ -30,4 +34,18 @@ df_app_categories <- tbl(con, "app_categories") %>%
head(df_app_categories) head(df_app_categories)
table(df_app_categories$play_store_genre) table(df_app_categories$play_store_genre)
# Correct some mistakes
df_app_categories %<>% mutate(
play_store_genre = {
function(x) {
case_when(
x == "Education,Education" ~ "Education",
x == "EducationEducation" ~ "Education",
x == "not_found" ~ "System",
.default = x
)
}
}(play_store_genre)
)
dbDisconnect(con) dbDisconnect(con)