Add category (genre) to foreground apps
parent
34c4586e4d
commit
9d00f14f7f
|
@ -96,6 +96,7 @@ packrat/*
|
|||
# exclude data from source control by default
|
||||
data/external/*
|
||||
!/data/external/.gitkeep
|
||||
!/data/external/stachl_application_genre_catalogue.csv
|
||||
data/raw/*
|
||||
!/data/raw/.gitkeep
|
||||
data/interim/*
|
||||
|
|
|
@ -10,6 +10,7 @@ rule all:
|
|||
expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["FITBIT_TABLE"]),
|
||||
expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
|
||||
expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]),
|
||||
expand("data/interim/{pid}/applications_foreground_with_datetime_with_genre.csv", pid=config["PIDS"]),
|
||||
expand("data/processed/{pid}/screen_deltas.csv", pid=config["PIDS"]),
|
||||
expand("data/processed/{pid}/plugin_google_activity_recognition_deltas.csv", pid=config["PIDS"]),
|
||||
expand("data/interim/{pid}/phone_valid_sensed_days.csv", pid=config["PIDS"]),
|
||||
|
|
|
@ -44,6 +44,12 @@ CALLS:
|
|||
outgoing: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, hubermduration, varqnduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
|
||||
DAY_SEGMENTS: *day_segments
|
||||
|
||||
APPLICATION_GENRES:
|
||||
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
|
||||
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
|
||||
UPDATE_CATALOGUE_FILE: false # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
|
||||
SCRAPE_MISSING_GENRES: false # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
||||
|
||||
PHONE_VALID_SENSED_DAYS:
|
||||
BIN_SIZE: 5 # (in minutes)
|
||||
MIN_VALID_HOURS: 20 # (out of 24)
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -69,6 +69,19 @@ rule resample_fused_location:
|
|||
script:
|
||||
"../src/data/resample_fused_location.R"
|
||||
|
||||
rule application_genres:
|
||||
input:
|
||||
"data/raw/{pid}/applications_foreground_with_datetime.csv"
|
||||
params:
|
||||
catalogue_source = config["APPLICATION_GENRES"]["CATALOGUE_SOURCE"],
|
||||
catalogue_file = config["APPLICATION_GENRES"]["CATALOGUE_FILE"],
|
||||
update_catalogue_file = config["APPLICATION_GENRES"]["UPDATE_CATALOGUE_FILE"],
|
||||
scrape_missing_genres = config["APPLICATION_GENRES"]["SCRAPE_MISSING_GENRES"]
|
||||
output:
|
||||
"data/interim/{pid}/applications_foreground_with_datetime_with_genre.csv"
|
||||
script:
|
||||
"../src/data/application_genres.R"
|
||||
|
||||
rule fitbit_heartrate_with_datetime:
|
||||
input:
|
||||
"data/raw/{pid}/fitbit_data_raw.csv"
|
||||
|
@ -98,3 +111,4 @@ rule fitbit_sleep_with_datetime:
|
|||
"data/raw/{pid}/fitbit_sleep_with_datetime.csv"
|
||||
script:
|
||||
"../src/data/fitbit_sleep_with_datetime.py"
|
||||
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
source("packrat/init.R")
|
||||
|
||||
library(tidyr)
|
||||
library(dplyr)
|
||||
library(stringr)
|
||||
library("rvest")
|
||||
|
||||
get_genre <- function(apps){
|
||||
urls = paste0("https://play.google.com/store/apps/details?id=", apps)
|
||||
destfiles = paste0(apps,".html")
|
||||
genres = vector("character", length(apps))
|
||||
|
||||
for(i in seq_along(urls)){
|
||||
try_download <- try(download.file(urls[i], destfiles[i], quiet=TRUE), silent = T)
|
||||
page_title <- read_html(destfiles[i]) %>% html_nodes("title") %>% html_text()
|
||||
|
||||
if(is(try_download,"try-error") || page_title == "Not Found"){
|
||||
genres[i] <- "unknown"
|
||||
}
|
||||
else{
|
||||
genres[i] <- read_html(destfiles[i]) %>%
|
||||
html_nodes(xpath = '//*[@itemprop="genre"]') %>%
|
||||
html_text()
|
||||
genres[i] <- tolower(str_remove_all(genres[i], "[\\s&]+")) # removes white spaces or ampersands
|
||||
}
|
||||
file.remove(destfiles[i])
|
||||
}
|
||||
return(data.frame(package_name = apps, genre = genres, stringsAsFactors = F))
|
||||
}
|
||||
|
||||
apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F)
|
||||
genre_catalogue <- data.frame()
|
||||
catalogue_source <- snakemake@params[["catalogue_source"]]
|
||||
update_catalogue_file <- snakemake@params[["update_catalogue_file"]]
|
||||
scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]]
|
||||
|
||||
if(catalogue_source == "GOOGLE"){
|
||||
apps_with_genre <- apps %>% mutate(genre = NA_character_)
|
||||
} else if(catalogue_source == "FILE"){
|
||||
genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character"))
|
||||
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
|
||||
}
|
||||
|
||||
if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){
|
||||
apps_without_genre <- (apps_with_genre %>% filter(is.na(genre)) %>% distinct(package_name))$package_name
|
||||
updated_apps <- get_genre(apps_without_genre)
|
||||
apps_with_genre <- left_join(apps_with_genre, updated_apps, by = "package_name") %>%
|
||||
mutate(genre = coalesce(genre.x, genre.y)) %>%
|
||||
select(-genre.x, -genre.y)
|
||||
|
||||
if(update_catalogue_file){
|
||||
genre_catalogue <- bind_rows(genre_catalogue, updated_apps) %>% distinct()
|
||||
write.csv(genre_catalogue, file = snakemake@params[["catalogue_file"]], row.names = FALSE)
|
||||
}
|
||||
}
|
||||
|
||||
write.csv(apps_with_genre, snakemake@output[[1]], row.names = FALSE)
|
Loading…
Reference in New Issue