Added dataset download rule

replace/61cae70687bed0beeb403707f3112f7319de52ef
JulioV 2019-10-24 12:11:24 -04:00
parent 41c233e4ed
commit 1b1799f9d8
7 changed files with 249 additions and 0 deletions

6
.gitignore vendored
View File

@ -87,3 +87,9 @@ target/
# Mypy cache
.mypy_cache/
.Rprofile
packrat/lib*/
packrat/*
!packrat/packrat.lock
!packrat/packrat.opts
.snakemake/

29
Snakefile 100644
View File

@ -0,0 +1,29 @@
configfile: "config.yaml"
include: "rules/preprocessing.snakefile"
rule all:
input:
expand("data/raw/{pid}/{sensor}.csv", pid=config["PIDS"], sensor=config["SENSORS"])
# --- Packrat Rules --- #
## Taken from https://github.com/lachlandeer/snakemake-econ-r
## packrat_install: installs packrat onto machine
rule packrat_install:
shell:
"R -e 'install.packages(\"packrat\", repos=\"http://cran.us.r-project.org\")'"
## packrat_install: initialize a packrat environment for this project
rule packrat_init:
shell:
"R -e 'packrat::init()'"
## packrat_snap : Look for new R packages in files & archives them
rule packrat_snap:
shell:
"R -e 'packrat::snapshot()'"
## packrat_restore: Installs archived packages onto a new machine
rule packrat_restore:
shell:
"R -e 'packrat::restore()'"

7
config.yaml 100644
View File

@ -0,0 +1,7 @@
# Valid database table names
SENSORS: [messages]
# Participants to include in the analysis
# You must create a file for each participant
# named pXXX containing their device_id
PIDS: [p01, p02]

View File

@ -0,0 +1,155 @@
PackratFormat: 1.4
PackratVersion: 0.5.0
RVersion: 3.6.1
Repos: CRAN=https://cran.rstudio.com/
Package: BH
Source: CRAN
Version: 1.69.0-1
Hash: 15f597ed227897f4f793b6161260f4b9
Package: DBI
Source: CRAN
Version: 1.0.0
Hash: 6abedd7919c4457604c0aa44529a6683
Package: R6
Source: CRAN
Version: 2.4.0
Hash: 948a547c484e5bea15eee76cc53cce3f
Package: RMySQL
Source: CRAN
Version: 0.10.17
Hash: 095e258676be1decbe4ee1bf3c164284
Requires: DBI
Package: Rcpp
Source: CRAN
Version: 1.0.2
Hash: d04e441a8d398e3d3a71f294c07fa94d
Package: assertthat
Source: CRAN
Version: 0.2.1
Hash: 622be49032fe50bd42e96aaef613e209
Package: backports
Source: CRAN
Version: 1.1.5
Hash: 35ad64fcf2063e2a52031b0f1a59d8f0
Package: cli
Source: CRAN
Version: 1.1.0
Hash: bc4e54014c2049f2605ad0c3ba0cce6b
Requires: assertthat, crayon
Package: crayon
Source: CRAN
Version: 1.3.4
Hash: ff2840dd9b0d563fc80377a5a45510cd
Package: digest
Source: CRAN
Version: 0.6.22
Hash: 824be063463b3709782ef29a3e8d7079
Package: dplyr
Source: CRAN
Version: 0.8.3
Hash: 201287c2b23cff8b2ef156ec8b1e57f2
Requires: BH, R6, Rcpp, assertthat, glue, magrittr, pkgconfig, plogr,
rlang, tibble, tidyselect
Package: ellipsis
Source: CRAN
Version: 0.3.0
Hash: 30b58109e4d7c6184a9c2e32f9ae38c6
Requires: rlang
Package: fansi
Source: CRAN
Version: 0.4.0
Hash: f147621f72b561485bfffcae78c4f5d5
Package: glue
Source: CRAN
Version: 1.3.1
Hash: 660bbbe3803c7cf7c9489a7d99a9c0ed
Package: magrittr
Source: CRAN
Version: 1.5
Hash: bdc4d48c3135e8f3b399536ddf160df4
Package: packrat
Source: CRAN
Version: 0.5.0
Hash: 498643e765d1442ba7b1160a1df3abf9
Package: pillar
Source: CRAN
Version: 1.4.2
Hash: 28ff1862b4e0c8761efca442e80a63d8
Requires: cli, crayon, fansi, rlang, utf8, vctrs
Package: pkgconfig
Source: CRAN
Version: 2.0.3
Hash: 5ff5f2361851a49534c96caa2a8071c7
Package: plogr
Source: CRAN
Version: 0.2.0
Hash: 81a8008a5e7858552503935f1abe48aa
Package: purrr
Source: CRAN
Version: 0.3.3
Hash: d4f497f8a97ef6c7182a87b2476748d1
Requires: magrittr, rlang
Package: rlang
Source: CRAN
Version: 0.4.0
Hash: eabda67321fe1d477ea641ddd5d84f00
Package: stringi
Source: CRAN
Version: 1.4.3
Hash: ed2a82fc7cc668c1345223d938cdfaf2
Package: stringr
Source: CRAN
Version: 1.4.0
Hash: 67da32dbb2a7a16f2ef124336358e54a
Requires: glue, magrittr, stringi
Package: tibble
Source: CRAN
Version: 2.1.3
Hash: f59680d81ddc45fa3fcb8c07686d1d89
Requires: cli, crayon, fansi, pillar, pkgconfig, rlang
Package: tidyselect
Source: CRAN
Version: 0.2.5
Hash: 9ab4ed03f4b7bbdbd1db9d7a920aae1a
Requires: Rcpp, glue, purrr, rlang
Package: utf8
Source: CRAN
Version: 1.1.4
Hash: f3f97ce59092abc8ed3fd098a59e236c
Package: vctrs
Source: CRAN
Version: 0.2.0
Hash: daf77cb3dbcacd7fb619cb3748dc215f
Requires: backports, digest, ellipsis, glue, rlang, zeallot
Package: zeallot
Source: CRAN
Version: 0.1.0
Hash: 10b2ed48e202b4db421ae864041dc4b2

View File

@ -0,0 +1,19 @@
auto.snapshot: FALSE
use.cache: FALSE
print.banner.on.startup: auto
vcs.ignore.lib: TRUE
vcs.ignore.src: FALSE
external.packages:
local.repos:
load.external.packages.on.startup: TRUE
ignored.packages:
ignored.directories:
data
inst
quiet.package.installation: TRUE
snapshot.recommended.packages: FALSE
snapshot.fields:
Imports
Depends
LinkingTo
symlink.system.packages: TRUE

View File

@ -0,0 +1,9 @@
rule download_dataset:
input:
"data/external/{pid}"
params:
group="AAPECS"
output:
"data/raw/{pid}/{sensor}.csv"
script:
"../src/data/download_dataset.R"

View File

@ -0,0 +1,24 @@
if (exists("snakemake"))
source("packrat/init.R")
library(RMySQL)
library(stringr)
library(dplyr)
participant <- snakemake@input[[1]]
group <- snakemake@params[[1]]
sensor_file <- snakemake@output[[1]]
device_id <- readLines(participant, n=1)
rmysql.settingsfile <- "./.env"
sensor <- tools::file_path_sans_ext(basename(sensor_file))
stopDB <- dbConnect(MySQL(), default.file = rmysql.settingsfile, group = group)
query <- paste("SELECT * FROM ", sensor, " WHERE device_id LIKE '", device_id, "'", sep = "")
sensor_data <- dbGetQuery(stopDB, query)
sensor_data <- sensor_data[order(sensor_data$timestamp),]
# Droping duplicates on all columns except for _id
sensor_data <- sensor_data %>% distinct(!!!syms(setdiff(names(sensor_data), "_id")))
write.table(sensor_data, sensor_file, row.names = FALSE, quote = FALSE, sep = ",")
dbDisconnect(stopDB)