diff --git a/rules/preprocessing.snakefile b/rules/preprocessing.snakefile index 7f31991f..3cbb2b55 100644 --- a/rules/preprocessing.snakefile +++ b/rules/preprocessing.snakefile @@ -11,7 +11,8 @@ rule download_dataset: "data/external/{pid}" params: group = config["DOWNLOAD_DATASET"]["GROUP"], - table = "{sensor}" + table = "{sensor}", + timezone = config["TIMEZONE"] output: "data/raw/{pid}/{sensor}_raw.csv" script: diff --git a/src/data/download_dataset.R b/src/data/download_dataset.R index ff73fbae..3b944e61 100644 --- a/src/data/download_dataset.R +++ b/src/data/download_dataset.R @@ -7,14 +7,24 @@ library(dplyr) participant <- snakemake@input[[1]] group <- snakemake@params[["group"]] table <- snakemake@params[["table"]] +timezone <- snakemake@params[["timezone"]] sensor_file <- snakemake@output[[1]] device_ids <- readLines(participant, n=1) unified_device_id <- tail(strsplit(device_ids, ",")[[1]], 1) + +start_date <- strsplit(readLines(participant, n=4)[4], ",")[[1]][1] +end_date <- strsplit(readLines(participant, n=4)[4], ",")[[1]][2] +start_datetime_utc = format(as.POSIXct(paste0(start_date, " 00:00:00"),format="%Y/%m/%d %H:%M:%S",origin="1970-01-01",tz=timezone), tz="UTC") +end_datetime_utc = format(as.POSIXct(paste0(end_date, " 23:59:59"),format="%Y/%m/%d %H:%M:%S",origin="1970-01-01",tz=timezone), tz="UTC") + rmysql.settingsfile <- "./.env" stopDB <- dbConnect(MySQL(), default.file = rmysql.settingsfile, group = group) query <- paste0("SELECT * FROM ", table, " WHERE device_id IN ('", gsub(",", "','", device_ids), "')") +if(!(is.na(start_datetime_utc)) && !(is.na(end_datetime_utc)) && start_datetime_utc < end_datetime_utc){ + query <- paste0(query, "AND timestamp BETWEEN 1000*UNIX_TIMESTAMP('", start_datetime_utc, "') AND 1000*UNIX_TIMESTAMP('", end_datetime_utc, "')") +} sensor_data <- dbGetQuery(stopDB, query) sensor_data <- sensor_data %>% arrange(timestamp) %>% @@ -23,4 +33,4 @@ sensor_data <- sensor_data %>% # Droping duplicates on all columns except for _id sensor_data <- sensor_data %>% distinct(!!!syms(setdiff(names(sensor_data), "_id"))) write.csv(sensor_data, sensor_file, row.names = FALSE) -dbDisconnect(stopDB) \ No newline at end of file +dbDisconnect(stopDB)