Cleanup data/

2021-03-14 13:36:22 -04:00 · 2021-03-14 13:36:22 -04:00 · f4b2bd1fb2
parent 42cee67664
commit f4b2bd1fb2
5 changed files with 0 additions and 286 deletions
--- a/rules/preprocessing.smk
+++ b/rules/preprocessing.smk
@ -1,15 +1,3 @@
-# This rull will dissapear
-rule restore_sql_file:
-    input:
-        sql_file = "data/external/rapids_example.sql",
-        db_credentials = ".env"
-    params:
-        group = "No_GROUP" #config["DATABASE_GROUP"]
-    output:
-        touch("data/interim/restore_sql_file.done")
-    script:
-        "../src/data/restore_sql_file.py"
-
 rule create_example_participant_files:
    output:
        expand("data/external/participant_files/{pid}.yaml", pid = ["example01", "example02"])
--- a/src/data/init.py
+++ b/src/data/init.py
--- a/src/data/download_fitbit_data.R
+++ b/src/data/download_fitbit_data.R
@ -1,46 +0,0 @@
-source("renv/activate.R")
-library(RMariaDB)
-library("dplyr", warn.conflicts = F)
-library(readr)
-library(stringr)
-library(yaml)
-
-
-participant_file <- snakemake@input[["participant_file"]]
-input_file <- snakemake@input[["input_file"]]
-data_configuration <- snakemake@params[["data_configuration"]]
-source <- data_configuration$SOURCE
-sensor <- snakemake@params[["sensor"]]
-table <- snakemake@params[["table"]]
-sensor_file <- snakemake@output[[1]]
-
-participant <- read_yaml(participant_file)
-if(! "FITBIT" %in% names(participant)){
-  stop(paste("The following participant file does not have a FITBIT section, create one manually or automatically (see the docs):", participant_file))
-}
-device_ids <- participant$FITBIT$DEVICE_IDS
-unified_device_id <- tail(device_ids, 1)
-# As opposed to phone data, we dont' filter by date here because data can still be in JSON format, we need to parse it first
-
-if(source$TYPE == "DATABASE"){
-  dbEngine <- dbConnect(MariaDB(), default.file = "./.env", group = source$DATABASE_GROUP)
-  query <- paste0("SELECT * FROM ", table, " WHERE ",source$DEVICE_ID_COLUMN," IN ('", paste0(device_ids, collapse = "','"), "')")
-  sensor_data <- dbGetQuery(dbEngine, query)
-  dbDisconnect(dbEngine)
-} else if(source$TYPE == "FILES"){
-  sensor_data <- read_csv_chunked(input_file, callback = DataFrameCallback$new(function(x, pos) subset(x,x[[source$DEVICE_ID_COLUMN]] %in% device_ids)), progress = T, chunk_size = 50000)
-  if(is.null(sensor_data)) # emtpy file
-    sensor_data <- read.csv(input_file)
-}
-
-sensor_data <- sensor_data %>%
-  rename(device_id = source$DEVICE_ID_COLUMN) %>% 
-  mutate(device_id = unified_device_id) # Unify device_id
-
-if("HIDDEN" %in% names(data_configuration) && data_configuration$HIDDEN$SINGLE_FITBIT_TABLE == TRUE) # For MoSHI use, we didn't split fitbit sensors into different tables
-  sensor_data <- sensor_data %>% filter(fitbit_data_type == str_split(sensor, "_", simplify = TRUE)[[2]])
-
-# Droping duplicates on all columns except for _id or id
-sensor_data <- sensor_data %>% distinct(!!!syms(setdiff(names(sensor_data), c("_id", "id"))))
-
-write_csv(sensor_data, sensor_file)
--- a/src/data/restore_sql_file.py
+++ b/src/data/restore_sql_file.py
@ -1,28 +0,0 @@
-import pandas as pd
-import configparser
-import subprocess
-import os
-
-# read database credentials
-group = snakemake.params["group"]
-config = configparser.ConfigParser()
-config.read(snakemake.input["db_credentials"])
-
-# bash command to create table and restore tables from sql file
-checkdb_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e use " + config[group]["database"]
-create_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e \"CREATE DATABASE IF NOT EXISTS " + config[group]["database"] + ";\""
-restore_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " " + config[group]["database"] + " < data/external/rapids_example.sql"
-
-try:
-    print("Checking if " + config[group]["database"] + " database exists")
-    subprocess.run(checkdb_cmd.split(), check = True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
-except subprocess.CalledProcessError:
-    print(config[group]["database"] + " database does not exist")
-    print("Creating " + config[group]["database"] + " database")
-    os.system(create_cmd)
-    print(config[group]["database"] + " database created")
-    print("Restoring rapids_example.sql")
-    os.system(restore_cmd)
-    print("rapids_example.sql restored in " + config[group]["database"] + " database")
-else:
-    raise ValueError(config[group]["database"] + " DB already exists")
--- a/src/data/unify_utils.R
+++ b/src/data/unify_utils.R
@ -1,200 +0,0 @@
-library("dplyr", warn.conflicts = F)
-library(stringr)
-
-unify_ios_screen <- function(ios_screen){
-    # In Android we only process UNLOCK to OFF episodes. In iOS we only process UNLOCK to LOCKED episodes,
-    # thus, we replace LOCKED with OFF episodes (2 to 0) so we can use Android's code for iOS
-    ios_screen <- ios_screen %>% 
-        # only keep consecutive pairs of 3,2 events
-        filter( (screen_status == 3 & lead(screen_status) == 2) | (screen_status == 2 & lag(screen_status) == 3) ) %>%
-        mutate(screen_status = replace(screen_status, screen_status == 2, 0))
-    return(ios_screen)
-}
-
-unify_ios_battery <- function(ios_battery){
-    # We only need to unify battery data for iOS client V1. V2 does it out-of-the-box
-    # V1 will not have rows where battery_status is equal to 4
-    if(nrow(ios_battery %>% filter(battery_status == 4)) == 0)
-        ios_battery <- ios_battery %>%
-            mutate(battery_status = replace(battery_status, battery_status == 3, 5),
-                battery_status = replace(battery_status, battery_status == 1, 3))
-    return(ios_battery)
-}
-
-unify_ios_calls <- function(ios_calls){
-    # Android’s call types 1=incoming, 2=outgoing, 3=missed
-    # iOS' call status 1=incoming, 2=connected, 3=dialing, 4=disconnected
-    # iOS' call types based on call status: (1,2,4)=incoming=1, (3,2,4)=outgoing=2, (1,4) or (3,4)=missed=3
-    # Sometimes (due to a possible bug in Aware) sequences get logged on the exact same timestamp, thus 3-item sequences can be 2,3,4 or 3,2,4
-    # Even tho iOS stores the duration of ringing/dialing for missed calls, we set it to 0 to match Android
-
-    ios_calls <- ios_calls %>%
-        arrange(trace, timestamp, call_type) %>% 
-        group_by(trace) %>%
-                # search for the disconnect event, as it is common to outgoing, received and missed calls
-        mutate(completed_call = ifelse(call_type == 4, 2, 0), 
-                # assign the same ID to all events before a 4
-                completed_call = cumsum(c(1, head(completed_call, -1) != tail(completed_call, -1))), 
-                # hack to match ID of last event (4) to that of the previous rows
-                completed_call = ifelse(call_type == 4, completed_call - 1, completed_call))
-
-        # We check utc_date_time and local_date_time exist because sometimes we call this function from
-        # download_dataset to unify multi-platform participants. At that point such time columns are missing
-        if("utc_date_time" %in% colnames(ios_calls) && "local_date_time" %in% colnames(ios_calls)){
-            ios_calls <- ios_calls %>% summarise(call_type_sequence = paste(call_type, collapse = ","), # collapse all events before a 4
-                        # sanity check, timestamp_diff should be equal or close to duration sum
-                        # timestamp_diff = trunc((last(timestamp) - first(timestamp)) / 1000) 
-                        # use call_duration = last(call_duration) if you want duration from pick up to hang up
-                        # use call_duration = sum(call_duration) if you want duration from dialing/ringing to hang up
-                        call_duration = last(call_duration), 
-                        timestamp = first(timestamp),
-                        utc_date_time = first(utc_date_time),
-                        local_date_time = first(local_date_time),
-                        local_date = first(local_date),
-                        local_time = first(local_time),
-                        local_hour = first(local_hour),
-                        local_minute = first(local_minute),
-                        local_timezone = first(local_timezone),
-                        assigned_segments = first(assigned_segments))
-        }
-        else {
-            ios_calls <- ios_calls %>% summarise(call_type_sequence = paste(call_type, collapse = ","), call_duration = sum(call_duration),  timestamp = first(timestamp))
-        }
-        ios_calls <- ios_calls %>% mutate(call_type = case_when(
-            call_type_sequence == "1,2,4" | call_type_sequence == "2,1,4" ~ 1, # incoming
-            call_type_sequence == "1,4" ~ 3, # missed
-            call_type_sequence == "3,2,4" | call_type_sequence == "2,3,4" ~ 2, # outgoing
-            call_type_sequence == "3,4" ~ 4, # outgoing missed, we create this temp missed state to assign a duration of 0 below
-            TRUE ~ -1), # other, call sequences without a disconnect (4) event are discarded
-            # assign a duration of 0 to incoming and outgoing missed calls
-            call_duration = ifelse(call_type == 3 | call_type == 4, 0, call_duration), 
-            # get rid of the temp missed call type, set to 2 to match Android. See https://github.com/carissalow/rapids/issues/79
-            call_type = ifelse(call_type == 4, 2, call_type) 
-        ) %>% 
-        # discard sequences without an event 4 (disconnect)
-        filter(call_type > 0) %>%
-        ungroup() %>%
-        arrange(timestamp)
-
-    return(ios_calls)
-}
-
-clean_ios_activity_column <- function(ios_gar){
-    ios_gar <- ios_gar %>%
-        mutate(activities = str_replace_all(activities, pattern = '("|\\[|\\])', replacement = ""))
-
-    existent_multiple_activities <- ios_gar %>%
-        filter(str_detect(activities, ",")) %>% 
-        group_by(activities) %>%
-        summarise(mutiple_activities = unique(activities)) %>% 
-        pull(mutiple_activities)
-
-    known_multiple_activities <- c("stationary,automotive")
-    unkown_multiple_actvities <- setdiff(existent_multiple_activities, known_multiple_activities)
-    if(length(unkown_multiple_actvities) > 0){
-        stop(paste0("There are unkwown combinations of ios activities, you need to implement the decision of the ones to keep: ", unkown_multiple_actvities))
-    }
-
-    ios_gar <- ios_gar %>%
-        mutate(activities = str_replace_all(activities, pattern = "stationary,automotive", replacement = "automotive"))
-    
-    return(ios_gar)
-}
-
-unify_ios_activity_recognition <- function(ios_gar){
-    # We only need to unify Google Activity Recognition data for iOS
-    # discard rows where activities column is blank
-    ios_gar <- ios_gar[-which(ios_gar$activities == ""), ]
-    # clean "activities" column of ios_gar
-    ios_gar <- clean_ios_activity_column(ios_gar)
-
-    # make it compatible with android version: generate "activity_name" and "activity_type" columns
-    ios_gar  <-  ios_gar %>% 
-        mutate(activity_name = case_when(activities == "automotive" ~ "in_vehicle",
-                                         activities == "cycling" ~ "on_bicycle",
-                                         activities == "walking" ~ "walking",
-                                         activities == "running" ~ "running",
-                                         activities == "stationary" ~ "still"),
-               activity_type = case_when(activities == "automotive" ~ 0,
-                                         activities == "cycling" ~ 1,
-                                         activities == "walking" ~ 7,
-                                         activities == "running" ~ 8,
-                                         activities == "stationary" ~ 3,
-                                         activities == "unknown" ~ 4))
-    
-    return(ios_gar)
-}
-
-unify_ios_conversation <- function(conversation){
-    if(nrow(conversation) > 0){
-        duration_check <- conversation %>% 
-            select(double_convo_start, double_convo_end) %>% 
-            mutate(start_is_seconds = double_convo_start <= 9999999999,
-                end_is_seconds = double_convo_end <= 9999999999) # Values smaller than 9999999999 are in seconds instead of milliseconds
-        start_end_in_seconds = sum(duration_check$start_is_seconds) + sum(duration_check$end_is_seconds)
-
-        if(start_end_in_seconds > 0) # convert seconds to milliseconds
-            conversation <- conversation %>% mutate(double_convo_start = double_convo_start * 1000, double_convo_end = double_convo_end * 1000)
-    }
-    return(conversation)
-}
-
-# This function is used in download_dataset.R
-unify_raw_data <- function(dbEngine, sensor_table, sensor, timestamp_filter, aware_multiplatform_tables, device_ids, platforms){
-  # If platforms is 'multiple', fetch each device_id's platform from aware_device, otherwise, use those given by the user
-  if(length(platforms) == 1 && platforms == "multiple")
-      devices_platforms <- dbGetQuery(dbEngine, paste0("SELECT device_id,brand FROM aware_device WHERE device_id IN ('", paste0(device_ids, collapse = "','"), "')")) %>% 
-        mutate(platform = ifelse(brand == "iPhone", "ios", "android"))
-    else
-      devices_platforms <- data.frame(device_id = device_ids, platform = platforms)
-
-  # Get existent tables in database
-  available_tables_in_db <- dbGetQuery(dbEngine, paste0("SELECT table_name FROM information_schema.tables WHERE table_schema='", dbGetInfo(dbEngine)$dbname,"'"))[[1]]
-  if(!any(sensor_table %in% available_tables_in_db))
-    stop(paste0("You requested data from these table(s) ", paste0(sensor_table, collapse=", "), " but they don't exist in your database ", dbGetInfo(dbEngine)$dbname))
-  # Parse the table names for activity recognition and conversation plugins because they are different between android and ios
-  ar_tables <- setNames(aware_multiplatform_tables[1:2], c("android", "ios"))
-  conversation_tables <- setNames(aware_multiplatform_tables[3:4], c("android", "ios"))
-
-  participants_sensordata <- list()
-  for(i in 1:nrow(devices_platforms)) {
-    row <- devices_platforms[i,]
-    device_id <- row$device_id
-    platform <- row$platform
-    
-    # Handle special cases when tables for the same sensor have different names for Android and iOS (AR and conversation)
-    if(length(sensor_table) == 1)
-        table <- sensor_table
-    else if(all(sensor_table == ar_tables))
-      table <- ar_tables[[platform]]
-    else if(all(sensor_table == conversation_tables))
-      table <- conversation_tables[[platform]]
-
-    if(table %in% available_tables_in_db){
-      query <- paste0("SELECT * FROM ", table, " WHERE device_id IN ('", device_id, "')", timestamp_filter)
-      sensor_data <- unify_data(dbGetQuery(dbEngine, query), sensor, platform)
-      participants_sensordata <- append(participants_sensordata, list(sensor_data))
-    }else{
-      warning(paste0("Missing ", table, " table. We unified the data from ", paste0(devices_platforms$device_id, collapse = " and "), " but without records from this missing table for ", device_id))
-    }
-  }
-  unified_data <- bind_rows(participants_sensordata)
-  return(unified_data)
-
-}
-
-# This function is used in unify_ios_android.R and unify_raw_data function
-unify_data <- function(sensor_data, sensor, platform){
-    if(sensor == "phone_calls" & platform == "ios"){
-        sensor_data = unify_ios_calls(sensor_data)
-    } else if(sensor == "phone_battery" & platform == "ios"){
-        sensor_data = unify_ios_battery(sensor_data)
-    } else if(sensor == "phone_activity_recognition" & platform == "ios"){
-        sensor_data = unify_ios_activity_recognition(sensor_data)
-    } else if(sensor == "phone_screen" & platform == "ios"){
-        sensor_data = unify_ios_screen(sensor_data)
-    } else if(sensor == "phone_conversation" & platform == "ios"){
-        sensor_data = unify_ios_conversation(sensor_data)
-    }
-    return(sensor_data)
-}