Merge branch 'master' of https://github.com/carissalow/rapids
commit
f292ba3bff
|
@ -1,6 +1,7 @@
|
||||||
source("packrat/init.R")
|
source("packrat/init.R")
|
||||||
|
|
||||||
library(dplyr)
|
library(dplyr)
|
||||||
|
library(tidyr)
|
||||||
|
|
||||||
filter_by_day_segment <- function(data, day_segment) {
|
filter_by_day_segment <- function(data, day_segment) {
|
||||||
if(day_segment %in% c("morning", "afternoon", "evening", "night"))
|
if(day_segment %in% c("morning", "afternoon", "evening", "night"))
|
||||||
|
@ -38,4 +39,6 @@ for(requested_feature in requested_features){
|
||||||
features <- merge(features, feature, by="local_date", all = TRUE)
|
features <- merge(features, feature, by="local_date", all = TRUE)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
features <- features %>% mutate_at(vars(contains("countscansmostuniquedevice")), list( ~ replace_na(., 0)))
|
||||||
|
|
||||||
write.csv(features, snakemake@output[[1]], row.names = FALSE)
|
write.csv(features, snakemake@output[[1]], row.names = FALSE)
|
|
@ -0,0 +1,20 @@
|
||||||
|
configfile: "config.yaml"
|
||||||
|
include: "../rules/packrat.snakefile"
|
||||||
|
include: "../rules/preprocessing.snakefile"
|
||||||
|
include: "../rules/features.snakefile"
|
||||||
|
include: "../rules/models.snakefile"
|
||||||
|
include: "../rules/reports.snakefile"
|
||||||
|
include: "../rules/mystudy.snakefile" # You can add snakfiles with rules tailored to your project
|
||||||
|
|
||||||
|
rule all:
|
||||||
|
input:
|
||||||
|
expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
|
||||||
|
expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
|
||||||
|
expand("data/processed/{pid}/sms_{sms_type}_{day_segment}.csv",
|
||||||
|
pid=config["PIDS"],
|
||||||
|
sms_type = config["SMS"]["TYPES"],
|
||||||
|
day_segment = config["SMS"]["DAY_SEGMENTS"]),
|
||||||
|
|
||||||
|
rule clean:
|
||||||
|
shell:
|
||||||
|
"rm -rf data/raw/* && rm -rf data/interim/* && rm -rf data/processed/* && rm -rf reports/figures/* && rm -rf reports/*.zip && rm -rf reports/compliance/*"
|
|
@ -0,0 +1,2 @@
|
||||||
|
"local_date","sms_received_daily_countmostfrequentcontact","sms_received_daily_count","sms_received_daily_distinctcontacts","sms_received_daily_timefirstsms","sms_received_daily_timelastsms"
|
||||||
|
"2017-03-27",1,1,1,16,16
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
"local_date","sms_sent_daily_countmostfrequentcontact","sms_sent_daily_count","sms_sent_daily_distinctcontacts","sms_sent_daily_timefirstsms","sms_sent_daily_timelastsms"
|
||||||
|
"2017-05-14",1,1,1,18.10,18.45
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
"timestamp","device_id","message_type","trace"
|
||||||
|
1490644832746,"00cf6eff-7d67-4b7d-be0c-9e7596c9640f",1,"8c2037e569c3e5c9574601071b7c8268b3158ff8"
|
||||||
|
1494800874206,"00cf6eff-7d67-4b7d-be0c-9e7596c9640f",2,"e3cbb6efdccf0d9e3c961b019ee0bda4f299d22f"
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
import unittest
|
||||||
|
import hashlib
|
||||||
|
import pandas as pd
|
||||||
|
import utils
|
||||||
|
import yaml
|
||||||
|
import os
|
||||||
|
|
||||||
|
class TestSensorFeatures(unittest.TestCase):
|
||||||
|
|
||||||
|
# Hack to run code in positional order (not 100% full proof)
|
||||||
|
unittest.TestLoader.sortTestMethodsUsing = lambda self, a, b: (a < b) - (a > b)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
# Runs once to Setup env
|
||||||
|
global configs
|
||||||
|
with open(r'tests/settings/testing_config.yaml') as file:
|
||||||
|
configs = yaml.full_load(file)
|
||||||
|
|
||||||
|
|
||||||
|
def test_sensors_files_exist(self):
|
||||||
|
# Loop through the file_list dictionary and check if the files exist.
|
||||||
|
|
||||||
|
file_lists = utils.generate_sensor_file_lists(configs)
|
||||||
|
for each in file_lists:
|
||||||
|
for out_file, _ in file_lists[each]:
|
||||||
|
self.assertEqual(os.path.exists(out_file), 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_sensors_features_calculations(self):
|
||||||
|
calc_files = utils.generate_sensor_file_lists(configs)
|
||||||
|
for each in calc_files:
|
||||||
|
for act_result, exp_result in calc_files[each]:
|
||||||
|
df_act = pd.read_csv(act_result)
|
||||||
|
df_exp = pd.read_csv(exp_result)
|
||||||
|
pd.testing.assert_frame_equal(df_exp, df_act, obj=df_exp)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
unittest.main()
|
|
@ -0,0 +1,124 @@
|
||||||
|
from snakemake.io import expand
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import shutil
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
def setUp():
|
||||||
|
# This utility setUp is intended to be run once before all tests are run
|
||||||
|
# It is intended the set up all the necessary fake data in order to test
|
||||||
|
# the rules and scipt files.
|
||||||
|
|
||||||
|
# Load the configuration file to get basic parameters
|
||||||
|
with open(r'tests/settings/testing_config.yaml') as file:
|
||||||
|
configs = yaml.full_load(file)
|
||||||
|
|
||||||
|
# Get the settings
|
||||||
|
pids = configs['PIDS']
|
||||||
|
|
||||||
|
|
||||||
|
# Reset the test data files
|
||||||
|
for pid in pids:
|
||||||
|
# Remove old data files if they exist
|
||||||
|
despath = os.path.join('data/raw/', pid)
|
||||||
|
if os.path.exists(despath) and os.path.isdir(despath):
|
||||||
|
shutil.rmtree(despath)
|
||||||
|
|
||||||
|
# Remove old processed files if they exist
|
||||||
|
propath = os.path.join('data/processed/', pid)
|
||||||
|
if os.path.exists(propath) and os.path.isdir(propath):
|
||||||
|
shutil.rmtree(propath)
|
||||||
|
|
||||||
|
# Create a fresh PID data directories necessary for this round of tests
|
||||||
|
os.mkdir(despath)
|
||||||
|
|
||||||
|
# Copy necessary data files
|
||||||
|
srcpath = os.path.join('tests/data/raw', pid)
|
||||||
|
srcfiles = os.listdir(srcpath)
|
||||||
|
for srcfile in srcfiles:
|
||||||
|
srcfile_path = os.path.join(srcpath, srcfile)
|
||||||
|
desfile = os.path.join(despath, srcfile)
|
||||||
|
shutil.copy(srcfile_path, desfile)
|
||||||
|
|
||||||
|
return configs
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def generate_file_list(configs, sensor):
|
||||||
|
# Generates the list of files that would be produced for one sensor
|
||||||
|
# i.e. The sensor passed into the function.
|
||||||
|
|
||||||
|
# Initialize string of file path for both expected and actual metric values
|
||||||
|
act_str = "data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
|
||||||
|
exp_str = "tests/data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
|
||||||
|
|
||||||
|
sensor_cap = sensor.upper()
|
||||||
|
if 'DAY_SEGMENTS' and 'FEATURES' in configs[sensor_cap]:
|
||||||
|
sensor_type = []
|
||||||
|
if 'TYPES' in configs[sensor_cap]:
|
||||||
|
for each in configs[sensor_cap]['TYPES']:
|
||||||
|
sensor_type.append(each+'_')
|
||||||
|
|
||||||
|
|
||||||
|
act_file_list = expand(act_str,pid=configs["PIDS"],
|
||||||
|
sensor = sensor,
|
||||||
|
sensor_type = sensor_type,
|
||||||
|
day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
|
||||||
|
|
||||||
|
exp_file_list = expand(exp_str,pid=configs["PIDS"],
|
||||||
|
sensor = sensor,
|
||||||
|
sensor_type = sensor_type,
|
||||||
|
day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
|
||||||
|
|
||||||
|
return zip(act_file_list, exp_file_list)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_sensor_file_lists(configs):
|
||||||
|
# Go through the configs and select those sensors with DAY_SEGMENTS,
|
||||||
|
# optionally TYPES then create expected files Return dictionary with
|
||||||
|
# list of file paths of expected and actual files for each sensor
|
||||||
|
# listed in the config file.
|
||||||
|
|
||||||
|
# Initialize string of file path for both expected and actual metric values
|
||||||
|
act_str = "data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
|
||||||
|
exp_str = "tests/data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
|
||||||
|
|
||||||
|
# Get all the SENSORS in the config.yaml files
|
||||||
|
sensors = configs['SENSORS']
|
||||||
|
sensor_file_lists = {}
|
||||||
|
|
||||||
|
# Loop though all sensors and create the actual and expected file paths
|
||||||
|
for sensor in sensors:
|
||||||
|
if sensor == 'messages':
|
||||||
|
sensor = 'sms'
|
||||||
|
sensor_cap = 'SMS'
|
||||||
|
else:
|
||||||
|
sensor_cap = sensor.upper()
|
||||||
|
if 'DAY_SEGMENTS' in configs[sensor_cap]:
|
||||||
|
sensor_type = []
|
||||||
|
if 'TYPES' in configs[sensor_cap]:
|
||||||
|
for each in configs[sensor_cap]['TYPES']:
|
||||||
|
sensor_type.append(each+'_')
|
||||||
|
|
||||||
|
if sensor_type:
|
||||||
|
act_file_list = expand(act_str, pid=configs["PIDS"],
|
||||||
|
sensor = sensor,
|
||||||
|
sensor_type = sensor_type,
|
||||||
|
day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
|
||||||
|
exp_file_list = expand(exp_str, pid=configs["PIDS"],
|
||||||
|
sensor = sensor,
|
||||||
|
sensor_type = sensor_type,
|
||||||
|
day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
|
||||||
|
else:
|
||||||
|
act_file_list = expand(act_str, pid=configs["PIDS"],
|
||||||
|
sensor = sensor,
|
||||||
|
sensor_type = '',
|
||||||
|
day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
|
||||||
|
exp_file_list = expand(exp_str, pid=configs["PIDS"],
|
||||||
|
sensor = sensor,
|
||||||
|
sensor_type = '',
|
||||||
|
day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
|
||||||
|
|
||||||
|
sensor_file_lists[sensor_cap] = list(zip(act_file_list,exp_file_list))
|
||||||
|
|
||||||
|
return sensor_file_lists
|
|
@ -0,0 +1,5 @@
|
||||||
|
directory: ./
|
||||||
|
configfile: ./tests/settings/testing_config.yaml
|
||||||
|
snakefile: ./tests/Snakefile
|
||||||
|
cores: 1
|
||||||
|
forcerun: sms_features
|
|
@ -0,0 +1,19 @@
|
||||||
|
# Valid database table name
|
||||||
|
SENSORS: [messages]
|
||||||
|
|
||||||
|
# Test Participant data to include in the unit testing
|
||||||
|
# You must create a file for each participant
|
||||||
|
# named pXXX containing their device_id
|
||||||
|
PIDS: [test01]
|
||||||
|
|
||||||
|
# Global var with common day segments
|
||||||
|
DAY_SEGMENTS: &day_segments
|
||||||
|
[daily]
|
||||||
|
|
||||||
|
# Communication SMS features config, TYPES and FEATURES keys need to match
|
||||||
|
SMS:
|
||||||
|
TYPES : [received, sent]
|
||||||
|
FEATURES:
|
||||||
|
received: [count, distinctcontacts, timefirstsms, timelastsms, countmostfrequentcontact]
|
||||||
|
sent: [count, distinctcontacts, timefirstsms, timelastsms, countmostfrequentcontact]
|
||||||
|
DAY_SEGMENTS: *day_segments
|
Loading…
Reference in New Issue