pull/95/head
Meng Li 2020-04-29 15:05:18 -04:00
commit f292ba3bff
11 changed files with 219 additions and 0 deletions

View File

@ -1,6 +1,7 @@
source("packrat/init.R")
library(dplyr)
library(tidyr)
filter_by_day_segment <- function(data, day_segment) {
if(day_segment %in% c("morning", "afternoon", "evening", "night"))
@ -38,4 +39,6 @@ for(requested_feature in requested_features){
features <- merge(features, feature, by="local_date", all = TRUE)
}
features <- features %>% mutate_at(vars(contains("countscansmostuniquedevice")), list( ~ replace_na(., 0)))
write.csv(features, snakemake@output[[1]], row.names = FALSE)

20
tests/Snakefile 100644
View File

@ -0,0 +1,20 @@
configfile: "config.yaml"
include: "../rules/packrat.snakefile"
include: "../rules/preprocessing.snakefile"
include: "../rules/features.snakefile"
include: "../rules/models.snakefile"
include: "../rules/reports.snakefile"
include: "../rules/mystudy.snakefile" # You can add snakfiles with rules tailored to your project
rule all:
input:
expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("data/processed/{pid}/sms_{sms_type}_{day_segment}.csv",
pid=config["PIDS"],
sms_type = config["SMS"]["TYPES"],
day_segment = config["SMS"]["DAY_SEGMENTS"]),
rule clean:
shell:
"rm -rf data/raw/* && rm -rf data/interim/* && rm -rf data/processed/* && rm -rf reports/figures/* && rm -rf reports/*.zip && rm -rf reports/compliance/*"

View File

View File

@ -0,0 +1,2 @@
"local_date","sms_received_daily_countmostfrequentcontact","sms_received_daily_count","sms_received_daily_distinctcontacts","sms_received_daily_timefirstsms","sms_received_daily_timelastsms"
"2017-03-27",1,1,1,16,16
1 local_date sms_received_daily_countmostfrequentcontact sms_received_daily_count sms_received_daily_distinctcontacts sms_received_daily_timefirstsms sms_received_daily_timelastsms
2 2017-03-27 1 1 1 16 16

View File

@ -0,0 +1,2 @@
"local_date","sms_sent_daily_countmostfrequentcontact","sms_sent_daily_count","sms_sent_daily_distinctcontacts","sms_sent_daily_timefirstsms","sms_sent_daily_timelastsms"
"2017-05-14",1,1,1,18.10,18.45
1 local_date sms_sent_daily_countmostfrequentcontact sms_sent_daily_count sms_sent_daily_distinctcontacts sms_sent_daily_timefirstsms sms_sent_daily_timelastsms
2 2017-05-14 1 1 1 18.10 18.45

View File

View File

@ -0,0 +1,3 @@
"timestamp","device_id","message_type","trace"
1490644832746,"00cf6eff-7d67-4b7d-be0c-9e7596c9640f",1,"8c2037e569c3e5c9574601071b7c8268b3158ff8"
1494800874206,"00cf6eff-7d67-4b7d-be0c-9e7596c9640f",2,"e3cbb6efdccf0d9e3c961b019ee0bda4f299d22f"
1 timestamp device_id message_type trace
2 1490644832746 00cf6eff-7d67-4b7d-be0c-9e7596c9640f 1 8c2037e569c3e5c9574601071b7c8268b3158ff8
3 1494800874206 00cf6eff-7d67-4b7d-be0c-9e7596c9640f 2 e3cbb6efdccf0d9e3c961b019ee0bda4f299d22f

View File

@ -0,0 +1,41 @@
import unittest
import hashlib
import pandas as pd
import utils
import yaml
import os
class TestSensorFeatures(unittest.TestCase):
# Hack to run code in positional order (not 100% full proof)
unittest.TestLoader.sortTestMethodsUsing = lambda self, a, b: (a < b) - (a > b)
@classmethod
def setUpClass(cls):
# Runs once to Setup env
global configs
with open(r'tests/settings/testing_config.yaml') as file:
configs = yaml.full_load(file)
def test_sensors_files_exist(self):
# Loop through the file_list dictionary and check if the files exist.
file_lists = utils.generate_sensor_file_lists(configs)
for each in file_lists:
for out_file, _ in file_lists[each]:
self.assertEqual(os.path.exists(out_file), 1)
def test_sensors_features_calculations(self):
calc_files = utils.generate_sensor_file_lists(configs)
for each in calc_files:
for act_result, exp_result in calc_files[each]:
df_act = pd.read_csv(act_result)
df_exp = pd.read_csv(exp_result)
pd.testing.assert_frame_equal(df_exp, df_act, obj=df_exp)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,124 @@
from snakemake.io import expand
import os
import subprocess
import shutil
import yaml
def setUp():
# This utility setUp is intended to be run once before all tests are run
# It is intended the set up all the necessary fake data in order to test
# the rules and scipt files.
# Load the configuration file to get basic parameters
with open(r'tests/settings/testing_config.yaml') as file:
configs = yaml.full_load(file)
# Get the settings
pids = configs['PIDS']
# Reset the test data files
for pid in pids:
# Remove old data files if they exist
despath = os.path.join('data/raw/', pid)
if os.path.exists(despath) and os.path.isdir(despath):
shutil.rmtree(despath)
# Remove old processed files if they exist
propath = os.path.join('data/processed/', pid)
if os.path.exists(propath) and os.path.isdir(propath):
shutil.rmtree(propath)
# Create a fresh PID data directories necessary for this round of tests
os.mkdir(despath)
# Copy necessary data files
srcpath = os.path.join('tests/data/raw', pid)
srcfiles = os.listdir(srcpath)
for srcfile in srcfiles:
srcfile_path = os.path.join(srcpath, srcfile)
desfile = os.path.join(despath, srcfile)
shutil.copy(srcfile_path, desfile)
return configs
def generate_file_list(configs, sensor):
# Generates the list of files that would be produced for one sensor
# i.e. The sensor passed into the function.
# Initialize string of file path for both expected and actual metric values
act_str = "data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
exp_str = "tests/data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
sensor_cap = sensor.upper()
if 'DAY_SEGMENTS' and 'FEATURES' in configs[sensor_cap]:
sensor_type = []
if 'TYPES' in configs[sensor_cap]:
for each in configs[sensor_cap]['TYPES']:
sensor_type.append(each+'_')
act_file_list = expand(act_str,pid=configs["PIDS"],
sensor = sensor,
sensor_type = sensor_type,
day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
exp_file_list = expand(exp_str,pid=configs["PIDS"],
sensor = sensor,
sensor_type = sensor_type,
day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
return zip(act_file_list, exp_file_list)
def generate_sensor_file_lists(configs):
# Go through the configs and select those sensors with DAY_SEGMENTS,
# optionally TYPES then create expected files Return dictionary with
# list of file paths of expected and actual files for each sensor
# listed in the config file.
# Initialize string of file path for both expected and actual metric values
act_str = "data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
exp_str = "tests/data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
# Get all the SENSORS in the config.yaml files
sensors = configs['SENSORS']
sensor_file_lists = {}
# Loop though all sensors and create the actual and expected file paths
for sensor in sensors:
if sensor == 'messages':
sensor = 'sms'
sensor_cap = 'SMS'
else:
sensor_cap = sensor.upper()
if 'DAY_SEGMENTS' in configs[sensor_cap]:
sensor_type = []
if 'TYPES' in configs[sensor_cap]:
for each in configs[sensor_cap]['TYPES']:
sensor_type.append(each+'_')
if sensor_type:
act_file_list = expand(act_str, pid=configs["PIDS"],
sensor = sensor,
sensor_type = sensor_type,
day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
exp_file_list = expand(exp_str, pid=configs["PIDS"],
sensor = sensor,
sensor_type = sensor_type,
day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
else:
act_file_list = expand(act_str, pid=configs["PIDS"],
sensor = sensor,
sensor_type = '',
day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
exp_file_list = expand(exp_str, pid=configs["PIDS"],
sensor = sensor,
sensor_type = '',
day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
sensor_file_lists[sensor_cap] = list(zip(act_file_list,exp_file_list))
return sensor_file_lists

View File

@ -0,0 +1,5 @@
directory: ./
configfile: ./tests/settings/testing_config.yaml
snakefile: ./tests/Snakefile
cores: 1
forcerun: sms_features

View File

@ -0,0 +1,19 @@
# Valid database table name
SENSORS: [messages]
# Test Participant data to include in the unit testing
# You must create a file for each participant
# named pXXX containing their device_id
PIDS: [test01]
# Global var with common day segments
DAY_SEGMENTS: &day_segments
[daily]
# Communication SMS features config, TYPES and FEATURES keys need to match
SMS:
TYPES : [received, sent]
FEATURES:
received: [count, distinctcontacts, timefirstsms, timelastsms, countmostfrequentcontact]
sent: [count, distinctcontacts, timefirstsms, timelastsms, countmostfrequentcontact]
DAY_SEGMENTS: *day_segments