Merge branch 'master' of https://github.com/carissalow/rapids

2020-04-29 15:05:18 -04:00 · 2020-04-29 15:05:18 -04:00 · f292ba3bff
parent 9ddb50ed59 b78c0504d3
commit f292ba3bff
11 changed files with 219 additions and 0 deletions
--- a/src/features/bluetooth_features.R
+++ b/src/features/bluetooth_features.R
@ -1,6 +1,7 @@
 source("packrat/init.R")
 library(dplyr)
 library(tidyr)
 filter_by_day_segment <- function(data, day_segment) {
  if(day_segment %in% c("morning", "afternoon", "evening", "night"))
@ -38,4 +39,6 @@ for(requested_feature in requested_features){
  features <- merge(features, feature, by="local_date", all = TRUE)
 }
 features <- features %>% mutate_at(vars(contains("countscansmostuniquedevice")), list( ~ replace_na(., 0)))
 write.csv(features, snakemake@output[[1]], row.names = FALSE)
--- a/tests/Snakefile
+++ b/tests/Snakefile
@ -0,0 +1,20 @@
 configfile: "config.yaml"
 include: "../rules/packrat.snakefile"
 include: "../rules/preprocessing.snakefile"
 include: "../rules/features.snakefile"
 include: "../rules/models.snakefile"
 include: "../rules/reports.snakefile"
 include: "../rules/mystudy.snakefile" # You can add snakfiles with rules tailored to your project
 rule all:
    input:
        expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
        expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
        expand("data/processed/{pid}/sms_{sms_type}_{day_segment}.csv",
                            pid=config["PIDS"],
                            sms_type = config["SMS"]["TYPES"],
                            day_segment = config["SMS"]["DAY_SEGMENTS"]),
 rule clean:
    shell:
        "rm -rf data/raw/* && rm -rf data/interim/* && rm -rf data/processed/* && rm -rf reports/figures/* && rm -rf reports/*.zip && rm -rf reports/compliance/*"
--- a/tests/data/interim/.gitkeep
+++ b/tests/data/interim/.gitkeep
--- a/tests/data/processed/test01/sms_received_daily.csv
+++ b/tests/data/processed/test01/sms_received_daily.csv
@ -0,0 +1,2 @@
 "local_date","sms_received_daily_countmostfrequentcontact","sms_received_daily_count","sms_received_daily_distinctcontacts","sms_received_daily_timefirstsms","sms_received_daily_timelastsms"
 "2017-03-27",1,1,1,16,16
--- a/tests/data/processed/test01/sms_sent_daily.csv
+++ b/tests/data/processed/test01/sms_sent_daily.csv
@ -0,0 +1,2 @@
 "local_date","sms_sent_daily_countmostfrequentcontact","sms_sent_daily_count","sms_sent_daily_distinctcontacts","sms_sent_daily_timefirstsms","sms_sent_daily_timelastsms"
 "2017-05-14",1,1,1,18.10,18.45
--- a/tests/data/raw/.gitkeep
+++ b/tests/data/raw/.gitkeep
--- a/tests/data/raw/test01/messages_raw.csv
+++ b/tests/data/raw/test01/messages_raw.csv
@ -0,0 +1,3 @@
 "timestamp","device_id","message_type","trace"
 1490644832746,"00cf6eff-7d67-4b7d-be0c-9e7596c9640f",1,"8c2037e569c3e5c9574601071b7c8268b3158ff8"
 1494800874206,"00cf6eff-7d67-4b7d-be0c-9e7596c9640f",2,"e3cbb6efdccf0d9e3c961b019ee0bda4f299d22f"
--- a/tests/scripts/test_sensor_features.py
+++ b/tests/scripts/test_sensor_features.py
@ -0,0 +1,41 @@
 import unittest
 import hashlib
 import pandas as pd
 import utils
 import yaml
 import os
 class TestSensorFeatures(unittest.TestCase):
    # Hack to run code in positional order (not 100% full proof)
    unittest.TestLoader.sortTestMethodsUsing = lambda self, a, b: (a < b) - (a > b)
    @classmethod
    def setUpClass(cls):
        # Runs once to Setup env
        global configs 
        with open(r'tests/settings/testing_config.yaml') as file:
            configs = yaml.full_load(file)
    def test_sensors_files_exist(self):
        # Loop through the file_list dictionary and check if the files exist. 
        file_lists = utils.generate_sensor_file_lists(configs)
        for each in file_lists:
            for out_file, _ in file_lists[each]:
                self.assertEqual(os.path.exists(out_file), 1)
    def test_sensors_features_calculations(self):
        calc_files = utils.generate_sensor_file_lists(configs)
        for each in calc_files:
            for act_result, exp_result in calc_files[each]:
                df_act = pd.read_csv(act_result)
                df_exp = pd.read_csv(exp_result)
                pd.testing.assert_frame_equal(df_exp, df_act, obj=df_exp)
 if __name__ == '__main__':
    unittest.main()
--- a/tests/scripts/utils.py
+++ b/tests/scripts/utils.py
@ -0,0 +1,124 @@
 from snakemake.io import expand
 import os
 import subprocess
 import shutil
 import yaml
 def setUp():
    # This utility setUp is intended to be run once before all tests are run
    # It is intended the set up all the necessary fake data in order to test
    # the rules and scipt files.  
    # Load the configuration file to get basic parameters 
    with open(r'tests/settings/testing_config.yaml') as file:
        configs = yaml.full_load(file)
        # Get the settings 
        pids = configs['PIDS']
    # Reset the test data files  
    for pid in pids:
        # Remove old data files if they exist
        despath = os.path.join('data/raw/', pid)
        if os.path.exists(despath) and os.path.isdir(despath):
            shutil.rmtree(despath)
        # Remove old processed files if they exist
        propath = os.path.join('data/processed/', pid)
        if os.path.exists(propath) and os.path.isdir(propath):
            shutil.rmtree(propath)
        # Create a fresh PID data directories necessary for this round of tests
        os.mkdir(despath)
        # Copy necessary data files
        srcpath = os.path.join('tests/data/raw', pid)
        srcfiles = os.listdir(srcpath)
        for srcfile in srcfiles:
            srcfile_path = os.path.join(srcpath, srcfile)
            desfile = os.path.join(despath, srcfile)
            shutil.copy(srcfile_path, desfile)
    return configs
 def generate_file_list(configs, sensor):
    # Generates the list of files that would be produced for one sensor
    # i.e. The sensor passed into the function. 
    # Initialize string of file path for both expected and actual metric values
    act_str = "data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
    exp_str = "tests/data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
    sensor_cap = sensor.upper()
    if 'DAY_SEGMENTS' and 'FEATURES' in configs[sensor_cap]:
        sensor_type = []
        if 'TYPES' in configs[sensor_cap]:
            for each in configs[sensor_cap]['TYPES']:
                sensor_type.append(each+'_')
    act_file_list = expand(act_str,pid=configs["PIDS"],
                                   sensor = sensor,
                                   sensor_type = sensor_type,
                                   day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
    exp_file_list = expand(exp_str,pid=configs["PIDS"],
                                   sensor = sensor,
                                   sensor_type = sensor_type,
                                   day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
    return zip(act_file_list, exp_file_list)
 def generate_sensor_file_lists(configs):
    # Go through the configs and select those sensors with DAY_SEGMENTS,
    # optionally TYPES then create expected files Return dictionary with 
    # list of file paths of expected and actual files for each sensor 
    # listed in the config file. 
    # Initialize string of file path for both expected and actual metric values
    act_str = "data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
    exp_str = "tests/data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
    # Get all the SENSORS in the config.yaml files
    sensors = configs['SENSORS']
    sensor_file_lists = {}
    # Loop though all sensors and create the actual and expected file paths
    for sensor in sensors:
        if sensor == 'messages':
            sensor = 'sms'
            sensor_cap = 'SMS'
        else:
            sensor_cap = sensor.upper()
        if 'DAY_SEGMENTS' in configs[sensor_cap]:
            sensor_type = []
            if 'TYPES' in configs[sensor_cap]:
                for each in configs[sensor_cap]['TYPES']:
                    sensor_type.append(each+'_')
            if sensor_type:
                act_file_list = expand(act_str, pid=configs["PIDS"], 
                                                sensor = sensor, 
                                                sensor_type = sensor_type, 
                                                day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
                exp_file_list = expand(exp_str, pid=configs["PIDS"], 
                                                sensor = sensor, 
                                                sensor_type = sensor_type, 
                                                day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
            else:
                act_file_list = expand(act_str, pid=configs["PIDS"], 
                                                sensor = sensor, 
                                                sensor_type = '', 
                                                day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
                exp_file_list = expand(exp_str, pid=configs["PIDS"], 
                                                sensor = sensor, 
                                                sensor_type = '', 
                                                day_segment = configs[sensor_cap]["DAY_SEGMENTS"])
            sensor_file_lists[sensor_cap] = list(zip(act_file_list,exp_file_list))
    return sensor_file_lists
--- a/tests/settings/config.yaml
+++ b/tests/settings/config.yaml
@ -0,0 +1,5 @@
 directory: ./
 configfile: ./tests/settings/testing_config.yaml
 snakefile: ./tests/Snakefile
 cores: 1
 forcerun: sms_features
--- a/tests/settings/testing_config.yaml
+++ b/tests/settings/testing_config.yaml
@ -0,0 +1,19 @@
 # Valid database table name
 SENSORS: [messages]
 # Test Participant data to include in the unit testing
 # You must create a file for each participant
 # named pXXX containing their device_id
 PIDS: [test01]
 # Global var with common day segments
 DAY_SEGMENTS: &day_segments
    [daily]
 # Communication SMS features config, TYPES and FEATURES keys need to match
 SMS:
  TYPES : [received, sent]
  FEATURES: 
    received: [count, distinctcontacts, timefirstsms, timelastsms, countmostfrequentcontact]
    sent: [count, distinctcontacts, timefirstsms, timelastsms, countmostfrequentcontact]
  DAY_SEGMENTS: *day_segments
		`@ -0,0 +1,2 @@`
							`"local_date","sms_received_daily_countmostfrequentcontact","sms_received_daily_count","sms_received_daily_distinctcontacts","sms_received_daily_timefirstsms","sms_received_daily_timelastsms"`
							`"2017-03-27",1,1,1,16,16`
		`@ -0,0 +1,2 @@`
							`"local_date","sms_sent_daily_countmostfrequentcontact","sms_sent_daily_count","sms_sent_daily_distinctcontacts","sms_sent_daily_timefirstsms","sms_sent_daily_timelastsms"`
							`"2017-05-14",1,1,1,18.10,18.45`