From 2ee45995f2ba57c8e790c9266ff725188016e756 Mon Sep 17 00:00:00 2001 From: JulioV Date: Thu, 11 Mar 2021 14:30:42 -0500 Subject: [PATCH] Update config docs and create participant files script --- Snakefile | 1 - config.yaml | 16 +++--- docs/setup/configuration.md | 83 +++++++++++++++------------- rules/preprocessing.smk | 2 +- src/data/create_participants_files.R | 46 +++++---------- tools/config.schema.yaml | 29 +++++----- 6 files changed, 81 insertions(+), 96 deletions(-) diff --git a/Snakefile b/Snakefile index db95ad5e..e076c0af 100644 --- a/Snakefile +++ b/Snakefile @@ -253,7 +253,6 @@ for provider in config["FITBIT_SLEEP_SUMMARY"]["PROVIDERS"].keys(): for provider in config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"].keys(): if config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_raw.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_intraday_parsed.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_sleep_intraday_features/fitbit_sleep_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_SLEEP_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) diff --git a/config.yaml b/config.yaml index 51f81372..77aa5d2c 100644 --- a/config.yaml +++ b/config.yaml @@ -11,21 +11,19 @@ PIDS: [test01] # See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files CREATE_PARTICIPANT_FILES: - SOURCE: - TYPE: AWARE_DEVICE_TABLE #AWARE_DEVICE_TABLE or CSV_FILE - DATABASE_GROUP: *database_group - CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format - TIMEZONE: *timezone + CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format PHONE_SECTION: ADD: TRUE DEVICE_ID_COLUMN: device_id # column name IGNORED_DEVICE_IDS: [] FITBIT_SECTION: ADD: FALSE - DEVICE_ID_COLUMN: device_id # column name + DEVICE_ID_COLUMN: fitbit_id # column name IGNORED_DEVICE_IDS: [] EMPATICA_SECTION: - ADD: FALSE + ADD: True + DEVICE_ID_COLUMN: empatica_id # column name + IGNORED_DEVICE_IDS: [] # See https://www.rapids.science/latest/setup/configuration/#time-segments TIME_SEGMENTS: &time_segments @@ -466,10 +464,10 @@ FITBIT_STEPS_INTRADAY: ######################################################################################################################## EMPATICA_DATA_STREAMS: - USE: empatica_zipfiles + USE: empatica_zip # AVAILABLE: - empatica_zipfiles: + empatica_zip: FOLDER: data/external/empatica # Sensors ------ diff --git a/docs/setup/configuration.md b/docs/setup/configuration.md index f016bcee..80602158 100644 --- a/docs/setup/configuration.md +++ b/docs/setup/configuration.md @@ -62,7 +62,8 @@ Participant files link together multiple devices (smartphones and wearables) to LABEL: test01 START_DATE: 2020-04-23 END_DATE: 2020-10-28 - EMPATICA: # Empatica doesn't have a device_id because the devices produce zip files per participant + EMPATICA: + DEVICE_IDS: [empatica1] LABEL: test01 START_DATE: 2020-04-23 END_DATE: 2020-10-28 @@ -91,6 +92,7 @@ Participant files link together multiple devices (smartphones and wearables) to | Key                      | Description | |------------------|-----------------------------------------------------------------------------------------------------------| + | `[DEVICE_IDS]` | An array of the strings that uniquely identify each Empatica device used by this participant. Since the most common use case involves having multiple zip files from a single device for each person, set this device id to an arbitrary string (we usually use their `pid`) | | `[LABEL]` | A string that is used in reports and visualizations. | | `[START_DATE]` | A string with format `YYYY-MM-DD` or `YYYY-MM-DD HH:MM:SS`. Only data collected *after* this date time will be included in the analysis. By default, `YYYY-MM-DD` is interpreted as `YYYY-MM-DD 00:00:00`. | | `[END_DATE]` | A string with format `YYYY-MM-DD` or `YYYY-MM-DD HH:MM:SS`. Only data collected *before* this date time will be included in the analysis. By default, `YYYY-MM-DD` is interpreted as `YYYY-MM-DD 00:00:00`. | @@ -102,14 +104,14 @@ You can use a CSV file with a row per participant to automatically create partic In previous versions of RAPIDS, you could create participant files automatically using the `aware_device` table. We deprecated this option but you can still achieve the same results if you export the output of the following SQL query as a CSV file and follow the instructions below: ```sql - SELECT device_id, device_id as fitbit_id, CONCAT("p", _id) as pid, if(brand = "iPhone", "ios", "android") as platform, CONCAT("p", _id) as label, DATE_FORMAT(FROM_UNIXTIME((timestamp/1000)- 86400), "%Y-%m-%d") as start_date, CURRENT_DATE as end_date from aware_device order by _id; + SELECT device_id, device_id as fitbit_id, CONCAT("p", _id) as empatica_id, CONCAT("p", _id) as pid, if(brand = "iPhone", "ios", "android") as platform, CONCAT("p", _id) as label, DATE_FORMAT(FROM_UNIXTIME((timestamp/1000)- 86400), "%Y-%m-%d") as start_date, CURRENT_DATE as end_date from aware_device order by _id; ``` In your `config.yaml`: 1. Set `CSV_FILE_PATH` to a CSV file path that complies with the specs described below 2. Set the devices (`PHONE`, `FITBIT`, `EMPATICA`) `[ADD]` flag to `TRUE` depending on what devices you used in your study. -3. Set `[DEVICE_ID_COLUMN]` to the name of the column in your CSV file that uniquely identifies each device (only for `PHONE` and `FITBIT`). +3. Set `[DEVICE_ID_COLUMN]` to the column's name in your CSV file that uniquely identifies each device. ```yaml CREATE_PARTICIPANT_FILES: @@ -122,8 +124,10 @@ CREATE_PARTICIPANT_FILES: ADD: FALSE # or FALSE DEVICE_ID_COLUMN: fitbit_id # column name IGNORED_DEVICE_IDS: [] - EMPATICA_SECTION: # Empatica doesn't have a device_id column because the devices produce zip files per participant - ADD: FALSE # or FALSE + EMPATICA_SECTION: + ADD: FALSE + DEVICE_ID_COLUMN: empatica_id # column name + IGNORED_DEVICE_IDS: [] ``` Your CSV file (`[CSV_FILE_PATH]`) should have the following columns (headers) but the values within each column can be empty: @@ -132,6 +136,7 @@ Your CSV file (`[CSV_FILE_PATH]`) should have the following columns (headers) bu |------------------|-----------------------------------------------------------------------------------------------------------| | phone device id | The name of this column has to match `[PHONE_SECTION][DEVICE_ID_COLUMN]`. Separate multiple ids with `;` | | fitbit device id | The name of this column has to match `[FITBIT_SECTION][DEVICE_ID_COLUMN]`. Separate multiple ids with `;` | +| empatica device id | The name of this column has to match `[EMPATICA_SECTION][DEVICE_ID_COLUMN]`. Since the most common use case involves having multiple zip files from a single device for each person, set this device id to an arbitrary string (we usually use their `pid`) | | pid | Unique identifiers with the format pXXX (your participant files will be named with this string) | | platform | Use `android`, `ios` or `infer` as explained above, separate values with `;` | | label | A human readable string that is used in reports and visualizations. | @@ -142,9 +147,9 @@ Your CSV file (`[CSV_FILE_PATH]`) should have the following columns (headers) bu We added white spaces to this example to make it easy to read but you don't have to. ```csv - device_id ,fitbit_id ,pid ,label ,platform ,start_date ,end_date - a748ee1a-1d0b-4ae9-9074-279a2b6ba524;dsadas-2324-fgsf-sdwr-gdfgs4rfsdf43 ,fitbit1 ,p01 ,julio ,android;ios ,2020-01-01 ,2021-01-01 - 4c4cf7a1-0340-44bc-be0f-d5053bf7390c ,fitbit2 ,p02 ,meng ,ios ,2021-01-01 ,2022-01-01 + device_id ,fitbit_id, empatica_id ,pid ,label ,platform ,start_date ,end_date + a748ee1a-1d0b-4ae9-9074-279a2b6ba524;dsadas-2324-fgsf-sdwr-gdfgs4rfsdf43 ,fitbit1 , p01 ,p01 ,julio ,android;ios ,2020-01-01 ,2021-01-01 + 4c4cf7a1-0340-44bc-be0f-d5053bf7390c ,fitbit2 , p02 ,p02 ,meng ,ios ,2021-01-01 ,2022-01-01 ``` Then run @@ -364,8 +369,8 @@ Parameters for `[TIMEZONE]` |`[SINGLE][TZCODE]`| The time zone code from this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) to be used across all devices | |`[MULTIPLE][TZCODES_FILE]`| A CSV file containing the time and code from this [list](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) visited by each device in the study. Multiple devices can be linked to the same person, read more in [Participants Files](#participant-files) | |`[MULTIPLE][IF_MISSING_TZCODE]`| When a device is missing from `[TZCODES_FILE]` Set this flag to `STOP` to stop RAPIDS execution and show an error, or to `USE_DEFAULT` to assign the time zone specified in `[DEFAULT_TZCODE]` to any such devices | -|`[MULTIPLE][FITBIT][ALLOW_MULTIPLE_TZ_PER_DEVICE]`| You only need to care about this flag if one or more Fitbit devices sensed data in one or more time zone, and you want RAPIDS to take into account this in its feature computation. Read more in "How does RAPIDS handle Fitbit devices?" below. | -|`[MULTIPLE][FITBIT][INFER_FROM_SMARTPHONE_TZ]`| You only need to care about this flag if one or more Fitbit devices sensed data in one or more time zone, and you want RAPIDS to take into account this in its feature computation. Read more in "How does RAPIDS handle Fitbit devices?" below. | +|`[MULTIPLE][FITBIT][ALLOW_MULTIPLE_TZ_PER_DEVICE]`| You only need to care about this flag if one or more Fitbit devices sensed data in one or more time zones, and you want RAPIDS to take into account this in its feature computation. Read more in "How does RAPIDS handle Fitbit devices?" below. | +|`[MULTIPLE][FITBIT][INFER_FROM_SMARTPHONE_TZ]`| You only need to care about this flag if one or more Fitbit devices sensed data in one or more time zones, and you want RAPIDS to take into account this in its feature computation. Read more in "How does RAPIDS handle Fitbit devices?" below. | ??? info "Format of `TZCODES_FILE`" `TZCODES_FILE` has three columns and a row for each time zone a device visited (a device can be a smartphone or wearable (Fitbit/Empatica)): @@ -415,7 +420,7 @@ Parameters for `[TIMEZONE]` ??? note "How does RAPIDS handle Fitbit devices?" Fitbit devices are not time zone aware and they always log data with a local date-time string. - - When none of the Fitbit devices in your study changed time zones (e.g., `p01` was always in New York and `p02` as always in Amsterdam), you can set a single time zone per Fitbit device id along with a timestamp 0 (you can still assign multiple time zones to smartphone device ids) + - When none of the Fitbit devices in your study changed time zones (e.g., `p01` was always in New York and `p02` was always in Amsterdam), you can set a single time zone per Fitbit device id along with a timestamp 0 (you can still assign multiple time zones to smartphone device ids) ```csv device_id, tzcode, timestamp fitbit123, America/New_York, 0 @@ -426,10 +431,6 @@ Parameters for `[TIMEZONE]` If you want to `ALLOW_MULTIPLE_TZ_PER_DEVICE` you will need to add any time zone changes per device in the `TZCODES_FILE` as explained above. You could obtain this data by hand but if your participants also used a smartphone during your study, you can use their time zone logs. Recall that in RAPIDS every participant is represented with a participant file `pXX.yaml`, this file links together multiple devices and we will use it to know what smartphone time zone data should be applied to Fitbit devices. Thus set `INFER_FROM_SMARTPHONE_TZ` to `TRUE`, if you have included smartphone time zone data in your `TZCODE_FILE` and you want to make a participant's Fitbit data time zone aware with their respective smartphone data. -??? note "How does RAPIDS handle Empatica devices?" - Empatica devices do not have a device id, since the raw data can only be exported in zip files per device that are saved in a folder per participant (e.g. `data/external/empatica/{pid}`). - - Therefore, in your `TZCODES_FILE`, use the participant's ids (PIDs) instead of the device's ids. Remember a person could have used one or more devices with different device ids, but every person only gets a single PID (e.g. `p01`, a.k.a the name of their participant file `p01.yaml`). --- ## Data Stream Configuration @@ -441,9 +442,12 @@ Modify the following keys in your `config.yaml` depending on the [data stream](. ```yaml PHONE_DATA_STREAMS: - TYPE: aware_mysql + USE: aware_mysql + + # AVAILABLE: aware_mysql: DATABASE_GROUP: MY_GROUP + aware_csv: FOLDER: data/external/aware_csv ``` @@ -467,98 +471,101 @@ Modify the following keys in your `config.yaml` depending on the [data stream](. === "Fitbit" - Set `[FITBIT_DATA_STREAMS][TYPE]` to the Fitbit data stream you want to process (e.g. `fitbitjson_mysql`) and configure its parameters (e.g. `[DATABASE_GROUP]`). - - Ignore the parameters of streams you are not using (e.g. `[FOLDER]` of `aware_csv`). + Set `[FITBIT_DATA_STREAMS][TYPE]` to the Fitbit data stream you want to process (e.g. `fitbitjson_mysql`) and configure its parameters (e.g. `[DATABASE_GROUP]`). Ignore the parameters of the other streams you are not using (e.g. `[FOLDER]` of `aware_csv`). + + !!! warning + You will probably have to tell RAPIDS the name of the columns where you stored your Fitbit data. To do this, modify your chosen stream's `format.yaml` column mappings to match your raw data column names. ```yaml FITBIT_DATA_STREAMS: - TYPE: fitbitjson_mysql + USE: fitbitjson_mysql + # AVAILABLE: fitbitjson_mysql: DATABASE_GROUP: MY_GROUP - COLUMN_MAPPINGS_READY: False + SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: False fitbitjson_csv: FOLDER: data/external/fitbit_csv - COLUMN_MAPPINGS_READY: False + SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: False fitbitparsed_mysql: DATABASE_GROUP: MY_GROUP - COLUMN_MAPPINGS_READY: False + SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: False fitbitparsed_csv: FOLDER: data/external/fitbit_csv - COLUMN_MAPPINGS_READY: False + SLEEP_SUMMARY_EPISODE_DAY_ANCHOR: False ``` === "fitbitjson_mysql" - This data stream process Fitbit data inside a JSON column as obtained from the Fitbit API and stored in a MySQL database. + This data stream process Fitbit data inside a JSON column as obtained from the Fitbit API and stored in a MySQL database. Read more about its column mappings and mutations in [`fitbitjson_mysql`](../../datastreams/fitbitjson-mysql#format). | Key | Description | |---------------------|----------------------------------------------------------------------------------------------------------------------------| | `[DATABASE_GROUP]` | A database credentials group. Read the instructions below to set it up | - | `[COLUMN_MAPPINGS_READY]` | Set this to `True` after you have modified this stream's `format.yaml` column mappings to match your raw data column names: [`fitbitjson_mysql`](../../datastreams/fitbitjson-mysql#format) | + | `[SLEEP_SUMMARY_EPISODE_DAY_ANCHOR]` | One of `start` or `end`. Summary sleep episodes are considered as events based on either the start timestamp or end timestamp (they will belong to the day where they start or end). | --8<---- "docs/snippets/database.md" === "fitbitjson_csv" - This data stream process Fitbit data inside a JSON column as obtained from the Fitbit API and stored in a CSV file. + This data stream process Fitbit data inside a JSON column as obtained from the Fitbit API and stored in a CSV file. Read more about its column mappings and mutations in [`fitbitjson_csv`](../../datastreams/fitbitjson-csv#format). | Key | Description | |---------------------|----------------------------------------------------------------------------------------------------------------------------| | `[FOLDER]` | Folder where you have to place a CSV file **per** Fitbit sensor. Each file has to contain all the data from every participant you want to process. | - | `[COLUMN_MAPPINGS_READY]` | Set this to `True` after you have modified this stream's `format.yaml` column mappings to match your raw data column names: [`fitbitjson_csv`](../../datastreams/fitbitjson-csv#format) | + | `[SLEEP_SUMMARY_EPISODE_DAY_ANCHOR]` | One of `start` or `end`. Summary sleep episodes are considered as events based on either the start timestamp or end timestamp (they will belong to the day where they start or end). | === "fitbitparsed_mysql" - This data stream process Fitbit data stored in multiple columns after being parsed from the JSON column returned by Fitbit API and stored in a MySQL database. + This data stream process Fitbit data stored in multiple columns after being parsed from the JSON column returned by Fitbit API and stored in a MySQL database. Read more about its column mappings and mutations in [`fitbitparsed_mysql`](../../datastreams/fitbitparsed-mysql#format). | Key | Description | |---------------------|----------------------------------------------------------------------------------------------------------------------------| | `[DATABASE_GROUP]` | A database credentials group. Read the instructions below to set it up | - | `[COLUMN_MAPPINGS_READY]` | Set this to `True` after you have modified this stream's `format.yaml` column mappings to match your raw data column names: [`fitbitparsed_mysql`](../../datastreams/fitbitparsed-mysql#format) | + | `[SLEEP_SUMMARY_EPISODE_DAY_ANCHOR]` | One of `start` or `end`. Summary sleep episodes are considered as events based on either the start timestamp or end timestamp (they will belong to the day where they start or end). | --8<---- "docs/snippets/database.md" === "fitbitparsed_csv" - This data stream process Fitbit data stored in multiple columns (plain text) after being parsed from the JSON column returned by Fitbit API and stored in a CSV file. + This data stream process Fitbit data stored in multiple columns (plain text) after being parsed from the JSON column returned by Fitbit API and stored in a CSV file. Read more about its column mappings and mutations in [`fitbitparsed_csv`](../../datastreams/fitbitparsed-csv#format). | Key | Description | |---------------------|----------------------------------------------------------------------------------------------------------------------------| | `[FOLDER]` | Folder where you have to place a CSV file **per** Fitbit sensor. Each file has to contain all the data from every participant you want to process. | - | `[COLUMN_MAPPINGS_READY]` | Set this to `True` after you have modified this stream's `format.yaml` column mappings to match your raw data column names: [`fitbitparsed_csv`](../../datastreams/fitbitparsed-csv#format) | + | `[SLEEP_SUMMARY_EPISODE_DAY_ANCHOR]` | One of `start` or `end`. Summary sleep episodes are considered as events based on either the start timestamp or end timestamp (they will belong to the day where they start or end). | === "Empatica" - Set `[USE]` to the Empatica data stream you want to use, see the table in [introduction to data streams](../../datastreams/data-streams-introduction). Configure any parameters as inidicated below. + Set `[USE]` to the Empatica data stream you want to use; see the table in [introduction to data streams](../../datastreams/data-streams-introduction). Configure any parameters as indicated below. ```yaml EMPATICA_DATA_STREAMS: - USE: empatica_zipfiles + USE: empatica_zip # AVAILABLE: - empatica_zipfiles: + empatica_zip: FOLDER: data/external/empatica ``` - === "empatica_zipfiles" + === "empatica_zip" | Key | Description | |---------------------|----------------------------------------------------------------------------------------------------------------------------| - | `[FOLDER]` | The relative path to a folder containing one subfolder per participant. The name of a participant folder should match their pid in `config[PIDS]`, for example `p01`. Each participant folder can have one or more zip files with any name; in other words, the sensor data contained in those zip files belongs to a single participant. The zip files are [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) generated by Empatica and have a CSV file per sensor (`ACC`, `HR`, `TEMP`, `EDA`, `BVP`, `TAGS`). All CSV files of the same type contained in one or more zip files are uncompressed, parsed, sorted by timestamp, and joinned together.| + | `[FOLDER]` | The relative path to a folder containing one subfolder per participant. The name of a participant folder should match their device_id assigned in their participant file. Each participant folder can have one or more zip files with any name; in other words, the sensor data in those zip files belong to a single participant. The zip files are [automatically](https://support.empatica.com/hc/en-us/articles/201608896-Data-export-and-formatting-from-E4-connect-) generated by Empatica and have a CSV file per sensor (`ACC`, `HR`, `TEMP`, `EDA`, `BVP`, `TAGS`). All CSV files of the same type contained in one or more zip files are uncompressed, parsed, sorted by timestamp, and joined together.| ??? example "Example of an EMPATICA FOLDER" - In the file tree below, we want to process the data of three participants: `p01`, `p02`, and `p03`. `p01` has two zip files, `p02` has only one zip file, and `p03` has three zip files. Each zip will have a CSV file per sensor that are joinned together and process by RAPIDS. These zip files are generated by Empatica. + In the file tree below, we want to process three participants' data: `p01`, `p02`, and `p03`. `p01` has two zip files, `p02` has only one zip file, and `p03` has three zip files. Each zip has a CSV file per sensor that are joined together and processed by RAPIDS. + ```bash data/ # this folder exists in the root RAPIDS folder external/ diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index c6658efc..51068ec1 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -17,7 +17,7 @@ rule create_example_participant_files: rule create_participants_files: input: - participants_file = [] if config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["TYPE"] == "AWARE_DEVICE_TABLE" else config["CREATE_PARTICIPANT_FILES"]["SOURCE"]["CSV_FILE_PATH"] + participants_file = config["CREATE_PARTICIPANT_FILES"]["CSV_FILE_PATH"] params: config = config["CREATE_PARTICIPANT_FILES"] script: diff --git a/src/data/create_participants_files.R b/src/data/create_participants_files.R index adc7daba..20a17f14 100644 --- a/src/data/create_participants_files.R +++ b/src/data/create_participants_files.R @@ -11,43 +11,25 @@ group <- config$SOURCE$DATABASE_GROUP timezone <- config$SOURCE$TIMEZONE phone_device_id_column = config$PHONE_SECTION$DEVICE_ID_COLUMN fitbit_device_id_column = config$FITBIT_SECTION$DEVICE_ID_COLUMN +empatica_device_id_column = config$EMPATICA_SECTION$DEVICE_ID_COLUMN add_phone_section = config$PHONE_SECTION$ADD add_fitbit_section = config$FITBIT_SECTION$ADD add_empatica_section = config$EMPATICA_SECTION$ADD phone_ignored = config$PHONE_SECTION$IGNORED_DEVICE_IDS fitbit_ignored = config$FITBIT_SECTION$IGNORED_DEVICE_IDS +empatica_ignored = config$EMPATICA_SECTION$IGNORED_DEVICE_IDS rmysql.settingsfile <- "./.env" -if(config$SOURCE$TYPE == "AWARE_DEVICE_TABLE"){ - database <- dbConnect(MariaDB(), default.file = rmysql.settingsfile, group = group) - if(config$FITBIT_SECTION$ADD == TRUE){ - query <- paste("SELECT",phone_device_id_column, ",",fitbit_device_id_column," as _temp_fitbit_id, brand, label, timestamp FROM aware_device order by timestamp asc") - fitbit_device_id_column <- "_temp_fitbit_id" - } - else - query <- paste("SELECT ",phone_device_id_column,", brand, label, timestamp FROM aware_device order by timestamp asc") - participants <- dbGetQuery(database, query) - dbDisconnect(database) - participants <- participants %>% - mutate(pid = if_else(row_number()<10, paste0("p","0",row_number()), paste0("p", row_number())), - platform = if_else(brand == "iPhone", "ios", "android"), brand = NULL, - label = iconv(if_else(label == "", "EMPTY_LABEL", label), from = "UTF-8", to = "UTF-8", sub=''), - start_date = format(as.POSIXct(timestamp / 1000, origin = "1970-01-01", tz = timezone), "%Y-%m-%d"), - end_date = format(Sys.Date(), "%Y-%m-%d"), - !!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)), - !!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column))) - -} else if(config$SOURCE$TYPE == "CSV_FILE"){ - participants <- read_csv(config$SOURCE$CSV_FILE_PATH, col_types=cols_only(device_id="c",pid="c",label="c",platform="c", - start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c")) %>% - mutate(start_date = as.character(start_date), end_date = as.character(end_date)) # we read as date to validate format - participants <- participants %>% - mutate(!!phone_device_id_column := str_replace(!!rlang::sym(phone_device_id_column), ";",","), - platform = str_replace(platform, ";",","), - !!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)), - !!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column))) -} +participants <- read_csv(config$CSV_FILE_PATH, col_types=cols_only(device_id="c",pid="c",label="c",platform="c", + start_date=col_date(format = "%Y-%m-%d"),end_date=col_date(format = "%Y-%m-%d"),fitbit_id="c",empatica_id="c")) %>% + mutate(start_date = as.character(start_date), end_date = as.character(end_date)) # we read as date to validate format +participants <- participants %>% +mutate(!!phone_device_id_column := str_replace(!!rlang::sym(phone_device_id_column), ";",","), + platform = str_replace(platform, ";",","), + !!phone_device_id_column := if_else(!!rlang::sym(phone_device_id_column) %in% phone_ignored, NA_character_, !!rlang::sym(phone_device_id_column)), + !!empatica_device_id_column := if_else(!!rlang::sym(empatica_device_id_column) %in% empatica_ignored, NA_character_, !!rlang::sym(empatica_device_id_column)), + !!fitbit_device_id_column := if_else(!!rlang::sym(fitbit_device_id_column) %in% fitbit_ignored, NA_character_, !!rlang::sym(fitbit_device_id_column))) dir.create(file.path("./data/external/participant_files/")) @@ -73,8 +55,8 @@ participants %>% } else lines <- append(lines, empty_fitbit) - if(add_empatica_section == TRUE){ - lines <- append(lines, c("EMPATICA:", + if(add_empatica_section == TRUE && !is.na(row[empatica_device_id_column])){ + lines <- append(lines, c("EMPATICA:", paste0(" DEVICE_IDS: [",row[empatica_device_id_column],"]"), paste(" LABEL:",row$label), paste(" START_DATE:", start_date), paste(" END_DATE:", end_date))) } else lines <- append(lines, empty_empatica) @@ -83,7 +65,7 @@ participants %>% writeLines(lines, file_connection) close(file_connection) - }, add_phone_section, add_fitbit_section, phone_device_id_column, fitbit_device_id_column) + }, add_phone_section, add_fitbit_section, phone_device_id_column, fitbit_device_id_column, empatica_device_id_column) file_lines <-readLines("./config.yaml") for (i in 1:length(file_lines)){ diff --git a/tools/config.schema.yaml b/tools/config.schema.yaml index 89667db8..60ce5dee 100644 --- a/tools/config.schema.yaml +++ b/tools/config.schema.yaml @@ -158,22 +158,11 @@ properties: CREATE_PARTICIPANT_FILES: type: object - required: [SOURCE, PHONE_SECTION, FITBIT_SECTION] + required: [CSV_FILE_PATH, PHONE_SECTION, FITBIT_SECTION,EMPATICA_SECTION] properties: - SOURCE: - type: object - required: [TYPE] - properties: - TYPE: - type: string - enum: [AWARE_DEVICE_TABLE, CSV_FILE] - DATABASE_GROUP: - type: string - CSV_FILE_PATH: - type: string - pattern: "^.*\\.csv$" - TIMEZONE: - type: string + CSV_FILE_PATH: + type: string + pattern: "^.*\\.csv$" PHONE_SECTION: type: object properties: @@ -195,6 +184,16 @@ properties: type: array items: type: string + EMPATICA_SECTION: + properties: + ADD: + type: boolean + DEVICE_ID_COLUMN: + type: string + IGNORED_DEVICE_IDS: + type: array + items: + type: string TIME_SEGMENTS: type: object