Compare commits
147 Commits
d326a1b09d
...
master
Author | SHA1 | Date |
---|---|---|
junos | 63f5a526fc | |
junos | 1cc7339fc8 | |
junos | 5307c71df0 | |
junos | f261286542 | |
junos | a6bc0a90d1 | |
junos | f161da41f4 | |
junos | 8ffd934fd3 | |
junos | cf6af7c9a4 | |
junos | 4dacb7129d | |
junos | f542a97eab | |
junos | 5cb2dcfb00 | |
junos | 8cef60ba87 | |
junos | 0d634f3622 | |
junos | 00e4f8deae | |
junos | 03687a1ac2 | |
junos | a36da99ccb | |
junos | 1d903f3629 | |
junos | d678be0641 | |
junos | 27b90421bf | |
junos | cb006ed0cf | |
junos | 9ca58ed204 | |
junos | 982fa982f7 | |
junos | f8088172e9 | |
junos | 801fbe1c10 | |
Primoz | 8721b944ca | |
Primoz | 36651a11c8 | |
Primoz | 8ae5ad0e88 | |
Primoz | e825aa7c89 | |
Primoz | 5958948af2 | |
Primoz | 7e37eb9067 | |
Primoz | 4d0497a5e0 | |
Primoz | 75b054d358 | |
Primoz | e27ec0269f | |
Primoz | 9b45188a61 | |
Primoz | 3e6b34babc | |
Primoz | 74fd4dfbd7 | |
Primoz | 7b8538ce51 | |
Primoz | 41a17d35f1 | |
Primoz | 7f5a4e6744 | |
Primoz | 3ce7f2c2a5 | |
Primoz | e40f0fd8dc | |
Primoz | 8af3bdf768 | |
Primoz | 01931b8873 | |
Primoz | 569854ddf5 | |
Primoz | 3b2001f570 | |
junos | 44a87c53eb | |
junos | 8da7bd71b2 | |
junos | 788a81d96f | |
Primoz | 87e5209a9f | |
Primoz | f78aa3e7b3 | |
Primoz | a620def209 | |
Primoz | c498ecb742 | |
Primoz | f088e9586f | |
Primoz | 0aa0e82673 | |
Primoz | 4cfe5a3a98 | |
Primoz | 607da820f2 | |
Primoz | fb577bc9ad | |
Primoz | 6ba4a66deb | |
Primoz | 788ac31190 | |
Primoz | 21eb2665d7 | |
Primoz | a65a85cce9 | |
Primoz | fa961fe2f5 | |
Primoz | 6c8014ba8e | |
Primoz | 5a777ac79f | |
Primoz | 0425403951 | |
Primoz | 887fd7dc72 | |
Primoz | 5a4696c548 | |
Primoz | d2758eef46 | |
Primoz | 2d5d23b615 | |
Primoz | a5480f1369 | |
Primoz | 505c3a86b9 | |
junos | ce04394679 | |
Primoz | c851ab0763 | |
Primoz | a8cd16f88c | |
Primoz | dda4554d46 | |
Primoz | 212cf300f8 | |
Primoz | 9ea39dc557 | |
Primoz | 402059871f | |
Primoz | 094743244d | |
primoz | e1d7607de4 | |
primoz | f371249b99 | |
primoz | 64e41cfa35 | |
primoz | 2c7ac21465 | |
primoz | 2acf6ff9fb | |
primoz | d300f0f8f0 | |
Primoz | fbf6a77dfc | |
Primoz | 5532043b1f | |
Primoz | bb62497ba6 | |
Primoz | 2a8f58f5c8 | |
Primoz | 1471c86c62 | |
Primoz | 6864cfe775 | |
Primoz | c1564f0cae | |
Primoz | 31e36e7400 | |
Primoz | 9cf9e1fe14 | |
Primoz | f62a1302dd | |
Primoz | 5638367999 | |
Primoz | 66451160e9 | |
= | 8c8fe1fec7 | |
= | 075c64d1e5 | |
junos | c05b047c2d | |
junos | 53ec52a954 | |
= | 3c058e4463 | |
junos | 144f0d0dcf | |
junos | ed5314aa98 | |
junos | 11c64cfc1a | |
junos | a6a37c7bd9 | |
junos | 9f5edf1c2b | |
junos | 4ad261fae5 | |
= | 74cf4ada1c | |
junos | 9ab0c8f289 | |
junos | 570d2eb656 | |
junos | f5688f6154 | |
junos | b1f356c3f7 | |
junos | 7ff3dcf5fc | |
junos | 50c0defca7 | |
junos | ac86221662 | |
junos | baa94c4c4e | |
junos | d2fbef5234 | |
= | 1c42347b9b | |
Primoz | c050174ca3 | |
Primoz | f9e40711e7 | |
Primoz | a357138f6e | |
Primoz | 470993eeb0 | |
= | ab0b9227d7 | |
= | a9244a60fc | |
= | 8b76c96e47 | |
= | ca59a54d8f | |
= | 393dab72f5 | |
Primoz | 1902d02a86 | |
Primoz | f389ac9d89 | |
Primoz | 191e53e543 | |
Primoz | d3a3f01f29 | |
Primoz | 2da0911d4c | |
Primoz | bd5a811256 | |
Primoz | d1c59de2e9 | |
Primoz | a80f7c0cc4 | |
Primoz | d63158c199 | |
junos | b18dba366e | |
Primoz | 3f8e1cc252 | |
Primoz | dc2b462145 | |
Primoz | 50358978cc | |
Primoz | 86c6312574 | |
Meng Li | 28e580e597 | |
Meng Li | 463ac0a2aa | |
Sam | 10e896ca1d | |
Sam | e5dbbfce44 | |
Sam | 8ae26fb845 |
|
@ -93,10 +93,17 @@ packrat/*
|
||||||
|
|
||||||
# exclude data from source control by default
|
# exclude data from source control by default
|
||||||
data/external/*
|
data/external/*
|
||||||
|
!/data/external/empatica/empatica1/E4 Data.zip
|
||||||
!/data/external/.gitkeep
|
!/data/external/.gitkeep
|
||||||
!/data/external/stachl_application_genre_catalogue.csv
|
!/data/external/stachl_application_genre_catalogue.csv
|
||||||
!/data/external/timesegments*.csv
|
!/data/external/timesegments*.csv
|
||||||
!/data/external/wiki_tz.csv
|
!/data/external/wiki_tz.csv
|
||||||
|
!/data/external/main_study_usernames.csv
|
||||||
|
!/data/external/timezone.csv
|
||||||
|
!/data/external/play_store_application_genre_catalogue.csv
|
||||||
|
!/data/external/play_store_categories_count.csv
|
||||||
|
|
||||||
|
|
||||||
data/raw/*
|
data/raw/*
|
||||||
!/data/raw/.gitkeep
|
!/data/raw/.gitkeep
|
||||||
data/interim/*
|
data/interim/*
|
||||||
|
@ -114,3 +121,12 @@ settings.dcf
|
||||||
tests/fakedata_generation/
|
tests/fakedata_generation/
|
||||||
site/
|
site/
|
||||||
credentials.yaml
|
credentials.yaml
|
||||||
|
|
||||||
|
# Docker container and other files
|
||||||
|
.devcontainer
|
||||||
|
|
||||||
|
# Calculating features module
|
||||||
|
calculatingfeatures/
|
||||||
|
|
||||||
|
# Temp folder for rapids data/external
|
||||||
|
rapids_temp_data/
|
||||||
|
|
188
README.md
188
README.md
|
@ -11,3 +11,191 @@
|
||||||
For more information refer to our [documentation](http://www.rapids.science)
|
For more information refer to our [documentation](http://www.rapids.science)
|
||||||
|
|
||||||
By [MoSHI](https://www.moshi.pitt.edu/), [University of Pittsburgh](https://www.pitt.edu/)
|
By [MoSHI](https://www.moshi.pitt.edu/), [University of Pittsburgh](https://www.pitt.edu/)
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
For RAPIDS installation refer to to the [documentation](https://www.rapids.science/1.8/setup/installation/)
|
||||||
|
|
||||||
|
### For the installation of the Docker version
|
||||||
|
|
||||||
|
1. Follow the [instructions](https://www.rapids.science/1.8/setup/installation/) to setup RAPIDS via Docker (from scratch).
|
||||||
|
|
||||||
|
2. Delete current contents in /rapids/ folder when in a container session.
|
||||||
|
```
|
||||||
|
cd ..
|
||||||
|
rm -rf rapids/{*,.*}
|
||||||
|
cd rapids
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Clone RAPIDS workspace from Git and checkout a specific branch.
|
||||||
|
```
|
||||||
|
git clone "https://repo.ijs.si/junoslukan/rapids.git" .
|
||||||
|
git checkout <branch_name>
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Install missing “libpq-dev” dependency with bash.
|
||||||
|
```
|
||||||
|
apt-get update -y
|
||||||
|
apt-get install -y libpq-dev
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Restore R venv.
|
||||||
|
Type R to go to the interactive R session and then:
|
||||||
|
```
|
||||||
|
renv::restore()
|
||||||
|
```
|
||||||
|
|
||||||
|
6. Install cr-features module
|
||||||
|
From: https://repo.ijs.si/matjazbostic/calculatingfeatures.git -> branch master.
|
||||||
|
Then follow the "cr-features module" section below.
|
||||||
|
|
||||||
|
7. Install all required packages from environment.yml, prune also deletes conda packages not present in environment file.
|
||||||
|
```
|
||||||
|
conda env update --file environment.yml –prune
|
||||||
|
```
|
||||||
|
|
||||||
|
8. If you wish to update your R or Python venvs.
|
||||||
|
```
|
||||||
|
R in interactive session:
|
||||||
|
renv::snapshot()
|
||||||
|
Python:
|
||||||
|
conda env export --no-builds | sed 's/^.*libgfortran.*$/ - libgfortran/' | sed 's/^.*mkl=.*$/ - mkl/' > environment.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
### cr-features module
|
||||||
|
|
||||||
|
This RAPIDS extension uses cr-features library accessible [here](https://repo.ijs.si/matjazbostic/calculatingfeatures).
|
||||||
|
|
||||||
|
To use cr-features library:
|
||||||
|
|
||||||
|
- Follow the installation instructions in the [README.md](https://repo.ijs.si/matjazbostic/calculatingfeatures/-/blob/master/README.md).
|
||||||
|
|
||||||
|
- Copy built calculatingfeatures folder into the RAPIDS workspace.
|
||||||
|
|
||||||
|
- Install the cr-features package by:
|
||||||
|
```
|
||||||
|
pip install path/to/the/calculatingfeatures/folder
|
||||||
|
e.g. pip install ./calculatingfeatures if the folder is copied to main parent directory
|
||||||
|
cr-features package has to be built and installed everytime to get the newest version.
|
||||||
|
Or an the newest version of the docker image must be used.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Updating RAPIDS
|
||||||
|
|
||||||
|
To update RAPIDS, first pull and merge [origin]( https://github.com/carissalow/rapids), such as with:
|
||||||
|
|
||||||
|
```commandline
|
||||||
|
git fetch --progress "origin" refs/heads/master
|
||||||
|
git merge --no-ff origin/master
|
||||||
|
```
|
||||||
|
|
||||||
|
Next, update the conda and R virtual environment.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
R -e 'renv::restore(repos = c(CRAN = "https://packagemanager.rstudio.com/all/__linux__/focal/latest"))'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Custom configuration
|
||||||
|
### Credentials
|
||||||
|
|
||||||
|
As mentioned under [Database in RAPIDS documentation](https://www.rapids.science/1.6/snippets/database/), a `credentials.yaml` file is needed to connect to a database.
|
||||||
|
It should contain:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
PSQL_STRAW:
|
||||||
|
database: staw
|
||||||
|
host: 212.235.208.113
|
||||||
|
password: password
|
||||||
|
port: 5432
|
||||||
|
user: staw_db
|
||||||
|
```
|
||||||
|
|
||||||
|
where`password` needs to be specified as well.
|
||||||
|
|
||||||
|
## Possible installation issues
|
||||||
|
### Missing dependencies for RPostgres
|
||||||
|
|
||||||
|
To install `RPostgres` R package (used to connect to the PostgreSQL database), an error might occur:
|
||||||
|
|
||||||
|
```text
|
||||||
|
------------------------- ANTICONF ERROR ---------------------------
|
||||||
|
Configuration failed because libpq was not found. Try installing:
|
||||||
|
* deb: libpq-dev (Debian, Ubuntu, etc)
|
||||||
|
* rpm: postgresql-devel (Fedora, EPEL)
|
||||||
|
* rpm: postgreql8-devel, psstgresql92-devel, postgresql93-devel, or postgresql94-devel (Amazon Linux)
|
||||||
|
* csw: postgresql_dev (Solaris)
|
||||||
|
* brew: libpq (OSX)
|
||||||
|
If libpq is already installed, check that either:
|
||||||
|
(i) 'pkg-config' is in your PATH AND PKG_CONFIG_PATH contains a libpq.pc file; or
|
||||||
|
(ii) 'pg_config' is in your PATH.
|
||||||
|
If neither can detect , you can set INCLUDE_DIR
|
||||||
|
and LIB_DIR manually via:
|
||||||
|
R CMD INSTALL --configure-vars='INCLUDE_DIR=... LIB_DIR=...'
|
||||||
|
--------------------------[ ERROR MESSAGE ]----------------------------
|
||||||
|
<stdin>:1:10: fatal error: libpq-fe.h: No such file or directory
|
||||||
|
compilation terminated.
|
||||||
|
```
|
||||||
|
|
||||||
|
The library requires `libpq` for compiling from source, so install accordingly.
|
||||||
|
|
||||||
|
### Timezone environment variable for tidyverse (relevant for WSL2)
|
||||||
|
|
||||||
|
One of the R packages, `tidyverse` might need access to the `TZ` environment variable during the installation.
|
||||||
|
On Ubuntu 20.04 on WSL2 this triggers the following error:
|
||||||
|
|
||||||
|
```text
|
||||||
|
> install.packages('tidyverse')
|
||||||
|
|
||||||
|
ERROR: configuration failed for package ‘xml2’
|
||||||
|
System has not been booted with systemd as init system (PID 1). Can't operate.
|
||||||
|
Failed to create bus connection: Host is down
|
||||||
|
Warning in system("timedatectl", intern = TRUE) :
|
||||||
|
running command 'timedatectl' had status 1
|
||||||
|
Error in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]) :
|
||||||
|
namespace ‘xml2’ 1.3.1 is already loaded, but >= 1.3.2 is required
|
||||||
|
Calls: <Anonymous> ... namespaceImportFrom -> asNamespace -> loadNamespace
|
||||||
|
Execution halted
|
||||||
|
ERROR: lazy loading failed for package ‘tidyverse’
|
||||||
|
```
|
||||||
|
|
||||||
|
This happens because WSL2 does not use the `timedatectl` service, which provides this variable.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
~$ timedatectl
|
||||||
|
System has not been booted with systemd as init system (PID 1). Can't operate.
|
||||||
|
Failed to create bus connection: Host is down
|
||||||
|
```
|
||||||
|
|
||||||
|
and later
|
||||||
|
|
||||||
|
```bash
|
||||||
|
Warning message:
|
||||||
|
In system("timedatectl", intern = TRUE) :
|
||||||
|
running command 'timedatectl' had status 1
|
||||||
|
Execution halted
|
||||||
|
```
|
||||||
|
|
||||||
|
This can be amended by setting the environment variable manually before attempting to install `tidyverse`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export TZ='Europe/Ljubljana'
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: if this is needed to avoid runtime issues, you need to either define this environment variable in each new terminal window or (better) define it in your `~/.bashrc` or `~/.bash_profile`.
|
||||||
|
|
||||||
|
## Possible runtime issues
|
||||||
|
### Unix end of line characters
|
||||||
|
|
||||||
|
Upon running rapids, an error might occur:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
/usr/bin/env: ‘python3\r’: No such file or directory
|
||||||
|
```
|
||||||
|
|
||||||
|
This is due to Windows style end of line characters.
|
||||||
|
To amend this, I added a `.gitattributes` files to force `git` to checkout `rapids` using Unix EOL characters.
|
||||||
|
If this still fails, `dos2unix` can be used to change them.
|
||||||
|
|
||||||
|
### System has not been booted with systemd as init system (PID 1)
|
||||||
|
|
||||||
|
See [the installation issue above](#Timezone-environment-variable-for-tidyverse-(relevant-for-WSL2)).
|
||||||
|
|
38
Snakefile
38
Snakefile
|
@ -169,9 +169,19 @@ for provider in config["PHONE_ESM"]["PROVIDERS"].keys():
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_raw.csv",pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_raw.csv",pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_with_datetime.csv",pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_with_datetime.csv",pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_clean.csv",pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_clean.csv",pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_features/phone_esm_clean_{provider_key}.csv",pid=config["PIDS"],provider_key=provider.lower()))
|
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_features/phone_esm_{language}_{provider_key}.csv",pid=config["PIDS"],language=get_script_language(config["PHONE_ESM"]["PROVIDERS"][provider]["SRC_SCRIPT"]),provider_key=provider.lower()))
|
||||||
#files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/features/{pid}/phone_esm.csv", pid=config["PIDS"]))
|
||||||
#files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
# files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"]))
|
||||||
|
# files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||||
|
|
||||||
|
for provider in config["PHONE_SPEECH"]["PROVIDERS"].keys():
|
||||||
|
if config["PHONE_SPEECH"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
|
files_to_compute.extend(expand("data/raw/{pid}/phone_speech_raw.csv",pid=config["PIDS"]))
|
||||||
|
files_to_compute.extend(expand("data/raw/{pid}/phone_speech_with_datetime.csv",pid=config["PIDS"]))
|
||||||
|
files_to_compute.extend(expand("data/interim/{pid}/phone_speech_features/phone_speech_{language}_{provider_key}.csv",pid=config["PIDS"],language=get_script_language(config["PHONE_SPEECH"]["PROVIDERS"][provider]["SRC_SCRIPT"]),provider_key=provider.lower()))
|
||||||
|
files_to_compute.extend(expand("data/processed/features/{pid}/phone_speech.csv", pid=config["PIDS"]))
|
||||||
|
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||||
|
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||||
|
|
||||||
# We can delete these if's as soon as we add feature PROVIDERS to any of these sensors
|
# We can delete these if's as soon as we add feature PROVIDERS to any of these sensors
|
||||||
if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict):
|
if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict):
|
||||||
|
@ -327,7 +337,7 @@ for provider in config["EMPATICA_ACCELEROMETER"]["PROVIDERS"].keys():
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_accelerometer.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_accelerometer.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||||
|
|
||||||
for provider in config["EMPATICA_HEARTRATE"]["PROVIDERS"].keys():
|
for provider in config["EMPATICA_HEARTRATE"]["PROVIDERS"].keys():
|
||||||
if config["EMPATICA_HEARTRATE"]["PROVIDERS"][provider]["COMPUTE"]:
|
if config["EMPATICA_HEARTRATE"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
files_to_compute.extend(expand("data/raw/{pid}/empatica_heartrate_raw.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/raw/{pid}/empatica_heartrate_raw.csv", pid=config["PIDS"]))
|
||||||
|
@ -373,7 +383,7 @@ for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys():
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_inter_beat_interval.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_inter_beat_interval.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||||
|
|
||||||
if isinstance(config["EMPATICA_TAGS"]["PROVIDERS"], dict):
|
if isinstance(config["EMPATICA_TAGS"]["PROVIDERS"], dict):
|
||||||
for provider in config["EMPATICA_TAGS"]["PROVIDERS"].keys():
|
for provider in config["EMPATICA_TAGS"]["PROVIDERS"].keys():
|
||||||
if config["EMPATICA_TAGS"]["PROVIDERS"][provider]["COMPUTE"]:
|
if config["EMPATICA_TAGS"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
|
@ -407,10 +417,18 @@ if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
|
||||||
# Data Cleaning
|
# Data Cleaning
|
||||||
for provider in config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"].keys():
|
for provider in config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"].keys():
|
||||||
if config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][provider]["COMPUTE"]:
|
if config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() +".csv", pid=config["PIDS"]))
|
if provider == "STRAW":
|
||||||
|
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() + "_py.csv", pid=config["PIDS"]))
|
||||||
|
else:
|
||||||
|
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() + "_R.csv", pid=config["PIDS"]))
|
||||||
|
|
||||||
for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
|
for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
|
||||||
if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
|
if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))
|
if provider == "STRAW":
|
||||||
|
for target in config["PARAMS_FOR_ANALYSIS"]["TARGET"]["ALL_LABELS"]:
|
||||||
|
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +"_py_(" + target + ").csv"))
|
||||||
|
else:
|
||||||
|
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +"_R.csv"))
|
||||||
|
|
||||||
# Baseline features
|
# Baseline features
|
||||||
if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
|
if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
|
||||||
|
@ -419,6 +437,12 @@ if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
|
||||||
files_to_compute.extend(expand("data/interim/{pid}/baseline_questionnaires.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/interim/{pid}/baseline_questionnaires.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]))
|
||||||
|
|
||||||
|
# Targets (labels)
|
||||||
|
if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]:
|
||||||
|
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
|
||||||
|
for target in config["PARAMS_FOR_ANALYSIS"]["TARGET"]["ALL_LABELS"]:
|
||||||
|
files_to_compute.extend(expand("data/processed/models/population_model/input_" + target + ".csv"))
|
||||||
|
|
||||||
rule all:
|
rule all:
|
||||||
input:
|
input:
|
||||||
files_to_compute
|
files_to_compute
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
from pprint import pprint
|
||||||
|
import sklearn.metrics
|
||||||
|
import autosklearn.regression
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
import importlib
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
|
||||||
|
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, train_test_split
|
||||||
|
from sklearn.metrics import mean_squared_error, r2_score
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
|
||||||
|
model_input = pd.read_csv("data/processed/models/population_model/input_PANAS_negative_affect_mean.csv") # Standardizirani podatki
|
||||||
|
|
||||||
|
model_input.dropna(axis=1, how="all", inplace=True)
|
||||||
|
model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
|
||||||
|
|
||||||
|
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||||
|
categorical_feature_colnames += [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||||
|
categorical_features = model_input[categorical_feature_colnames].copy()
|
||||||
|
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||||
|
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||||
|
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||||
|
if not categorical_features.empty:
|
||||||
|
categorical_features = pd.get_dummies(categorical_features)
|
||||||
|
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
|
||||||
|
model_in = pd.concat([numerical_features, categorical_features], axis=1)
|
||||||
|
|
||||||
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
|
model_in.set_index(index_columns, inplace=True)
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(model_in.drop(["target", "pid"], axis=1), model_in["target"], test_size=0.30)
|
||||||
|
|
||||||
|
automl = autosklearn.regression.AutoSklearnRegressor(
|
||||||
|
time_left_for_this_task=7200,
|
||||||
|
per_run_time_limit=120
|
||||||
|
)
|
||||||
|
automl.fit(X_train, y_train, dataset_name='straw')
|
||||||
|
|
||||||
|
print(automl.leaderboard())
|
||||||
|
pprint(automl.show_models(), indent=4)
|
||||||
|
|
||||||
|
train_predictions = automl.predict(X_train)
|
||||||
|
print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
|
||||||
|
test_predictions = automl.predict(X_test)
|
||||||
|
print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))
|
||||||
|
|
||||||
|
import sys
|
||||||
|
sys.exit()
|
187
config.yaml
187
config.yaml
|
@ -3,7 +3,7 @@
|
||||||
########################################################################################################################
|
########################################################################################################################
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/setup/configuration/#participant-files
|
# See https://www.rapids.science/latest/setup/configuration/#participant-files
|
||||||
PIDS: ['p031']
|
PIDS: ['p031', 'p032', 'p033', 'p034', 'p035', 'p036', 'p037', 'p038', 'p039', 'p040', 'p042', 'p043', 'p044', 'p045', 'p046', 'p049', 'p050', 'p052', 'p053', 'p054', 'p055', 'p057', 'p058', 'p059', 'p060', 'p061', 'p062', 'p064', 'p067', 'p068', 'p069', 'p070', 'p071', 'p072', 'p073', 'p074', 'p075', 'p076', 'p077', 'p078', 'p079', 'p080', 'p081', 'p082', 'p083', 'p084', 'p085', 'p086', 'p088', 'p089', 'p090', 'p091', 'p092', 'p093', 'p106', 'p107']
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files
|
# See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files
|
||||||
CREATE_PARTICIPANT_FILES:
|
CREATE_PARTICIPANT_FILES:
|
||||||
|
@ -16,14 +16,19 @@ CREATE_PARTICIPANT_FILES:
|
||||||
ADD: False
|
ADD: False
|
||||||
IGNORED_DEVICE_IDS: []
|
IGNORED_DEVICE_IDS: []
|
||||||
EMPATICA_SECTION:
|
EMPATICA_SECTION:
|
||||||
ADD: False
|
ADD: True
|
||||||
IGNORED_DEVICE_IDS: []
|
IGNORED_DEVICE_IDS: []
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/setup/configuration/#time-segments
|
# See https://www.rapids.science/latest/setup/configuration/#time-segments
|
||||||
TIME_SEGMENTS: &time_segments
|
TIME_SEGMENTS: &time_segments
|
||||||
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
|
TYPE: EVENT # FREQUENCY, PERIODIC, EVENT
|
||||||
FILE: "data/external/timesegments_daily.csv"
|
FILE: "data/external/straw_events.csv"
|
||||||
INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
|
INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
|
||||||
|
TAILORED_EVENTS: # Only relevant if TYPE=EVENT
|
||||||
|
COMPUTE: True
|
||||||
|
SEGMENTING_METHOD: "30_before" # 30_before, 90_before, stress_event
|
||||||
|
INTERVAL_OF_INTEREST: 10 # duration of event of interest [minutes]
|
||||||
|
IOI_ERROR_TOLERANCE: 5 # interval of interest erorr tolerance (before and after IOI) [minutes]
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
|
# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
|
||||||
TIMEZONE:
|
TIMEZONE:
|
||||||
|
@ -70,7 +75,6 @@ PHONE_ACCELEROMETER:
|
||||||
COMPUTE: False
|
COMPUTE: False
|
||||||
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
|
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
|
||||||
SRC_SCRIPT: src/features/phone_accelerometer/rapids/main.py
|
SRC_SCRIPT: src/features/phone_accelerometer/rapids/main.py
|
||||||
|
|
||||||
PANDA:
|
PANDA:
|
||||||
COMPUTE: False
|
COMPUTE: False
|
||||||
VALID_SENSED_MINUTES: False
|
VALID_SENSED_MINUTES: False
|
||||||
|
@ -100,9 +104,9 @@ PHONE_APPLICATIONS_CRASHES:
|
||||||
CONTAINER: applications_crashes
|
CONTAINER: applications_crashes
|
||||||
APPLICATION_CATEGORIES:
|
APPLICATION_CATEGORIES:
|
||||||
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
|
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
|
||||||
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
|
CATALOGUE_FILE: "data/external/play_store_application_genre_catalogue.csv"
|
||||||
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
|
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
|
||||||
SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
SCRAPE_MISSING_CATEGORIES: False # whether to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
||||||
PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
|
PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/features/phone-applications-foreground/
|
# See https://www.rapids.science/latest/features/phone-applications-foreground/
|
||||||
|
@ -110,24 +114,32 @@ PHONE_APPLICATIONS_FOREGROUND:
|
||||||
CONTAINER: applications
|
CONTAINER: applications
|
||||||
APPLICATION_CATEGORIES:
|
APPLICATION_CATEGORIES:
|
||||||
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
|
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
|
||||||
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
|
CATALOGUE_FILE: "data/external/play_store_application_genre_catalogue.csv"
|
||||||
PACKAGE_NAMES_HASHED: True
|
# Refer to data/external/play_store_categories_count.csv for a list of categories (genres) and their frequency.
|
||||||
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
|
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
|
||||||
SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
SCRAPE_MISSING_CATEGORIES: False # whether to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
INCLUDE_EPISODE_FEATURES: True
|
INCLUDE_EPISODE_FEATURES: True
|
||||||
SINGLE_CATEGORIES: ["all", "email"]
|
SINGLE_CATEGORIES: ["Productivity", "Tools", "Communication", "Education", "Social"]
|
||||||
MULTIPLE_CATEGORIES:
|
MULTIPLE_CATEGORIES:
|
||||||
social: ["socialnetworks", "socialmediatools"]
|
games: ["Puzzle", "Card", "Casual", "Board", "Strategy", "Trivia", "Word", "Adventure", "Role Playing", "Simulation", "Board, Brain Games", "Racing"]
|
||||||
entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"]
|
social: ["Communication", "Social", "Dating"]
|
||||||
|
productivity: ["Tools", "Productivity", "Finance", "Education", "News & Magazines", "Business", "Books & Reference"]
|
||||||
|
health: ["Health & Fitness", "Lifestyle", "Food & Drink", "Sports", "Medical", "Parenting"]
|
||||||
|
entertainment: ["Shopping", "Music & Audio", "Entertainment", "Travel & Local", "Photography", "Video Players & Editors", "Personalization", "House & Home", "Art & Design", "Auto & Vehicles", "Entertainment,Music & Video",
|
||||||
|
"Puzzle", "Card", "Casual", "Board", "Strategy", "Trivia", "Word", "Adventure", "Role Playing", "Simulation", "Board, Brain Games", "Racing" # Add all games.
|
||||||
|
]
|
||||||
|
maps_weather: ["Maps & Navigation", "Weather"]
|
||||||
CUSTOM_CATEGORIES:
|
CUSTOM_CATEGORIES:
|
||||||
social_media: ["com.google.android.youtube", "com.snapchat.android", "com.instagram.android", "com.zhiliaoapp.musically", "com.facebook.katana"]
|
SINGLE_APPS: []
|
||||||
dating: ["com.tinder", "com.relance.happycouple", "com.kiwi.joyride"]
|
EXCLUDED_CATEGORIES: ["System", "STRAW"]
|
||||||
SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps
|
# Note: A special option here is "is_system_app".
|
||||||
EXCLUDED_CATEGORIES: []
|
# This excludes applications that have is_system_app = TRUE, which is a separate column in the table.
|
||||||
EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"] # TODO list system apps?
|
# However, all of these applications have been assigned System category.
|
||||||
|
# I will therefore filter by that category, which is a superset and is more complete. JL
|
||||||
|
EXCLUDED_APPS: []
|
||||||
FEATURES:
|
FEATURES:
|
||||||
APP_EVENTS: ["countevent", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
|
APP_EVENTS: ["countevent", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
|
||||||
APP_EPISODES: ["countepisode", "minduration", "maxduration", "meanduration", "sumduration"]
|
APP_EPISODES: ["countepisode", "minduration", "maxduration", "meanduration", "sumduration"]
|
||||||
|
@ -160,7 +172,7 @@ PHONE_BLUETOOTH:
|
||||||
CONTAINER: bluetooth
|
CONTAINER: bluetooth
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
|
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
|
||||||
SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R
|
SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R
|
||||||
|
|
||||||
|
@ -239,7 +251,8 @@ PHONE_ESM:
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
STRAW:
|
STRAW:
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support"]
|
SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
|
||||||
|
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
|
||||||
FEATURES: [mean]
|
FEATURES: [mean]
|
||||||
SRC_SCRIPT: src/features/phone_esm/straw/main.py
|
SRC_SCRIPT: src/features/phone_esm/straw/main.py
|
||||||
|
|
||||||
|
@ -324,6 +337,15 @@ PHONE_SCREEN:
|
||||||
EPISODE_TYPES: ["unlock"]
|
EPISODE_TYPES: ["unlock"]
|
||||||
SRC_SCRIPT: src/features/phone_screen/rapids/main.py
|
SRC_SCRIPT: src/features/phone_screen/rapids/main.py
|
||||||
|
|
||||||
|
# Custom added sensor
|
||||||
|
PHONE_SPEECH:
|
||||||
|
CONTAINER: speech
|
||||||
|
PROVIDERS:
|
||||||
|
STRAW:
|
||||||
|
COMPUTE: True
|
||||||
|
FEATURES: ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"]
|
||||||
|
SRC_SCRIPT: src/features/phone_speech/straw/main.py
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/features/phone-wifi-connected/
|
# See https://www.rapids.science/latest/features/phone-wifi-connected/
|
||||||
PHONE_WIFI_CONNECTED:
|
PHONE_WIFI_CONNECTED:
|
||||||
CONTAINER: sensor_wifi
|
CONTAINER: sensor_wifi
|
||||||
|
@ -441,7 +463,6 @@ FITBIT_SLEEP_INTRADAY:
|
||||||
UNIFIED: [awake, asleep]
|
UNIFIED: [awake, asleep]
|
||||||
SLEEP_TYPES: [main, nap, all]
|
SLEEP_TYPES: [main, nap, all]
|
||||||
SRC_SCRIPT: src/features/fitbit_sleep_intraday/rapids/main.py
|
SRC_SCRIPT: src/features/fitbit_sleep_intraday/rapids/main.py
|
||||||
|
|
||||||
PRICE:
|
PRICE:
|
||||||
COMPUTE: False
|
COMPUTE: False
|
||||||
FEATURES: [avgduration, avgratioduration, avgstarttimeofepisodemain, avgendtimeofepisodemain, avgmidpointofepisodemain, stdstarttimeofepisodemain, stdendtimeofepisodemain, stdmidpointofepisodemain, socialjetlag, rmssdmeanstarttimeofepisodemain, rmssdmeanendtimeofepisodemain, rmssdmeanmidpointofepisodemain, rmssdmedianstarttimeofepisodemain, rmssdmedianendtimeofepisodemain, rmssdmedianmidpointofepisodemain]
|
FEATURES: [avgduration, avgratioduration, avgstarttimeofepisodemain, avgendtimeofepisodemain, avgmidpointofepisodemain, stdstarttimeofepisodemain, stdendtimeofepisodemain, stdmidpointofepisodemain, socialjetlag, rmssdmeanstarttimeofepisodemain, rmssdmeanendtimeofepisodemain, rmssdmeanmidpointofepisodemain, rmssdmedianstarttimeofepisodemain, rmssdmedianendtimeofepisodemain, rmssdmedianmidpointofepisodemain]
|
||||||
|
@ -485,6 +506,7 @@ FITBIT_STEPS_INTRADAY:
|
||||||
INCLUDE_ZERO_STEP_ROWS: False
|
INCLUDE_ZERO_STEP_ROWS: False
|
||||||
SRC_SCRIPT: src/features/fitbit_steps_intraday/rapids/main.py
|
SRC_SCRIPT: src/features/fitbit_steps_intraday/rapids/main.py
|
||||||
|
|
||||||
|
|
||||||
########################################################################################################################
|
########################################################################################################################
|
||||||
# EMPATICA #
|
# EMPATICA #
|
||||||
########################################################################################################################
|
########################################################################################################################
|
||||||
|
@ -506,6 +528,15 @@ EMPATICA_ACCELEROMETER:
|
||||||
COMPUTE: False
|
COMPUTE: False
|
||||||
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
|
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
|
||||||
SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
|
SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
|
||||||
|
CR:
|
||||||
|
COMPUTE: True
|
||||||
|
FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features
|
||||||
|
WINDOWS:
|
||||||
|
COMPUTE: True
|
||||||
|
WINDOW_LENGTH: 15 # specify window length in seconds
|
||||||
|
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
|
||||||
|
SRC_SCRIPT: src/features/empatica_accelerometer/cr/main.py
|
||||||
|
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/features/empatica-heartrate/
|
# See https://www.rapids.science/latest/features/empatica-heartrate/
|
||||||
EMPATICA_HEARTRATE:
|
EMPATICA_HEARTRATE:
|
||||||
|
@ -524,6 +555,15 @@ EMPATICA_TEMPERATURE:
|
||||||
COMPUTE: False
|
COMPUTE: False
|
||||||
FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
|
FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
|
||||||
SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
|
SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
|
||||||
|
CR:
|
||||||
|
COMPUTE: True
|
||||||
|
FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean",
|
||||||
|
"stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"]
|
||||||
|
WINDOWS:
|
||||||
|
COMPUTE: True
|
||||||
|
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||||
|
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
|
||||||
|
SRC_SCRIPT: src/features/empatica_temperature/cr/main.py
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/features/empatica-electrodermal-activity/
|
# See https://www.rapids.science/latest/features/empatica-electrodermal-activity/
|
||||||
EMPATICA_ELECTRODERMAL_ACTIVITY:
|
EMPATICA_ELECTRODERMAL_ACTIVITY:
|
||||||
|
@ -533,6 +573,19 @@ EMPATICA_ELECTRODERMAL_ACTIVITY:
|
||||||
COMPUTE: False
|
COMPUTE: False
|
||||||
FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
|
FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
|
||||||
SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
|
SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
|
||||||
|
CR:
|
||||||
|
COMPUTE: True
|
||||||
|
FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic',
|
||||||
|
'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore',
|
||||||
|
'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio',
|
||||||
|
'avgPeakIncreaseTime', 'avgPeakDecreaseTime', 'avgPeakDuration', 'signalOverallChange', 'changeDuration', 'changeRate', 'significantIncrease',
|
||||||
|
'significantDecrease']
|
||||||
|
WINDOWS:
|
||||||
|
COMPUTE: True
|
||||||
|
WINDOW_LENGTH: 60 # specify window length in seconds
|
||||||
|
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', count_windows, eda_num_peaks_non_zero]
|
||||||
|
IMPUTE_NANS: True
|
||||||
|
SRC_SCRIPT: src/features/empatica_electrodermal_activity/cr/main.py
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/features/empatica-blood-volume-pulse/
|
# See https://www.rapids.science/latest/features/empatica-blood-volume-pulse/
|
||||||
EMPATICA_BLOOD_VOLUME_PULSE:
|
EMPATICA_BLOOD_VOLUME_PULSE:
|
||||||
|
@ -542,6 +595,15 @@ EMPATICA_BLOOD_VOLUME_PULSE:
|
||||||
COMPUTE: False
|
COMPUTE: False
|
||||||
FEATURES: ["maxbvp", "minbvp", "avgbvp", "medianbvp", "modebvp", "stdbvp", "diffmaxmodebvp", "diffminmodebvp", "entropybvp"]
|
FEATURES: ["maxbvp", "minbvp", "avgbvp", "medianbvp", "modebvp", "stdbvp", "diffmaxmodebvp", "diffminmodebvp", "entropybvp"]
|
||||||
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/dbdp/main.py
|
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/dbdp/main.py
|
||||||
|
CR:
|
||||||
|
COMPUTE: False
|
||||||
|
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
|
||||||
|
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
|
||||||
|
WINDOWS:
|
||||||
|
COMPUTE: True
|
||||||
|
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||||
|
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
|
||||||
|
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/cr/main.py
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/features/empatica-inter-beat-interval/
|
# See https://www.rapids.science/latest/features/empatica-inter-beat-interval/
|
||||||
EMPATICA_INTER_BEAT_INTERVAL:
|
EMPATICA_INTER_BEAT_INTERVAL:
|
||||||
|
@ -551,6 +613,16 @@ EMPATICA_INTER_BEAT_INTERVAL:
|
||||||
COMPUTE: False
|
COMPUTE: False
|
||||||
FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
|
FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
|
||||||
SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
|
SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
|
||||||
|
CR:
|
||||||
|
COMPUTE: True
|
||||||
|
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
|
||||||
|
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
|
||||||
|
PATCH_WITH_BVP: True
|
||||||
|
WINDOWS:
|
||||||
|
COMPUTE: True
|
||||||
|
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||||
|
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
|
||||||
|
SRC_SCRIPT: src/features/empatica_inter_beat_interval/cr/main.py
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/features/empatica-tags/
|
# See https://www.rapids.science/latest/features/empatica-tags/
|
||||||
EMPATICA_TAGS:
|
EMPATICA_TAGS:
|
||||||
|
@ -566,7 +638,7 @@ EMPATICA_TAGS:
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#1-histograms-of-phone-data-yield
|
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#1-histograms-of-phone-data-yield
|
||||||
HISTOGRAM_PHONE_DATA_YIELD:
|
HISTOGRAM_PHONE_DATA_YIELD:
|
||||||
PLOT: True
|
PLOT: False
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#2-heatmaps-of-overall-data-yield
|
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#2-heatmaps-of-overall-data-yield
|
||||||
HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT:
|
HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT:
|
||||||
|
@ -575,7 +647,7 @@ HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT:
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#3-heatmap-of-recorded-phone-sensors
|
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#3-heatmap-of-recorded-phone-sensors
|
||||||
HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT:
|
HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT:
|
||||||
PLOT: True
|
PLOT: False
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#4-heatmap-of-sensor-row-count
|
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#4-heatmap-of-sensor-row-count
|
||||||
HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT:
|
HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT:
|
||||||
|
@ -586,7 +658,7 @@ HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT:
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/visualizations/feature-visualizations/#1-heatmap-correlation-matrix
|
# See https://www.rapids.science/latest/visualizations/feature-visualizations/#1-heatmap-correlation-matrix
|
||||||
HEATMAP_FEATURE_CORRELATION_MATRIX:
|
HEATMAP_FEATURE_CORRELATION_MATRIX:
|
||||||
PLOT: True
|
PLOT: False
|
||||||
MIN_ROWS_RATIO: 0.5
|
MIN_ROWS_RATIO: 0.5
|
||||||
CORR_THRESHOLD: 0.1
|
CORR_THRESHOLD: 0.1
|
||||||
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
|
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
|
||||||
|
@ -599,55 +671,88 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
|
||||||
ALL_CLEANING_INDIVIDUAL:
|
ALL_CLEANING_INDIVIDUAL:
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
IMPUTE_SELECTED_EVENT_FEATURES:
|
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||||
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
COLS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||||
COLS_VAR_THRESHOLD: True
|
COLS_VAR_THRESHOLD: True
|
||||||
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
|
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||||
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||||
DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable
|
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
|
||||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||||
COMPUTE: False
|
COMPUTE: True
|
||||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||||
CORR_THRESHOLD: 0.95
|
CORR_THRESHOLD: 0.95
|
||||||
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
|
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
|
||||||
|
STRAW:
|
||||||
|
COMPUTE: True
|
||||||
|
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||||
|
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||||
|
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||||
|
ROWS_NAN_THRESHOLD: 0.33 # set to 1 to disable
|
||||||
|
COLS_NAN_THRESHOLD: 0.9 # set to 1 to remove only columns that contains all (100% of) NaN
|
||||||
|
COLS_VAR_THRESHOLD: True
|
||||||
|
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||||
|
COMPUTE: True
|
||||||
|
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||||
|
CORR_THRESHOLD: 0.95
|
||||||
|
STANDARDIZATION: True
|
||||||
|
SRC_SCRIPT: src/features/all_cleaning_individual/straw/main.py
|
||||||
|
|
||||||
ALL_CLEANING_OVERALL:
|
ALL_CLEANING_OVERALL:
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
IMPUTE_SELECTED_EVENT_FEATURES:
|
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||||
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
COLS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||||
COLS_VAR_THRESHOLD: True
|
COLS_VAR_THRESHOLD: True
|
||||||
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
|
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||||
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||||
DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable
|
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
|
||||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||||
COMPUTE: False
|
COMPUTE: True
|
||||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||||
CORR_THRESHOLD: 0.95
|
CORR_THRESHOLD: 0.95
|
||||||
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
|
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
|
||||||
|
STRAW:
|
||||||
|
COMPUTE: True
|
||||||
|
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||||
|
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||||
|
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||||
|
ROWS_NAN_THRESHOLD: 0.33 # set to 1 to disable
|
||||||
|
COLS_NAN_THRESHOLD: 0.8 # set to 1 to remove only columns that contains all (100% of) NaN
|
||||||
|
COLS_VAR_THRESHOLD: True
|
||||||
|
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||||
|
COMPUTE: True
|
||||||
|
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||||
|
CORR_THRESHOLD: 0.95
|
||||||
|
STANDARDIZATION: True
|
||||||
|
TARGET_STANDARDIZATION: False
|
||||||
|
SRC_SCRIPT: src/features/all_cleaning_overall/straw/main.py
|
||||||
|
|
||||||
|
|
||||||
########################################################################################################################
|
########################################################################################################################
|
||||||
# Analysis Workflow Example #
|
# Baseline #
|
||||||
########################################################################################################################
|
########################################################################################################################
|
||||||
|
|
||||||
PARAMS_FOR_ANALYSIS:
|
PARAMS_FOR_ANALYSIS:
|
||||||
BASELINE:
|
BASELINE:
|
||||||
COMPUTE: False
|
COMPUTE: True
|
||||||
FOLDER: data/external/baseline
|
FOLDER: data/external/baseline
|
||||||
CONTAINER: [results-survey637813_final.csv, # Slovenia
|
CONTAINER: [results-survey637813_final.csv, # Slovenia
|
||||||
results-survey358134_final.csv, # Belgium 1
|
results-survey358134_final.csv, # Belgium 1
|
||||||
results-survey413767_final.csv # Belgium 2
|
results-survey413767_final.csv # Belgium 2
|
||||||
]
|
]
|
||||||
QUESTION_LIST: survey637813+question_text.csv
|
QUESTION_LIST: survey637813+question_text.csv
|
||||||
FEATURES: [age, gender, startlanguage, demand, control, demand_control_ratio]
|
FEATURES: [age, gender, startlanguage, limesurvey_demand, limesurvey_control, limesurvey_demand_control_ratio, limesurvey_demand_control_ratio_quartile]
|
||||||
CATEGORICAL_FEATURES: [gender]
|
CATEGORICAL_FEATURES: [gender]
|
||||||
|
|
||||||
TARGET:
|
TARGET:
|
||||||
SCALE: [positive_affect, negative_affect]
|
COMPUTE: True
|
||||||
|
LABEL: appraisal_stressfulness_event_mean
|
||||||
|
ALL_LABELS: [PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean, JCQ_coworker_support_mean, appraisal_stressfulness_period_mean]
|
||||||
|
# PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean,
|
||||||
|
# JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
"_id","timestamp","device_id","call_type","call_duration","trace"
|
||||||
|
1,1587663260695,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",2,14,"d5e84f8af01b2728021d4f43f53a163c0c90000c"
|
||||||
|
2,1587739118007,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",3,0,"47c125dc7bd163b8612cdea13724a814917b6e93"
|
||||||
|
5,1587746544891,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",2,95,"9cc793ffd6e88b1d850ce540b5d7e000ef5650d4"
|
||||||
|
6,1587911379859,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",2,63,"51fb9344e988049a3fec774c7ca622358bf80264"
|
||||||
|
7,1587992647361,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",3,0,"2a862a7730cfdfaf103a9487afe3e02935fd6e02"
|
||||||
|
8,1588020039448,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",1,11,"a2c53f6a086d98622c06107780980cf1bb4e37bd"
|
||||||
|
11,1588176189024,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",2,65,"56589df8c830c70e330b644921ed38e08d8fd1f3"
|
||||||
|
12,1588197745079,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",3,0,"cab458018a8ed3b626515e794c70b6f415318adc"
|
|
Binary file not shown.
|
@ -0,0 +1,57 @@
|
||||||
|
label,empatica_id
|
||||||
|
uploader_79170,A0245B
|
||||||
|
uploader_89788,A02731
|
||||||
|
uploader_68294,A02705
|
||||||
|
uploader_92856,A024AF
|
||||||
|
uploader_23726,A0231C
|
||||||
|
uploader_66620,A02305
|
||||||
|
uploader_58435,A026B5
|
||||||
|
uploader_87801,A022A8
|
||||||
|
uploader_96055,A027BA
|
||||||
|
uploader_69549,A0226C
|
||||||
|
uploader_26363,A0263D
|
||||||
|
uploader_72010,A023FA
|
||||||
|
uploader_13997,A024AF
|
||||||
|
uploader_31156,A02305
|
||||||
|
uploader_63187,A027BA
|
||||||
|
uploader_94821,A022A8
|
||||||
|
uploader_65413,A023F1;A023FA
|
||||||
|
uploader_36488,A02713
|
||||||
|
uploader_91087,A0231C
|
||||||
|
uploader_35174,A025D1
|
||||||
|
uploader_73880,A02705
|
||||||
|
uploader_78650,A02731
|
||||||
|
uploader_70578,A0245B
|
||||||
|
uploader_88313,A02736
|
||||||
|
uploader_58482,A0261A
|
||||||
|
uploader_80601,A027BA
|
||||||
|
uploader_93729,A0226C
|
||||||
|
uploader_61663,A0245B
|
||||||
|
uploader_80848,A025D1
|
||||||
|
uploader_57312,A023F9;A02361;A027A0
|
||||||
|
uploader_52087,A02666
|
||||||
|
uploader_98770,A02953
|
||||||
|
uploader_51327,A0245F
|
||||||
|
uploader_11737,A02732
|
||||||
|
uploader_77440,A0264E
|
||||||
|
uploader_57277,A02422
|
||||||
|
uploader_13098,A026E5
|
||||||
|
uploader_80719,A023C8
|
||||||
|
uploader_54698,A02953
|
||||||
|
uploader_95571,A02853
|
||||||
|
uploader_21880,A024DC
|
||||||
|
uploader_92905,A02920
|
||||||
|
uploader_12108,A023F4
|
||||||
|
uploader_17436,A026E5
|
||||||
|
uploader_58440,A0273F
|
||||||
|
uploader_22172,A0245F
|
||||||
|
uploader_39250,A02422
|
||||||
|
uploader_15311,A023F9
|
||||||
|
uploader_45766,A02920
|
||||||
|
uploader_23096,A02361
|
||||||
|
uploader_78243,A02422
|
||||||
|
uploader_58777,A0245F
|
||||||
|
uploader_82941,A02666
|
||||||
|
uploader_89606,A023F4
|
||||||
|
uploader_82969,A023C8
|
||||||
|
uploader_53573,A024DC;A02361
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
PHONE:
|
||||||
|
DEVICE_IDS: [4b62a655-cbf0-4ac0-a448-06726f45b56a]
|
||||||
|
PLATFORMS: [android]
|
||||||
|
LABEL: uploader_53573
|
||||||
|
START_DATE: 2021-05-21 09:21:24
|
||||||
|
END_DATE: 2021-07-12 17:32:07
|
||||||
|
EMPATICA:
|
||||||
|
DEVICE_IDS: [uploader_53573]
|
||||||
|
LABEL: uploader_53573
|
||||||
|
START_DATE: 2021-05-21 09:21:24
|
||||||
|
END_DATE: 2021-07-12 17:32:07
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,45 @@
|
||||||
|
genre,n
|
||||||
|
System,261
|
||||||
|
Tools,96
|
||||||
|
Productivity,71
|
||||||
|
Health & Fitness,60
|
||||||
|
Finance,54
|
||||||
|
Communication,39
|
||||||
|
Music & Audio,39
|
||||||
|
Shopping,38
|
||||||
|
Lifestyle,33
|
||||||
|
Education,28
|
||||||
|
News & Magazines,24
|
||||||
|
Maps & Navigation,23
|
||||||
|
Entertainment,21
|
||||||
|
Business,18
|
||||||
|
Travel & Local,18
|
||||||
|
Books & Reference,16
|
||||||
|
Social,16
|
||||||
|
Weather,16
|
||||||
|
Food & Drink,14
|
||||||
|
Sports,14
|
||||||
|
Other,13
|
||||||
|
Photography,13
|
||||||
|
Puzzle,13
|
||||||
|
Video Players & Editors,12
|
||||||
|
Card,9
|
||||||
|
Casual,9
|
||||||
|
Personalization,8
|
||||||
|
Medical,7
|
||||||
|
Board,5
|
||||||
|
Strategy,4
|
||||||
|
House & Home,3
|
||||||
|
Trivia,3
|
||||||
|
Word,3
|
||||||
|
Adventure,2
|
||||||
|
Art & Design,2
|
||||||
|
Auto & Vehicles,2
|
||||||
|
Dating,2
|
||||||
|
Role Playing,2
|
||||||
|
STRAW,2
|
||||||
|
Simulation,2
|
||||||
|
"Board,Brain Games",1
|
||||||
|
"Entertainment,Music & Video",1
|
||||||
|
Parenting,1
|
||||||
|
Racing,1
|
|
|
@ -1,2 +1,3 @@
|
||||||
label,start_time,length,repeats_on,repeats_value
|
label,start_time,length,repeats_on,repeats_value
|
||||||
daily,04:00:00,23H 59M 59S,every_day,0
|
daily,04:00:00,23H 59M 59S,every_day,0
|
||||||
|
working_day,04:00:00,18H 00M 00S,every_day,0
|
||||||
|
|
|
|
@ -1,2 +1,2 @@
|
||||||
label,length
|
label,length
|
||||||
thirtyminutes,30
|
fiveminutes,5
|
|
|
@ -1,9 +1,2 @@
|
||||||
label,start_time,length,repeats_on,repeats_value
|
label,start_time,length,repeats_on,repeats_value
|
||||||
threeday,00:00:00,2D 23H 59M 59S,every_day,0
|
|
||||||
daily,00:00:00,23H 59M 59S,every_day,0
|
daily,00:00:00,23H 59M 59S,every_day,0
|
||||||
morning,06:00:00,5H 59M 59S,every_day,0
|
|
||||||
afternoon,12:00:00,5H 59M 59S,every_day,0
|
|
||||||
evening,18:00:00,5H 59M 59S,every_day,0
|
|
||||||
night,00:00:00,5H 59M 59S,every_day,0
|
|
||||||
two_weeks_overlapping,00:00:00,13D 23H 59M 59S,every_day,0
|
|
||||||
weekends,00:00:00,2D 23H 59M 59S,wday,5
|
|
||||||
|
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,9 @@
|
||||||
# Change Log
|
# Change Log
|
||||||
|
## v1.8.0
|
||||||
|
- Add data stream for AWARE Micro server
|
||||||
|
- Fix the NA bug in PHONE_LOCATIONS BARNETT provider
|
||||||
|
- Fix the bug of data type for call_duration field
|
||||||
|
- Fix the index bug of heatmap_sensors_per_minute_per_time_segment
|
||||||
## v1.7.1
|
## v1.7.1
|
||||||
- Update docs for Git Flow section
|
- Update docs for Git Flow section
|
||||||
- Update RAPIDS paper information
|
- Update RAPIDS paper information
|
||||||
|
|
|
@ -0,0 +1,15 @@
|
||||||
|
# `aware_micro_mysql`
|
||||||
|
|
||||||
|
This [data stream](../../datastreams/data-streams-introduction) handles iOS and Android sensor data collected with the [AWARE Framework's](https://awareframework.com/) [AWARE Micro](https://github.com/denzilferreira/aware-micro) server and stored in a MySQL database.
|
||||||
|
|
||||||
|
## Container
|
||||||
|
A MySQL database with a table per sensor, each containing the data for all participants. Sensor data is stored in a JSON field within each table called `data`
|
||||||
|
|
||||||
|
The script to connect and download data from this container is at:
|
||||||
|
```bash
|
||||||
|
src/data/streams/aware_micro_mysql/container.R
|
||||||
|
```
|
||||||
|
|
||||||
|
## Format
|
||||||
|
|
||||||
|
--8<---- "docs/snippets/aware_format.md"
|
|
@ -16,6 +16,7 @@ For reference, these are the data streams we currently support:
|
||||||
| Data Stream | Device | Format | Container | Docs
|
| Data Stream | Device | Format | Container | Docs
|
||||||
|--|--|--|--|--|
|
|--|--|--|--|--|
|
||||||
| `aware_mysql`| Phone | AWARE app | MySQL | [link](../aware-mysql)
|
| `aware_mysql`| Phone | AWARE app | MySQL | [link](../aware-mysql)
|
||||||
|
| `aware_micro_mysql`| Phone | AWARE Micro server | MySQL | [link](../aware-micro-mysql)
|
||||||
| `aware_csv`| Phone | AWARE app | CSV files | [link](../aware-csv)
|
| `aware_csv`| Phone | AWARE app | CSV files | [link](../aware-csv)
|
||||||
| `aware_influxdb` (beta)| Phone | AWARE app | InfluxDB | [link](../aware-influxdb)
|
| `aware_influxdb` (beta)| Phone | AWARE app | InfluxDB | [link](../aware-influxdb)
|
||||||
| `fitbitjson_mysql`| Fitbit | JSON (per [Fitbit's API](https://dev.fitbit.com/build/reference/web-api/)) | MySQL | [link](../fitbitjson-mysql)
|
| `fitbitjson_mysql`| Fitbit | JSON (per [Fitbit's API](https://dev.fitbit.com/build/reference/web-api/)) | MySQL | [link](../fitbitjson-mysql)
|
||||||
|
|
|
@ -0,0 +1,39 @@
|
||||||
|
"""
|
||||||
|
Please do not make any changes, as RAPIDS is running on tmux server ...
|
||||||
|
"""
|
||||||
|
# !
|
||||||
|
# !
|
||||||
|
"""
|
||||||
|
Please do not make any changes, as RAPIDS is running on tmux server ...
|
||||||
|
"""
|
||||||
|
# !
|
||||||
|
# !
|
||||||
|
"""
|
||||||
|
Please do not make any changes, as RAPIDS is running on tmux server ...
|
||||||
|
"""
|
||||||
|
# !
|
||||||
|
# !
|
||||||
|
"""
|
||||||
|
Please do not make any changes, as RAPIDS is running on tmux server ...
|
||||||
|
"""
|
||||||
|
# !
|
||||||
|
# !
|
||||||
|
"""
|
||||||
|
Please do not make any changes, as RAPIDS is running on tmux server ...
|
||||||
|
"""
|
||||||
|
# !
|
||||||
|
# !
|
||||||
|
"""
|
||||||
|
Please do not make any changes, as RAPIDS is running on tmux server ...
|
||||||
|
"""
|
||||||
|
# !
|
||||||
|
# !
|
||||||
|
"""
|
||||||
|
Please do not make any changes, as RAPIDS is running on tmux server ...
|
||||||
|
"""
|
||||||
|
# !
|
||||||
|
# !
|
||||||
|
"""
|
||||||
|
Please do not make any changes, as RAPIDS is running on tmux server ...
|
||||||
|
"""
|
||||||
|
# !
|
138
environment.yml
138
environment.yml
|
@ -1,116 +1,30 @@
|
||||||
name: rapids
|
name: rapids
|
||||||
channels:
|
channels:
|
||||||
- conda-forge
|
- conda-forge
|
||||||
- defaults
|
|
||||||
dependencies:
|
dependencies:
|
||||||
- _py-xgboost-mutex=2.0
|
- auto-sklearn
|
||||||
- appdirs=1.4.*
|
- hmmlearn
|
||||||
- arrow=0.16.0
|
- imbalanced-learn
|
||||||
- asn1crypto=1.4.*
|
- jsonschema
|
||||||
- astropy=4.2.*
|
- lightgbm
|
||||||
- attrs=20.3.*
|
- matplotlib
|
||||||
- binaryornot=0.4.*
|
- numpy
|
||||||
- blas=1.0
|
- pandas
|
||||||
- brotlipy=0.7.*
|
- peakutils
|
||||||
- bzip2=1.0.*
|
- pip
|
||||||
- ca-certificates
|
- plotly
|
||||||
- certifi
|
- python-dateutil
|
||||||
- cffi=1.14.4
|
- pytz
|
||||||
- chardet=3.0.*
|
- pywavelets
|
||||||
- click=7.1.*
|
- pyyaml
|
||||||
- cookiecutter=1.6.*
|
- scikit-learn
|
||||||
- cryptography=3.3.*
|
- scipy
|
||||||
- datrie=0.8.*
|
- seaborn
|
||||||
- docutils=0.16
|
- setuptools
|
||||||
- future=0.18.2
|
- bioconda::snakemake
|
||||||
- gitdb=4.0.*
|
- bioconda::snakemake-minimal
|
||||||
- gitdb2=4.0.*
|
- tqdm
|
||||||
- gitpython=3.1.*
|
- xgboost
|
||||||
- idna=2.10
|
- pip:
|
||||||
- imbalanced-learn=0.6.*
|
- biosppy
|
||||||
- importlib-metadata=2.0.*
|
- cr_features>=0.2
|
||||||
- importlib_metadata=2.0.*
|
|
||||||
- intel-openmp=2019.4
|
|
||||||
- jinja2=2.11.2
|
|
||||||
- jinja2-time=0.2.*
|
|
||||||
- joblib=1.0.*
|
|
||||||
- jsonschema=3.2.*
|
|
||||||
- libblas=3.8.*
|
|
||||||
- libcblas=3.8.*
|
|
||||||
- libcxx=10.0.*
|
|
||||||
- libedit=3.1.*
|
|
||||||
- libffi=3.3
|
|
||||||
- libgfortran
|
|
||||||
- liblapack=3.8.*
|
|
||||||
- libopenblas=0.3.*
|
|
||||||
- libxgboost=0.90
|
|
||||||
- lightgbm=3.1.*
|
|
||||||
- llvm-openmp=10.0.*
|
|
||||||
- markupsafe=1.1.*
|
|
||||||
- mkl
|
|
||||||
- mkl-service=2.3.*
|
|
||||||
- mkl_fft=1.2.*
|
|
||||||
- mkl_random=1.1.*
|
|
||||||
- more-itertools=8.6.*
|
|
||||||
- ncurses=6.2
|
|
||||||
- numpy=1.19.2
|
|
||||||
- numpy-base=1.19.2
|
|
||||||
- openblas=0.3.*
|
|
||||||
- openssl
|
|
||||||
- pandas=1.1.*
|
|
||||||
- pbr=5.5.*
|
|
||||||
- pip=20.3.*
|
|
||||||
- plotly=4.14.1
|
|
||||||
- poyo=0.5.*
|
|
||||||
- psutil=5.7.*
|
|
||||||
- psycopg2
|
|
||||||
- py-xgboost=0.90
|
|
||||||
- pycparser=2.20
|
|
||||||
- pyerfa=1.7.*
|
|
||||||
- pyopenssl=20.0.*
|
|
||||||
- pyprojroot
|
|
||||||
- pysocks=1.7.*
|
|
||||||
- python=3.7.*
|
|
||||||
- python-dateutil=2.8.*
|
|
||||||
- python-dotenv
|
|
||||||
- python_abi=3.7
|
|
||||||
- pytz=2020.4
|
|
||||||
- pyyaml=5.3.*
|
|
||||||
- readline=8.0
|
|
||||||
- requests=2.25.0
|
|
||||||
- retrying=1.3.*
|
|
||||||
- scikit-learn=0.23.2
|
|
||||||
- scipy=1.5.*
|
|
||||||
- setuptools=51.0.*
|
|
||||||
- six=1.15.0
|
|
||||||
- smmap=3.0.*
|
|
||||||
- smmap2=3.0.*
|
|
||||||
- sqlalchemy
|
|
||||||
- sqlite=3.33.0
|
|
||||||
- threadpoolctl=2.1.*
|
|
||||||
- tk=8.6.*
|
|
||||||
- tqdm=4.62.0
|
|
||||||
- urllib3=1.25.11
|
|
||||||
- wheel=0.36.2
|
|
||||||
- whichcraft=0.6.*
|
|
||||||
- wrapt=1.12.1
|
|
||||||
- xgboost=0.90
|
|
||||||
- xz=5.2.*
|
|
||||||
- yaml=0.2.*
|
|
||||||
- zipp=3.4.*
|
|
||||||
- zlib=1.2.*
|
|
||||||
- pip:
|
|
||||||
- amply==0.1.*
|
|
||||||
- configargparse==0.15.1
|
|
||||||
- decorator==4.4.*
|
|
||||||
- ipython-genutils==0.2.*
|
|
||||||
- jupyter-core==4.6.*
|
|
||||||
- nbformat==5.0.*
|
|
||||||
- pulp==2.4
|
|
||||||
- pyparsing==2.4.*
|
|
||||||
- pyrsistent==0.15.5
|
|
||||||
- ratelimiter==1.2.*
|
|
||||||
- snakemake==5.30.2
|
|
||||||
- toposort==1.5
|
|
||||||
- traitlets==4.3.*
|
|
||||||
prefix: /usr/local/Caskroom/miniconda/base/envs/rapids202108
|
|
||||||
|
|
|
@ -85,6 +85,7 @@ nav:
|
||||||
- Introduction: datastreams/data-streams-introduction.md
|
- Introduction: datastreams/data-streams-introduction.md
|
||||||
- Phone:
|
- Phone:
|
||||||
- aware_mysql: datastreams/aware-mysql.md
|
- aware_mysql: datastreams/aware-mysql.md
|
||||||
|
- aware_micro_mysql: datastreams/aware-micro-mysql.md
|
||||||
- aware_csv: datastreams/aware-csv.md
|
- aware_csv: datastreams/aware-csv.md
|
||||||
- aware_influxdb (beta): datastreams/aware-influxdb.md
|
- aware_influxdb (beta): datastreams/aware-influxdb.md
|
||||||
- Mandatory Phone Format: datastreams/mandatory-phone-format.md
|
- Mandatory Phone Format: datastreams/mandatory-phone-format.md
|
||||||
|
|
|
@ -14,9 +14,6 @@ local({
|
||||||
# signal that we're loading renv during R startup
|
# signal that we're loading renv during R startup
|
||||||
Sys.setenv("RENV_R_INITIALIZING" = "true")
|
Sys.setenv("RENV_R_INITIALIZING" = "true")
|
||||||
on.exit(Sys.unsetenv("RENV_R_INITIALIZING"), add = TRUE)
|
on.exit(Sys.unsetenv("RENV_R_INITIALIZING"), add = TRUE)
|
||||||
|
|
||||||
if(grepl("Darwin", Sys.info()["sysname"], fixed = TRUE) & grepl("ARM64", Sys.info()["version"], fixed = TRUE)) # M1 Macs
|
|
||||||
Sys.setenv("TZDIR" = file.path(R.home(), "share", "zoneinfo"))
|
|
||||||
|
|
||||||
# signal that we've consented to use renv
|
# signal that we've consented to use renv
|
||||||
options(renv.consent = TRUE)
|
options(renv.consent = TRUE)
|
||||||
|
|
|
@ -40,6 +40,17 @@ def find_features_files(wildcards):
|
||||||
feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", sensor_key=wildcards.sensor_key.lower(), language=get_script_language(provider["SRC_SCRIPT"]), provider_key=provider_key.lower()))
|
feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", sensor_key=wildcards.sensor_key.lower(), language=get_script_language(provider["SRC_SCRIPT"]), provider_key=provider_key.lower()))
|
||||||
return(feature_files)
|
return(feature_files)
|
||||||
|
|
||||||
|
def find_joint_non_empatica_sensor_files(wildcards):
|
||||||
|
joined_files = []
|
||||||
|
for config_key in config.keys():
|
||||||
|
if config_key.startswith(("PHONE", "FITBIT")) and "PROVIDERS" in config[config_key] and isinstance(config[config_key]["PROVIDERS"], dict):
|
||||||
|
for provider_key, provider in config[config_key]["PROVIDERS"].items():
|
||||||
|
if "COMPUTE" in provider.keys() and provider["COMPUTE"]:
|
||||||
|
joined_files.append("data/processed/features/{pid}/" + config_key.lower() + ".csv")
|
||||||
|
break
|
||||||
|
return joined_files
|
||||||
|
|
||||||
|
|
||||||
def optional_steps_sleep_input(wildcards):
|
def optional_steps_sleep_input(wildcards):
|
||||||
if config["FITBIT_STEPS_INTRADAY"]["EXCLUDE_SLEEP"]["FITBIT_BASED"]["EXCLUDE"]:
|
if config["FITBIT_STEPS_INTRADAY"]["EXCLUDE_SLEEP"]["FITBIT_BASED"]["EXCLUDE"]:
|
||||||
return "data/raw/{pid}/fitbit_sleep_summary_raw.csv"
|
return "data/raw/{pid}/fitbit_sleep_summary_raw.csv"
|
||||||
|
|
|
@ -32,7 +32,7 @@ rule phone_data_yield_r_features:
|
||||||
output:
|
output:
|
||||||
"data/interim/{pid}/phone_data_yield_features/phone_data_yield_r_{provider_key}.csv"
|
"data/interim/{pid}/phone_data_yield_features/phone_data_yield_r_{provider_key}.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.R"
|
"../src/features/entry.R"
|
||||||
|
|
||||||
rule phone_accelerometer_python_features:
|
rule phone_accelerometer_python_features:
|
||||||
input:
|
input:
|
||||||
|
@ -341,7 +341,20 @@ rule esm_features:
|
||||||
provider_key = "{provider_key}",
|
provider_key = "{provider_key}",
|
||||||
sensor_key = "phone_esm",
|
sensor_key = "phone_esm",
|
||||||
scales=lambda wildcards: config["PHONE_ESM"]["PROVIDERS"][wildcards.provider_key.upper()]["SCALES"]
|
scales=lambda wildcards: config["PHONE_ESM"]["PROVIDERS"][wildcards.provider_key.upper()]["SCALES"]
|
||||||
output: "data/interim/{pid}/phone_esm_features/phone_esm_clean_{provider_key}.csv"
|
output: "data/interim/{pid}/phone_esm_features/phone_esm_python_{provider_key}.csv"
|
||||||
|
script:
|
||||||
|
"../src/features/entry.py"
|
||||||
|
|
||||||
|
rule phone_speech_python_features:
|
||||||
|
input:
|
||||||
|
sensor_data = "data/raw/{pid}/phone_speech_with_datetime.csv",
|
||||||
|
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
||||||
|
params:
|
||||||
|
provider = lambda wildcards: config["PHONE_SPEECH"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||||
|
provider_key = "{provider_key}",
|
||||||
|
sensor_key = "phone_speech"
|
||||||
|
output:
|
||||||
|
"data/interim/{pid}/phone_speech_features/phone_speech_python_{provider_key}.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.py"
|
"../src/features/entry.py"
|
||||||
|
|
||||||
|
@ -791,7 +804,8 @@ rule empatica_accelerometer_python_features:
|
||||||
provider_key = "{provider_key}",
|
provider_key = "{provider_key}",
|
||||||
sensor_key = "empatica_accelerometer"
|
sensor_key = "empatica_accelerometer"
|
||||||
output:
|
output:
|
||||||
"data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}.csv"
|
"data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}.csv",
|
||||||
|
"data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}_windows.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.py"
|
"../src/features/entry.py"
|
||||||
|
|
||||||
|
@ -817,7 +831,8 @@ rule empatica_heartrate_python_features:
|
||||||
provider_key = "{provider_key}",
|
provider_key = "{provider_key}",
|
||||||
sensor_key = "empatica_heartrate"
|
sensor_key = "empatica_heartrate"
|
||||||
output:
|
output:
|
||||||
"data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_python_{provider_key}.csv"
|
"data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_python_{provider_key}.csv",
|
||||||
|
"data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_python_{provider_key}_windows.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.py"
|
"../src/features/entry.py"
|
||||||
|
|
||||||
|
@ -843,7 +858,8 @@ rule empatica_temperature_python_features:
|
||||||
provider_key = "{provider_key}",
|
provider_key = "{provider_key}",
|
||||||
sensor_key = "empatica_temperature"
|
sensor_key = "empatica_temperature"
|
||||||
output:
|
output:
|
||||||
"data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}.csv"
|
"data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}.csv",
|
||||||
|
"data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}_windows.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.py"
|
"../src/features/entry.py"
|
||||||
|
|
||||||
|
@ -869,7 +885,8 @@ rule empatica_electrodermal_activity_python_features:
|
||||||
provider_key = "{provider_key}",
|
provider_key = "{provider_key}",
|
||||||
sensor_key = "empatica_electrodermal_activity"
|
sensor_key = "empatica_electrodermal_activity"
|
||||||
output:
|
output:
|
||||||
"data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}.csv"
|
"data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}.csv",
|
||||||
|
"data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}_windows.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.py"
|
"../src/features/entry.py"
|
||||||
|
|
||||||
|
@ -895,7 +912,8 @@ rule empatica_blood_volume_pulse_python_features:
|
||||||
provider_key = "{provider_key}",
|
provider_key = "{provider_key}",
|
||||||
sensor_key = "empatica_blood_volume_pulse"
|
sensor_key = "empatica_blood_volume_pulse"
|
||||||
output:
|
output:
|
||||||
"data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}.csv"
|
"data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}.csv",
|
||||||
|
"data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}_windows.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.py"
|
"../src/features/entry.py"
|
||||||
|
|
||||||
|
@ -921,7 +939,8 @@ rule empatica_inter_beat_interval_python_features:
|
||||||
provider_key = "{provider_key}",
|
provider_key = "{provider_key}",
|
||||||
sensor_key = "empatica_inter_beat_interval"
|
sensor_key = "empatica_inter_beat_interval"
|
||||||
output:
|
output:
|
||||||
"data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}.csv"
|
"data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}.csv",
|
||||||
|
"data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}_windows.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.py"
|
"../src/features/entry.py"
|
||||||
|
|
||||||
|
@ -988,11 +1007,12 @@ rule clean_sensor_features_for_individual_participants:
|
||||||
params:
|
params:
|
||||||
provider = lambda wildcards: config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
provider = lambda wildcards: config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||||
provider_key = "{provider_key}",
|
provider_key = "{provider_key}",
|
||||||
sensor_key = "all_cleaning_individual"
|
script_extension = "{script_extension}",
|
||||||
|
sensor_key = "all_cleaning_individual"
|
||||||
output:
|
output:
|
||||||
"data/processed/features/{pid}/all_sensor_features_cleaned_{provider_key}.csv"
|
"data/processed/features/{pid}/all_sensor_features_cleaned_{provider_key}_{script_extension}.csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.R"
|
"../src/features/entry.{params.script_extension}"
|
||||||
|
|
||||||
rule clean_sensor_features_for_all_participants:
|
rule clean_sensor_features_for_all_participants:
|
||||||
input:
|
input:
|
||||||
|
@ -1000,9 +1020,10 @@ rule clean_sensor_features_for_all_participants:
|
||||||
params:
|
params:
|
||||||
provider = lambda wildcards: config["ALL_CLEANING_OVERALL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
provider = lambda wildcards: config["ALL_CLEANING_OVERALL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||||
provider_key = "{provider_key}",
|
provider_key = "{provider_key}",
|
||||||
sensor_key = "all_cleaning_overall"
|
script_extension = "{script_extension}",
|
||||||
|
sensor_key = "all_cleaning_overall",
|
||||||
|
target = "{target}"
|
||||||
output:
|
output:
|
||||||
"data/processed/features/all_participants/all_sensor_features_cleaned_{provider_key}.csv"
|
"data/processed/features/all_participants/all_sensor_features_cleaned_{provider_key}_{script_extension}_({target}).csv"
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.R"
|
"../src/features/entry.{params.script_extension}"
|
||||||
|
|
||||||
|
|
|
@ -27,3 +27,26 @@ rule baseline_features:
|
||||||
features="data/processed/features/{pid}/baseline_features.csv"
|
features="data/processed/features/{pid}/baseline_features.csv"
|
||||||
script:
|
script:
|
||||||
"../src/data/baseline_features.py"
|
"../src/data/baseline_features.py"
|
||||||
|
|
||||||
|
rule select_target:
|
||||||
|
input:
|
||||||
|
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned_straw_py.csv"
|
||||||
|
params:
|
||||||
|
target_variable = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||||
|
output:
|
||||||
|
"data/processed/models/individual_model/{pid}/input.csv"
|
||||||
|
script:
|
||||||
|
"../src/models/select_targets.py"
|
||||||
|
|
||||||
|
rule merge_features_and_targets_for_population_model:
|
||||||
|
input:
|
||||||
|
cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned_straw_py_({target}).csv",
|
||||||
|
demographic_features = expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]),
|
||||||
|
params:
|
||||||
|
target_variable="{target}"
|
||||||
|
output:
|
||||||
|
"data/processed/models/population_model/input_{target}.csv"
|
||||||
|
script:
|
||||||
|
"../src/models/merge_features_and_targets_for_population_model.py"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,14 +4,14 @@ rule create_example_participant_files:
|
||||||
shell:
|
shell:
|
||||||
"echo 'PHONE:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n PLATFORMS: [android]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example01.yaml && echo 'PHONE:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n PLATFORMS: [ios]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example02.yaml"
|
"echo 'PHONE:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n PLATFORMS: [android]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example01.yaml && echo 'PHONE:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n PLATFORMS: [ios]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example02.yaml"
|
||||||
|
|
||||||
rule query_usernames_device_empatica_ids:
|
# rule query_usernames_device_empatica_ids:
|
||||||
params:
|
# params:
|
||||||
baseline_folder = "/mnt/e/STRAWbaseline/"
|
# baseline_folder = "/mnt/e/STRAWbaseline/"
|
||||||
output:
|
# output:
|
||||||
usernames_file = config["CREATE_PARTICIPANT_FILES"]["USERNAMES_CSV"],
|
# usernames_file = config["CREATE_PARTICIPANT_FILES"]["USERNAMES_CSV"],
|
||||||
timezone_file = config["TIMEZONE"]["MULTIPLE"]["TZ_FILE"]
|
# timezone_file = config["TIMEZONE"]["MULTIPLE"]["TZ_FILE"]
|
||||||
script:
|
# script:
|
||||||
"../../participants/prepare_usernames_file.py"
|
# "../../participants/prepare_usernames_file.py"
|
||||||
|
|
||||||
rule prepare_tzcodes_file:
|
rule prepare_tzcodes_file:
|
||||||
input:
|
input:
|
||||||
|
@ -247,5 +247,33 @@ rule empatica_readable_datetime:
|
||||||
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
|
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
|
||||||
output:
|
output:
|
||||||
"data/raw/{pid}/empatica_{sensor}_with_datetime.csv"
|
"data/raw/{pid}/empatica_{sensor}_with_datetime.csv"
|
||||||
|
resources:
|
||||||
|
mem_mb=50000
|
||||||
script:
|
script:
|
||||||
"../src/data/datetime/readable_datetime.R"
|
"../src/data/datetime/readable_datetime.R"
|
||||||
|
|
||||||
|
|
||||||
|
rule extract_event_information_from_esm:
|
||||||
|
input:
|
||||||
|
esm_raw_input = "data/raw/{pid}/phone_esm_raw.csv",
|
||||||
|
pid_file = "data/external/participant_files/{pid}.yaml"
|
||||||
|
params:
|
||||||
|
stage = "extract",
|
||||||
|
pid = "{pid}"
|
||||||
|
output:
|
||||||
|
"data/raw/ers/{pid}_ers.csv",
|
||||||
|
"data/raw/ers/{pid}_stress_event_targets.csv"
|
||||||
|
script:
|
||||||
|
"../src/features/phone_esm/straw/process_user_event_related_segments.py"
|
||||||
|
|
||||||
|
rule merge_event_related_segments_files:
|
||||||
|
input:
|
||||||
|
ers_files = expand("data/raw/ers/{pid}_ers.csv", pid=config["PIDS"]),
|
||||||
|
se_files = expand("data/raw/ers/{pid}_stress_event_targets.csv", pid=config["PIDS"])
|
||||||
|
params:
|
||||||
|
stage = "merge"
|
||||||
|
output:
|
||||||
|
"data/external/straw_events.csv",
|
||||||
|
"data/external/stress_event_targets.csv"
|
||||||
|
script:
|
||||||
|
"../src/features/phone_esm/straw/process_user_event_related_segments.py"
|
|
@ -29,23 +29,16 @@ get_genre <- function(apps){
|
||||||
apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F)
|
apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F)
|
||||||
genre_catalogue <- data.frame()
|
genre_catalogue <- data.frame()
|
||||||
catalogue_source <- snakemake@params[["catalogue_source"]]
|
catalogue_source <- snakemake@params[["catalogue_source"]]
|
||||||
package_names_hashed <- snakemake@params[["package_names_hashed"]]
|
|
||||||
update_catalogue_file <- snakemake@params[["update_catalogue_file"]]
|
update_catalogue_file <- snakemake@params[["update_catalogue_file"]]
|
||||||
scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]]
|
scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]]
|
||||||
apps_with_genre <- data.frame(matrix(ncol=length(colnames(apps)) + 1,nrow=0, dimnames=list(NULL, c(colnames(apps), "genre"))))
|
apps_with_genre <- data.frame(matrix(ncol=length(colnames(apps)) + 1,nrow=0, dimnames=list(NULL, c(colnames(apps), "genre"))))
|
||||||
|
|
||||||
if (length(package_names_hashed) == 0) {package_names_hashed <- FALSE}
|
|
||||||
|
|
||||||
if(nrow(apps) > 0){
|
if(nrow(apps) > 0){
|
||||||
if(catalogue_source == "GOOGLE"){
|
if(catalogue_source == "GOOGLE"){
|
||||||
apps_with_genre <- apps %>% mutate(genre = NA_character_)
|
apps_with_genre <- apps %>% mutate(genre = NA_character_)
|
||||||
} else if(catalogue_source == "FILE"){
|
} else if(catalogue_source == "FILE"){
|
||||||
genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character"))
|
genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character"))
|
||||||
if (package_names_hashed) {
|
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
|
||||||
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_hash")
|
|
||||||
} else {
|
|
||||||
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){
|
if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){
|
||||||
|
|
|
@ -60,15 +60,15 @@ if not participant_info.empty:
|
||||||
0, "startlanguage"
|
0, "startlanguage"
|
||||||
]
|
]
|
||||||
if (
|
if (
|
||||||
("demand" in requested_features)
|
("limesurvey_demand" in requested_features)
|
||||||
or ("control" in requested_features)
|
or ("limesurvey_control" in requested_features)
|
||||||
or ("demand_control_ratio" in requested_features)
|
or ("limesurvey_demand_control_ratio" in requested_features)
|
||||||
):
|
):
|
||||||
participant_info_t = participant_info.T
|
participant_info_t = participant_info.T
|
||||||
rows_baseline = participant_info_t.index
|
rows_baseline = participant_info_t.index
|
||||||
|
|
||||||
if ("demand" in requested_features) or (
|
if ("limesurvey_demand" in requested_features) or (
|
||||||
"demand_control_ratio" in requested_features
|
"limesurvey_demand_control_ratio" in requested_features
|
||||||
):
|
):
|
||||||
# Find questions about demand, but disregard time (duration of filling in questionnaire)
|
# Find questions about demand, but disregard time (duration of filling in questionnaire)
|
||||||
rows_demand = rows_baseline.str.startswith(
|
rows_demand = rows_baseline.str.startswith(
|
||||||
|
@ -95,13 +95,13 @@ if not participant_info.empty:
|
||||||
- limesurvey_demand.loc[rows_demand_reverse, "score_original"]
|
- limesurvey_demand.loc[rows_demand_reverse, "score_original"]
|
||||||
)
|
)
|
||||||
baseline_interim = pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True)
|
baseline_interim = pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True)
|
||||||
if "demand" in requested_features:
|
if "limesurvey_demand" in requested_features:
|
||||||
baseline_features.loc[0, "demand"] = limesurvey_demand[
|
baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[
|
||||||
"score"
|
"score"
|
||||||
].sum()
|
].sum()
|
||||||
|
|
||||||
if ("control" in requested_features) or (
|
if ("limesurvey_control" in requested_features) or (
|
||||||
"demand_control_ratio" in requested_features
|
"limesurvey_demand_control_ratio" in requested_features
|
||||||
):
|
):
|
||||||
# Find questions about control, but disregard time (duration of filling in questionnaire)
|
# Find questions about control, but disregard time (duration of filling in questionnaire)
|
||||||
rows_control = rows_baseline.str.startswith(
|
rows_control = rows_baseline.str.startswith(
|
||||||
|
@ -130,15 +130,18 @@ if not participant_info.empty:
|
||||||
|
|
||||||
baseline_interim = pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True)
|
baseline_interim = pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True)
|
||||||
|
|
||||||
if "control" in requested_features:
|
if "limesurvey_control" in requested_features:
|
||||||
baseline_features.loc[0, "control"] = limesurvey_control[
|
baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[
|
||||||
"score"
|
"score"
|
||||||
].sum()
|
].sum()
|
||||||
|
|
||||||
if "demand_control_ratio" in requested_features:
|
if "limesurvey_demand_control_ratio" in requested_features:
|
||||||
limesurvey_demand_control_ratio = (
|
if limesurvey_control["score"].sum():
|
||||||
limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()
|
limesurvey_demand_control_ratio = (
|
||||||
)
|
limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
limesurvey_demand_control_ratio = 0
|
||||||
if (
|
if (
|
||||||
JCQ_NORMS[participant_info.loc[0, "gender"]][0]
|
JCQ_NORMS[participant_info.loc[0, "gender"]][0]
|
||||||
<= limesurvey_demand_control_ratio
|
<= limesurvey_demand_control_ratio
|
||||||
|
@ -167,10 +170,10 @@ if not participant_info.empty:
|
||||||
limesurvey_quartile = np.nan
|
limesurvey_quartile = np.nan
|
||||||
|
|
||||||
baseline_features.loc[
|
baseline_features.loc[
|
||||||
0, "demand_control_ratio"
|
0, "limesurvey_demand_control_ratio"
|
||||||
] = limesurvey_demand_control_ratio
|
] = limesurvey_demand_control_ratio
|
||||||
baseline_features.loc[
|
baseline_features.loc[
|
||||||
0, "demand_control_ratio_quartile"
|
0, "limesurvey_demand_control_ratio_quartile"
|
||||||
] = limesurvey_quartile
|
] = limesurvey_quartile
|
||||||
|
|
||||||
if not baseline_interim.empty:
|
if not baseline_interim.empty:
|
||||||
|
|
|
@ -58,7 +58,7 @@ participants %>%
|
||||||
lines <- append(lines, empty_fitbit)
|
lines <- append(lines, empty_fitbit)
|
||||||
|
|
||||||
if(add_empatica_section == TRUE && !is.na(row[empatica_device_id_column])){
|
if(add_empatica_section == TRUE && !is.na(row[empatica_device_id_column])){
|
||||||
lines <- append(lines, c("EMPATICA:", paste0(" DEVICE_IDS: [",row[empatica_device_id_column],"]"),
|
lines <- append(lines, c("EMPATICA:", paste0(" DEVICE_IDS: [",row$label,"]"),
|
||||||
paste(" LABEL:",row$label), paste(" START_DATE:", start_date), paste(" END_DATE:", end_date)))
|
paste(" LABEL:",row$label), paste(" START_DATE:", start_date), paste(" END_DATE:", end_date)))
|
||||||
} else
|
} else
|
||||||
lines <- append(lines, empty_empatica)
|
lines <- append(lines, empty_empatica)
|
||||||
|
|
|
@ -5,13 +5,16 @@ options(scipen=999)
|
||||||
|
|
||||||
assign_rows_to_segments <- function(data, segments){
|
assign_rows_to_segments <- function(data, segments){
|
||||||
# This function is used by all segment types, we use data.tables because they are fast
|
# This function is used by all segment types, we use data.tables because they are fast
|
||||||
|
|
||||||
data <- data.table::as.data.table(data)
|
data <- data.table::as.data.table(data)
|
||||||
data[, assigned_segments := ""]
|
data[, assigned_segments := ""]
|
||||||
for(i in seq_len(nrow(segments))) {
|
for(i in seq_len(nrow(segments))) {
|
||||||
segment <- segments[i,]
|
segment <- segments[i,]
|
||||||
|
|
||||||
data[segment$segment_start_ts<= timestamp & segment$segment_end_ts >= timestamp,
|
data[segment$segment_start_ts<= timestamp & segment$segment_end_ts >= timestamp,
|
||||||
assigned_segments := stringi::stri_c(assigned_segments, segment$segment_id, sep = "|")]
|
assigned_segments := stringi::stri_c(assigned_segments, segment$segment_id, sep = "|")]
|
||||||
}
|
}
|
||||||
|
|
||||||
data[,assigned_segments:=substring(assigned_segments, 2)]
|
data[,assigned_segments:=substring(assigned_segments, 2)]
|
||||||
data
|
data
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,85 @@
|
||||||
|
# if you need a new package, you should add it with renv::install(package) so your renv venv is updated
|
||||||
|
library(RMariaDB)
|
||||||
|
library(yaml)
|
||||||
|
|
||||||
|
#' @description
|
||||||
|
#' Auxiliary function to parse the connection credentials from a specifc group in ./credentials.yaml
|
||||||
|
#' You can reause most of this function if you are connection to a DB or Web API.
|
||||||
|
#' It's OK to delete this function if you don't need credentials, e.g., you are pulling data from a CSV for example.
|
||||||
|
#' @param group the yaml key containing the credentials to connect to a database
|
||||||
|
#' @preturn dbEngine a database engine (connection) ready to perform queries
|
||||||
|
get_db_engine <- function(group){
|
||||||
|
# The working dir is aways RAPIDS root folder, so your credentials file is always /credentials.yaml
|
||||||
|
credentials <- read_yaml("./credentials.yaml")
|
||||||
|
if(!group %in% names(credentials))
|
||||||
|
stop(paste("The credentials group",group, "does not exist in ./credentials.yaml. The only groups that exist in that file are:", paste(names(credentials), collapse = ","), ". Did you forget to set the group in [PHONE_DATA_STREAMS][aware_mysql][DATABASE_GROUP] in config.yaml?"))
|
||||||
|
dbEngine <- dbConnect(MariaDB(), db = credentials[[group]][["database"]],
|
||||||
|
username = credentials[[group]][["user"]],
|
||||||
|
password = credentials[[group]][["password"]],
|
||||||
|
host = credentials[[group]][["host"]],
|
||||||
|
port = credentials[[group]][["port"]])
|
||||||
|
return(dbEngine)
|
||||||
|
}
|
||||||
|
|
||||||
|
# This file gets executed for each PHONE_SENSOR of each participant
|
||||||
|
# If you are connecting to a database the env file containing its credentials is available at "./.env"
|
||||||
|
# If you are reading a CSV file instead of a DB table, the @param sensor_container wil contain the file path as set in config.yaml
|
||||||
|
# You are not bound to databases or files, you can query a web API or whatever data source you need.
|
||||||
|
|
||||||
|
#' @description
|
||||||
|
#' RAPIDS allows users to use the keyword "infer" (previously "multiple") to automatically infer the mobile Operative System a device was running.
|
||||||
|
#' If you have a way to infer the OS of a device ID, implement this function. For example, for AWARE data we use the "aware_device" table.
|
||||||
|
#'
|
||||||
|
#' If you don't have a way to infer the OS, call stop("Error Message") so other users know they can't use "infer" or the inference failed,
|
||||||
|
#' and they have to assign the OS manually in the participant file
|
||||||
|
#'
|
||||||
|
#' @param stream_parameters The PHONE_STREAM_PARAMETERS key in config.yaml. If you need specific parameters add them there.
|
||||||
|
#' @param device A device ID string
|
||||||
|
#' @return The OS the device ran, "android" or "ios"
|
||||||
|
|
||||||
|
infer_device_os <- function(stream_parameters, device){
|
||||||
|
dbEngine <- get_db_engine(stream_parameters$DATABASE_GROUP)
|
||||||
|
query <- paste0("SELECT device_id,brand FROM aware_device WHERE device_id = '", device, "'")
|
||||||
|
message(paste0("Executing the following query to infer phone OS: ", query))
|
||||||
|
os <- dbGetQuery(dbEngine, query)
|
||||||
|
dbDisconnect(dbEngine)
|
||||||
|
|
||||||
|
if(nrow(os) > 0)
|
||||||
|
return(os %>% mutate(os = ifelse(brand == "iPhone", "ios", "android")) %>% pull(os))
|
||||||
|
else
|
||||||
|
stop(paste("We cannot infer the OS of the following device id because it does not exist in the aware_device table:", device))
|
||||||
|
|
||||||
|
return(os)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' @description
|
||||||
|
#' Gets the sensor data for a specific device id from a database table, file or whatever source you want to query
|
||||||
|
#'
|
||||||
|
#' @param stream_parameters The PHONE_STREAM_PARAMETERS key in config.yaml. If you need specific parameters add them there.
|
||||||
|
#' @param device A device ID string
|
||||||
|
#' @param sensor_container database table or file containing the sensor data for all participants. This is the PHONE_SENSOR[CONTAINER] key in config.yaml
|
||||||
|
#' @param columns the columns needed from this sensor (we recommend to only return these columns instead of every column in sensor_container)
|
||||||
|
#' @return A dataframe with the sensor data for device
|
||||||
|
|
||||||
|
pull_data <- function(stream_parameters, device, sensor, sensor_container, columns){
|
||||||
|
dbEngine <- get_db_engine(stream_parameters$DATABASE_GROUP)
|
||||||
|
|
||||||
|
select_items <- c()
|
||||||
|
for (column in columns) {
|
||||||
|
select_items <- append(select_items, paste0("data->>'$.", column, "' ", column))
|
||||||
|
}
|
||||||
|
|
||||||
|
query <- paste0("SELECT ", paste(select_items, collapse = ",")," FROM ", sensor_container, " WHERE ", columns$DEVICE_ID ," = '", device,"'")
|
||||||
|
|
||||||
|
# Letting the user know what we are doing
|
||||||
|
message(paste0("Executing the following query to download data: ", query))
|
||||||
|
sensor_data <- dbGetQuery(dbEngine, query)
|
||||||
|
|
||||||
|
dbDisconnect(dbEngine)
|
||||||
|
|
||||||
|
if(nrow(sensor_data) == 0)
|
||||||
|
warning(paste("The device '", device,"' did not have data in ", sensor_container))
|
||||||
|
|
||||||
|
return(sensor_data)
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,337 @@
|
||||||
|
PHONE_ACCELEROMETER:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
DOUBLE_VALUES_0: double_values_0
|
||||||
|
DOUBLE_VALUES_1: double_values_1
|
||||||
|
DOUBLE_VALUES_2: double_values_2
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
IOS:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
DOUBLE_VALUES_0: double_values_0
|
||||||
|
DOUBLE_VALUES_1: double_values_1
|
||||||
|
DOUBLE_VALUES_2: double_values_2
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
||||||
|
PHONE_ACTIVITY_RECOGNITION:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
ACTIVITY_NAME: activity_name
|
||||||
|
ACTIVITY_TYPE: activity_type
|
||||||
|
CONFIDENCE: confidence
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
IOS:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
ACTIVITY_NAME: FLAG_TO_MUTATE
|
||||||
|
ACTIVITY_TYPE: FLAG_TO_MUTATE
|
||||||
|
CONFIDENCE: FLAG_TO_MUTATE
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
ACTIVITIES: activities
|
||||||
|
CONFIDENCE: confidence
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
- "src/data/streams/mutations/phone/aware/activity_recogniton_ios_unification.R"
|
||||||
|
|
||||||
|
PHONE_APPLICATIONS_CRASHES:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
PACKAGE_NAME: package_name
|
||||||
|
APPLICATION_NAME: application_name
|
||||||
|
APPLICATION_VERSION: application_version
|
||||||
|
ERROR_SHORT: error_short
|
||||||
|
ERROR_LONG: error_long
|
||||||
|
ERROR_CONDITION: error_condition
|
||||||
|
IS_SYSTEM_APP: is_system_app
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
||||||
|
PHONE_APPLICATIONS_FOREGROUND:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
PACKAGE_NAME: package_name
|
||||||
|
APPLICATION_NAME: application_name
|
||||||
|
IS_SYSTEM_APP: is_system_app
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
||||||
|
PHONE_APPLICATIONS_NOTIFICATIONS:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
PACKAGE_NAME: package_name
|
||||||
|
APPLICATION_NAME: application_name
|
||||||
|
TEXT: text
|
||||||
|
SOUND: sound
|
||||||
|
VIBRATE: vibrate
|
||||||
|
DEFAULTS: defaults
|
||||||
|
FLAGS: flags
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
||||||
|
PHONE_BATTERY:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
BATTERY_STATUS: battery_status
|
||||||
|
BATTERY_LEVEL: battery_level
|
||||||
|
BATTERY_SCALE: battery_scale
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
IOS:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
BATTERY_STATUS: FLAG_TO_MUTATE
|
||||||
|
BATTERY_LEVEL: battery_level
|
||||||
|
BATTERY_SCALE: battery_scale
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
BATTERY_STATUS: battery_status
|
||||||
|
SCRIPTS:
|
||||||
|
- "src/data/streams/mutations/phone/aware/battery_ios_unification.R"
|
||||||
|
|
||||||
|
PHONE_BLUETOOTH:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
BT_ADDRESS: bt_address
|
||||||
|
BT_NAME: bt_name
|
||||||
|
BT_RSSI: bt_rssi
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
IOS:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
BT_ADDRESS: bt_address
|
||||||
|
BT_NAME: bt_name
|
||||||
|
BT_RSSI: bt_rssi
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
||||||
|
PHONE_CALLS:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
CALL_TYPE: call_type
|
||||||
|
CALL_DURATION: call_duration
|
||||||
|
TRACE: trace
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
IOS:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
CALL_TYPE: FLAG_TO_MUTATE
|
||||||
|
CALL_DURATION: call_duration
|
||||||
|
TRACE: trace
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
CALL_TYPE: call_type
|
||||||
|
SCRIPTS:
|
||||||
|
- "src/data/streams/mutations/phone/aware/calls_ios_unification.R"
|
||||||
|
|
||||||
|
PHONE_CONVERSATION:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
DOUBLE_ENERGY: double_energy
|
||||||
|
INFERENCE: inference
|
||||||
|
DOUBLE_CONVO_START: double_convo_start
|
||||||
|
DOUBLE_CONVO_END: double_convo_end
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
IOS:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
DOUBLE_ENERGY: double_energy
|
||||||
|
INFERENCE: inference
|
||||||
|
DOUBLE_CONVO_START: double_convo_start
|
||||||
|
DOUBLE_CONVO_END: double_convo_end
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
- "src/data/streams/mutations/phone/aware/conversation_ios_timestamp.R"
|
||||||
|
|
||||||
|
PHONE_KEYBOARD:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
PACKAGE_NAME: package_name
|
||||||
|
BEFORE_TEXT: before_text
|
||||||
|
CURRENT_TEXT: current_text
|
||||||
|
IS_PASSWORD: is_password
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
||||||
|
PHONE_LIGHT:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
DOUBLE_LIGHT_LUX: double_light_lux
|
||||||
|
ACCURACY: accuracy
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
||||||
|
PHONE_LOCATIONS:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
DOUBLE_LATITUDE: double_latitude
|
||||||
|
DOUBLE_LONGITUDE: double_longitude
|
||||||
|
DOUBLE_BEARING: double_bearing
|
||||||
|
DOUBLE_SPEED: double_speed
|
||||||
|
DOUBLE_ALTITUDE: double_altitude
|
||||||
|
PROVIDER: provider
|
||||||
|
ACCURACY: accuracy
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
IOS:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
DOUBLE_LATITUDE: double_latitude
|
||||||
|
DOUBLE_LONGITUDE: double_longitude
|
||||||
|
DOUBLE_BEARING: double_bearing
|
||||||
|
DOUBLE_SPEED: double_speed
|
||||||
|
DOUBLE_ALTITUDE: double_altitude
|
||||||
|
PROVIDER: provider
|
||||||
|
ACCURACY: accuracy
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
||||||
|
PHONE_LOG:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
LOG_MESSAGE: log_message
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
IOS:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
LOG_MESSAGE: log_message
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
||||||
|
PHONE_MESSAGES:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
MESSAGE_TYPE: message_type
|
||||||
|
TRACE: trace
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
||||||
|
PHONE_SCREEN:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
SCREEN_STATUS: screen_status
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
IOS:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
SCREEN_STATUS: FLAG_TO_MUTATE
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCREEN_STATUS: screen_status
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
- "src/data/streams/mutations/phone/aware/screen_ios_unification.R"
|
||||||
|
|
||||||
|
PHONE_WIFI_CONNECTED:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
MAC_ADDRESS: mac_address
|
||||||
|
SSID: ssid
|
||||||
|
BSSID: bssid
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
IOS:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
MAC_ADDRESS: mac_address
|
||||||
|
SSID: ssid
|
||||||
|
BSSID: bssid
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
||||||
|
PHONE_WIFI_VISIBLE:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
SSID: ssid
|
||||||
|
BSSID: bssid
|
||||||
|
SECURITY: security
|
||||||
|
FREQUENCY: frequency
|
||||||
|
RSSI: rssi
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
IOS:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
SSID: ssid
|
||||||
|
BSSID: bssid
|
||||||
|
SECURITY: security
|
||||||
|
FREQUENCY: frequency
|
||||||
|
RSSI: rssi
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
|
@ -349,3 +349,24 @@ PHONE_WIFI_VISIBLE:
|
||||||
COLUMN_MAPPINGS:
|
COLUMN_MAPPINGS:
|
||||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
||||||
|
PHONE_SPEECH:
|
||||||
|
ANDROID:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
SPEECH_PROPORTION: speech_proportion
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
IOS:
|
||||||
|
RAPIDS_COLUMN_MAPPINGS:
|
||||||
|
TIMESTAMP: timestamp
|
||||||
|
DEVICE_ID: device_id
|
||||||
|
SPEECH_PROPORTION: speech_proportion
|
||||||
|
MUTATION:
|
||||||
|
COLUMN_MAPPINGS:
|
||||||
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,11 +2,16 @@ from zipfile import ZipFile
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
from pandas.core import indexing
|
from pandas.core import indexing
|
||||||
import yaml
|
import yaml
|
||||||
import csv
|
import csv
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from io import BytesIO, StringIO
|
from io import BytesIO, StringIO
|
||||||
|
import sys, os
|
||||||
|
|
||||||
|
from cr_features.hrv import get_HRV_features, get_patched_ibi_with_bvp
|
||||||
|
from cr_features.helper_functions import empatica1d_to_array, empatica2d_to_array
|
||||||
|
|
||||||
def processAcceleration(x, y, z):
|
def processAcceleration(x, y, z):
|
||||||
x = float(x)
|
x = float(x)
|
||||||
|
@ -52,6 +57,8 @@ def extract_empatica_data(data, sensor):
|
||||||
df = pd.DataFrame.from_dict(ddict, orient='index', columns=[column])
|
df = pd.DataFrame.from_dict(ddict, orient='index', columns=[column])
|
||||||
df[column] = df[column].astype(float)
|
df[column] = df[column].astype(float)
|
||||||
df.index.name = 'timestamp'
|
df.index.name = 'timestamp'
|
||||||
|
if df.empty:
|
||||||
|
return df
|
||||||
|
|
||||||
elif sensor == 'EMPATICA_ACCELEROMETER':
|
elif sensor == 'EMPATICA_ACCELEROMETER':
|
||||||
ddict = readFile(sensor_data_file, sensor)
|
ddict = readFile(sensor_data_file, sensor)
|
||||||
|
@ -60,15 +67,22 @@ def extract_empatica_data(data, sensor):
|
||||||
df['y'] = df['y'].astype(float)
|
df['y'] = df['y'].astype(float)
|
||||||
df['z'] = df['z'].astype(float)
|
df['z'] = df['z'].astype(float)
|
||||||
df.index.name = 'timestamp'
|
df.index.name = 'timestamp'
|
||||||
|
if df.empty:
|
||||||
|
return df
|
||||||
|
|
||||||
elif sensor == 'EMPATICA_INTER_BEAT_INTERVAL':
|
elif sensor == 'EMPATICA_INTER_BEAT_INTERVAL':
|
||||||
df = pd.read_csv(sensor_data_file, names=['timestamp', column], header=None)
|
|
||||||
|
df = pd.read_csv(sensor_data_file, names=['timings', column], header=None)
|
||||||
|
df['timestamp'] = df['timings']
|
||||||
|
if df.empty:
|
||||||
|
df = df.set_index('timestamp')
|
||||||
|
return df
|
||||||
timestampstart = float(df['timestamp'][0])
|
timestampstart = float(df['timestamp'][0])
|
||||||
df['timestamp'] = (df['timestamp'][1:len(df)]).astype(float) + timestampstart
|
df['timestamp'] = (df['timestamp'][1:len(df)]).astype(float) + timestampstart
|
||||||
df = df.drop([0])
|
df = df.drop([0])
|
||||||
df[column] = df[column].astype(float)
|
df[column] = df[column].astype(float)
|
||||||
df = df.set_index('timestamp')
|
df = df.set_index('timestamp')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"sensor has an invalid name: {}".format(sensor))
|
"sensor has an invalid name: {}".format(sensor))
|
||||||
|
@ -84,6 +98,10 @@ def pull_data(data_configuration, device, sensor, container, columns_to_download
|
||||||
participant_data = pd.DataFrame(columns=columns_to_download.values())
|
participant_data = pd.DataFrame(columns=columns_to_download.values())
|
||||||
participant_data.set_index('timestamp', inplace=True)
|
participant_data.set_index('timestamp', inplace=True)
|
||||||
|
|
||||||
|
with open('config.yaml', 'r') as stream:
|
||||||
|
config = yaml.load(stream, Loader=yaml.FullLoader)
|
||||||
|
cr_ibi_provider = config['EMPATICA_INTER_BEAT_INTERVAL']['PROVIDERS']['CR']
|
||||||
|
|
||||||
available_zipfiles = list((Path(data_configuration["FOLDER"]) / Path(device)).rglob("*.zip"))
|
available_zipfiles = list((Path(data_configuration["FOLDER"]) / Path(device)).rglob("*.zip"))
|
||||||
if len(available_zipfiles) == 0:
|
if len(available_zipfiles) == 0:
|
||||||
warnings.warn("There were no zip files in: {}. If you were expecting data for this participant the [EMPATICA][DEVICE_IDS] key in their participant file is missing the pid".format((Path(data_configuration["FOLDER"]) / Path(device))))
|
warnings.warn("There were no zip files in: {}. If you were expecting data for this participant the [EMPATICA][DEVICE_IDS] key in their participant file is missing the pid".format((Path(data_configuration["FOLDER"]) / Path(device))))
|
||||||
|
@ -94,7 +112,13 @@ def pull_data(data_configuration, device, sensor, container, columns_to_download
|
||||||
listOfFileNames = zipFile.namelist()
|
listOfFileNames = zipFile.namelist()
|
||||||
for fileName in listOfFileNames:
|
for fileName in listOfFileNames:
|
||||||
if fileName == sensor_csv:
|
if fileName == sensor_csv:
|
||||||
participant_data = pd.concat([participant_data, extract_empatica_data(zipFile.read(fileName), sensor)], axis=0)
|
if sensor == "EMPATICA_INTER_BEAT_INTERVAL" and cr_ibi_provider.get('PATCH_WITH_BVP', False):
|
||||||
|
participant_data = \
|
||||||
|
pd.concat([participant_data, patch_ibi_with_bvp(zipFile.read('IBI.csv'), zipFile.read('BVP.csv'))], axis=0)
|
||||||
|
#print("patch with ibi")
|
||||||
|
else:
|
||||||
|
participant_data = pd.concat([participant_data, extract_empatica_data(zipFile.read(fileName), sensor)], axis=0)
|
||||||
|
#print("no patching")
|
||||||
warning = False
|
warning = False
|
||||||
if warning:
|
if warning:
|
||||||
warnings.warn("We could not find a zipped file for {} in {} (we tried to find {})".format(sensor, zipFile, sensor_csv))
|
warnings.warn("We could not find a zipped file for {} in {} (we tried to find {})".format(sensor, zipFile, sensor_csv))
|
||||||
|
@ -105,4 +129,54 @@ def pull_data(data_configuration, device, sensor, container, columns_to_download
|
||||||
participant_data["device_id"] = device
|
participant_data["device_id"] = device
|
||||||
return(participant_data)
|
return(participant_data)
|
||||||
|
|
||||||
|
def patch_ibi_with_bvp(ibi_data, bvp_data):
|
||||||
|
ibi_data_file = BytesIO(ibi_data).getvalue().decode('utf-8')
|
||||||
|
ibi_data_file = StringIO(ibi_data_file)
|
||||||
|
|
||||||
|
# Begin with the cr-features part
|
||||||
|
try:
|
||||||
|
ibi_data, ibi_start_timestamp = empatica2d_to_array(ibi_data_file)
|
||||||
|
except (IndexError, KeyError) as e:
|
||||||
|
# Checks whether IBI.csv is empty
|
||||||
|
# It may raise a KeyError if df is empty here: startTimeStamp = df.time[0]
|
||||||
|
df_test = pd.read_csv(ibi_data_file, names=['timings', 'inter_beat_interval'], header=None)
|
||||||
|
if df_test.empty:
|
||||||
|
df_test['timestamp'] = df_test['timings']
|
||||||
|
df_test = df_test.set_index('timestamp')
|
||||||
|
return df_test
|
||||||
|
else:
|
||||||
|
raise IndexError("Something went wrong with indices. Error that was previously caught:\n", repr(e))
|
||||||
|
|
||||||
|
bvp_data_file = BytesIO(bvp_data).getvalue().decode('utf-8')
|
||||||
|
bvp_data_file = StringIO(bvp_data_file)
|
||||||
|
|
||||||
|
bvp_data, bvp_start_timestamp, sample_rate = empatica1d_to_array(bvp_data_file)
|
||||||
|
|
||||||
|
hrv_time_and_freq_features, sample, bvp_rr, bvp_timings, peak_indx = \
|
||||||
|
get_HRV_features(bvp_data, ma=False,
|
||||||
|
detrend=False, m_deternd=False, low_pass=False, winsorize=True,
|
||||||
|
winsorize_value=25, hampel_fiter=False, median_filter=False,
|
||||||
|
mod_z_score_filter=True, sampling=64, feature_names=['meanHr'])
|
||||||
|
|
||||||
|
ibi_timings, ibi_rr = get_patched_ibi_with_bvp(ibi_data[0], ibi_data[1], bvp_timings, bvp_rr)
|
||||||
|
|
||||||
|
df = \
|
||||||
|
pd.DataFrame(np.array([ibi_timings, ibi_rr]).transpose(), columns=['timestamp', 'inter_beat_interval'])
|
||||||
|
df.loc[-1] = [ibi_start_timestamp, 'IBI'] # adding a row
|
||||||
|
df.index = df.index + 1 # shifting index
|
||||||
|
df = df.sort_index() # sorting by index
|
||||||
|
|
||||||
|
# Repeated as in extract_empatica_data for IBI
|
||||||
|
df['timings'] = df['timestamp']
|
||||||
|
timestampstart = float(df['timestamp'][0])
|
||||||
|
df['timestamp'] = (df['timestamp'][1:len(df)]).astype(float) + timestampstart
|
||||||
|
df = df.drop([0])
|
||||||
|
df['inter_beat_interval'] = df['inter_beat_interval'].astype(float)
|
||||||
|
df = df.set_index('timestamp')
|
||||||
|
|
||||||
|
# format timestamps
|
||||||
|
df.index *= 1000
|
||||||
|
df.index = df.index.astype(int)
|
||||||
|
return(df)
|
||||||
|
|
||||||
# print(pull_data({'FOLDER': 'data/external/empatica'}, "e01", "EMPATICA_accelerometer", {'TIMESTAMP': 'timestamp', 'DEVICE_ID': 'device_id', 'DOUBLE_VALUES_0': 'x', 'DOUBLE_VALUES_1': 'y', 'DOUBLE_VALUES_2': 'z'}))
|
# print(pull_data({'FOLDER': 'data/external/empatica'}, "e01", "EMPATICA_accelerometer", {'TIMESTAMP': 'timestamp', 'DEVICE_ID': 'device_id', 'DOUBLE_VALUES_0': 'x', 'DOUBLE_VALUES_1': 'y', 'DOUBLE_VALUES_2': 'z'}))
|
|
@ -50,6 +50,7 @@ EMPATICA_INTER_BEAT_INTERVAL:
|
||||||
TIMESTAMP: timestamp
|
TIMESTAMP: timestamp
|
||||||
DEVICE_ID: device_id
|
DEVICE_ID: device_id
|
||||||
INTER_BEAT_INTERVAL: inter_beat_interval
|
INTER_BEAT_INTERVAL: inter_beat_interval
|
||||||
|
TIMINGS: timings
|
||||||
MUTATION:
|
MUTATION:
|
||||||
COLUMN_MAPPINGS:
|
COLUMN_MAPPINGS:
|
||||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||||
|
|
|
@ -39,7 +39,7 @@ unify_ios_calls <- function(ios_calls){
|
||||||
assigned_segments = first(assigned_segments))
|
assigned_segments = first(assigned_segments))
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ios_calls <- ios_calls %>% summarise(call_type_sequence = paste(call_type, collapse = ","), call_duration = sum(call_duration), timestamp = first(timestamp), device_id = first(device_id))
|
ios_calls <- ios_calls %>% summarise(call_type_sequence = paste(call_type, collapse = ","), call_duration = sum(as.numeric(call_duration)), timestamp = first(timestamp), device_id = first(device_id))
|
||||||
}
|
}
|
||||||
ios_calls <- ios_calls %>% mutate(call_type = case_when(
|
ios_calls <- ios_calls %>% mutate(call_type = case_when(
|
||||||
call_type_sequence == "1,2,4" | call_type_sequence == "2,1,4" ~ 1, # incoming
|
call_type_sequence == "1,2,4" | call_type_sequence == "2,1,4" ~ 1, # incoming
|
||||||
|
|
|
@ -118,6 +118,11 @@ PHONE_SCREEN:
|
||||||
- DEVICE_ID
|
- DEVICE_ID
|
||||||
- SCREEN_STATUS
|
- SCREEN_STATUS
|
||||||
|
|
||||||
|
PHONE_SPEECH:
|
||||||
|
- TIMESTAMP
|
||||||
|
- DEVICE_ID
|
||||||
|
- SPEECH_PROPORTION
|
||||||
|
|
||||||
PHONE_WIFI_CONNECTED:
|
PHONE_WIFI_CONNECTED:
|
||||||
- TIMESTAMP
|
- TIMESTAMP
|
||||||
- DEVICE_ID
|
- DEVICE_ID
|
||||||
|
@ -227,6 +232,7 @@ EMPATICA_INTER_BEAT_INTERVAL:
|
||||||
- TIMESTAMP
|
- TIMESTAMP
|
||||||
- DEVICE_ID
|
- DEVICE_ID
|
||||||
- INTER_BEAT_INTERVAL
|
- INTER_BEAT_INTERVAL
|
||||||
|
- TIMINGS
|
||||||
|
|
||||||
EMPATICA_TAGS:
|
EMPATICA_TAGS:
|
||||||
- TIMESTAMP
|
- TIMESTAMP
|
||||||
|
|
|
@ -39,16 +39,18 @@ rapids_cleaning <- function(sensor_data_files, provider){
|
||||||
if(!data_yield_column %in% colnames(clean_features)){
|
if(!data_yield_column %in% colnames(clean_features)){
|
||||||
stop(paste0("Error: RAPIDS provider needs to clean data based on ", data_yield_column, " column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded", data_yield_unit, "' in [FEATURES]."))
|
stop(paste0("Error: RAPIDS provider needs to clean data based on ", data_yield_column, " column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded", data_yield_unit, "' in [FEATURES]."))
|
||||||
}
|
}
|
||||||
clean_features <- clean_features %>%
|
if (data_yield_ratio_threshold > 0) {
|
||||||
|
clean_features <- clean_features %>%
|
||||||
filter(.[[data_yield_column]] >= data_yield_ratio_threshold)
|
filter(.[[data_yield_column]] >= data_yield_ratio_threshold)
|
||||||
|
}
|
||||||
|
|
||||||
# Drop columns with a percentage of NA values above cols_nan_threshold
|
# Drop columns with a percentage of NA values above cols_nan_threshold
|
||||||
if(nrow(clean_features))
|
if(nrow(clean_features))
|
||||||
clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
|
clean_features <- clean_features %>% select(where(~ sum(is.na(.)) / length(.) <= cols_nan_threshold ), starts_with("phone_esm"))
|
||||||
|
|
||||||
# Drop columns with zero variance
|
# Drop columns with zero variance
|
||||||
if(drop_zero_variance_columns)
|
if(drop_zero_variance_columns)
|
||||||
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime|phone_esm",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
||||||
|
|
||||||
# Drop highly correlated features
|
# Drop highly correlated features
|
||||||
if(as.logical(drop_highly_correlated_features$COMPUTE)){
|
if(as.logical(drop_highly_correlated_features$COMPUTE)){
|
||||||
|
|
|
@ -0,0 +1,180 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import math, sys, random
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from sklearn.impute import KNNImputer
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
|
||||||
|
sys.path.append('/rapids/')
|
||||||
|
from src.features import empatica_data_yield as edy
|
||||||
|
|
||||||
|
pd.set_option('display.max_columns', 20)
|
||||||
|
|
||||||
|
def straw_cleaning(sensor_data_files, provider):
|
||||||
|
|
||||||
|
features = pd.read_csv(sensor_data_files["sensor_data"][0])
|
||||||
|
|
||||||
|
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
||||||
|
|
||||||
|
with open('config.yaml', 'r') as stream:
|
||||||
|
config = yaml.load(stream, Loader=yaml.FullLoader)
|
||||||
|
|
||||||
|
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||||
|
|
||||||
|
# (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
|
||||||
|
if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
|
||||||
|
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
|
||||||
|
if 'phone_esm_straw_' + target in features:
|
||||||
|
features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
|
||||||
|
else:
|
||||||
|
return features
|
||||||
|
|
||||||
|
# (2.1) QUALITY CHECK (DATA YIELD COLUMN) deletes the rows where E4 or phone data is low quality
|
||||||
|
phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
|
||||||
|
phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit
|
||||||
|
|
||||||
|
if features.empty:
|
||||||
|
return features
|
||||||
|
|
||||||
|
features = edy.calculate_empatica_data_yield(features)
|
||||||
|
|
||||||
|
if not phone_data_yield_column in features.columns and not "empatica_data_yield" in features.columns:
|
||||||
|
raise KeyError(f"RAPIDS provider needs to clean the selected event features based on {phone_data_yield_column} and empatica_data_yield columns. For phone data yield, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].")
|
||||||
|
|
||||||
|
# Drop rows where phone data yield is less then given threshold
|
||||||
|
if provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]:
|
||||||
|
features = features[features[phone_data_yield_column] >= provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
||||||
|
|
||||||
|
# Drop rows where empatica data yield is less then given threshold
|
||||||
|
if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]:
|
||||||
|
features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
||||||
|
|
||||||
|
if features.empty:
|
||||||
|
return features
|
||||||
|
|
||||||
|
# (2.2) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES?
|
||||||
|
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
|
||||||
|
features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans
|
||||||
|
|
||||||
|
# (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
|
||||||
|
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
||||||
|
|
||||||
|
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
||||||
|
|
||||||
|
# Preserve esm cols if deleted (has to come after drop cols operations)
|
||||||
|
for esm in esm_cols:
|
||||||
|
if esm not in features:
|
||||||
|
features[esm] = esm_cols[esm]
|
||||||
|
|
||||||
|
# (4) CONTEXTUAL IMPUTATION
|
||||||
|
|
||||||
|
# Impute selected phone features with a high number
|
||||||
|
impute_w_hn = [col for col in features.columns if \
|
||||||
|
"timeoffirstuse" in col or
|
||||||
|
"timeoflastuse" in col or
|
||||||
|
"timefirstcall" in col or
|
||||||
|
"timelastcall" in col or
|
||||||
|
"firstuseafter" in col or
|
||||||
|
"timefirstmessages" in col or
|
||||||
|
"timelastmessages" in col]
|
||||||
|
features[impute_w_hn] = features[impute_w_hn].fillna(1500)
|
||||||
|
|
||||||
|
|
||||||
|
# Impute special case (mostcommonactivity) and (homelabel)
|
||||||
|
impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col]
|
||||||
|
features[impute_w_sn] = features[impute_w_sn].fillna(4) # Special case of imputation - nominal/ordinal value
|
||||||
|
|
||||||
|
impute_w_sn2 = [col for col in features.columns if "homelabel" in col]
|
||||||
|
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
|
||||||
|
|
||||||
|
impute_w_sn3 = [col for col in features.columns if "loglocationvariance" in col]
|
||||||
|
features[impute_w_sn2] = features[impute_w_sn2].fillna(-1000000) # Special case of imputation - nominal/ordinal value
|
||||||
|
|
||||||
|
|
||||||
|
# Impute selected phone features with 0
|
||||||
|
impute_zero = [col for col in features if \
|
||||||
|
col.startswith('phone_applications_foreground_rapids_') or
|
||||||
|
col.startswith('phone_battery_rapids_') or
|
||||||
|
col.startswith('phone_bluetooth_rapids_') or
|
||||||
|
col.startswith('phone_light_rapids_') or
|
||||||
|
col.startswith('phone_calls_rapids_') or
|
||||||
|
col.startswith('phone_messages_rapids_') or
|
||||||
|
col.startswith('phone_screen_rapids_') or
|
||||||
|
col.startswith('phone_wifi_visible')]
|
||||||
|
|
||||||
|
features[impute_zero+list(esm_cols.columns)] = features[impute_zero+list(esm_cols.columns)].fillna(0)
|
||||||
|
|
||||||
|
## (5) STANDARDIZATION
|
||||||
|
if provider["STANDARDIZATION"]:
|
||||||
|
features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)])
|
||||||
|
|
||||||
|
# (6) IMPUTATION: IMPUTE DATA WITH KNN METHOD
|
||||||
|
impute_cols = [col for col in features.columns if col not in excluded_columns]
|
||||||
|
features.reset_index(drop=True, inplace=True)
|
||||||
|
features[impute_cols] = impute(features[impute_cols], method="knn")
|
||||||
|
|
||||||
|
# (7) REMOVE COLS WHERE VARIANCE IS 0
|
||||||
|
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')]
|
||||||
|
|
||||||
|
if provider["COLS_VAR_THRESHOLD"]:
|
||||||
|
features.drop(features.std(numeric_only=True)[features.std(numeric_only=True) == 0].index.values, axis=1, inplace=True)
|
||||||
|
|
||||||
|
fe5 = features.copy()
|
||||||
|
|
||||||
|
# (8) DROP HIGHLY CORRELATED FEATURES
|
||||||
|
drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]
|
||||||
|
if drop_corr_features["COMPUTE"] and features.shape[0]: # If small amount of segments (rows) is present, do not execute correlation check
|
||||||
|
|
||||||
|
numerical_cols = features.select_dtypes(include=np.number).columns.tolist()
|
||||||
|
|
||||||
|
# Remove columns where NaN count threshold is passed
|
||||||
|
valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
|
||||||
|
|
||||||
|
corr_matrix = valid_features.corr().abs()
|
||||||
|
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
||||||
|
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
|
||||||
|
|
||||||
|
features.drop(to_drop, axis=1, inplace=True)
|
||||||
|
|
||||||
|
# Preserve esm cols if deleted (has to come after drop cols operations)
|
||||||
|
for esm in esm_cols:
|
||||||
|
if esm not in features:
|
||||||
|
features[esm] = esm_cols[esm]
|
||||||
|
|
||||||
|
# (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
|
||||||
|
if features.isna().any().any():
|
||||||
|
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
|
||||||
|
|
||||||
|
return features
|
||||||
|
|
||||||
|
|
||||||
|
def k_nearest(df):
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
imputer = KNNImputer(n_neighbors=3)
|
||||||
|
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
|
||||||
|
|
||||||
|
|
||||||
|
def impute(df, method='zero'):
|
||||||
|
|
||||||
|
return {
|
||||||
|
'zero': df.fillna(0),
|
||||||
|
'high_number': df.fillna(1500),
|
||||||
|
'mean': df.fillna(df.mean()),
|
||||||
|
'median': df.fillna(df.median()),
|
||||||
|
'knn': k_nearest(df)
|
||||||
|
}[method]
|
||||||
|
|
||||||
|
|
||||||
|
def graph_bf_af(features, phase_name, plt_flag=False):
|
||||||
|
if plt_flag:
|
||||||
|
sns.set(rc={"figure.figsize":(16, 8)})
|
||||||
|
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)
|
||||||
|
plt.savefig(f'features_overall_nans_{phase_name}.png', bbox_inches='tight')
|
||||||
|
|
||||||
|
print(f"\n-------------{phase_name}-------------")
|
||||||
|
print("Rows number:", features.shape[0])
|
||||||
|
print("Columns number:", len(features.columns))
|
||||||
|
print("---------------------------------------------\n")
|
|
@ -39,16 +39,18 @@ rapids_cleaning <- function(sensor_data_files, provider){
|
||||||
if(!data_yield_column %in% colnames(clean_features)){
|
if(!data_yield_column %in% colnames(clean_features)){
|
||||||
stop(paste0("Error: RAPIDS provider needs to clean data based on ", data_yield_column, " column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded", data_yield_unit, "' in [FEATURES]."))
|
stop(paste0("Error: RAPIDS provider needs to clean data based on ", data_yield_column, " column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded", data_yield_unit, "' in [FEATURES]."))
|
||||||
}
|
}
|
||||||
clean_features <- clean_features %>%
|
if (data_yield_ratio_threshold > 0) {
|
||||||
|
clean_features <- clean_features %>%
|
||||||
filter(.[[data_yield_column]] >= data_yield_ratio_threshold)
|
filter(.[[data_yield_column]] >= data_yield_ratio_threshold)
|
||||||
|
}
|
||||||
|
|
||||||
# Drop columns with a percentage of NA values above cols_nan_threshold
|
# Drop columns with a percentage of NA values above cols_nan_threshold
|
||||||
if(nrow(clean_features))
|
if(nrow(clean_features))
|
||||||
clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
|
clean_features <- clean_features %>% select(where(~ sum(is.na(.)) / length(.) <= cols_nan_threshold ), starts_with("phone_esm"))
|
||||||
|
|
||||||
# Drop columns with zero variance
|
# Drop columns with zero variance
|
||||||
if(drop_zero_variance_columns)
|
if(drop_zero_variance_columns)
|
||||||
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime|phone_esm",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
||||||
|
|
||||||
# Drop highly correlated features
|
# Drop highly correlated features
|
||||||
if(as.logical(drop_highly_correlated_features$COMPUTE)){
|
if(as.logical(drop_highly_correlated_features$COMPUTE)){
|
||||||
|
|
|
@ -0,0 +1,275 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import math, sys, random, warnings, yaml
|
||||||
|
|
||||||
|
from sklearn.impute import KNNImputer
|
||||||
|
from sklearn.preprocessing import StandardScaler, minmax_scale
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
|
||||||
|
sys.path.append('/rapids/')
|
||||||
|
from src.features import empatica_data_yield as edy
|
||||||
|
|
||||||
|
def straw_cleaning(sensor_data_files, provider, target):
|
||||||
|
|
||||||
|
features = pd.read_csv(sensor_data_files["sensor_data"][0])
|
||||||
|
|
||||||
|
with open('config.yaml', 'r') as stream:
|
||||||
|
config = yaml.load(stream, Loader=yaml.FullLoader)
|
||||||
|
|
||||||
|
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
||||||
|
|
||||||
|
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||||
|
|
||||||
|
graph_bf_af(features, "1target_rows_before")
|
||||||
|
|
||||||
|
# (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS SEGMENTING_METHOD IS "STRESS_EVENT"
|
||||||
|
if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] == "stress_event":
|
||||||
|
|
||||||
|
stress_events_targets = pd.read_csv("data/external/stress_event_targets.csv")
|
||||||
|
|
||||||
|
if "appraisal_stressfulness_event_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
|
||||||
|
features.drop(columns=['phone_esm_straw_appraisal_stressfulness_event_mean'], inplace=True)
|
||||||
|
features = features.merge(stress_events_targets[["label", "appraisal_stressfulness_event"]] \
|
||||||
|
.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
|
||||||
|
.rename(columns={'appraisal_stressfulness_event': 'phone_esm_straw_appraisal_stressfulness_event_mean'})
|
||||||
|
|
||||||
|
if "appraisal_threat_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
|
||||||
|
features.drop(columns=['phone_esm_straw_appraisal_threat_mean'], inplace=True)
|
||||||
|
features = features.merge(stress_events_targets[["label", "appraisal_threat"]] \
|
||||||
|
.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
|
||||||
|
.rename(columns={'appraisal_threat': 'phone_esm_straw_appraisal_threat_mean'})
|
||||||
|
|
||||||
|
if "appraisal_challenge_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
|
||||||
|
features.drop(columns=['phone_esm_straw_appraisal_challenge_mean'], inplace=True)
|
||||||
|
features = features.merge(stress_events_targets[["label", "appraisal_challenge"]] \
|
||||||
|
.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
|
||||||
|
.rename(columns={'appraisal_challenge': 'phone_esm_straw_appraisal_challenge_mean'})
|
||||||
|
|
||||||
|
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
||||||
|
|
||||||
|
# (1.1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
|
||||||
|
if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
|
||||||
|
features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
|
||||||
|
|
||||||
|
if features.empty:
|
||||||
|
return pd.DataFrame(columns=excluded_columns)
|
||||||
|
|
||||||
|
graph_bf_af(features, "2target_rows_after")
|
||||||
|
|
||||||
|
# (2) QUALITY CHECK (DATA YIELD COLUMN) drops the rows where E4 or phone data is low quality
|
||||||
|
phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
|
||||||
|
phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit
|
||||||
|
|
||||||
|
features = edy.calculate_empatica_data_yield(features)
|
||||||
|
|
||||||
|
if not phone_data_yield_column in features.columns and not "empatica_data_yield" in features.columns:
|
||||||
|
raise KeyError(f"RAPIDS provider needs to clean the selected event features based on {phone_data_yield_column} and empatica_data_yield columns. For phone data yield, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].")
|
||||||
|
|
||||||
|
hist = features[["empatica_data_yield", phone_data_yield_column]].hist()
|
||||||
|
plt.savefig(f'phone_E4_histogram.png', bbox_inches='tight')
|
||||||
|
|
||||||
|
# Drop rows where phone data yield is less then given threshold
|
||||||
|
if provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]:
|
||||||
|
hist = features[phone_data_yield_column].hist(bins=5)
|
||||||
|
plt.close()
|
||||||
|
features = features[features[phone_data_yield_column] >= provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
||||||
|
|
||||||
|
# Drop rows where empatica data yield is less then given threshold
|
||||||
|
if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]:
|
||||||
|
features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
||||||
|
|
||||||
|
if features.empty:
|
||||||
|
return pd.DataFrame(columns=excluded_columns)
|
||||||
|
|
||||||
|
graph_bf_af(features, "3data_yield_drop_rows")
|
||||||
|
|
||||||
|
if features.empty:
|
||||||
|
return pd.DataFrame(columns=excluded_columns)
|
||||||
|
|
||||||
|
|
||||||
|
# (3) CONTEXTUAL IMPUTATION
|
||||||
|
|
||||||
|
# Impute selected phone features with a high number
|
||||||
|
impute_w_hn = [col for col in features.columns if \
|
||||||
|
"timeoffirstuse" in col or
|
||||||
|
"timeoflastuse" in col or
|
||||||
|
"timefirstcall" in col or
|
||||||
|
"timelastcall" in col or
|
||||||
|
"firstuseafter" in col or
|
||||||
|
"timefirstmessages" in col or
|
||||||
|
"timelastmessages" in col]
|
||||||
|
features[impute_w_hn] = features[impute_w_hn].fillna(1500)
|
||||||
|
|
||||||
|
# Impute special case (mostcommonactivity) and (homelabel)
|
||||||
|
impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col]
|
||||||
|
features[impute_w_sn] = features[impute_w_sn].fillna(4) # Special case of imputation - nominal/ordinal value
|
||||||
|
|
||||||
|
impute_w_sn2 = [col for col in features.columns if "homelabel" in col]
|
||||||
|
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
|
||||||
|
|
||||||
|
impute_w_sn3 = [col for col in features.columns if "loglocationvariance" in col]
|
||||||
|
features[impute_w_sn3] = features[impute_w_sn3].fillna(-1000000) # Special case of imputation - loglocation
|
||||||
|
|
||||||
|
# Impute location features
|
||||||
|
impute_locations = [col for col in features \
|
||||||
|
if col.startswith('phone_locations_doryab_') and
|
||||||
|
'radiusgyration' not in col
|
||||||
|
]
|
||||||
|
|
||||||
|
# Impute selected phone, location, and esm features with 0
|
||||||
|
impute_zero = [col for col in features if \
|
||||||
|
col.startswith('phone_applications_foreground_rapids_') or
|
||||||
|
col.startswith('phone_activity_recognition_') or
|
||||||
|
col.startswith('phone_battery_rapids_') or
|
||||||
|
col.startswith('phone_bluetooth_rapids_') or
|
||||||
|
col.startswith('phone_light_rapids_') or
|
||||||
|
col.startswith('phone_calls_rapids_') or
|
||||||
|
col.startswith('phone_messages_rapids_') or
|
||||||
|
col.startswith('phone_screen_rapids_') or
|
||||||
|
col.startswith('phone_bluetooth_doryab_') or
|
||||||
|
col.startswith('phone_wifi_visible')
|
||||||
|
]
|
||||||
|
|
||||||
|
features[impute_zero+impute_locations+list(esm_cols.columns)] = features[impute_zero+impute_locations+list(esm_cols.columns)].fillna(0)
|
||||||
|
|
||||||
|
pd.set_option('display.max_rows', None)
|
||||||
|
|
||||||
|
graph_bf_af(features, "4context_imp")
|
||||||
|
|
||||||
|
# (4) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
|
||||||
|
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
||||||
|
|
||||||
|
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
||||||
|
|
||||||
|
graph_bf_af(features, "5too_much_nans_cols")
|
||||||
|
# (5) REMOVE COLS WHERE VARIANCE IS 0
|
||||||
|
|
||||||
|
if provider["COLS_VAR_THRESHOLD"]:
|
||||||
|
features.drop(features.std(numeric_only=True)[features.std(numeric_only=True) == 0].index.values, axis=1, inplace=True)
|
||||||
|
|
||||||
|
graph_bf_af(features, "6variance_drop")
|
||||||
|
|
||||||
|
# Preserve esm cols if deleted (has to come after drop cols operations)
|
||||||
|
for esm in esm_cols:
|
||||||
|
if esm not in features:
|
||||||
|
features[esm] = esm_cols[esm]
|
||||||
|
|
||||||
|
# (6) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES?
|
||||||
|
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
|
||||||
|
features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans
|
||||||
|
|
||||||
|
graph_bf_af(features, "7too_much_nans_rows")
|
||||||
|
|
||||||
|
if features.empty:
|
||||||
|
return pd.DataFrame(columns=excluded_columns)
|
||||||
|
|
||||||
|
# (7) STANDARDIZATION
|
||||||
|
if provider["STANDARDIZATION"]:
|
||||||
|
nominal_cols = [col for col in features.columns if "mostcommonactivity" in col or "homelabel" in col] # Excluded nominal features
|
||||||
|
# Expected warning within this code block
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("ignore", category=RuntimeWarning)
|
||||||
|
if provider["TARGET_STANDARDIZATION"]:
|
||||||
|
features.loc[:, ~features.columns.isin(excluded_columns + ["pid"] + nominal_cols)] = \
|
||||||
|
features.loc[:, ~features.columns.isin(excluded_columns + nominal_cols)].groupby('pid').transform(lambda x: StandardScaler().fit_transform(x.values[:,np.newaxis]).ravel())
|
||||||
|
else:
|
||||||
|
features.loc[:, ~features.columns.isin(excluded_columns + ["pid"] + nominal_cols + ['phone_esm_straw_' + target])] = \
|
||||||
|
features.loc[:, ~features.columns.isin(excluded_columns + nominal_cols + ['phone_esm_straw_' + target])].groupby('pid').transform(lambda x: StandardScaler().fit_transform(x.values[:,np.newaxis]).ravel())
|
||||||
|
|
||||||
|
graph_bf_af(features, "8standardization")
|
||||||
|
|
||||||
|
# (8) IMPUTATION: IMPUTE DATA WITH KNN METHOD
|
||||||
|
features.reset_index(drop=True, inplace=True)
|
||||||
|
impute_cols = [col for col in features.columns if col not in excluded_columns and col != "pid"]
|
||||||
|
|
||||||
|
features[impute_cols] = impute(features[impute_cols], method="knn")
|
||||||
|
|
||||||
|
graph_bf_af(features, "9knn_after")
|
||||||
|
|
||||||
|
|
||||||
|
# (9) DROP HIGHLY CORRELATED FEATURES
|
||||||
|
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')]
|
||||||
|
|
||||||
|
drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]
|
||||||
|
if drop_corr_features["COMPUTE"] and features.shape[0] > 5: # If small amount of segments (rows) is present, do not execute correlation check
|
||||||
|
|
||||||
|
numerical_cols = features.select_dtypes(include=np.number).columns.tolist()
|
||||||
|
|
||||||
|
# Remove columns where NaN count threshold is passed
|
||||||
|
valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
|
||||||
|
|
||||||
|
corr_matrix = valid_features.corr().abs()
|
||||||
|
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
||||||
|
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
|
||||||
|
|
||||||
|
# sns.heatmap(corr_matrix, cmap="YlGnBu")
|
||||||
|
# plt.savefig(f'correlation_matrix.png', bbox_inches='tight')
|
||||||
|
# plt.close()
|
||||||
|
|
||||||
|
# s = corr_matrix.unstack()
|
||||||
|
# so = s.sort_values(ascending=False)
|
||||||
|
|
||||||
|
# pd.set_option('display.max_rows', None)
|
||||||
|
# sorted_upper = upper.unstack().sort_values(ascending=False)
|
||||||
|
# print(sorted_upper[sorted_upper > drop_corr_features["CORR_THRESHOLD"]])
|
||||||
|
|
||||||
|
features.drop(to_drop, axis=1, inplace=True)
|
||||||
|
|
||||||
|
# Preserve esm cols if deleted (has to come after drop cols operations)
|
||||||
|
for esm in esm_cols:
|
||||||
|
if esm not in features:
|
||||||
|
features[esm] = esm_cols[esm]
|
||||||
|
|
||||||
|
graph_bf_af(features, "10correlation_drop")
|
||||||
|
|
||||||
|
# Transform categorical columns to category dtype
|
||||||
|
|
||||||
|
cat1 = [col for col in features.columns if "mostcommonactivity" in col]
|
||||||
|
if cat1: # Transform columns to category dtype (mostcommonactivity)
|
||||||
|
features[cat1] = features[cat1].astype(int).astype('category')
|
||||||
|
|
||||||
|
cat2 = [col for col in features.columns if "homelabel" in col]
|
||||||
|
if cat2: # Transform columns to category dtype (homelabel)
|
||||||
|
features[cat2] = features[cat2].astype(int).astype('category')
|
||||||
|
|
||||||
|
# (10) DROP ALL WINDOW RELATED COLUMNS
|
||||||
|
win_count_cols = [col for col in features if "SO_windowsCount" in col]
|
||||||
|
if win_count_cols:
|
||||||
|
features.drop(columns=win_count_cols, inplace=True)
|
||||||
|
|
||||||
|
# (11) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
|
||||||
|
if features.isna().any().any():
|
||||||
|
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
|
||||||
|
|
||||||
|
|
||||||
|
return features
|
||||||
|
|
||||||
|
|
||||||
|
def k_nearest(df):
|
||||||
|
imputer = KNNImputer(n_neighbors=3)
|
||||||
|
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
|
||||||
|
|
||||||
|
|
||||||
|
def impute(df, method='zero'):
|
||||||
|
|
||||||
|
return {
|
||||||
|
'zero': df.fillna(0),
|
||||||
|
'high_number': df.fillna(1500),
|
||||||
|
'mean': df.fillna(df.mean()),
|
||||||
|
'median': df.fillna(df.median()),
|
||||||
|
'knn': k_nearest(df)
|
||||||
|
}[method]
|
||||||
|
|
||||||
|
|
||||||
|
def graph_bf_af(features, phase_name, plt_flag=False):
|
||||||
|
if plt_flag:
|
||||||
|
sns.set(rc={"figure.figsize":(16, 8)})
|
||||||
|
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)
|
||||||
|
plt.savefig(f'features_overall_nans_{phase_name}.png', bbox_inches='tight')
|
||||||
|
|
||||||
|
print(f"\n-------------{phase_name}-------------")
|
||||||
|
print("Rows number:", features.shape[0])
|
||||||
|
print("Columns number:", len(features.columns))
|
||||||
|
print("NaN values:", features.isna().sum().sum())
|
||||||
|
print("---------------------------------------------\n")
|
|
@ -0,0 +1,59 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import math as m
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def extract_second_order_features(intraday_features, so_features_names, prefix=""):
|
||||||
|
|
||||||
|
if prefix:
|
||||||
|
groupby_cols = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||||
|
else:
|
||||||
|
groupby_cols = ['local_segment']
|
||||||
|
|
||||||
|
if not intraday_features.empty:
|
||||||
|
so_features = pd.DataFrame()
|
||||||
|
#print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest())
|
||||||
|
if "mean" in so_features_names:
|
||||||
|
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean(numeric_only=True).add_suffix("_SO_mean")], axis=1)
|
||||||
|
|
||||||
|
if "median" in so_features_names:
|
||||||
|
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median(numeric_only=True).add_suffix("_SO_median")], axis=1)
|
||||||
|
|
||||||
|
if "sd" in so_features_names:
|
||||||
|
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std(numeric_only=True).fillna(0).add_suffix("_SO_sd")], axis=1)
|
||||||
|
|
||||||
|
if "nlargest" in so_features_names: # largest 5 -- maybe there is a faster groupby solution?
|
||||||
|
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
|
||||||
|
so_features[column+"_SO_nlargest"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nlargest(5).mean())
|
||||||
|
|
||||||
|
if "nsmallest" in so_features_names: # smallest 5 -- maybe there is a faster groupby solution?
|
||||||
|
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
|
||||||
|
so_features[column+"_SO_nsmallest"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nsmallest(5).mean())
|
||||||
|
|
||||||
|
if "count_windows" in so_features_names:
|
||||||
|
so_features["SO_windowsCount"] = intraday_features.groupby(groupby_cols).count()[prefix+"level_1"]
|
||||||
|
|
||||||
|
# numPeaksNonZero specialized for EDA sensor
|
||||||
|
if "eda_num_peaks_non_zero" in so_features_names and prefix+"numPeaks" in intraday_features.columns:
|
||||||
|
so_features[prefix+"SO_numPeaksNonZero"] = intraday_features.groupby(groupby_cols)[prefix+"numPeaks"].apply(lambda x: (x!=0).sum())
|
||||||
|
|
||||||
|
# numWindowsNonZero specialized for BVP and IBI sensors
|
||||||
|
if "hrv_num_windows_non_nan" in so_features_names and prefix+"meanHr" in intraday_features.columns:
|
||||||
|
so_features[prefix+"SO_numWindowsNonNaN"] = intraday_features.groupby(groupby_cols)[prefix+"meanHr"].apply(lambda x: (~np.isnan(x)).sum())
|
||||||
|
|
||||||
|
so_features.reset_index(inplace=True)
|
||||||
|
|
||||||
|
else:
|
||||||
|
so_features = pd.DataFrame(columns=groupby_cols)
|
||||||
|
|
||||||
|
return so_features
|
||||||
|
|
||||||
|
def get_sample_rate(data): # To-Do get the sample rate information from the file's metadata
|
||||||
|
try:
|
||||||
|
timestamps_diff = data['timestamp'].diff().dropna().mean()
|
||||||
|
print("Timestamp diff:", timestamps_diff)
|
||||||
|
except:
|
||||||
|
raise Exception("Error occured while trying to get the mean sample rate from the data.")
|
||||||
|
|
||||||
|
return m.ceil(1000/timestamps_diff)
|
|
@ -0,0 +1,75 @@
|
||||||
|
import pandas as pd
|
||||||
|
from scipy.stats import entropy
|
||||||
|
|
||||||
|
from cr_features.helper_functions import convert_to2d, accelerometer_features, frequency_features
|
||||||
|
from cr_features.calculate_features_old import calculateFeatures
|
||||||
|
from cr_features.calculate_features import calculate_features
|
||||||
|
from cr_features_helper_methods import extract_second_order_features
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def extract_acc_features_from_intraday_data(acc_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||||
|
acc_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||||
|
|
||||||
|
if not acc_intraday_data.empty:
|
||||||
|
sample_rate = 32
|
||||||
|
|
||||||
|
acc_intraday_data = filter_data_by_segment(acc_intraday_data, time_segment)
|
||||||
|
|
||||||
|
if not acc_intraday_data.empty:
|
||||||
|
|
||||||
|
acc_intraday_features = pd.DataFrame()
|
||||||
|
|
||||||
|
# apply methods from calculate features module
|
||||||
|
if window_length is None:
|
||||||
|
acc_intraday_features = \
|
||||||
|
acc_intraday_data.groupby('local_segment').apply(lambda x: calculate_features( \
|
||||||
|
convert_to2d(x['double_values_0'], x.shape[0]), \
|
||||||
|
convert_to2d(x['double_values_1'], x.shape[0]), \
|
||||||
|
convert_to2d(x['double_values_2'], x.shape[0]), \
|
||||||
|
fs=sample_rate, feature_names=features, show_progress=False))
|
||||||
|
else:
|
||||||
|
acc_intraday_features = \
|
||||||
|
acc_intraday_data.groupby('local_segment').apply(lambda x: calculate_features( \
|
||||||
|
convert_to2d(x['double_values_0'], window_length*sample_rate), \
|
||||||
|
convert_to2d(x['double_values_1'], window_length*sample_rate), \
|
||||||
|
convert_to2d(x['double_values_2'], window_length*sample_rate), \
|
||||||
|
fs=sample_rate, feature_names=features, show_progress=False))
|
||||||
|
|
||||||
|
acc_intraday_features.reset_index(inplace=True)
|
||||||
|
|
||||||
|
return acc_intraday_features
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||||
|
|
||||||
|
data_types = {'local_timezone': 'str', 'device_id': 'str', 'timestamp': 'int64', 'double_values_0': 'float64',
|
||||||
|
'double_values_1': 'float64', 'double_values_2': 'float64', 'local_date_time': 'str', 'local_date': "str",
|
||||||
|
'local_time': "str", 'local_hour': "str", 'local_minute': "str", 'assigned_segments': "str"}
|
||||||
|
acc_intraday_data = pd.read_csv(sensor_data_files["sensor_data"], dtype=data_types)
|
||||||
|
|
||||||
|
requested_intraday_features = provider["FEATURES"]
|
||||||
|
|
||||||
|
calc_windows = kwargs.get('calc_windows', False)
|
||||||
|
|
||||||
|
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||||
|
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||||
|
else:
|
||||||
|
requested_window_length = None
|
||||||
|
|
||||||
|
# name of the features this function can compute
|
||||||
|
base_intraday_features_names = accelerometer_features + frequency_features
|
||||||
|
# the subset of requested features this function can compute
|
||||||
|
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||||
|
|
||||||
|
# extract features from intraday data
|
||||||
|
acc_intraday_features = extract_acc_features_from_intraday_data(acc_intraday_data, intraday_features_to_compute,
|
||||||
|
requested_window_length, time_segment, filter_data_by_segment)
|
||||||
|
|
||||||
|
if calc_windows:
|
||||||
|
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||||
|
acc_second_order_features = extract_second_order_features(acc_intraday_features, so_features_names)
|
||||||
|
return acc_intraday_features, acc_second_order_features
|
||||||
|
|
||||||
|
return acc_intraday_features
|
|
@ -0,0 +1,73 @@
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
|
||||||
|
from cr_features.helper_functions import convert_to2d, hrv_features
|
||||||
|
from cr_features.hrv import extract_hrv_features_2d_wrapper
|
||||||
|
from cr_features_helper_methods import extract_second_order_features
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# pd.set_option('display.max_rows', 1000)
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
|
||||||
|
def extract_bvp_features_from_intraday_data(bvp_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||||
|
bvp_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||||
|
|
||||||
|
if not bvp_intraday_data.empty:
|
||||||
|
sample_rate = 64
|
||||||
|
|
||||||
|
bvp_intraday_data = filter_data_by_segment(bvp_intraday_data, time_segment)
|
||||||
|
|
||||||
|
if not bvp_intraday_data.empty:
|
||||||
|
|
||||||
|
bvp_intraday_features = pd.DataFrame()
|
||||||
|
|
||||||
|
# apply methods from calculate features module
|
||||||
|
if window_length is None:
|
||||||
|
bvp_intraday_features = \
|
||||||
|
bvp_intraday_data.groupby('local_segment').apply(\
|
||||||
|
lambda x:
|
||||||
|
extract_hrv_features_2d_wrapper(
|
||||||
|
convert_to2d(x['blood_volume_pulse'], x.shape[0]),
|
||||||
|
sampling=sample_rate, hampel_fiter=False, median_filter=False, mod_z_score_filter=True, feature_names=features))
|
||||||
|
|
||||||
|
else:
|
||||||
|
bvp_intraday_features = \
|
||||||
|
bvp_intraday_data.groupby('local_segment').apply(\
|
||||||
|
lambda x:
|
||||||
|
extract_hrv_features_2d_wrapper(
|
||||||
|
convert_to2d(x['blood_volume_pulse'], window_length*sample_rate),
|
||||||
|
sampling=sample_rate, hampel_fiter=False, median_filter=False, mod_z_score_filter=True, feature_names=features))
|
||||||
|
|
||||||
|
bvp_intraday_features.reset_index(inplace=True)
|
||||||
|
|
||||||
|
return bvp_intraday_features
|
||||||
|
|
||||||
|
|
||||||
|
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||||
|
bvp_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||||
|
|
||||||
|
requested_intraday_features = provider["FEATURES"]
|
||||||
|
|
||||||
|
calc_windows = kwargs.get('calc_windows', False)
|
||||||
|
|
||||||
|
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||||
|
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||||
|
else:
|
||||||
|
requested_window_length = None
|
||||||
|
|
||||||
|
# name of the features this function can compute
|
||||||
|
base_intraday_features_names = hrv_features
|
||||||
|
# the subset of requested features this function can compute
|
||||||
|
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||||
|
|
||||||
|
# extract features from intraday data
|
||||||
|
bvp_intraday_features = extract_bvp_features_from_intraday_data(bvp_intraday_data, intraday_features_to_compute,
|
||||||
|
requested_window_length, time_segment, filter_data_by_segment)
|
||||||
|
|
||||||
|
if calc_windows:
|
||||||
|
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||||
|
bvp_second_order_features = extract_second_order_features(bvp_intraday_features, so_features_names)
|
||||||
|
return bvp_intraday_features, bvp_second_order_features
|
||||||
|
|
||||||
|
return bvp_intraday_features
|
|
@ -0,0 +1,32 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import sys, yaml
|
||||||
|
|
||||||
|
def calculate_empatica_data_yield(features): # TODO
|
||||||
|
|
||||||
|
# Get time segment duration in seconds from all segments in features dataframe
|
||||||
|
datetime_start = pd.to_datetime(features['local_segment_start_datetime'], format='%Y-%m-%d %H:%M:%S')
|
||||||
|
datetime_end = pd.to_datetime(features['local_segment_end_datetime'], format='%Y-%m-%d %H:%M:%S')
|
||||||
|
tseg_duration = (datetime_end - datetime_start).dt.total_seconds()
|
||||||
|
|
||||||
|
with open('config.yaml', 'r') as stream:
|
||||||
|
config = yaml.load(stream, Loader=yaml.FullLoader)
|
||||||
|
|
||||||
|
sensors = ["EMPATICA_ACCELEROMETER", "EMPATICA_TEMPERATURE", "EMPATICA_ELECTRODERMAL_ACTIVITY", "EMPATICA_INTER_BEAT_INTERVAL"]
|
||||||
|
for sensor in sensors:
|
||||||
|
features[f"{sensor.lower()}_data_yield"] = \
|
||||||
|
(features[f"{sensor.lower()}_cr_SO_windowsCount"] * config[sensor]["PROVIDERS"]["CR"]["WINDOWS"]["WINDOW_LENGTH"]) / tseg_duration \
|
||||||
|
if f'{sensor.lower()}_cr_SO_windowsCount' in features else 0
|
||||||
|
|
||||||
|
empatica_data_yield_cols = [sensor.lower() + "_data_yield" for sensor in sensors]
|
||||||
|
pd.set_option('display.max_rows', None)
|
||||||
|
|
||||||
|
# Assigns 1 to values that are over 1 (in case of windows not being filled fully)
|
||||||
|
features[empatica_data_yield_cols] = features[empatica_data_yield_cols].apply(lambda x: [y if y <= 1 or np.isnan(y) else 1 for y in x])
|
||||||
|
|
||||||
|
features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1, numeric_only=True).fillna(0)
|
||||||
|
features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average)
|
||||||
|
|
||||||
|
return features
|
|
@ -0,0 +1,82 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from scipy.stats import entropy
|
||||||
|
|
||||||
|
from cr_features.helper_functions import convert_to2d, gsr_features
|
||||||
|
from cr_features.calculate_features import calculate_features
|
||||||
|
from cr_features.gsr import extractGsrFeatures2D
|
||||||
|
from cr_features_helper_methods import extract_second_order_features
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
#pd.set_option('display.max_columns', None)
|
||||||
|
#pd.set_option('display.max_rows', None)
|
||||||
|
#np.seterr(invalid='ignore')
|
||||||
|
|
||||||
|
|
||||||
|
def extract_eda_features_from_intraday_data(eda_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||||
|
eda_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||||
|
|
||||||
|
if not eda_intraday_data.empty:
|
||||||
|
sample_rate = 4
|
||||||
|
|
||||||
|
eda_intraday_data = filter_data_by_segment(eda_intraday_data, time_segment)
|
||||||
|
|
||||||
|
if not eda_intraday_data.empty:
|
||||||
|
|
||||||
|
eda_intraday_features = pd.DataFrame()
|
||||||
|
|
||||||
|
# apply methods from calculate features module
|
||||||
|
if window_length is None:
|
||||||
|
eda_intraday_features = \
|
||||||
|
eda_intraday_data.groupby('local_segment').apply(\
|
||||||
|
lambda x: extractGsrFeatures2D(convert_to2d(x['electrodermal_activity'], x.shape[0]), sampleRate=sample_rate, featureNames=features,
|
||||||
|
threshold=.01, offset=1, riseTime=5, decayTime=15))
|
||||||
|
else:
|
||||||
|
eda_intraday_features = \
|
||||||
|
eda_intraday_data.groupby('local_segment').apply(\
|
||||||
|
lambda x: extractGsrFeatures2D(convert_to2d(x['electrodermal_activity'], window_length*sample_rate), sampleRate=sample_rate, featureNames=features,
|
||||||
|
threshold=.01, offset=1, riseTime=5, decayTime=15))
|
||||||
|
|
||||||
|
eda_intraday_features.reset_index(inplace=True)
|
||||||
|
|
||||||
|
return eda_intraday_features
|
||||||
|
|
||||||
|
|
||||||
|
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||||
|
|
||||||
|
data_types = {'local_timezone': 'str', 'device_id': 'str', 'timestamp': 'int64', 'electrodermal_activity': 'float64', 'local_date_time': 'str',
|
||||||
|
'local_date': "str", 'local_time': "str", 'local_hour': "str", 'local_minute': "str", 'assigned_segments': "str"}
|
||||||
|
|
||||||
|
eda_intraday_data = pd.read_csv(sensor_data_files["sensor_data"], dtype=data_types)
|
||||||
|
|
||||||
|
requested_intraday_features = provider["FEATURES"]
|
||||||
|
|
||||||
|
calc_windows = kwargs.get('calc_windows', False)
|
||||||
|
|
||||||
|
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||||
|
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||||
|
else:
|
||||||
|
requested_window_length = None
|
||||||
|
|
||||||
|
# name of the features this function can compute
|
||||||
|
base_intraday_features_names = gsr_features
|
||||||
|
# the subset of requested features this function can compute
|
||||||
|
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||||
|
|
||||||
|
# extract features from intraday data
|
||||||
|
eda_intraday_features = extract_eda_features_from_intraday_data(eda_intraday_data, intraday_features_to_compute,
|
||||||
|
requested_window_length, time_segment, filter_data_by_segment)
|
||||||
|
|
||||||
|
if calc_windows:
|
||||||
|
if provider["WINDOWS"]["IMPUTE_NANS"]:
|
||||||
|
eda_intraday_features[eda_intraday_features["numPeaks"] == 0] = \
|
||||||
|
eda_intraday_features[eda_intraday_features["numPeaks"] == 0].fillna(0)
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
|
||||||
|
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||||
|
eda_second_order_features = extract_second_order_features(eda_intraday_features, so_features_names)
|
||||||
|
|
||||||
|
return eda_intraday_features, eda_second_order_features
|
||||||
|
|
||||||
|
return eda_intraday_features
|
|
@ -0,0 +1,83 @@
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from cr_features.helper_functions import convert_ibi_to2d_time, hrv_features
|
||||||
|
from cr_features.hrv import extract_hrv_features_2d_wrapper, get_HRV_features
|
||||||
|
from cr_features_helper_methods import extract_second_order_features
|
||||||
|
|
||||||
|
import math
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# pd.set_option('display.max_rows', 1000)
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ibi_features_from_intraday_data(ibi_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||||
|
ibi_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||||
|
|
||||||
|
if not ibi_intraday_data.empty:
|
||||||
|
|
||||||
|
ibi_intraday_data = filter_data_by_segment(ibi_intraday_data, time_segment)
|
||||||
|
|
||||||
|
if not ibi_intraday_data.empty:
|
||||||
|
|
||||||
|
ibi_intraday_features = pd.DataFrame()
|
||||||
|
|
||||||
|
# apply methods from calculate features module
|
||||||
|
if window_length is None:
|
||||||
|
ibi_intraday_features = \
|
||||||
|
ibi_intraday_data.groupby('local_segment').apply(\
|
||||||
|
lambda x:
|
||||||
|
extract_hrv_features_2d_wrapper(
|
||||||
|
signal_2D = \
|
||||||
|
convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], math.ceil(x['timings'].iloc[-1]))[0],
|
||||||
|
ibi_timings = \
|
||||||
|
convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], math.ceil(x['timings'].iloc[-1]))[1],
|
||||||
|
sampling=None, hampel_fiter=False, median_filter=False, mod_z_score_filter=True, feature_names=features))
|
||||||
|
else:
|
||||||
|
ibi_intraday_features = \
|
||||||
|
ibi_intraday_data.groupby('local_segment').apply(\
|
||||||
|
lambda x:
|
||||||
|
extract_hrv_features_2d_wrapper(
|
||||||
|
signal_2D = convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], window_length)[0],
|
||||||
|
ibi_timings = convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], window_length)[1],
|
||||||
|
sampling=None, hampel_fiter=False, median_filter=False, mod_z_score_filter=True, feature_names=features))
|
||||||
|
|
||||||
|
ibi_intraday_features.reset_index(inplace=True)
|
||||||
|
|
||||||
|
return ibi_intraday_features
|
||||||
|
|
||||||
|
|
||||||
|
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||||
|
|
||||||
|
data_types = {'local_timezone': 'str', 'device_id': 'str', 'timestamp': 'int64', 'inter_beat_interval': 'float64', 'timings': 'float64', 'local_date_time': 'str',
|
||||||
|
'local_date': "str", 'local_time': "str", 'local_hour': "str", 'local_minute': "str", 'assigned_segments': "str"}
|
||||||
|
|
||||||
|
ibi_intraday_data = pd.read_csv(sensor_data_files["sensor_data"], dtype=data_types)
|
||||||
|
|
||||||
|
requested_intraday_features = provider["FEATURES"]
|
||||||
|
|
||||||
|
calc_windows = kwargs.get('calc_windows', False)
|
||||||
|
|
||||||
|
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||||
|
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||||
|
else:
|
||||||
|
requested_window_length = None
|
||||||
|
|
||||||
|
# name of the features this function can compute
|
||||||
|
base_intraday_features_names = hrv_features
|
||||||
|
# the subset of requested features this function can compute
|
||||||
|
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||||
|
|
||||||
|
# extract features from intraday data
|
||||||
|
ibi_intraday_features = extract_ibi_features_from_intraday_data(ibi_intraday_data, intraday_features_to_compute,
|
||||||
|
requested_window_length, time_segment, filter_data_by_segment)
|
||||||
|
|
||||||
|
if calc_windows:
|
||||||
|
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||||
|
ibi_second_order_features = extract_second_order_features(ibi_intraday_features, so_features_names)
|
||||||
|
|
||||||
|
return ibi_intraday_features, ibi_second_order_features
|
||||||
|
|
||||||
|
return ibi_intraday_features
|
|
@ -0,0 +1,68 @@
|
||||||
|
import pandas as pd
|
||||||
|
from scipy.stats import entropy
|
||||||
|
|
||||||
|
from cr_features.helper_functions import convert_to2d, generic_features
|
||||||
|
from cr_features.calculate_features_old import calculateFeatures
|
||||||
|
from cr_features.calculate_features import calculate_features
|
||||||
|
from cr_features_helper_methods import extract_second_order_features
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def extract_temp_features_from_intraday_data(temperature_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||||
|
temperature_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||||
|
|
||||||
|
if not temperature_intraday_data.empty:
|
||||||
|
sample_rate = 4
|
||||||
|
|
||||||
|
temperature_intraday_data = filter_data_by_segment(temperature_intraday_data, time_segment)
|
||||||
|
|
||||||
|
if not temperature_intraday_data.empty:
|
||||||
|
|
||||||
|
temperature_intraday_features = pd.DataFrame()
|
||||||
|
|
||||||
|
# apply methods from calculate features module
|
||||||
|
if window_length is None:
|
||||||
|
temperature_intraday_features = \
|
||||||
|
temperature_intraday_data.groupby('local_segment').apply(\
|
||||||
|
lambda x: calculate_features(convert_to2d(x['temperature'], x.shape[0]), fs=sample_rate, feature_names=features, show_progress=False))
|
||||||
|
else:
|
||||||
|
temperature_intraday_features = \
|
||||||
|
temperature_intraday_data.groupby('local_segment').apply(\
|
||||||
|
lambda x: calculate_features(convert_to2d(x['temperature'], window_length*sample_rate), fs=sample_rate, feature_names=features, show_progress=False))
|
||||||
|
|
||||||
|
|
||||||
|
temperature_intraday_features.reset_index(inplace=True)
|
||||||
|
|
||||||
|
return temperature_intraday_features
|
||||||
|
|
||||||
|
|
||||||
|
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||||
|
data_types = {'local_timezone': 'str', 'device_id': 'str', 'timestamp': 'int64', 'temperature': 'float64', 'local_date_time': 'str',
|
||||||
|
'local_date': "str", 'local_time': "str", 'local_hour': "str", 'local_minute': "str", 'assigned_segments': "str"}
|
||||||
|
|
||||||
|
temperature_intraday_data = pd.read_csv(sensor_data_files["sensor_data"], dtype=data_types)
|
||||||
|
|
||||||
|
requested_intraday_features = provider["FEATURES"]
|
||||||
|
|
||||||
|
calc_windows = kwargs.get('calc_windows', False)
|
||||||
|
|
||||||
|
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||||
|
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||||
|
else:
|
||||||
|
requested_window_length = None
|
||||||
|
|
||||||
|
# name of the features this function can compute
|
||||||
|
base_intraday_features_names = generic_features
|
||||||
|
# the subset of requested features this function can compute
|
||||||
|
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||||
|
|
||||||
|
# extract features from intraday data
|
||||||
|
temperature_intraday_features = extract_temp_features_from_intraday_data(temperature_intraday_data, intraday_features_to_compute,
|
||||||
|
requested_window_length, time_segment, filter_data_by_segment)
|
||||||
|
|
||||||
|
if calc_windows:
|
||||||
|
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||||
|
temperature_second_order_features = extract_second_order_features(temperature_intraday_features, so_features_names)
|
||||||
|
return temperature_intraday_features, temperature_second_order_features
|
||||||
|
|
||||||
|
return temperature_intraday_features
|
|
@ -1,19 +1,38 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from utils.utils import fetch_provider_features, run_provider_cleaning_script
|
from utils.utils import fetch_provider_features, run_provider_cleaning_script
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
sensor_data_files = dict(snakemake.input)
|
sensor_data_files = dict(snakemake.input)
|
||||||
|
|
||||||
provider = snakemake.params["provider"]
|
provider = snakemake.params["provider"]
|
||||||
provider_key = snakemake.params["provider_key"]
|
provider_key = snakemake.params["provider_key"]
|
||||||
sensor_key = snakemake.params["sensor_key"]
|
sensor_key = snakemake.params["sensor_key"]
|
||||||
|
|
||||||
|
calc_windows = True if (provider.get("WINDOWS", False) and provider["WINDOWS"].get("COMPUTE", False)) else False
|
||||||
|
|
||||||
if sensor_key == "all_cleaning_individual" or sensor_key == "all_cleaning_overall":
|
if sensor_key == "all_cleaning_individual" or sensor_key == "all_cleaning_overall":
|
||||||
# Data cleaning
|
# Data cleaning
|
||||||
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
|
if "overall" in sensor_key:
|
||||||
|
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files, snakemake.params["target"])
|
||||||
|
else:
|
||||||
|
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
|
||||||
else:
|
else:
|
||||||
# Extract sensor features
|
# Extract sensor features
|
||||||
del sensor_data_files["time_segments_labels"]
|
del sensor_data_files["time_segments_labels"]
|
||||||
time_segments_file = snakemake.input["time_segments_labels"]
|
time_segments_file = snakemake.input["time_segments_labels"]
|
||||||
sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
|
|
||||||
|
|
||||||
sensor_features.to_csv(snakemake.output[0], index=False)
|
if calc_windows:
|
||||||
|
window_features, second_order_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=True)
|
||||||
|
|
||||||
|
window_features.to_csv(snakemake.output[1], index=False)
|
||||||
|
second_order_features.to_csv(snakemake.output[0], index=False)
|
||||||
|
|
||||||
|
elif "empatica" in sensor_key:
|
||||||
|
pd.DataFrame().to_csv(snakemake.output[1], index=False)
|
||||||
|
|
||||||
|
if not calc_windows:
|
||||||
|
sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=False)
|
||||||
|
|
||||||
|
if not calc_windows:
|
||||||
|
sensor_features.to_csv(snakemake.output[0], index=False)
|
||||||
|
|
|
@ -37,6 +37,6 @@ def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
||||||
ar_features.index.names = ["local_segment"]
|
ar_features.index.names = ["local_segment"]
|
||||||
ar_features = ar_features.reset_index()
|
ar_features = ar_features.reset_index()
|
||||||
|
|
||||||
ar_features.fillna(value={"count": 0, "countuniqueactivities": 0, "durationstationary": 0, "durationmobile": 0, "durationvehicle": 0}, inplace=True)
|
ar_features.fillna(value={"count": 0, "countuniqueactivities": 0, "durationstationary": 0, "durationmobile": 0, "durationvehicle": 0, "mostcommonactivity": 4}, inplace=True)
|
||||||
|
|
||||||
return ar_features
|
return ar_features
|
||||||
|
|
|
@ -9,19 +9,19 @@ def compute_features(filtered_data, apps_type, requested_features, apps_features
|
||||||
if "timeoffirstuse" in requested_features:
|
if "timeoffirstuse" in requested_features:
|
||||||
time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
||||||
if time_first_event.empty:
|
if time_first_event.empty:
|
||||||
apps_features["timeoffirstuse" + apps_type] = np.nan
|
apps_features["timeoffirstuse" + apps_type] = 1500 # np.nan
|
||||||
else:
|
else:
|
||||||
apps_features["timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
|
apps_features["timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
|
||||||
if "timeoflastuse" in requested_features:
|
if "timeoflastuse" in requested_features:
|
||||||
time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
||||||
if time_last_event.empty:
|
if time_last_event.empty:
|
||||||
apps_features["timeoflastuse" + apps_type] = np.nan
|
apps_features["timeoflastuse" + apps_type] = 1500 # np.nan
|
||||||
else:
|
else:
|
||||||
apps_features["timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
|
apps_features["timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
|
||||||
if "frequencyentropy" in requested_features:
|
if "frequencyentropy" in requested_features:
|
||||||
apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
|
apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
|
||||||
if (len(apps_with_count.index) < 2 ):
|
if (len(apps_with_count.index) < 2 ):
|
||||||
apps_features["frequencyentropy" + apps_type] = np.nan
|
apps_features["frequencyentropy" + apps_type] = 0 # np.nan
|
||||||
else:
|
else:
|
||||||
apps_features["frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy)
|
apps_features["frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy)
|
||||||
if "countevent" in requested_features:
|
if "countevent" in requested_features:
|
||||||
|
@ -43,6 +43,7 @@ def compute_features(filtered_data, apps_type, requested_features, apps_features
|
||||||
apps_features["sumduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].sum()
|
apps_features["sumduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].sum()
|
||||||
|
|
||||||
apps_features.index.names = ["local_segment"]
|
apps_features.index.names = ["local_segment"]
|
||||||
|
|
||||||
return apps_features
|
return apps_features
|
||||||
|
|
||||||
def process_app_features(data, requested_features, time_segment, provider, filter_data_by_segment):
|
def process_app_features(data, requested_features, time_segment, provider, filter_data_by_segment):
|
||||||
|
|
|
@ -14,8 +14,8 @@ def deviceFeatures(devices, ownership, common_devices, features_to_compute, feat
|
||||||
features = features.join(device_value_counts.groupby("local_segment")["bt_address"].nunique().to_frame("uniquedevices" + ownership), how="outer")
|
features = features.join(device_value_counts.groupby("local_segment")["bt_address"].nunique().to_frame("uniquedevices" + ownership), how="outer")
|
||||||
if "meanscans" in features_to_compute:
|
if "meanscans" in features_to_compute:
|
||||||
features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer")
|
features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer")
|
||||||
if "stdscans" in features_to_compute:
|
if "stdscans" in features_to_compute:
|
||||||
features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership), how="outer")
|
features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership).fillna(0), how="outer")
|
||||||
# Most frequent device within segments, across segments, and across dataset
|
# Most frequent device within segments, across segments, and across dataset
|
||||||
if "countscansmostfrequentdevicewithinsegments" in features_to_compute:
|
if "countscansmostfrequentdevicewithinsegments" in features_to_compute:
|
||||||
features = features.join(device_value_counts.groupby("local_segment")["scans"].max().to_frame("countscansmostfrequentdevicewithinsegments" + ownership), how="outer")
|
features = features.join(device_value_counts.groupby("local_segment")["scans"].max().to_frame("countscansmostfrequentdevicewithinsegments" + ownership), how="outer")
|
||||||
|
|
|
@ -88,6 +88,16 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
||||||
features <- call_features_of_type(calls_of_type, features_type, call_type, time_segment, requested_features)
|
features <- call_features_of_type(calls_of_type, features_type, call_type, time_segment, requested_features)
|
||||||
call_features <- merge(call_features, features, all=TRUE)
|
call_features <- merge(call_features, features, all=TRUE)
|
||||||
}
|
}
|
||||||
call_features <- call_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count") | contains("sumduration") | contains("minduration") | contains("maxduration") | contains("meanduration") | contains("modeduration")), list( ~ replace_na(., 0)))
|
|
||||||
|
# Fill seleted columns with a high number
|
||||||
|
time_cols <- select(call_features, contains("timefirstcall") | contains("timelastcall")) %>%
|
||||||
|
colnames(.)
|
||||||
|
|
||||||
|
call_features <- call_features %>%
|
||||||
|
mutate_at(., time_cols, ~replace(., is.na(.), 1500))
|
||||||
|
|
||||||
|
# Fill NA values with 0
|
||||||
|
call_features <- call_features %>% mutate_all(~replace(., is.na(.), 0))
|
||||||
|
|
||||||
return(call_features)
|
return(call_features)
|
||||||
}
|
}
|
|
@ -3,9 +3,11 @@ library(tidyr)
|
||||||
library(readr)
|
library(readr)
|
||||||
|
|
||||||
compute_data_yield_features <- function(data, feature_name, time_segment, provider){
|
compute_data_yield_features <- function(data, feature_name, time_segment, provider){
|
||||||
|
|
||||||
data <- data %>% filter_data_by_segment(time_segment)
|
data <- data %>% filter_data_by_segment(time_segment)
|
||||||
if(nrow(data) == 0)
|
if(nrow(data) == 0){
|
||||||
return(tibble(local_segment = character(), ratiovalidyieldedminutes = numeric(), ratiovalidyieldedhours = numeric()))
|
return(tibble(local_segment = character(), ratiovalidyieldedminutes = numeric(), ratiovalidyieldedhours = numeric()))
|
||||||
|
}
|
||||||
features <- data %>%
|
features <- data %>%
|
||||||
separate(timestamps_segment, into = c("start_timestamp", "end_timestamp"), convert = T, sep = ",") %>%
|
separate(timestamps_segment, into = c("start_timestamp", "end_timestamp"), convert = T, sep = ",") %>%
|
||||||
mutate(duration_minutes = (end_timestamp - start_timestamp) / 60000,
|
mutate(duration_minutes = (end_timestamp - start_timestamp) / 60000,
|
||||||
|
|
|
@ -0,0 +1,274 @@
|
||||||
|
from collections.abc import Collection
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from pytz import timezone
|
||||||
|
import datetime, json
|
||||||
|
|
||||||
|
# from config.models import ESM, Participant
|
||||||
|
# from features import helper
|
||||||
|
|
||||||
|
ESM_STATUS_ANSWERED = 2
|
||||||
|
|
||||||
|
GROUP_SESSIONS_BY = ["device_id", "esm_session"] # 'participant_id
|
||||||
|
|
||||||
|
SESSION_STATUS_UNANSWERED = "ema_unanswered"
|
||||||
|
SESSION_STATUS_DAY_FINISHED = "day_finished"
|
||||||
|
SESSION_STATUS_COMPLETE = "ema_completed"
|
||||||
|
|
||||||
|
ANSWER_DAY_FINISHED = "DayFinished3421"
|
||||||
|
ANSWER_DAY_OFF = "DayOff3421"
|
||||||
|
ANSWER_SET_EVENING = "DayFinishedSetEvening"
|
||||||
|
|
||||||
|
MAX_MORNING_LENGTH = 3
|
||||||
|
# When the participants was not yet at work at the time of the first (morning) EMA,
|
||||||
|
# only three items were answered.
|
||||||
|
# Two sleep related items and one indicating NOT starting work yet.
|
||||||
|
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
|
||||||
|
|
||||||
|
|
||||||
|
TZ_LJ = timezone("Europe/Ljubljana")
|
||||||
|
COLUMN_TIMESTAMP = "timestamp"
|
||||||
|
COLUMN_TIMESTAMP_ESM = "double_esm_user_answer_timestamp"
|
||||||
|
|
||||||
|
|
||||||
|
def get_date_from_timestamp(df_aware) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Transform a UNIX timestamp into a datetime (with Ljubljana timezone).
|
||||||
|
Additionally, extract only the date part, where anything until 4 AM is considered the same day.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df_aware: pd.DataFrame
|
||||||
|
Any AWARE-type data as defined in models.py.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_aware: pd.DataFrame
|
||||||
|
The same dataframe with datetime_lj and date_lj columns added.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if COLUMN_TIMESTAMP_ESM in df_aware:
|
||||||
|
column_timestamp = COLUMN_TIMESTAMP_ESM
|
||||||
|
else:
|
||||||
|
column_timestamp = COLUMN_TIMESTAMP
|
||||||
|
|
||||||
|
df_aware["datetime_lj"] = df_aware[column_timestamp].apply(
|
||||||
|
lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
|
||||||
|
)
|
||||||
|
df_aware = df_aware.assign(
|
||||||
|
date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
|
||||||
|
)
|
||||||
|
# Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
|
||||||
|
# the datetime is first translated to 4 h earlier.
|
||||||
|
|
||||||
|
return df_aware
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Convert timestamps into human-readable datetimes and dates
|
||||||
|
and expand the JSON column into several Pandas DF columns.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df_esm: pd.DataFrame
|
||||||
|
A dataframe of esm data.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_esm_preprocessed: pd.DataFrame
|
||||||
|
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
|
||||||
|
"""
|
||||||
|
df_esm = get_date_from_timestamp(df_esm)
|
||||||
|
|
||||||
|
df_esm_json = df_esm["esm_json"].apply(json.loads)
|
||||||
|
df_esm_json = pd.json_normalize(df_esm_json).drop(
|
||||||
|
columns=["esm_trigger"]
|
||||||
|
) # The esm_trigger column is already present in the main df.
|
||||||
|
return df_esm.join(df_esm_json)
|
||||||
|
|
||||||
|
|
||||||
|
def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
For each distinct EMA session, determine how the participant responded to it.
|
||||||
|
Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
|
||||||
|
|
||||||
|
This is done in three steps.
|
||||||
|
|
||||||
|
First, the esm_status is considered.
|
||||||
|
If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
|
||||||
|
|
||||||
|
Second, the sessions which do not represent full questionnaires are identified.
|
||||||
|
These are sessions where participants only marked they are finished with the day or have not yet started working.
|
||||||
|
|
||||||
|
Third, the sessions with only one item are marked with their trigger.
|
||||||
|
We never offered questionnaires with single items, so we can be sure these are unfinished.
|
||||||
|
|
||||||
|
Finally, all sessions that remain are marked as completed.
|
||||||
|
By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df_esm_preprocessed: pd.DataFrame
|
||||||
|
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_session_counts: pd.Dataframe
|
||||||
|
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
|
||||||
|
"""
|
||||||
|
sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
|
||||||
|
|
||||||
|
# 0. First, assign all session statuses as NaN.
|
||||||
|
df_session_counts = pd.DataFrame(sessions_grouped.count()["timestamp"]).rename(
|
||||||
|
columns={"timestamp": "esm_session_count"}
|
||||||
|
)
|
||||||
|
df_session_counts["session_response"] = np.nan
|
||||||
|
|
||||||
|
# 1. Identify all ESMs with status other than answered.
|
||||||
|
esm_not_answered = sessions_grouped.apply(
|
||||||
|
lambda x: (x.esm_status != ESM_STATUS_ANSWERED).any()
|
||||||
|
)
|
||||||
|
df_session_counts.loc[
|
||||||
|
esm_not_answered, "session_response"
|
||||||
|
] = SESSION_STATUS_UNANSWERED
|
||||||
|
|
||||||
|
# 2. Identify non-sessions, i.e. answers about the end of the day.
|
||||||
|
non_session = sessions_grouped.apply(
|
||||||
|
lambda x: (
|
||||||
|
(x.esm_user_answer == ANSWER_DAY_FINISHED) # I finished working for today.
|
||||||
|
| (x.esm_user_answer == ANSWER_DAY_OFF) # I am not going to work today.
|
||||||
|
| (
|
||||||
|
x.esm_user_answer == ANSWER_SET_EVENING
|
||||||
|
) # When would you like to answer the evening EMA?
|
||||||
|
).any()
|
||||||
|
)
|
||||||
|
df_session_counts.loc[non_session, "session_response"] = SESSION_STATUS_DAY_FINISHED
|
||||||
|
|
||||||
|
# 3. Identify sessions appearing only once, as those were not true EMAs for sure.
|
||||||
|
singleton_sessions = (df_session_counts.esm_session_count == 1) & (
|
||||||
|
df_session_counts.session_response.isna()
|
||||||
|
)
|
||||||
|
df_session_1 = df_session_counts[singleton_sessions]
|
||||||
|
df_esm_unique_session = df_session_1.join(
|
||||||
|
df_esm_preprocessed.set_index(GROUP_SESSIONS_BY), how="left"
|
||||||
|
)
|
||||||
|
df_esm_unique_session = df_esm_unique_session.assign(
|
||||||
|
session_response=lambda x: x.esm_trigger
|
||||||
|
)["session_response"]
|
||||||
|
df_session_counts.loc[
|
||||||
|
df_esm_unique_session.index, "session_response"
|
||||||
|
] = df_esm_unique_session
|
||||||
|
|
||||||
|
# 4. Mark the remaining sessions as completed.
|
||||||
|
df_session_counts.loc[
|
||||||
|
df_session_counts.session_response.isna(), "session_response"
|
||||||
|
] = SESSION_STATUS_COMPLETE
|
||||||
|
|
||||||
|
return df_session_counts
|
||||||
|
|
||||||
|
|
||||||
|
def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df_esm_preprocessed: pd.DataFrame
|
||||||
|
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_session_time: pd.DataFrame
|
||||||
|
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
|
||||||
|
"""
|
||||||
|
df_session_time = (
|
||||||
|
df_esm_preprocessed.sort_values(["datetime_lj"]) # "participant_id"
|
||||||
|
.groupby(GROUP_SESSIONS_BY)
|
||||||
|
.first()[["time", "datetime_lj"]]
|
||||||
|
)
|
||||||
|
return df_session_time
|
||||||
|
|
||||||
|
|
||||||
|
def classify_sessions_by_completion_time(
|
||||||
|
df_esm_preprocessed: pd.DataFrame,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
The point of this function is to not only classify sessions by using the previously defined functions.
|
||||||
|
It also serves to "correct" the time type of some EMA sessions.
|
||||||
|
|
||||||
|
A morning questionnaire could seamlessly transition into a daytime questionnaire,
|
||||||
|
if the participant was already at work.
|
||||||
|
In this case, the "time" label changed mid-session.
|
||||||
|
Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
|
||||||
|
But for all intents and purposes, it can be treated as a "daytime" EMA.
|
||||||
|
|
||||||
|
The way this scenario is differentiated from a true "morning" questionnaire,
|
||||||
|
where the participants NOT yet at work, is by considering their length.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df_esm_preprocessed: pd.DataFrame
|
||||||
|
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_session_counts_time: pd.DataFrame
|
||||||
|
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items,
|
||||||
|
their time type (with some morning EMAs reclassified) and timestamp of first answer.
|
||||||
|
|
||||||
|
"""
|
||||||
|
df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
|
||||||
|
df_session_time = classify_sessions_by_time(df_esm_preprocessed)
|
||||||
|
|
||||||
|
df_session_counts_time = df_session_time.join(df_session_counts)
|
||||||
|
|
||||||
|
morning_transition_to_daytime = (df_session_counts_time.time == "morning") & (
|
||||||
|
df_session_counts_time.esm_session_count > MAX_MORNING_LENGTH
|
||||||
|
)
|
||||||
|
|
||||||
|
df_session_counts_time.loc[morning_transition_to_daytime, "time"] = "daytime"
|
||||||
|
|
||||||
|
return df_session_counts_time
|
||||||
|
|
||||||
|
|
||||||
|
# def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
# """
|
||||||
|
# This function eliminates invalid ESM responses.
|
||||||
|
# It removes unanswered ESMs and those that indicate end of work and similar.
|
||||||
|
# It also extracts a numeric answer from strings such as "4 - I strongly agree".
|
||||||
|
|
||||||
|
# Parameters
|
||||||
|
# ----------
|
||||||
|
# df_esm_preprocessed: pd.DataFrame
|
||||||
|
# A preprocessed dataframe of esm data.
|
||||||
|
|
||||||
|
# Returns
|
||||||
|
# -------
|
||||||
|
# df_esm_clean: pd.DataFrame
|
||||||
|
# A subset of the original dataframe.
|
||||||
|
|
||||||
|
# """
|
||||||
|
# df_esm_clean = df_esm_preprocessed[
|
||||||
|
# df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED
|
||||||
|
# ]
|
||||||
|
# df_esm_clean = df_esm_clean[
|
||||||
|
# ~df_esm_clean["esm_user_answer"].isin(
|
||||||
|
# [ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING]
|
||||||
|
# )
|
||||||
|
# ]
|
||||||
|
# df_esm_clean["esm_user_answer_numeric"] = np.nan
|
||||||
|
# esm_type_numeric = [
|
||||||
|
# ESM.ESM_TYPE.get("radio"),
|
||||||
|
# ESM.ESM_TYPE.get("scale"),
|
||||||
|
# ESM.ESM_TYPE.get("number"),
|
||||||
|
# ]
|
||||||
|
# df_esm_clean.loc[
|
||||||
|
# df_esm_clean["esm_type"].isin(esm_type_numeric)
|
||||||
|
# ] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
|
||||||
|
# esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
|
||||||
|
# int
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
# return df_esm_clean
|
|
@ -42,7 +42,8 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
|
||||||
requested_features = provider["FEATURES"]
|
requested_features = provider["FEATURES"]
|
||||||
# name of the features this function can compute
|
# name of the features this function can compute
|
||||||
requested_scales = provider["SCALES"]
|
requested_scales = provider["SCALES"]
|
||||||
base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support"]
|
base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
|
||||||
|
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
|
||||||
#TODO Check valid questionnaire and feature names.
|
#TODO Check valid questionnaire and feature names.
|
||||||
# the subset of requested features this function can compute
|
# the subset of requested features this function can compute
|
||||||
features_to_compute = list(set(requested_features) & set(base_features_names))
|
features_to_compute = list(set(requested_features) & set(base_features_names))
|
||||||
|
@ -52,7 +53,6 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
|
||||||
|
|
||||||
if not esm_data.empty:
|
if not esm_data.empty:
|
||||||
esm_features = pd.DataFrame()
|
esm_features = pd.DataFrame()
|
||||||
|
|
||||||
for scale in requested_scales:
|
for scale in requested_scales:
|
||||||
questionnaire_id = QUESTIONNAIRE_IDS[scale]
|
questionnaire_id = QUESTIONNAIRE_IDS[scale]
|
||||||
mask = esm_data["questionnaire_id"] == questionnaire_id
|
mask = esm_data["questionnaire_id"] == questionnaire_id
|
||||||
|
@ -60,4 +60,7 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
|
||||||
#TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.
|
#TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.
|
||||||
|
|
||||||
esm_features = esm_features.reset_index()
|
esm_features = esm_features.reset_index()
|
||||||
|
if 'index' in esm_features: # In calse of empty esm_features df
|
||||||
|
esm_features.rename(columns={'index': 'local_segment'}, inplace=True)
|
||||||
|
|
||||||
return esm_features
|
return esm_features
|
||||||
|
|
|
@ -0,0 +1,260 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
import math, sys, yaml
|
||||||
|
|
||||||
|
from esm_preprocess import clean_up_esm
|
||||||
|
from esm import classify_sessions_by_completion_time, preprocess_esm
|
||||||
|
|
||||||
|
input_data_files = dict(snakemake.input)
|
||||||
|
|
||||||
|
def format_timestamp(x):
|
||||||
|
"""This method formates inputed timestamp into format "HH MM SS". Including spaces. If there is no hours or minutes present
|
||||||
|
that part is ignored, e.g., "MM SS" or just "SS".
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (int): unix timestamp in seconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: formatted timestamp using "HH MM SS" sintax
|
||||||
|
"""
|
||||||
|
tstring=""
|
||||||
|
space = False
|
||||||
|
if x//3600 > 0:
|
||||||
|
tstring += f"{x//3600}H"
|
||||||
|
space = True
|
||||||
|
if x % 3600 // 60 > 0:
|
||||||
|
tstring += f" {x % 3600 // 60}M" if "H" in tstring else f"{x % 3600 // 60}M"
|
||||||
|
if x % 60 > 0:
|
||||||
|
tstring += f" {x % 60}S" if "M" in tstring or "H" in tstring else f"{x % 60}S"
|
||||||
|
|
||||||
|
return tstring
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ers(esm_df):
|
||||||
|
"""This method has two major functionalities:
|
||||||
|
(1) It prepares STRAW event-related segments file with the use of esm file. The execution protocol is depended on
|
||||||
|
the segmenting method specified in the config.yaml file.
|
||||||
|
(2) It prepares and writes csv with targets and corresponding time segments labels. This is later used
|
||||||
|
in the overall cleaning script (straw).
|
||||||
|
|
||||||
|
Details about each segmenting method are listed below by each corresponding condition. Refer to the RAPIDS documentation for the
|
||||||
|
ERS file format: https://www.rapids.science/1.9/setup/configuration/#time-segments -> event segments
|
||||||
|
|
||||||
|
Args:
|
||||||
|
esm_df (DataFrame): read esm file that is dependend on the current participant.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
extracted_ers (DataFrame): dataframe with all necessary information to write event-related segments file
|
||||||
|
in the correct format.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pd.set_option("display.max_rows", 100)
|
||||||
|
pd.set_option("display.max_columns", None)
|
||||||
|
|
||||||
|
with open('config.yaml', 'r') as stream:
|
||||||
|
config = yaml.load(stream, Loader=yaml.FullLoader)
|
||||||
|
|
||||||
|
pd.DataFrame(columns=["label"]).to_csv(snakemake.output[1]) # Create an empty stress_events_targets file
|
||||||
|
|
||||||
|
esm_preprocessed = clean_up_esm(preprocess_esm(esm_df))
|
||||||
|
|
||||||
|
# Take only ema_completed sessions responses
|
||||||
|
classified = classify_sessions_by_completion_time(esm_preprocessed)
|
||||||
|
esm_filtered_sessions = classified[classified["session_response"] == 'ema_completed'].reset_index()[['device_id', 'esm_session']]
|
||||||
|
esm_df = esm_preprocessed.loc[(esm_preprocessed['device_id'].isin(esm_filtered_sessions['device_id'])) & (esm_preprocessed['esm_session'].isin(esm_filtered_sessions['esm_session']))]
|
||||||
|
|
||||||
|
segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"]
|
||||||
|
|
||||||
|
if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
|
||||||
|
""" '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
|
||||||
|
Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
|
||||||
|
All questionnaire durations over 15 minutes are excluded from the querying.
|
||||||
|
"""
|
||||||
|
# Extract time-relevant information
|
||||||
|
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index() # questionnaire length
|
||||||
|
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
||||||
|
extracted_ers[['event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].min().reset_index()[['timestamp', 'device_id']]
|
||||||
|
extracted_ers = extracted_ers[extracted_ers["timestamp"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min
|
||||||
|
extracted_ers["shift_direction"] = -1
|
||||||
|
|
||||||
|
if segmenting_method == "30_before":
|
||||||
|
"""The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
|
||||||
|
The timestamps are formatted with the help of format_timestamp() method.
|
||||||
|
"""
|
||||||
|
time_before_questionnaire = 30 * 60 # in seconds (30 minutes)
|
||||||
|
|
||||||
|
extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x))
|
||||||
|
extracted_ers["shift"] = time_before_questionnaire
|
||||||
|
extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x))
|
||||||
|
|
||||||
|
elif segmenting_method == "90_before":
|
||||||
|
"""The method 90-minutes before has an important condition. If the time between the current and the previous questionnaire is
|
||||||
|
longer then 90 minutes it takes 90 minutes, otherwise it takes the original time difference between the questionnaires.
|
||||||
|
"""
|
||||||
|
time_before_questionnaire = 90 * 60 # in seconds (90 minutes)
|
||||||
|
|
||||||
|
extracted_ers[['end_event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].max().reset_index()[['timestamp', 'device_id']]
|
||||||
|
|
||||||
|
extracted_ers['diffs'] = extracted_ers['event_timestamp'].astype('int64') - extracted_ers['end_event_timestamp'].shift(1, fill_value=0).astype('int64')
|
||||||
|
extracted_ers.loc[extracted_ers['diffs'] > time_before_questionnaire * 1000, 'diffs'] = time_before_questionnaire * 1000
|
||||||
|
|
||||||
|
extracted_ers["diffs"] = (extracted_ers["diffs"] / 1000).apply(lambda x: math.ceil(x))
|
||||||
|
|
||||||
|
extracted_ers["length"] = (extracted_ers["timestamp"] + extracted_ers["diffs"]).apply(lambda x: format_timestamp(x))
|
||||||
|
extracted_ers["shift"] = extracted_ers["diffs"].apply(lambda x: format_timestamp(x))
|
||||||
|
|
||||||
|
elif segmenting_method == "stress_event":
|
||||||
|
"""
|
||||||
|
TODO: update documentation for this condition
|
||||||
|
This is a special case of the method as it consists of two important parts:
|
||||||
|
(1) Generating of the ERS file (same as the methods above) and
|
||||||
|
(2) Generating targets file alongside with the correct time segment labels.
|
||||||
|
|
||||||
|
This extracts event-related segments, depended on the event time and duration specified by the participant in the next
|
||||||
|
questionnaire. Additionally, 5 minutes before the specified start time of this event is taken to take into a account the
|
||||||
|
possiblity of the participant not remembering the start time percisely => this parameter can be manipulated with the variable
|
||||||
|
"time_before_event" which is defined below.
|
||||||
|
|
||||||
|
In case if the participant marked that no stressful event happened, the default of 30 minutes before the event is choosen.
|
||||||
|
In this case, se_threat and se_challenge are NaN.
|
||||||
|
|
||||||
|
By default, this method also excludes all events that are longer then 2.5 hours so that the segments are easily comparable.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ioi = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["INTERVAL_OF_INTEREST"] * 60 # interval of interest in seconds
|
||||||
|
ioi_error_tolerance = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["IOI_ERROR_TOLERANCE"] * 60 # interval of interest error tolerance in seconds
|
||||||
|
|
||||||
|
# Get and join required data
|
||||||
|
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire length
|
||||||
|
extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire answering is 15 min
|
||||||
|
session_start_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].min().to_frame().rename(columns={'timestamp': 'session_start_timestamp'}) # questionnaire start timestamp
|
||||||
|
session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
|
||||||
|
|
||||||
|
# Users' answers for the stressfulness event (se) start times and durations
|
||||||
|
se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
|
||||||
|
se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
|
||||||
|
|
||||||
|
# Make se_durations to the appropriate lengths
|
||||||
|
|
||||||
|
# Extracted 3 targets that will be transfered in the csv file to the cleaning script.
|
||||||
|
se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'})
|
||||||
|
se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean(numeric_only=True)['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'})
|
||||||
|
se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean(numeric_only=True)['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'})
|
||||||
|
|
||||||
|
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
|
||||||
|
extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \
|
||||||
|
.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
|
||||||
|
.join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
|
||||||
|
.join(se_time, on=['device_id', 'esm_session'], how='left') \
|
||||||
|
.join(se_duration, on=['device_id', 'esm_session'], how='left') \
|
||||||
|
.join(se_threat_tg, on=['device_id', 'esm_session'], how='left') \
|
||||||
|
.join(se_challenge_tg, on=['device_id', 'esm_session'], how='left')
|
||||||
|
|
||||||
|
# Filter-out the sessions that are not useful. Because of the ambiguity this excludes:
|
||||||
|
# (1) straw event times that are marked as "0 - I don't remember"
|
||||||
|
extracted_ers = extracted_ers[~extracted_ers.se_time.astype(str).str.startswith("0 - ")]
|
||||||
|
extracted_ers.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
extracted_ers.loc[extracted_ers.se_duration.astype(str).str.startswith("0 - "), 'se_duration'] = 0
|
||||||
|
|
||||||
|
# Add default duration in case if participant answered that no stressful event occured
|
||||||
|
extracted_ers["se_duration"] = extracted_ers["se_duration"].fillna(int((ioi + 2*ioi_error_tolerance) * 1000))
|
||||||
|
|
||||||
|
# Prepare data to fit the data structure in the CSV file ...
|
||||||
|
# Add the event time as the end of the questionnaire if no stress event occured
|
||||||
|
extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
|
||||||
|
# Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds
|
||||||
|
extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
|
||||||
|
extracted_ers['shift_direction'] = -1
|
||||||
|
|
||||||
|
""">>>>> begin section (could be optimized) <<<<<"""
|
||||||
|
|
||||||
|
# Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
|
||||||
|
# is taken as end time of the segment. Else the user input duration is taken.
|
||||||
|
extracted_ers['se_duration'] = \
|
||||||
|
np.where(
|
||||||
|
extracted_ers['se_duration'].astype(str).str.startswith("1 - "),
|
||||||
|
extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'],
|
||||||
|
extracted_ers['se_duration']
|
||||||
|
)
|
||||||
|
|
||||||
|
# This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds.
|
||||||
|
extracted_ers['se_duration'] = \
|
||||||
|
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60)
|
||||||
|
|
||||||
|
# Check explicitley whether min duration is at least 0. This will eliminate rows that would be investigated after the end of the questionnaire.
|
||||||
|
extracted_ers = extracted_ers[extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'] >= 0]
|
||||||
|
# Double check whether min se_duration is at least 0. Filter-out the rest. Negative values are considered invalid.
|
||||||
|
extracted_ers = extracted_ers[extracted_ers["se_duration"] >= 0].reset_index(drop=True)
|
||||||
|
|
||||||
|
""">>>>> end section <<<<<"""
|
||||||
|
|
||||||
|
# Simply override all durations to be of an equal amount
|
||||||
|
extracted_ers['se_duration'] = ioi + 2*ioi_error_tolerance
|
||||||
|
|
||||||
|
# If target is 0 then shift by the total stress event duration, otherwise shift it by ioi_tolerance
|
||||||
|
extracted_ers['shift'] = \
|
||||||
|
np.where(
|
||||||
|
extracted_ers['appraisal_stressfulness_event'] == 0,
|
||||||
|
extracted_ers['se_duration'],
|
||||||
|
ioi_error_tolerance
|
||||||
|
)
|
||||||
|
|
||||||
|
extracted_ers['shift'] = extracted_ers['shift'].apply(lambda x: format_timestamp(int(x)))
|
||||||
|
extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(int(x)))
|
||||||
|
|
||||||
|
# Drop event_timestamp duplicates in case in the user is referencing the same event over multiple questionnaires
|
||||||
|
extracted_ers.drop_duplicates(subset=["event_timestamp"], keep='first', inplace=True)
|
||||||
|
extracted_ers.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
||||||
|
|
||||||
|
# Write the csv of extracted ERS labels with targets related to stressfulness event
|
||||||
|
extracted_ers[["label", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]].to_csv(snakemake.output[1], index=False)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise Exception("Please select correct target method for the event-related segments.")
|
||||||
|
extracted_ers = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"])
|
||||||
|
|
||||||
|
return extracted_ers[["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]]
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Here the code is executed - this .py file is used both for extraction of the STRAW time_segments file for the individual
|
||||||
|
participant, and also for merging all participant's files into one combined file which is later used for the time segments
|
||||||
|
to all sensors assignment.
|
||||||
|
|
||||||
|
There are two files involved (see rules extract_event_information_from_esm and merge_event_related_segments_files in preprocessing.smk)
|
||||||
|
(1) ERS file which contains all the information about the time segment timings and
|
||||||
|
(2) targets file which has corresponding target value for the segment label which is later used to merge with other features in the cleaning script.
|
||||||
|
For more information, see the comment in the method above.
|
||||||
|
"""
|
||||||
|
if snakemake.params["stage"] == "extract":
|
||||||
|
esm_df = pd.read_csv(input_data_files['esm_raw_input'])
|
||||||
|
|
||||||
|
extracted_ers = extract_ers(esm_df)
|
||||||
|
|
||||||
|
extracted_ers.to_csv(snakemake.output[0], index=False)
|
||||||
|
|
||||||
|
elif snakemake.params["stage"] == "merge":
|
||||||
|
|
||||||
|
input_data_files = dict(snakemake.input)
|
||||||
|
straw_events = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"])
|
||||||
|
stress_events_targets = pd.DataFrame(columns=["label", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"])
|
||||||
|
|
||||||
|
for input_file in input_data_files["ers_files"]:
|
||||||
|
ers_df = pd.read_csv(input_file)
|
||||||
|
straw_events = pd.concat([straw_events, ers_df], axis=0, ignore_index=True)
|
||||||
|
|
||||||
|
straw_events.to_csv(snakemake.output[0], index=False)
|
||||||
|
|
||||||
|
for input_file in input_data_files["se_files"]:
|
||||||
|
se_df = pd.read_csv(input_file)
|
||||||
|
stress_events_targets = pd.concat([stress_events_targets, se_df], axis=0, ignore_index=True)
|
||||||
|
|
||||||
|
stress_events_targets.to_csv(snakemake.output[1], index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,7 @@ def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
||||||
if "medianlux" in features_to_compute:
|
if "medianlux" in features_to_compute:
|
||||||
light_features["medianlux"] = light_data.groupby(["local_segment"])["double_light_lux"].median()
|
light_features["medianlux"] = light_data.groupby(["local_segment"])["double_light_lux"].median()
|
||||||
if "stdlux" in features_to_compute:
|
if "stdlux" in features_to_compute:
|
||||||
light_features["stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std()
|
light_features["stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std().fillna(0)
|
||||||
|
|
||||||
light_features = light_features.reset_index()
|
light_features = light_features.reset_index()
|
||||||
|
|
||||||
|
|
|
@ -25,9 +25,11 @@ barnett_daily_features <- function(snakemake){
|
||||||
datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59"
|
datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59"
|
||||||
location <- location %>%
|
location <- location %>%
|
||||||
mutate(is_daily = str_detect(assigned_segments, paste0(".*#", datetime_start_regex, ",", datetime_end_regex, ".*")))
|
mutate(is_daily = str_detect(assigned_segments, paste0(".*#", datetime_start_regex, ",", datetime_end_regex, ".*")))
|
||||||
|
|
||||||
if(nrow(segment_labels) == 0 || nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)){
|
does_not_span = nrow(segment_labels) == 0 || nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)
|
||||||
warning("Barnett's location features cannot be computed for data or time segments that do not span one or more entire days (00:00:00 to 23:59:59). Values below point to the problem:",
|
|
||||||
|
if(is.na(does_not_span) || does_not_span){
|
||||||
|
warning("Barnett's location features cannot be computed for data or time segments that do not span one or more entire days (00:00:00 to 23:59:59). Values below point to the problem:",
|
||||||
"\nLocation data rows within a daily time segment: ", nrow(filter(location, is_daily)),
|
"\nLocation data rows within a daily time segment: ", nrow(filter(location, is_daily)),
|
||||||
"\nLocation data time span in days: ", round((max(location$timestamp) - min(location$timestamp)) / 86400000, 2)
|
"\nLocation data time span in days: ", round((max(location$timestamp) - min(location$timestamp)) / 86400000, 2)
|
||||||
)
|
)
|
||||||
|
|
|
@ -115,7 +115,7 @@ cluster_on = provider["CLUSTER_ON"]
|
||||||
strategy = provider["INFER_HOME_LOCATION_STRATEGY"]
|
strategy = provider["INFER_HOME_LOCATION_STRATEGY"]
|
||||||
days_threshold = provider["MINIMUM_DAYS_TO_DETECT_HOME_CHANGES"]
|
days_threshold = provider["MINIMUM_DAYS_TO_DETECT_HOME_CHANGES"]
|
||||||
|
|
||||||
if not location_data.timestamp.is_monotonic:
|
if not location_data.timestamp.is_monotonic_increasing:
|
||||||
location_data.sort_values(by=["timestamp"], inplace=True)
|
location_data.sort_values(by=["timestamp"], inplace=True)
|
||||||
|
|
||||||
location_data["duration_in_seconds"] = -1 * location_data.timestamp.diff(-1) / 1000
|
location_data["duration_in_seconds"] = -1 * location_data.timestamp.diff(-1) / 1000
|
||||||
|
|
|
@ -37,7 +37,8 @@ def variance_and_logvariance_features(location_data, location_features):
|
||||||
location_data["longitude_for_wvar"] = (location_data["double_longitude"] - location_data["longitude_wavg"]) ** 2 * location_data["duration"] * 60
|
location_data["longitude_for_wvar"] = (location_data["double_longitude"] - location_data["longitude_wavg"]) ** 2 * location_data["duration"] * 60
|
||||||
|
|
||||||
location_features["locationvariance"] = ((location_data_grouped["latitude_for_wvar"].sum() + location_data_grouped["longitude_for_wvar"].sum()) / (location_data_grouped["duration"].sum() * 60 - 1)).fillna(0)
|
location_features["locationvariance"] = ((location_data_grouped["latitude_for_wvar"].sum() + location_data_grouped["longitude_for_wvar"].sum()) / (location_data_grouped["duration"].sum() * 60 - 1)).fillna(0)
|
||||||
location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, np.nan)
|
|
||||||
|
location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, -1000000)
|
||||||
|
|
||||||
return location_features
|
return location_features
|
||||||
|
|
||||||
|
|
|
@ -65,6 +65,15 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
||||||
features <- message_features_of_type(messages_of_type, message_type, time_segment, requested_features)
|
features <- message_features_of_type(messages_of_type, message_type, time_segment, requested_features)
|
||||||
messages_features <- merge(messages_features, features, all=TRUE)
|
messages_features <- merge(messages_features, features, all=TRUE)
|
||||||
}
|
}
|
||||||
messages_features <- messages_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count")), list( ~ replace_na(., 0)))
|
# Fill seleted columns with a high number
|
||||||
|
time_cols <- select(messages_features, contains("timefirstmessages") | contains("timelastmessages")) %>%
|
||||||
|
colnames(.)
|
||||||
|
|
||||||
|
messages_features <- messages_features %>%
|
||||||
|
mutate_at(., time_cols, ~replace(., is.na(.), 1500))
|
||||||
|
|
||||||
|
# Fill NA values with 0
|
||||||
|
messages_features <- messages_features %>% mutate_all(~replace(., is.na(.), 0))
|
||||||
|
|
||||||
return(messages_features)
|
return(messages_features)
|
||||||
}
|
}
|
|
@ -15,7 +15,7 @@ def getEpisodeDurationFeatures(screen_data, time_segment, episode, features, ref
|
||||||
if "avgduration" in features:
|
if "avgduration" in features:
|
||||||
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].mean().rename(columns = {"duration":"avgduration" + episode})], axis = 1)
|
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].mean().rename(columns = {"duration":"avgduration" + episode})], axis = 1)
|
||||||
if "stdduration" in features:
|
if "stdduration" in features:
|
||||||
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].std().rename(columns = {"duration":"stdduration" + episode})], axis = 1)
|
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].std().fillna(0).rename(columns = {"duration":"stdduration" + episode})], axis = 1)
|
||||||
if "firstuseafter" + "{0:0=2d}".format(reference_hour_first_use) in features:
|
if "firstuseafter" + "{0:0=2d}".format(reference_hour_first_use) in features:
|
||||||
screen_data_episode_after_hour = screen_data_episode.copy()
|
screen_data_episode_after_hour = screen_data_episode.copy()
|
||||||
screen_data_episode_after_hour["hour"] = pd.to_datetime(screen_data_episode["local_start_date_time"]).dt.hour
|
screen_data_episode_after_hour["hour"] = pd.to_datetime(screen_data_episode["local_start_date_time"]).dt.hour
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||||
|
speech_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||||
|
requested_features = provider["FEATURES"]
|
||||||
|
# name of the features this function can compute+
|
||||||
|
base_features_names = ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"]
|
||||||
|
features_to_compute = list(set(requested_features) & set(base_features_names))
|
||||||
|
speech_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
|
||||||
|
|
||||||
|
if not speech_data.empty:
|
||||||
|
speech_data = filter_data_by_segment(speech_data, time_segment)
|
||||||
|
|
||||||
|
if not speech_data.empty:
|
||||||
|
speech_features = pd.DataFrame()
|
||||||
|
if "meanspeech" in features_to_compute:
|
||||||
|
speech_features["meanspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].mean()
|
||||||
|
if "stdspeech" in features_to_compute:
|
||||||
|
speech_features["stdspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].std()
|
||||||
|
if "nlargest" in features_to_compute:
|
||||||
|
speech_features["nlargest"] = speech_data.groupby(["local_segment"])['speech_proportion'].apply(lambda x: x.nlargest(5).mean())
|
||||||
|
if "nsmallest" in features_to_compute:
|
||||||
|
speech_features["nsmallest"] = speech_data.groupby(["local_segment"])['speech_proportion'].apply(lambda x: x.nsmallest(5).mean())
|
||||||
|
if "medianspeech" in features_to_compute:
|
||||||
|
speech_features["medianspeech"] = speech_data.groupby(["local_segment"])['speech_proportion'].median()
|
||||||
|
|
||||||
|
speech_features = speech_features.reset_index()
|
||||||
|
|
||||||
|
return speech_features
|
|
@ -9,21 +9,26 @@ compute_wifi_feature <- function(data, feature, time_segment){
|
||||||
"countscans" = data %>% summarise(!!feature := n()),
|
"countscans" = data %>% summarise(!!feature := n()),
|
||||||
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid)))
|
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid)))
|
||||||
return(data)
|
return(data)
|
||||||
|
|
||||||
} else if(feature == "countscansmostuniquedevice"){
|
} else if(feature == "countscansmostuniquedevice"){
|
||||||
# Get the most scanned device
|
# Get the most scanned device
|
||||||
mostuniquedevice <- data %>%
|
mostuniquedevice <- data %>%
|
||||||
|
filter(bssid != "") %>%
|
||||||
group_by(bssid) %>%
|
group_by(bssid) %>%
|
||||||
mutate(N=n()) %>%
|
mutate(N=n()) %>%
|
||||||
ungroup() %>%
|
ungroup() %>%
|
||||||
filter(N == max(N)) %>%
|
filter(N == max(N)) %>%
|
||||||
head(1) %>% # if there are multiple device with the same amount of scans pick the first one only
|
head(1) %>% # if there are multiple device with the same amount of scans pick the first one only
|
||||||
pull(bssid)
|
pull(bssid)
|
||||||
|
|
||||||
data <- data %>% filter_data_by_segment(time_segment)
|
data <- data %>% filter_data_by_segment(time_segment)
|
||||||
|
|
||||||
return(data %>%
|
return(data %>%
|
||||||
filter(bssid == mostuniquedevice) %>%
|
filter(bssid == mostuniquedevice) %>%
|
||||||
group_by(local_segment) %>%
|
group_by(local_segment) %>%
|
||||||
summarise(!!feature := n()) %>%
|
summarise(!!feature := n())
|
||||||
replace(is.na(.), 0))
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -43,6 +48,6 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
||||||
feature <- compute_wifi_feature(wifi_data, feature_name, time_segment)
|
feature <- compute_wifi_feature(wifi_data, feature_name, time_segment)
|
||||||
features <- merge(features, feature, by="local_segment", all = TRUE)
|
features <- merge(features, feature, by="local_segment", all = TRUE)
|
||||||
}
|
}
|
||||||
|
features <- features %>% mutate_all(~replace(., is.na(.), 0))
|
||||||
return(features)
|
return(features)
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
source("renv/activate.R")
|
||||||
|
|
||||||
|
library(tidyr)
|
||||||
|
library(purrr)
|
||||||
|
library("dplyr", warn.conflicts = F)
|
||||||
|
library(stringr)
|
||||||
|
|
||||||
|
feature_files <- snakemake@input[["feature_files"]]
|
||||||
|
|
||||||
|
|
||||||
|
features_of_all_participants <- tibble(filename = feature_files) %>% # create a data frame
|
||||||
|
mutate(file_contents = map(filename, ~ read.csv(., stringsAsFactors = F, colClasses = c(local_segment = "character", local_segment_label = "character", local_segment_start_datetime="character", local_segment_end_datetime="character"))),
|
||||||
|
pid = str_match(filename, ".*/(.*)/z_all_sensor_features.csv")[,2]) %>%
|
||||||
|
unnest(cols = c(file_contents)) %>%
|
||||||
|
select(-filename)
|
||||||
|
|
||||||
|
write.csv(features_of_all_participants, snakemake@output[[1]], row.names = FALSE)
|
|
@ -88,11 +88,13 @@ def chunk_episodes(sensor_episodes):
|
||||||
|
|
||||||
return merged_sensor_episodes
|
return merged_sensor_episodes
|
||||||
|
|
||||||
def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file):
|
def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=False):
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from importlib import import_module, util
|
from importlib import import_module, util
|
||||||
|
|
||||||
sensor_features = pd.DataFrame(columns=["local_segment"])
|
sensor_features = pd.DataFrame(columns=["local_segment"])
|
||||||
|
sensor_fo_features = pd.DataFrame(columns=["local_segment"])
|
||||||
|
sensor_so_features = pd.DataFrame(columns=["local_segment"])
|
||||||
time_segments_labels = pd.read_csv(time_segments_file, header=0)
|
time_segments_labels = pd.read_csv(time_segments_file, header=0)
|
||||||
if "FEATURES" not in provider:
|
if "FEATURES" not in provider:
|
||||||
raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(sensor_key.upper(), provider_key.upper()))
|
raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(sensor_key.upper(), provider_key.upper()))
|
||||||
|
@ -106,30 +108,68 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
|
||||||
time_segments_labels["label"] = [""]
|
time_segments_labels["label"] = [""]
|
||||||
for time_segment in time_segments_labels["label"]:
|
for time_segment in time_segments_labels["label"]:
|
||||||
print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, time_segment))
|
print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, time_segment))
|
||||||
features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes)
|
|
||||||
if not "local_segment" in features.columns:
|
features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes, calc_windows=calc_windows)
|
||||||
raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")
|
|
||||||
features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns]
|
# In case of calc_window = True
|
||||||
sensor_features = pd.concat([sensor_features, features], axis=0, sort=False)
|
if isinstance(features, tuple):
|
||||||
|
if not "local_segment" in features[0].columns or not "local_segment" in features[1].columns:
|
||||||
|
raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")
|
||||||
|
features[0].columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features[0].columns]
|
||||||
|
features[1].columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features[1].columns]
|
||||||
|
if not features[0].empty:
|
||||||
|
sensor_fo_features = pd.concat([sensor_fo_features, features[0]], axis=0, sort=False)
|
||||||
|
if not features[1].empty:
|
||||||
|
sensor_so_features = pd.concat([sensor_so_features, features[1]], axis=0, sort=False)
|
||||||
|
else:
|
||||||
|
if not "local_segment" in features.columns:
|
||||||
|
raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")
|
||||||
|
features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns]
|
||||||
|
sensor_features = pd.concat([sensor_features, features], axis=0, sort=False)
|
||||||
else:
|
else:
|
||||||
for feature in provider["FEATURES"]:
|
for feature in provider["FEATURES"]:
|
||||||
sensor_features[feature] = None
|
sensor_features[feature] = None
|
||||||
segment_colums = pd.DataFrame()
|
|
||||||
sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
if calc_windows:
|
||||||
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
segment_colums = pd.DataFrame()
|
||||||
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
sensor_fo_features['local_segment'] = sensor_fo_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
||||||
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
|
split_segemnt_columns = sensor_fo_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
||||||
for i in range(segment_colums.shape[1]):
|
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
||||||
sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
|
||||||
|
for i in range(segment_colums.shape[1]):
|
||||||
|
sensor_fo_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||||
|
|
||||||
|
segment_colums = pd.DataFrame()
|
||||||
|
sensor_so_features['local_segment'] = sensor_so_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
||||||
|
split_segemnt_columns = sensor_so_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
||||||
|
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
||||||
|
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
|
||||||
|
for i in range(segment_colums.shape[1]):
|
||||||
|
sensor_so_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||||
|
|
||||||
return sensor_features
|
return sensor_fo_features, sensor_so_features
|
||||||
|
|
||||||
def run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files):
|
else:
|
||||||
|
segment_colums = pd.DataFrame()
|
||||||
|
sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
||||||
|
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
||||||
|
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
||||||
|
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
|
||||||
|
for i in range(segment_colums.shape[1]):
|
||||||
|
sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||||
|
|
||||||
|
return sensor_features
|
||||||
|
|
||||||
|
def run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files, target=False):
|
||||||
from importlib import import_module, util
|
from importlib import import_module, util
|
||||||
print("{} Processing {} {}".format(rapids_log_tag, sensor_key, provider_key))
|
print("{} Processing {} {}".format(rapids_log_tag, sensor_key, provider_key))
|
||||||
|
|
||||||
cleaning_module = import_path(provider["SRC_SCRIPT"])
|
cleaning_module = import_path(provider["SRC_SCRIPT"])
|
||||||
cleaning_function = getattr(cleaning_module, provider_key.lower() + "_cleaning")
|
cleaning_function = getattr(cleaning_module, provider_key.lower() + "_cleaning")
|
||||||
sensor_features = cleaning_function(sensor_data_files, provider)
|
|
||||||
|
|
||||||
return sensor_features
|
if target:
|
||||||
|
sensor_features = cleaning_function(sensor_data_files, provider, target)
|
||||||
|
else:
|
||||||
|
sensor_features = cleaning_function(sensor_data_files, provider)
|
||||||
|
|
||||||
|
return sensor_features
|
|
@ -0,0 +1,19 @@
|
||||||
|
import pandas as pd
|
||||||
|
import sys
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
|
||||||
|
column_names = df_input.columns
|
||||||
|
esm_names_index = column_names.str.startswith("phone_esm_straw")
|
||||||
|
# Find all columns coming from phone_esm, since these are not features for our purposes and we will drop them.
|
||||||
|
esm_names = column_names[esm_names_index]
|
||||||
|
target_variable_index = esm_names.str.contains(target_variable_name)
|
||||||
|
if all(~target_variable_index):
|
||||||
|
warnings.warn(f"The requested target (, {target_variable_name} ,)cannot be found in the dataset. Please check the names of phone_esm_ columns in cleaned python file")
|
||||||
|
return None
|
||||||
|
|
||||||
|
sensor_features_plus_target = df_input.drop(esm_names, axis=1)
|
||||||
|
sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]
|
||||||
|
# We will only keep one column related to phone_esm and that will be our target variable.
|
||||||
|
# Add it back to the very and of the data frame and rename it to target.
|
||||||
|
return sensor_features_plus_target
|
|
@ -0,0 +1,24 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from helper import retain_target_column
|
||||||
|
|
||||||
|
sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"])
|
||||||
|
|
||||||
|
all_baseline_features = pd.DataFrame()
|
||||||
|
for baseline_features_path in snakemake.input["demographic_features"]:
|
||||||
|
pid = baseline_features_path.split("/")[3]
|
||||||
|
baseline_features = pd.read_csv(baseline_features_path)
|
||||||
|
baseline_features = baseline_features.assign(pid=pid)
|
||||||
|
all_baseline_features = pd.concat([all_baseline_features, baseline_features], axis=0)
|
||||||
|
|
||||||
|
# merge sensor features and baseline features
|
||||||
|
if not sensor_features.empty:
|
||||||
|
features = sensor_features.merge(all_baseline_features, on="pid", how="left")
|
||||||
|
|
||||||
|
target_variable_name = snakemake.params["target_variable"]
|
||||||
|
model_input = retain_target_column(features, target_variable_name)
|
||||||
|
|
||||||
|
model_input.to_csv(snakemake.output[0], index=False)
|
||||||
|
|
||||||
|
else:
|
||||||
|
sensor_features.to_csv(snakemake.output[0], index=False)
|
|
@ -0,0 +1,13 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from helper import retain_target_column
|
||||||
|
|
||||||
|
cleaned_sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"])
|
||||||
|
target_variable_name = snakemake.params["target_variable"]
|
||||||
|
|
||||||
|
model_input = retain_target_column(cleaned_sensor_features, target_variable_name)
|
||||||
|
|
||||||
|
if model_input is None:
|
||||||
|
pd.DataFrame().to_csv(snakemake.output[0])
|
||||||
|
else:
|
||||||
|
model_input.to_csv(snakemake.output[0], index=False)
|
|
@ -24,12 +24,12 @@ def colors2colorscale(colors):
|
||||||
def getDataForPlot(phone_data_yield_per_segment):
|
def getDataForPlot(phone_data_yield_per_segment):
|
||||||
# calculate the length (in minute) of per segment instance
|
# calculate the length (in minute) of per segment instance
|
||||||
phone_data_yield_per_segment["length"] = phone_data_yield_per_segment["timestamps_segment"].str.split(",").apply(lambda x: int((int(x[1])-int(x[0])) / (1000 * 60)))
|
phone_data_yield_per_segment["length"] = phone_data_yield_per_segment["timestamps_segment"].str.split(",").apply(lambda x: int((int(x[1])-int(x[0])) / (1000 * 60)))
|
||||||
# calculate the number of sensors logged at least one row of data per minute.
|
|
||||||
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(["local_segment", "length", "local_date", "local_hour", "local_minute"])[["sensor", "local_date_time"]].max().reset_index()
|
|
||||||
# extract local start datetime of the segment from "local_segment" column
|
# extract local start datetime of the segment from "local_segment" column
|
||||||
phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(phone_data_yield_per_segment["local_segment"].apply(lambda x: x.split("#")[1].split(",")[0]))
|
phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(phone_data_yield_per_segment["local_segment"].apply(lambda x: x.split("#")[1].split(",")[0]))
|
||||||
# calculate the number of minutes after local start datetime of the segment
|
# calculate the number of minutes after local start datetime of the segment
|
||||||
phone_data_yield_per_segment["minutes_after_segment_start"] = ((phone_data_yield_per_segment["local_date_time"] - phone_data_yield_per_segment["local_segment_start_datetimes"]) / pd.Timedelta(minutes=1)).astype("int")
|
phone_data_yield_per_segment["minutes_after_segment_start"] = ((phone_data_yield_per_segment["local_date_time"] - phone_data_yield_per_segment["local_segment_start_datetimes"]) / pd.Timedelta(minutes=1)).astype("int")
|
||||||
|
# calculate the number of sensors logged at least one row of data per minute.
|
||||||
|
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(["local_segment", "length", "local_segment_start_datetimes", "minutes_after_segment_start"])[["sensor"]].max().reset_index()
|
||||||
|
|
||||||
# impute missing rows with 0
|
# impute missing rows with 0
|
||||||
columns_for_full_index = phone_data_yield_per_segment[["local_segment_start_datetimes", "length"]].drop_duplicates(keep="first")
|
columns_for_full_index = phone_data_yield_per_segment[["local_segment_start_datetimes", "length"]].drop_duplicates(keep="first")
|
||||||
|
|
Binary file not shown.
After Width: | Height: | Size: 12 KiB |
|
@ -0,0 +1,39 @@
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
participant = "p01"
|
||||||
|
all_sensors = ["eda", "ibi", "temp", "acc"]
|
||||||
|
|
||||||
|
for sensor in all_sensors:
|
||||||
|
|
||||||
|
if sensor == "eda":
|
||||||
|
path = f"/rapids/data/interim/{participant}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_cr_windows.csv"
|
||||||
|
elif sensor == "bvp":
|
||||||
|
path = f"/rapids/data/interim/{participant}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_cr_windows.csv"
|
||||||
|
elif sensor == "ibi":
|
||||||
|
path = f"/rapids/data/interim/{participant}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_cr_windows.csv"
|
||||||
|
elif sensor == "acc":
|
||||||
|
path = f"/rapids/data/interim/{participant}/empatica_accelerometer_features/empatica_accelerometer_python_cr_windows.csv"
|
||||||
|
elif sensor == "temp":
|
||||||
|
path = f"/rapids/data/interim/{participant}/empatica_temperature_features/empatica_temperature_python_cr_windows.csv"
|
||||||
|
else:
|
||||||
|
path = "/rapids/data/processed/features/all_participants/all_sensor_features.csv" # all features all participants
|
||||||
|
|
||||||
|
|
||||||
|
df = pd.read_csv(path)
|
||||||
|
print(df)
|
||||||
|
is_NaN = df.isnull()
|
||||||
|
row_has_NaN = is_NaN.any(axis=1)
|
||||||
|
rows_with_NaN = df[row_has_NaN]
|
||||||
|
|
||||||
|
print("All rows:", len(df.index))
|
||||||
|
print("\nCount NaN vals:", rows_with_NaN.size)
|
||||||
|
print("\nDf mean:")
|
||||||
|
print(df.mean())
|
||||||
|
|
||||||
|
sns.heatmap(df.isna(), cbar=False)
|
||||||
|
plt.savefig(f'{sensor}_{participant}_windows_NaN.png', bbox_inches='tight')
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,285 @@
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
path = "/rapids/data/processed/features/all_participants/all_sensor_features.csv"
|
||||||
|
df = pd.read_csv(path)
|
||||||
|
|
||||||
|
# activity_recognition
|
||||||
|
|
||||||
|
cols = [col for col in df.columns if "activity_recognition" in col]
|
||||||
|
df_x = df[cols]
|
||||||
|
|
||||||
|
print(len(cols))
|
||||||
|
print(df_x)
|
||||||
|
|
||||||
|
df_x = df_x.dropna(axis=0, how="all")
|
||||||
|
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||||
|
plt.savefig(f'activity_recognition_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_x:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||||
|
plt.savefig(f'cut_activity_recognition_values', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# applications_foreground
|
||||||
|
|
||||||
|
cols = [col for col in df.columns if "applications_foreground" in col]
|
||||||
|
df_x = df[cols]
|
||||||
|
|
||||||
|
print(len(cols))
|
||||||
|
print(df_x)
|
||||||
|
|
||||||
|
df_x = df_x.dropna(axis=0, how="all")
|
||||||
|
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||||
|
plt.savefig(f'applications_foreground_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_x:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||||
|
plt.savefig(f'cut_applications_foreground_values', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# battery
|
||||||
|
|
||||||
|
cols = [col for col in df.columns if "phone_battery" in col]
|
||||||
|
df_x = df[cols]
|
||||||
|
|
||||||
|
print(len(cols))
|
||||||
|
print(df_x)
|
||||||
|
|
||||||
|
df_x = df_x.dropna(axis=0, how="all")
|
||||||
|
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||||
|
plt.savefig(f'phone_battery_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_x:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||||
|
plt.savefig(f'cut_phone_battery_values', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# bluetooth_doryab
|
||||||
|
|
||||||
|
cols = [col for col in df.columns if "bluetooth_doryab" in col]
|
||||||
|
df_x = df[cols]
|
||||||
|
|
||||||
|
print(len(cols))
|
||||||
|
print(df_x)
|
||||||
|
|
||||||
|
df_x = df_x.dropna(axis=0, how="all")
|
||||||
|
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||||
|
plt.savefig(f'bluetooth_doryab_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_x:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||||
|
plt.savefig(f'cut_bluetooth_doryab_values', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# bluetooth_rapids
|
||||||
|
|
||||||
|
cols = [col for col in df.columns if "bluetooth_rapids" in col]
|
||||||
|
df_x = df[cols]
|
||||||
|
|
||||||
|
print(len(cols))
|
||||||
|
print(df_x)
|
||||||
|
|
||||||
|
df_x = df_x.dropna(axis=0, how="all")
|
||||||
|
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||||
|
plt.savefig(f'bluetooth_rapids_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_x:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||||
|
plt.savefig(f'cut_bluetooth_rapids_values', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# calls
|
||||||
|
|
||||||
|
cols = [col for col in df.columns if "phone_calls" in col]
|
||||||
|
df_x = df[cols]
|
||||||
|
|
||||||
|
print(len(cols))
|
||||||
|
print(df_x)
|
||||||
|
|
||||||
|
df_x = df_x.dropna(axis=0, how="all")
|
||||||
|
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||||
|
plt.savefig(f'phone_calls_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_x:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||||
|
plt.savefig(f'cut_phone_calls_values', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# data_yield
|
||||||
|
|
||||||
|
cols = [col for col in df.columns if "data_yield" in col]
|
||||||
|
df_x = df[cols]
|
||||||
|
|
||||||
|
print(len(cols))
|
||||||
|
print(df_x)
|
||||||
|
|
||||||
|
df_x = df_x.dropna(axis=0, how="all")
|
||||||
|
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||||
|
plt.savefig(f'data_yield_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_x:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||||
|
plt.savefig(f'cut_data_yield_values', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# esm
|
||||||
|
|
||||||
|
cols = [col for col in df.columns if "phone_esm" in col]
|
||||||
|
df_x = df[cols]
|
||||||
|
|
||||||
|
print(len(cols))
|
||||||
|
print(df_x)
|
||||||
|
|
||||||
|
df_x = df_x.dropna(axis=0, how="all")
|
||||||
|
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||||
|
plt.savefig(f'phone_esm_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_x:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||||
|
plt.savefig(f'cut_phone_esm_values', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# light
|
||||||
|
|
||||||
|
cols = [col for col in df.columns if "phone_light" in col]
|
||||||
|
df_x = df[cols]
|
||||||
|
|
||||||
|
print(len(cols))
|
||||||
|
print(df_x)
|
||||||
|
|
||||||
|
df_x = df_x.dropna(axis=0, how="all")
|
||||||
|
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||||
|
plt.savefig(f'phone_light_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_x:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||||
|
plt.savefig(f'cut_phone_light_values', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# locations_doryab
|
||||||
|
|
||||||
|
cols = [col for col in df.columns if "locations_doryab" in col]
|
||||||
|
df_x = df[cols]
|
||||||
|
|
||||||
|
print(len(cols))
|
||||||
|
print(df_x)
|
||||||
|
|
||||||
|
df_x = df_x.dropna(axis=0, how="all")
|
||||||
|
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||||
|
plt.savefig(f'locations_doryab_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_x:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||||
|
plt.savefig(f'cut_locations_doryab_values', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# locations_barnett
|
||||||
|
|
||||||
|
# Not working
|
||||||
|
|
||||||
|
# messages
|
||||||
|
|
||||||
|
cols = [col for col in df.columns if "phone_messages" in col]
|
||||||
|
df_x = df[cols]
|
||||||
|
|
||||||
|
print(len(cols))
|
||||||
|
print(df_x)
|
||||||
|
|
||||||
|
df_x = df_x.dropna(axis=0, how="all")
|
||||||
|
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||||
|
plt.savefig(f'phone_messages_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_x:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||||
|
plt.savefig(f'cut_phone_messages_values', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# screen
|
||||||
|
|
||||||
|
cols = [col for col in df.columns if "phone_screen" in col]
|
||||||
|
df_x = df[cols]
|
||||||
|
|
||||||
|
print(len(cols))
|
||||||
|
print(df_x)
|
||||||
|
|
||||||
|
df_x = df_x.dropna(axis=0, how="all")
|
||||||
|
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||||
|
plt.savefig(f'phone_screen_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_x:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||||
|
plt.savefig(f'cut_phone_screen_values', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# wifi_visible
|
||||||
|
|
||||||
|
cols = [col for col in df.columns if "wifi_visible" in col]
|
||||||
|
df_x = df[cols]
|
||||||
|
|
||||||
|
print(len(cols))
|
||||||
|
print(df_x)
|
||||||
|
|
||||||
|
df_x = df_x.dropna(axis=0, how="all")
|
||||||
|
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||||
|
plt.savefig(f'wifi_visible_values', bbox_inches='tight')
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_x:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||||
|
plt.savefig(f'cut_wifi_visible_values', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# All features
|
||||||
|
|
||||||
|
print(len(df))
|
||||||
|
print(df)
|
||||||
|
|
||||||
|
# df = df.dropna(axis=0, how="all")
|
||||||
|
# df = df.dropna(axis=1, how="all")
|
||||||
|
sns.heatmap(df.isna())
|
||||||
|
plt.savefig(f'all_features', bbox_inches='tight')
|
||||||
|
|
||||||
|
print(df.columns[df.isna().all()].tolist())
|
||||||
|
print("All NaNs:", df.isna().sum().sum())
|
||||||
|
print("Df shape NaNs:", df.shape)
|
|
@ -0,0 +1,23 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import os, sys
|
||||||
|
|
||||||
|
participant = "p032"
|
||||||
|
|
||||||
|
folder = f"/rapids/data/processed/features/{participant}/"
|
||||||
|
for filename in os.listdir(folder):
|
||||||
|
if filename.startswith("phone_"):
|
||||||
|
df = pd.read_csv(f"{folder}{filename}")
|
||||||
|
plt.figure()
|
||||||
|
sns.heatmap(df[[col for col in df if col.startswith('phone_')]], cbar=True)
|
||||||
|
plt.savefig(f'{participant}_{filename}.png', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
plt.figure()
|
||||||
|
sns.heatmap(df[[col for col in df if col.startswith('phone_')]].isna(), cbar=True)
|
||||||
|
plt.savefig(f'is_na_{participant}_{filename}.png', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.append('/rapids/')
|
||||||
|
from src.features import cr_features_helper_methods as crhm
|
||||||
|
|
||||||
|
pd.set_option("display.max_columns", None)
|
||||||
|
features_win = pd.read_csv("data/interim/p031/empatica_temperature_features/empatica_temperature_python_cr_windows.csv", usecols=[0, 1, 2, 3, 4, 5])
|
||||||
|
|
||||||
|
# First standardization method
|
||||||
|
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime', "empatica_temperature_cr_level_1"]
|
||||||
|
z1_windows = features_win.copy()
|
||||||
|
z1_windows.loc[:, ~z1_windows.columns.isin(excluded_columns)] = StandardScaler().fit_transform(z1_windows.loc[:, ~z1_windows.columns.isin(excluded_columns)])
|
||||||
|
z1 = crhm.extract_second_order_features(z1_windows, ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'], prefix="empatica_temperature_cr_")
|
||||||
|
z1 = z1.iloc[:,4:]
|
||||||
|
# print(z1)
|
||||||
|
|
||||||
|
# Second standardization method
|
||||||
|
so_features_reg = crhm.extract_second_order_features(features_win, ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'], prefix="empatica_temperature_cr_")
|
||||||
|
so_features_reg = so_features_reg.iloc[:,4:]
|
||||||
|
z2 = pd.DataFrame(StandardScaler().fit_transform(so_features_reg), columns=so_features_reg.columns)
|
||||||
|
# print(z2)
|
||||||
|
|
||||||
|
# Standardization of the first standardization method values
|
||||||
|
z1_z = pd.DataFrame(StandardScaler().fit_transform(z1), columns=z1.columns)
|
||||||
|
# print(z1_z)
|
||||||
|
|
||||||
|
# For SD
|
||||||
|
fig, axs = plt.subplots(3, figsize=(8, 10))
|
||||||
|
axs[0].plot(z1['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'])
|
||||||
|
axs[0].set_title("Z1 - standardizirana okna, nato ekstrahiranje značilk SO")
|
||||||
|
|
||||||
|
axs[1].plot(z2['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'])
|
||||||
|
axs[1].set_title("Z2 - ekstrahirane značilke SO 'normalnih' vrednosti, nato standardizacija")
|
||||||
|
|
||||||
|
axs[2].plot(z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'])
|
||||||
|
axs[2].set_title("Standardiziran Z1")
|
||||||
|
|
||||||
|
fig.suptitle('Z-Score methods for temperature_squareSumOfComponent_SO_sd')
|
||||||
|
plt.savefig('z_score_comparison_temperature_squareSumOfComponent_X_SO_sd', bbox_inches='tight')
|
||||||
|
|
||||||
|
showcase = pd.DataFrame()
|
||||||
|
showcase['Z1__SD'] = z1['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']
|
||||||
|
showcase['Z2__SD'] = z2['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']
|
||||||
|
showcase['Z1__SD_STANDARDIZED'] = z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']
|
||||||
|
print(showcase)
|
||||||
|
|
||||||
|
# For
|
||||||
|
fig, axs = plt.subplots(3, figsize=(8, 10))
|
||||||
|
axs[0].plot(z1['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'])
|
||||||
|
axs[0].set_title("Z1 - standardizirana okna, nato ekstrahiranje značilk SO")
|
||||||
|
|
||||||
|
axs[1].plot(z2['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'])
|
||||||
|
axs[1].set_title("Z2")
|
||||||
|
|
||||||
|
axs[2].plot(z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'])
|
||||||
|
axs[2].set_title("Standardized Z1")
|
||||||
|
|
||||||
|
fig.suptitle('Z-Score methods for temperature_squareSumOfComponent_SO_nlargest')
|
||||||
|
plt.savefig('z_score_comparison_temperature_squareSumOfComponent_X_SO_nlargest', bbox_inches='tight')
|
||||||
|
|
||||||
|
showcase2 = pd.DataFrame()
|
||||||
|
showcase2['Z1__nlargest'] = z1['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']
|
||||||
|
showcase2['Z2__nlargest'] = z2['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']
|
||||||
|
showcase2['Z1__nlargest_STANDARDIZED'] = z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']
|
||||||
|
print(showcase2)
|
||||||
|
|
|
@ -0,0 +1,38 @@
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
df = pd.read_csv(f"/rapids/data/raw/p03/empatica_accelerometer_raw.csv")
|
||||||
|
|
||||||
|
|
||||||
|
df['date'] = pd.to_datetime(df['timestamp'],unit='ms')
|
||||||
|
df.set_index('date', inplace=True)
|
||||||
|
print(df)
|
||||||
|
df = df['double_values_0'].resample("31ms").mean()
|
||||||
|
print(df)
|
||||||
|
|
||||||
|
st='2021-05-21 12:28:27'
|
||||||
|
en='2021-05-21 12:59:12'
|
||||||
|
|
||||||
|
df = df.loc[(df.index > st) & (df.index < en)]
|
||||||
|
plt.plot(df)
|
||||||
|
|
||||||
|
plt.savefig(f'NaN.png')
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
plt.plot(df)
|
||||||
|
|
||||||
|
esm = pd.read_csv(f"/rapids/data/raw/p03/phone_esm_raw.csv")
|
||||||
|
|
||||||
|
esm['date'] = pd.to_datetime(esm['timestamp'],unit='ms')
|
||||||
|
esm = esm[esm['date']]
|
||||||
|
esm.set_index('date', inplace=True)
|
||||||
|
print(esm)
|
||||||
|
|
||||||
|
esm = esm['esm_session'].resample("2900ms").mean()
|
||||||
|
|
||||||
|
plt.plot(esm)
|
||||||
|
plt.savefig(f'NaN.png')
|
|
@ -0,0 +1,48 @@
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from itertools import compress
|
||||||
|
|
||||||
|
|
||||||
|
participant = "p031"
|
||||||
|
sensor = "eda"
|
||||||
|
|
||||||
|
if sensor == "eda":
|
||||||
|
path = f"/rapids/data/interim/{participant}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_cr_windows.csv"
|
||||||
|
elif sensor == "bvp":
|
||||||
|
path = f"/rapids/data/interim/{participant}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_cr_windows.csv"
|
||||||
|
elif sensor == "ibi":
|
||||||
|
path = f"/rapids/data/interim/{participant}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_cr_windows.csv"
|
||||||
|
elif sensor == "acc":
|
||||||
|
path = f"/rapids/data/interim/{participant}/empatica_accelerometer_features/empatica_accelerometer_python_cr_windows.csv"
|
||||||
|
elif sensor == "temp":
|
||||||
|
path = f"/rapids/data/interim/{participant}/empatica_temperature_features/empatica_temperature_python_cr_windows.csv"
|
||||||
|
else:
|
||||||
|
path = "/rapids/data/processed/features/all_participants/all_sensor_features.csv" # all features all participants"
|
||||||
|
|
||||||
|
|
||||||
|
df = pd.read_csv(path)
|
||||||
|
df_num_peaks_zero = df[df["empatica_electrodermal_activity_cr_numPeaks"] == 0]
|
||||||
|
columns_num_peaks_zero = df_num_peaks_zero.columns[df_num_peaks_zero.isna().any()].tolist()
|
||||||
|
|
||||||
|
df_num_peaks_non_zero = df[df["empatica_electrodermal_activity_cr_numPeaks"] != 0]
|
||||||
|
df_num_peaks_non_zero = df_num_peaks_non_zero[columns_num_peaks_zero]
|
||||||
|
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
|
||||||
|
df_q = pd.DataFrame()
|
||||||
|
for col in df_num_peaks_non_zero:
|
||||||
|
df_q[col] = pd.to_numeric(pd.cut(df_num_peaks_non_zero[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||||
|
|
||||||
|
sns.heatmap(df_q)
|
||||||
|
plt.savefig(f'eda_{participant}_window_non_zero_peak_other_vals.png', bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# Filter columns that do not contain 0
|
||||||
|
non_zero_cols = list(compress(columns_num_peaks_zero, df_num_peaks_non_zero.all().tolist()))
|
||||||
|
zero_cols = list(set(columns_num_peaks_zero) - set(non_zero_cols))
|
||||||
|
|
||||||
|
print(non_zero_cols, "\n")
|
||||||
|
print(zero_cols)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue