Compare commits
74 Commits
ce04394679
...
f78aa3e7b3
Author | SHA1 | Date |
---|---|---|
Primoz | f78aa3e7b3 | |
Primoz | a620def209 | |
Primoz | c498ecb742 | |
Primoz | f088e9586f | |
Primoz | 0aa0e82673 | |
Primoz | 4cfe5a3a98 | |
Primoz | 607da820f2 | |
Primoz | fb577bc9ad | |
Primoz | 6ba4a66deb | |
Primoz | 788ac31190 | |
Primoz | 21eb2665d7 | |
Primoz | a65a85cce9 | |
Primoz | fa961fe2f5 | |
Primoz | 6c8014ba8e | |
Primoz | 5a777ac79f | |
Primoz | 0425403951 | |
Primoz | 887fd7dc72 | |
Primoz | 5a4696c548 | |
Primoz | d2758eef46 | |
Primoz | 2d5d23b615 | |
Primoz | a5480f1369 | |
Primoz | 505c3a86b9 | |
Primoz | c851ab0763 | |
Primoz | a8cd16f88c | |
Primoz | dda4554d46 | |
Primoz | 212cf300f8 | |
Primoz | 9ea39dc557 | |
Primoz | 402059871f | |
Primoz | 094743244d | |
primoz | e1d7607de4 | |
primoz | f371249b99 | |
primoz | 64e41cfa35 | |
primoz | 2c7ac21465 | |
primoz | 2acf6ff9fb | |
primoz | d300f0f8f0 | |
Primoz | fbf6a77dfc | |
Primoz | 5532043b1f | |
Primoz | bb62497ba6 | |
Primoz | 2a8f58f5c8 | |
Primoz | 1471c86c62 | |
Primoz | 6864cfe775 | |
Primoz | c1564f0cae | |
Primoz | 31e36e7400 | |
Primoz | 9cf9e1fe14 | |
Primoz | f62a1302dd | |
Primoz | 5638367999 | |
Primoz | 66451160e9 | |
= | 8c8fe1fec7 | |
= | 075c64d1e5 | |
= | 3c058e4463 | |
= | 74cf4ada1c | |
= | 1c42347b9b | |
Primoz | c050174ca3 | |
Primoz | f9e40711e7 | |
Primoz | a357138f6e | |
Primoz | 470993eeb0 | |
= | ab0b9227d7 | |
= | a9244a60fc | |
= | 8b76c96e47 | |
= | ca59a54d8f | |
= | 393dab72f5 | |
Primoz | 1902d02a86 | |
Primoz | f389ac9d89 | |
Primoz | 191e53e543 | |
Primoz | d3a3f01f29 | |
Primoz | 2da0911d4c | |
Primoz | bd5a811256 | |
Primoz | d1c59de2e9 | |
Primoz | a80f7c0cc4 | |
Primoz | d63158c199 | |
Primoz | 3f8e1cc252 | |
Primoz | dc2b462145 | |
Primoz | 50358978cc | |
Primoz | 86c6312574 |
|
@ -93,10 +93,14 @@ packrat/*
|
|||
|
||||
# exclude data from source control by default
|
||||
data/external/*
|
||||
!/data/external/empatica/empatica1/E4 Data.zip
|
||||
!/data/external/.gitkeep
|
||||
!/data/external/stachl_application_genre_catalogue.csv
|
||||
!/data/external/timesegments*.csv
|
||||
!/data/external/wiki_tz.csv
|
||||
!/data/external/main_study_usernames.csv
|
||||
!/data/external/timezone.csv
|
||||
|
||||
data/raw/*
|
||||
!/data/raw/.gitkeep
|
||||
data/interim/*
|
||||
|
@ -114,3 +118,12 @@ settings.dcf
|
|||
tests/fakedata_generation/
|
||||
site/
|
||||
credentials.yaml
|
||||
|
||||
# Docker container and other files
|
||||
.devcontainer
|
||||
|
||||
# Calculating features module
|
||||
calculatingfeatures/
|
||||
|
||||
# Temp folder for rapids data/external
|
||||
rapids_temp_data/
|
||||
|
|
68
README.md
68
README.md
|
@ -11,3 +11,71 @@
|
|||
For more information refer to our [documentation](http://www.rapids.science)
|
||||
|
||||
By [MoSHI](https://www.moshi.pitt.edu/), [University of Pittsburgh](https://www.pitt.edu/)
|
||||
|
||||
## Installation
|
||||
|
||||
For RAPIDS installation refer to to the [documentation](https://www.rapids.science/1.8/setup/installation/)
|
||||
|
||||
## For the installation of the Docker version
|
||||
|
||||
1. Follow the [instructions](https://www.rapids.science/1.8/setup/installation/) to setup RAPIDS via Docker (from scratch).
|
||||
|
||||
2. Delete current contents in /rapids/ folder when in a container session.
|
||||
```
|
||||
cd ..
|
||||
rm -rf rapids/{*,.*}
|
||||
cd rapids
|
||||
```
|
||||
|
||||
3. Clone RAPIDS workspace from Git and checkout a specific branch.
|
||||
```
|
||||
git clone "https://repo.ijs.si/junoslukan/rapids.git" .
|
||||
git checkout <branch_name>
|
||||
```
|
||||
|
||||
4. Install missing “libpq-dev” dependency with bash.
|
||||
```
|
||||
apt-get update -y
|
||||
apt-get install -y libpq-dev
|
||||
```
|
||||
|
||||
5. Restore R venv.
|
||||
Type R to go to the interactive R session and then:
|
||||
```
|
||||
renv::restore()
|
||||
```
|
||||
|
||||
6. Install cr-features module
|
||||
From: https://repo.ijs.si/matjazbostic/calculatingfeatures.git -> branch modifications_for_rapids.
|
||||
Then follow the "cr-features module" section below.
|
||||
|
||||
7. Install all required packages from environment.yml, prune also deletes conda packages not present in environment file.
|
||||
```
|
||||
conda env update --file environment.yml –prune
|
||||
```
|
||||
|
||||
8. If you wish to update your R or Python venvs.
|
||||
```
|
||||
R in interactive session:
|
||||
renv::snapshot()
|
||||
Python:
|
||||
conda env export --no-builds | sed 's/^.*libgfortran.*$/ - libgfortran/' | sed 's/^.*mkl=.*$/ - mkl/' > environment.yml
|
||||
```
|
||||
|
||||
## cr-features module
|
||||
|
||||
This RAPIDS extension uses cr-features library accessible [here](https://repo.ijs.si/matjazbostic/calculatingfeatures).
|
||||
|
||||
To use cr-features library:
|
||||
|
||||
- Follow the installation instructions in the [README.md](https://repo.ijs.si/matjazbostic/calculatingfeatures/-/blob/master/README.md).
|
||||
|
||||
- Copy built calculatingfeatures folder into the RAPIDS workspace.
|
||||
|
||||
- Install the cr-features package by:
|
||||
```
|
||||
pip install path/to/the/calculatingfeatures/folder
|
||||
e.g. pip install ./calculatingfeatures if the folder is copied to main parent directory
|
||||
cr-features package has to be built and installed everytime to get the newest version.
|
||||
Or an the newest version of the docker image must be used.
|
||||
```
|
141
Snakefile
141
Snakefile
|
@ -33,6 +33,12 @@ for provider in config["PHONE_DATA_YIELD"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_data_yield.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_data_yield.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys():
|
||||
if config["PHONE_MESSAGES"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -42,6 +48,12 @@ for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_messages.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_MESSAGES"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_messages.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_CALLS"]["PROVIDERS"].keys():
|
||||
if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -56,6 +68,12 @@ for provider in config["PHONE_CALLS"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_CALLS"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_calls.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_BLUETOOTH"]["PROVIDERS"].keys():
|
||||
if config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -65,6 +83,12 @@ for provider in config["PHONE_BLUETOOTH"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_bluetooth.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_bluetooth.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"].keys():
|
||||
if config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -77,6 +101,12 @@ for provider in config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_activity_recognition.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_activity_recognition.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys():
|
||||
if config["PHONE_BATTERY"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -88,6 +118,12 @@ for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_battery.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_BATTERY"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_battery.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys():
|
||||
if config["PHONE_SCREEN"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -104,6 +140,12 @@ for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_screen.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_SCREEN"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_screen.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_LIGHT"]["PROVIDERS"].keys():
|
||||
if config["PHONE_LIGHT"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -113,6 +155,12 @@ for provider in config["PHONE_LIGHT"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_light.csv", pid=config["PIDS"],))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_LIGHT"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_light.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_ACCELEROMETER"]["PROVIDERS"].keys():
|
||||
if config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -136,6 +184,12 @@ for provider in config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_foreground.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_applications_foreground.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_WIFI_VISIBLE"]["PROVIDERS"].keys():
|
||||
if config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -145,6 +199,12 @@ for provider in config["PHONE_WIFI_VISIBLE"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_visible.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_wifi_visible.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_WIFI_CONNECTED"]["PROVIDERS"].keys():
|
||||
if config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -171,8 +231,14 @@ for provider in config["PHONE_ESM"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_clean.csv",pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_features/phone_esm_{language}_{provider_key}.csv",pid=config["PIDS"],language=get_script_language(config["PHONE_ESM"]["PROVIDERS"][provider]["SRC_SCRIPT"]),provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_esm.csv", pid=config["PIDS"]))
|
||||
#files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"]))
|
||||
#files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
# files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"]))
|
||||
# files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_ESM"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_esm.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
# We can delete these if's as soon as we add feature PROVIDERS to any of these sensors
|
||||
if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict):
|
||||
|
@ -238,6 +304,12 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_locations.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["FITBIT_CALORIES_INTRADAY"]["PROVIDERS"].keys():
|
||||
if config["FITBIT_CALORIES_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -328,7 +400,14 @@ for provider in config["EMPATICA_ACCELEROMETER"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_accelerometer.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"] \
|
||||
and config["EMPATICA_ACCELEROMETER"]["PROVIDERS"][provider]["WINDOWS"]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_accelerometer_features/z_empatica_accelerometer_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_empatica_accelerometer.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["EMPATICA_HEARTRATE"]["PROVIDERS"].keys():
|
||||
if config["EMPATICA_HEARTRATE"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/raw/{pid}/empatica_heartrate_raw.csv", pid=config["PIDS"]))
|
||||
|
@ -347,6 +426,13 @@ for provider in config["EMPATICA_TEMPERATURE"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_temperature.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"] \
|
||||
and config["EMPATICA_TEMPERATURE"]["PROVIDERS"][provider]["WINDOWS"]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_temperature_features/z_empatica_temperature_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_empatica_temperature.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"].keys():
|
||||
if config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -356,6 +442,13 @@ for provider in config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_electrodermal_activity.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"] \
|
||||
and config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"][provider]["WINDOWS"]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_electrodermal_activity_features/z_empatica_electrodermal_activity_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_empatica_electrodermal_activity.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"].keys():
|
||||
if config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -365,6 +458,13 @@ for provider in config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_blood_volume_pulse.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"] \
|
||||
and config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"][provider]["WINDOWS"]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_blood_volume_pulse_features/z_empatica_blood_volume_pulse_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_empatica_blood_volume_pulse.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys():
|
||||
if config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -374,7 +474,14 @@ for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_inter_beat_interval.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"] \
|
||||
and config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["WINDOWS"]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_inter_beat_interval_features/z_empatica_inter_beat_interval_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_empatica_inter_beat_interval.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
if isinstance(config["EMPATICA_TAGS"]["PROVIDERS"], dict):
|
||||
for provider in config["EMPATICA_TAGS"]["PROVIDERS"].keys():
|
||||
if config["EMPATICA_TAGS"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -408,10 +515,26 @@ if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]:
|
|||
# Data Cleaning
|
||||
for provider in config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"].keys():
|
||||
if config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() +".csv", pid=config["PIDS"]))
|
||||
if provider == "STRAW":
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() + "_py.csv", pid=config["PIDS"]))
|
||||
if config["ALL_CLEANING_INDIVIDUAL"]["CLEAN_STANDARDIZED"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features_cleaned_" + provider.lower() + "_py.csv", pid=config["PIDS"]))
|
||||
else:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() + "_R.csv", pid=config["PIDS"]))
|
||||
if config["ALL_CLEANING_INDIVIDUAL"]["CLEAN_STANDARDIZED"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features_cleaned_" + provider.lower() + "_R.csv", pid=config["PIDS"]))
|
||||
|
||||
for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
|
||||
if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +".csv"))
|
||||
if provider == "STRAW":
|
||||
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +"_py.csv"))
|
||||
if config["ALL_CLEANING_OVERALL"]["CLEAN_STANDARDIZED"]:
|
||||
files_to_compute.extend(expand("data/processed/features/all_participants/z_all_sensor_features_cleaned_" + provider.lower() +"_py.csv"))
|
||||
else:
|
||||
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +"_R.csv"))
|
||||
if config["ALL_CLEANING_OVERALL"]["CLEAN_STANDARDIZED"]:
|
||||
files_to_compute.extend(expand("data/processed/features/all_participants/z_all_sensor_features_cleaned_" + provider.lower() +"_R.csv"))
|
||||
|
||||
|
||||
# Baseline features
|
||||
if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
|
||||
|
@ -422,8 +545,10 @@ if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
|
|||
|
||||
# Targets (labels)
|
||||
if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/models/population_model/input.csv"))
|
||||
# files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
|
||||
# files_to_compute.extend(expand("data/processed/models/population_model/input.csv"))
|
||||
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/z_input.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/models/population_model/z_input.csv"))
|
||||
|
||||
#files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
|
||||
|
||||
|
|
147
config.yaml
147
config.yaml
|
@ -16,7 +16,7 @@ CREATE_PARTICIPANT_FILES:
|
|||
ADD: False
|
||||
IGNORED_DEVICE_IDS: []
|
||||
EMPATICA_SECTION:
|
||||
ADD: False
|
||||
ADD: True
|
||||
IGNORED_DEVICE_IDS: []
|
||||
|
||||
# See https://www.rapids.science/latest/setup/configuration/#time-segments
|
||||
|
@ -93,6 +93,7 @@ PHONE_ACTIVITY_RECOGNITION:
|
|||
STATIONARY: ["still", "tilting"]
|
||||
MOBILE: ["on_foot", "walking", "running", "on_bicycle"]
|
||||
VEHICLE: ["in_vehicle"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_activity_recognition/rapids/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-applications-crashes/
|
||||
|
@ -133,6 +134,7 @@ PHONE_APPLICATIONS_FOREGROUND:
|
|||
APP_EPISODES: ["countepisode", "minduration", "maxduration", "meanduration", "sumduration"]
|
||||
IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable
|
||||
IGNORE_EPISODES_LONGER_THAN: 300 # in minutes, set to 0 to disable
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_applications_foreground/rapids/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-applications-notifications/
|
||||
|
@ -153,6 +155,7 @@ PHONE_BATTERY:
|
|||
RAPIDS:
|
||||
COMPUTE: True
|
||||
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_battery/rapids/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-bluetooth/
|
||||
|
@ -162,6 +165,7 @@ PHONE_BLUETOOTH:
|
|||
RAPIDS:
|
||||
COMPUTE: True
|
||||
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R
|
||||
|
||||
DORYAB:
|
||||
|
@ -179,6 +183,7 @@ PHONE_BLUETOOTH:
|
|||
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
|
||||
SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
|
||||
SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_bluetooth/doryab/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-calls/
|
||||
|
@ -193,6 +198,7 @@ PHONE_CALLS:
|
|||
missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact]
|
||||
incoming: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
|
||||
outgoing: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_calls/rapids/main.R
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-conversation/
|
||||
|
@ -232,6 +238,7 @@ PHONE_DATA_YIELD:
|
|||
COMPUTE: True
|
||||
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
|
||||
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R
|
||||
|
||||
PHONE_ESM:
|
||||
|
@ -241,6 +248,7 @@ PHONE_ESM:
|
|||
COMPUTE: True
|
||||
SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support"]
|
||||
FEATURES: [mean]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_esm/straw/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-keyboard/
|
||||
|
@ -259,6 +267,7 @@ PHONE_LIGHT:
|
|||
RAPIDS:
|
||||
COMPUTE: True
|
||||
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_light/rapids/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-locations/
|
||||
|
@ -283,6 +292,7 @@ PHONE_LOCATIONS:
|
|||
MINIMUM_DAYS_TO_DETECT_HOME_CHANGES: 3
|
||||
CLUSTERING_ALGORITHM: DBSCAN # DBSCAN, OPTICS
|
||||
RADIUS_FOR_HOME: 100
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_locations/doryab/main.py
|
||||
|
||||
BARNETT:
|
||||
|
@ -290,6 +300,7 @@ PHONE_LOCATIONS:
|
|||
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
|
||||
IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON
|
||||
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_locations/barnett/main.R
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-log/
|
||||
|
@ -309,6 +320,7 @@ PHONE_MESSAGES:
|
|||
FEATURES:
|
||||
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
|
||||
sent: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_messages/rapids/main.R
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-screen/
|
||||
|
@ -322,6 +334,7 @@ PHONE_SCREEN:
|
|||
IGNORE_EPISODES_LONGER_THAN: 360 # in minutes, set to 0 to disable
|
||||
FEATURES: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration", "firstuseafter"] # "episodepersensedminutes" needs to be added later
|
||||
EPISODE_TYPES: ["unlock"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_screen/rapids/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-wifi-connected/
|
||||
|
@ -340,6 +353,7 @@ PHONE_WIFI_VISIBLE:
|
|||
RAPIDS:
|
||||
COMPUTE: True
|
||||
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R
|
||||
|
||||
|
||||
|
@ -485,6 +499,7 @@ FITBIT_STEPS_INTRADAY:
|
|||
INCLUDE_ZERO_STEP_ROWS: False
|
||||
SRC_SCRIPT: src/features/fitbit_steps_intraday/rapids/main.py
|
||||
|
||||
|
||||
########################################################################################################################
|
||||
# EMPATICA #
|
||||
########################################################################################################################
|
||||
|
@ -506,6 +521,16 @@ EMPATICA_ACCELEROMETER:
|
|||
COMPUTE: False
|
||||
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
|
||||
SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
|
||||
CR:
|
||||
COMPUTE: True
|
||||
FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features
|
||||
WINDOWS:
|
||||
COMPUTE: True
|
||||
WINDOW_LENGTH: 15 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/empatica_accelerometer/cr/main.py
|
||||
|
||||
|
||||
# See https://www.rapids.science/latest/features/empatica-heartrate/
|
||||
EMPATICA_HEARTRATE:
|
||||
|
@ -524,6 +549,16 @@ EMPATICA_TEMPERATURE:
|
|||
COMPUTE: False
|
||||
FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
|
||||
SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
|
||||
CR:
|
||||
COMPUTE: True
|
||||
FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean",
|
||||
"stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"]
|
||||
WINDOWS:
|
||||
COMPUTE: True
|
||||
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/empatica_temperature/cr/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/empatica-electrodermal-activity/
|
||||
EMPATICA_ELECTRODERMAL_ACTIVITY:
|
||||
|
@ -533,6 +568,20 @@ EMPATICA_ELECTRODERMAL_ACTIVITY:
|
|||
COMPUTE: False
|
||||
FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
|
||||
SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
|
||||
CR:
|
||||
COMPUTE: True
|
||||
FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic',
|
||||
'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore',
|
||||
'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio',
|
||||
'avgPeakIncreaseTime', 'avgPeakDecreaseTime', 'avgPeakDuration', 'signalOverallChange', 'changeDuration', 'changeRate', 'significantIncrease',
|
||||
'significantDecrease']
|
||||
WINDOWS:
|
||||
COMPUTE: True
|
||||
WINDOW_LENGTH: 60 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', count_windows, eda_num_peaks_non_zero]
|
||||
STANDARDIZE_FEATURES: True
|
||||
IMPUTE_NANS: True
|
||||
SRC_SCRIPT: src/features/empatica_electrodermal_activity/cr/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/empatica-blood-volume-pulse/
|
||||
EMPATICA_BLOOD_VOLUME_PULSE:
|
||||
|
@ -542,6 +591,16 @@ EMPATICA_BLOOD_VOLUME_PULSE:
|
|||
COMPUTE: False
|
||||
FEATURES: ["maxbvp", "minbvp", "avgbvp", "medianbvp", "modebvp", "stdbvp", "diffmaxmodebvp", "diffminmodebvp", "entropybvp"]
|
||||
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/dbdp/main.py
|
||||
CR:
|
||||
COMPUTE: False
|
||||
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
|
||||
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
|
||||
WINDOWS:
|
||||
COMPUTE: True
|
||||
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/cr/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/empatica-inter-beat-interval/
|
||||
EMPATICA_INTER_BEAT_INTERVAL:
|
||||
|
@ -551,6 +610,17 @@ EMPATICA_INTER_BEAT_INTERVAL:
|
|||
COMPUTE: False
|
||||
FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
|
||||
SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
|
||||
CR:
|
||||
COMPUTE: True
|
||||
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
|
||||
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
|
||||
PATCH_WITH_BVP: True
|
||||
WINDOWS:
|
||||
COMPUTE: True
|
||||
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/empatica_inter_beat_interval/cr/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/empatica-tags/
|
||||
EMPATICA_TAGS:
|
||||
|
@ -566,7 +636,7 @@ EMPATICA_TAGS:
|
|||
|
||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#1-histograms-of-phone-data-yield
|
||||
HISTOGRAM_PHONE_DATA_YIELD:
|
||||
PLOT: True
|
||||
PLOT: False
|
||||
|
||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#2-heatmaps-of-overall-data-yield
|
||||
HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT:
|
||||
|
@ -575,7 +645,7 @@ HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT:
|
|||
|
||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#3-heatmap-of-recorded-phone-sensors
|
||||
HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT:
|
||||
PLOT: True
|
||||
PLOT: False
|
||||
|
||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#4-heatmap-of-sensor-row-count
|
||||
HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT:
|
||||
|
@ -586,7 +656,7 @@ HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT:
|
|||
|
||||
# See https://www.rapids.science/latest/visualizations/feature-visualizations/#1-heatmap-correlation-matrix
|
||||
HEATMAP_FEATURE_CORRELATION_MATRIX:
|
||||
PLOT: True
|
||||
PLOT: False
|
||||
MIN_ROWS_RATIO: 0.5
|
||||
CORR_THRESHOLD: 0.1
|
||||
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
|
||||
|
@ -597,43 +667,94 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
|
|||
########################################################################################################################
|
||||
|
||||
ALL_CLEANING_INDIVIDUAL:
|
||||
CLEAN_STANDARDIZED: True
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||
COLS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
COLS_VAR_THRESHOLD: True
|
||||
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||
DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable
|
||||
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
|
||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||
COMPUTE: False
|
||||
COMPUTE: True
|
||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
|
||||
STRAW: # currently the same as RAPIDS provider with a change in selecting the imputation type
|
||||
COMPUTE: True
|
||||
IMPUTE_PHONE_SELECTED_EVENT_FEATURES:
|
||||
COMPUTE: False
|
||||
TYPE: median # options: zero, mean, median or k-nearest
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||
COLS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
COLS_VAR_THRESHOLD: True
|
||||
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
|
||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||
COMPUTE: True
|
||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
SRC_SCRIPT: src/features/all_cleaning_individual/straw/main.py
|
||||
|
||||
ALL_CLEANING_OVERALL:
|
||||
CLEAN_STANDARDIZED: True
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||
COLS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
COLS_VAR_THRESHOLD: True
|
||||
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||
DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable
|
||||
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
|
||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||
COMPUTE: False
|
||||
COMPUTE: True
|
||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
|
||||
STRAW: # currently the same as RAPIDS provider with a change in selecting the imputation type
|
||||
COMPUTE: True
|
||||
IMPUTE_PHONE_SELECTED_EVENT_FEATURES:
|
||||
COMPUTE: False
|
||||
TYPE: median # options: zero, mean, median or k-nearest
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||
COLS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
COLS_VAR_THRESHOLD: True
|
||||
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
|
||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||
COMPUTE: True
|
||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
SRC_SCRIPT: src/features/all_cleaning_overall/straw/main.py
|
||||
|
||||
|
||||
########################################################################################################################
|
||||
# Analysis Workflow Example #
|
||||
# Z-score standardization #
|
||||
########################################################################################################################
|
||||
|
||||
STANDARDIZATION: # Standardization for both providers is executed if only one of two providers is marked COMPUTE: TRUE
|
||||
MERGE_ALL: True # Creates the joint standardized file for each participant and all participants. Similar to merge_sensor_features_for_all_participants rule
|
||||
PROVIDERS:
|
||||
CR:
|
||||
COMPUTE: True
|
||||
SRC_SCRIPT: src/features/standardization/main.py
|
||||
OTHER:
|
||||
COMPUTE: True
|
||||
LIST: [RAPIDS, DORYAB, BARNETT, STRAW]
|
||||
SRC_SCRIPT: src/features/standardization/main.py
|
||||
|
||||
|
||||
########################################################################################################################
|
||||
# Baseline #
|
||||
########################################################################################################################
|
||||
|
||||
PARAMS_FOR_ANALYSIS:
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
"_id","timestamp","device_id","call_type","call_duration","trace"
|
||||
1,1587663260695,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",2,14,"d5e84f8af01b2728021d4f43f53a163c0c90000c"
|
||||
2,1587739118007,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",3,0,"47c125dc7bd163b8612cdea13724a814917b6e93"
|
||||
5,1587746544891,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",2,95,"9cc793ffd6e88b1d850ce540b5d7e000ef5650d4"
|
||||
6,1587911379859,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",2,63,"51fb9344e988049a3fec774c7ca622358bf80264"
|
||||
7,1587992647361,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",3,0,"2a862a7730cfdfaf103a9487afe3e02935fd6e02"
|
||||
8,1588020039448,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",1,11,"a2c53f6a086d98622c06107780980cf1bb4e37bd"
|
||||
11,1588176189024,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",2,65,"56589df8c830c70e330b644921ed38e08d8fd1f3"
|
||||
12,1588197745079,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",3,0,"cab458018a8ed3b626515e794c70b6f415318adc"
|
|
Binary file not shown.
|
@ -0,0 +1,57 @@
|
|||
label,empatica_id
|
||||
uploader_79170,A0245B
|
||||
uploader_89788,A02731
|
||||
uploader_68294,A02705
|
||||
uploader_92856,A024AF
|
||||
uploader_23726,A0231C
|
||||
uploader_66620,A02305
|
||||
uploader_58435,A026B5
|
||||
uploader_87801,A022A8
|
||||
uploader_96055,A027BA
|
||||
uploader_69549,A0226C
|
||||
uploader_26363,A0263D
|
||||
uploader_72010,A023FA
|
||||
uploader_13997,A024AF
|
||||
uploader_31156,A02305
|
||||
uploader_63187,A027BA
|
||||
uploader_94821,A022A8
|
||||
uploader_65413,A023F1;A023FA
|
||||
uploader_36488,A02713
|
||||
uploader_91087,A0231C
|
||||
uploader_35174,A025D1
|
||||
uploader_73880,A02705
|
||||
uploader_78650,A02731
|
||||
uploader_70578,A0245B
|
||||
uploader_88313,A02736
|
||||
uploader_58482,A0261A
|
||||
uploader_80601,A027BA
|
||||
uploader_93729,A0226C
|
||||
uploader_61663,A0245B
|
||||
uploader_80848,A025D1
|
||||
uploader_57312,A023F9;A02361;A027A0
|
||||
uploader_52087,A02666
|
||||
uploader_98770,A02953
|
||||
uploader_51327,A0245F
|
||||
uploader_11737,A02732
|
||||
uploader_77440,A0264E
|
||||
uploader_57277,A02422
|
||||
uploader_13098,A026E5
|
||||
uploader_80719,A023C8
|
||||
uploader_54698,A02953
|
||||
uploader_95571,A02853
|
||||
uploader_21880,A024DC
|
||||
uploader_92905,A02920
|
||||
uploader_12108,A023F4
|
||||
uploader_17436,A026E5
|
||||
uploader_58440,A0273F
|
||||
uploader_22172,A0245F
|
||||
uploader_39250,A02422
|
||||
uploader_15311,A023F9
|
||||
uploader_45766,A02920
|
||||
uploader_23096,A02361
|
||||
uploader_78243,A02422
|
||||
uploader_58777,A0245F
|
||||
uploader_82941,A02666
|
||||
uploader_89606,A023F4
|
||||
uploader_82969,A023C8
|
||||
uploader_53573,A024DC;A02361
|
|
|
@ -0,0 +1,11 @@
|
|||
PHONE:
|
||||
DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524] # the participant's AWARE device id
|
||||
PLATFORMS: [android] # or ios
|
||||
LABEL: MyTestP01 # any string
|
||||
START_DATE: 2020-01-01 # this can also be empty
|
||||
END_DATE: 2021-01-01 # this can also be empty
|
||||
EMPATICA:
|
||||
DEVICE_IDS: [empatica1]
|
||||
LABEL: test01
|
||||
START_DATE:
|
||||
END_DATE:
|
|
@ -1,2 +1,2 @@
|
|||
label,length
|
||||
thirtyminutes,30
|
||||
fiveminutes,5
|
|
|
@ -1,9 +1,2 @@
|
|||
label,start_time,length,repeats_on,repeats_value
|
||||
threeday,00:00:00,2D 23H 59M 59S,every_day,0
|
||||
daily,00:00:00,23H 59M 59S,every_day,0
|
||||
morning,06:00:00,5H 59M 59S,every_day,0
|
||||
afternoon,12:00:00,5H 59M 59S,every_day,0
|
||||
evening,18:00:00,5H 59M 59S,every_day,0
|
||||
night,00:00:00,5H 59M 59S,every_day,0
|
||||
two_weeks_overlapping,00:00:00,13D 23H 59M 59S,every_day,0
|
||||
weekends,00:00:00,2D 23H 59M 59S,wday,5
|
||||
|
|
|
File diff suppressed because it is too large
Load Diff
172
environment.yml
172
environment.yml
|
@ -3,114 +3,138 @@ channels:
|
|||
- conda-forge
|
||||
- defaults
|
||||
dependencies:
|
||||
- _libgcc_mutex=0.1
|
||||
- _openmp_mutex=4.5
|
||||
- _py-xgboost-mutex=2.0
|
||||
- appdirs=1.4.*
|
||||
- appdirs=1.4.4
|
||||
- arrow=0.16.0
|
||||
- asn1crypto=1.4.*
|
||||
- astropy=4.2.*
|
||||
- attrs=20.3.*
|
||||
- binaryornot=0.4.*
|
||||
- asn1crypto=1.4.0
|
||||
- astropy=4.2.1
|
||||
- attrs=20.3.0
|
||||
- binaryornot=0.4.4
|
||||
- blas=1.0
|
||||
- brotlipy=0.7.*
|
||||
- bzip2=1.0.*
|
||||
- ca-certificates
|
||||
- certifi
|
||||
- brotlipy=0.7.0
|
||||
- bzip2=1.0.8
|
||||
- ca-certificates=2021.7.5
|
||||
- certifi=2021.5.30
|
||||
- cffi=1.14.4
|
||||
- chardet=3.0.*
|
||||
- click=7.1.*
|
||||
- cookiecutter=1.6.*
|
||||
- cryptography=3.3.*
|
||||
- datrie=0.8.*
|
||||
- chardet=3.0.4
|
||||
- click=7.1.2
|
||||
- colorama=0.4.4
|
||||
- cookiecutter=1.6.0
|
||||
- cryptography=3.3.1
|
||||
- datrie=0.8.2
|
||||
- docutils=0.16
|
||||
- future=0.18.2
|
||||
- gitdb=4.0.*
|
||||
- gitdb2=4.0.*
|
||||
- gitpython=3.1.*
|
||||
- gitdb=4.0.5
|
||||
- gitdb2=4.0.2
|
||||
- gitpython=3.1.11
|
||||
- idna=2.10
|
||||
- imbalanced-learn=0.6.*
|
||||
- importlib-metadata=2.0.*
|
||||
- importlib_metadata=2.0.*
|
||||
- imbalanced-learn=0.6.2
|
||||
- importlib-metadata=2.0.0
|
||||
- importlib_metadata=2.0.0
|
||||
- intel-openmp=2019.4
|
||||
- jinja2=2.11.2
|
||||
- jinja2-time=0.2.*
|
||||
- joblib=1.0.*
|
||||
- jsonschema=3.2.*
|
||||
- libblas=3.8.*
|
||||
- libcblas=3.8.*
|
||||
- libcxx=10.0.*
|
||||
- libedit=3.1.*
|
||||
- jinja2-time=0.2.0
|
||||
- joblib=1.0.0
|
||||
- jsonschema=3.2.0
|
||||
- ld_impl_linux-64=2.36.1
|
||||
- libblas=3.8.0
|
||||
- libcblas=3.8.0
|
||||
- libcxx=10.0.0
|
||||
- libcxxabi=10.0.0
|
||||
- libedit=3.1.20191231
|
||||
- libffi=3.3
|
||||
- libgcc-ng=11.2.0
|
||||
- libgfortran
|
||||
- liblapack=3.8.*
|
||||
- libopenblas=0.3.*
|
||||
- libgfortran
|
||||
- libgfortran
|
||||
- liblapack=3.8.0
|
||||
- libopenblas=0.3.10
|
||||
- libstdcxx-ng=11.2.0
|
||||
- libxgboost=0.90
|
||||
- lightgbm=3.1.*
|
||||
- llvm-openmp=10.0.*
|
||||
- markupsafe=1.1.*
|
||||
- libzlib=1.2.11
|
||||
- lightgbm=3.1.1
|
||||
- llvm-openmp=10.0.0
|
||||
- markupsafe=1.1.1
|
||||
- mkl
|
||||
- mkl-service=2.3.*
|
||||
- mkl_fft=1.2.*
|
||||
- mkl_random=1.1.*
|
||||
- more-itertools=8.6.*
|
||||
- mkl-service=2.3.0
|
||||
- mkl_fft=1.2.0
|
||||
- mkl_random=1.1.1
|
||||
- more-itertools=8.6.0
|
||||
- ncurses=6.2
|
||||
- numpy=1.19.2
|
||||
- numpy-base=1.19.2
|
||||
- openblas=0.3.*
|
||||
- openssl
|
||||
- pandas=1.1.*
|
||||
- pbr=5.5.*
|
||||
- pip=20.3.*
|
||||
- openblas=0.3.4
|
||||
- openssl=1.1.1k
|
||||
- pandas=1.1.5
|
||||
- pbr=5.5.1
|
||||
- pip=20.3.3
|
||||
- plotly=4.14.1
|
||||
- poyo=0.5.*
|
||||
- psutil=5.7.*
|
||||
- psycopg2
|
||||
- poyo=0.5.0
|
||||
- psutil=5.7.2
|
||||
- py-xgboost=0.90
|
||||
- pycparser=2.20
|
||||
- pyerfa=1.7.*
|
||||
- pyopenssl=20.0.*
|
||||
- pyprojroot
|
||||
- pysocks=1.7.*
|
||||
- python=3.7.*
|
||||
- python-dateutil=2.8.*
|
||||
- python-dotenv
|
||||
- pyerfa=1.7.1.1
|
||||
- pyopenssl=20.0.1
|
||||
- pysocks=1.7.1
|
||||
- python=3.7.9
|
||||
- python-dateutil=2.8.1
|
||||
- python_abi=3.7
|
||||
- pytz=2020.4
|
||||
- pyyaml=5.3.*
|
||||
- pyyaml=5.3.1
|
||||
- readline=8.0
|
||||
- requests=2.25.0
|
||||
- retrying=1.3.*
|
||||
- retrying=1.3.3
|
||||
- scikit-learn=0.23.2
|
||||
- scipy=1.5.*
|
||||
- setuptools=51.0.*
|
||||
- scipy=1.5.2
|
||||
- setuptools=51.0.0
|
||||
- six=1.15.0
|
||||
- smmap=3.0.*
|
||||
- smmap2=3.0.*
|
||||
- sqlalchemy
|
||||
- smmap=3.0.4
|
||||
- smmap2=3.0.1
|
||||
- sqlite=3.33.0
|
||||
- threadpoolctl=2.1.*
|
||||
- tk=8.6.*
|
||||
- threadpoolctl=2.1.0
|
||||
- tk=8.6.10
|
||||
- tqdm=4.62.0
|
||||
- urllib3=1.25.11
|
||||
- wheel=0.36.2
|
||||
- whichcraft=0.6.*
|
||||
- whichcraft=0.6.1
|
||||
- wrapt=1.12.1
|
||||
- xgboost=0.90
|
||||
- xz=5.2.*
|
||||
- yaml=0.2.*
|
||||
- zipp=3.4.*
|
||||
- zlib=1.2.*
|
||||
- xz=5.2.5
|
||||
- yaml=0.2.5
|
||||
- zipp=3.4.0
|
||||
- zlib=1.2.11
|
||||
- pip:
|
||||
- amply==0.1.*
|
||||
- amply==0.1.4
|
||||
- bidict==0.22.0
|
||||
- biosppy==0.8.0
|
||||
- cached-property==1.5.2
|
||||
- configargparse==0.15.1
|
||||
- decorator==4.4.*
|
||||
- ipython-genutils==0.2.*
|
||||
- jupyter-core==4.6.*
|
||||
- nbformat==5.0.*
|
||||
- cr-features==0.2.1
|
||||
- cycler==0.11.0
|
||||
- decorator==4.4.2
|
||||
- fonttools==4.33.2
|
||||
- h5py==3.6.0
|
||||
- hmmlearn==0.2.7
|
||||
- ipython-genutils==0.2.0
|
||||
- jupyter-core==4.6.3
|
||||
- kiwisolver==1.4.2
|
||||
- matplotlib==3.5.1
|
||||
- nbformat==5.0.7
|
||||
- opencv-python==4.5.5.64
|
||||
- packaging==21.3
|
||||
- peakutils==1.3.3
|
||||
- pillow==9.1.0
|
||||
- pulp==2.4
|
||||
- pyparsing==2.4.*
|
||||
- pyparsing==2.4.7
|
||||
- pyrsistent==0.15.5
|
||||
- ratelimiter==1.2.*
|
||||
- pywavelets==1.3.0
|
||||
- ratelimiter==1.2.0.post0
|
||||
- seaborn==0.11.2
|
||||
- shortuuid==1.0.8
|
||||
- snakemake==5.30.2
|
||||
- toposort==1.5
|
||||
- traitlets==4.3.*
|
||||
prefix: /usr/local/Caskroom/miniconda/base/envs/rapids202108
|
||||
- traitlets==4.3.3
|
||||
- typing-extensions==4.2.0
|
||||
prefix: /opt/conda/envs/rapids
|
||||
|
|
|
@ -14,9 +14,6 @@ local({
|
|||
# signal that we're loading renv during R startup
|
||||
Sys.setenv("RENV_R_INITIALIZING" = "true")
|
||||
on.exit(Sys.unsetenv("RENV_R_INITIALIZING"), add = TRUE)
|
||||
|
||||
if(grepl("Darwin", Sys.info()["sysname"], fixed = TRUE) & grepl("ARM64", Sys.info()["version"], fixed = TRUE)) # M1 Macs
|
||||
Sys.setenv("TZDIR" = file.path(R.home(), "share", "zoneinfo"))
|
||||
|
||||
# signal that we've consented to use renv
|
||||
options(renv.consent = TRUE)
|
||||
|
|
|
@ -40,6 +40,26 @@ def find_features_files(wildcards):
|
|||
feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", sensor_key=wildcards.sensor_key.lower(), language=get_script_language(provider["SRC_SCRIPT"]), provider_key=provider_key.lower()))
|
||||
return(feature_files)
|
||||
|
||||
def find_empaticas_standardized_features_files(wildcards):
|
||||
feature_files = []
|
||||
if "empatica" in wildcards.sensor_key:
|
||||
for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items():
|
||||
if provider["COMPUTE"] and provider.get("WINDOWS", False) and provider["WINDOWS"]["COMPUTE"]:
|
||||
if "empatica" in wildcards.sensor_key:
|
||||
feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/z_{sensor_key}_{language}_{provider_key}.csv", sensor_key=wildcards.sensor_key.lower(), language=get_script_language(provider["SRC_SCRIPT"]), provider_key=provider_key.lower()))
|
||||
return(feature_files)
|
||||
|
||||
def find_joint_non_empatica_sensor_files(wildcards):
|
||||
joined_files = []
|
||||
for config_key in config.keys():
|
||||
if config_key.startswith(("PHONE", "FITBIT")) and "PROVIDERS" in config[config_key] and isinstance(config[config_key]["PROVIDERS"], dict):
|
||||
for provider_key, provider in config[config_key]["PROVIDERS"].items():
|
||||
if "COMPUTE" in provider.keys() and provider["COMPUTE"]:
|
||||
joined_files.append("data/processed/features/{pid}/" + config_key.lower() + ".csv")
|
||||
break
|
||||
return joined_files
|
||||
|
||||
|
||||
def optional_steps_sleep_input(wildcards):
|
||||
if config["FITBIT_STEPS_INTRADAY"]["EXCLUDE_SLEEP"]["FITBIT_BASED"]["EXCLUDE"]:
|
||||
return "data/raw/{pid}/fitbit_sleep_summary_raw.csv"
|
||||
|
@ -62,6 +82,18 @@ def input_merge_sensor_features_for_individual_participants(wildcards):
|
|||
break
|
||||
return feature_files
|
||||
|
||||
def input_merge_standardized_sensor_features_for_individual_participants(wildcards):
|
||||
feature_files = []
|
||||
for config_key in config.keys():
|
||||
if config_key.startswith(("PHONE", "FITBIT", "EMPATICA")) and "PROVIDERS" in config[config_key] and isinstance(config[config_key]["PROVIDERS"], dict):
|
||||
for provider_key, provider in config[config_key]["PROVIDERS"].items():
|
||||
if "COMPUTE" in provider.keys() and provider["COMPUTE"] and ("STANDARDIZE_FEATURES" in provider.keys() and provider["STANDARDIZE_FEATURES"] or
|
||||
"WINDOWS" in provider.keys() and "STANDARDIZE_FEATURES" in provider["WINDOWS"].keys() and provider["WINDOWS"]["STANDARDIZE_FEATURES"]):
|
||||
feature_files.append("data/processed/features/{pid}/z_" + config_key.lower() + ".csv")
|
||||
break
|
||||
|
||||
return feature_files
|
||||
|
||||
def get_phone_sensor_names():
|
||||
phone_sensor_names = []
|
||||
for config_key in config.keys():
|
||||
|
|
|
@ -32,7 +32,7 @@ rule phone_data_yield_r_features:
|
|||
output:
|
||||
"data/interim/{pid}/phone_data_yield_features/phone_data_yield_r_{provider_key}.csv"
|
||||
script:
|
||||
"../src/features/entry.R"
|
||||
"../src/features/entry.R"
|
||||
|
||||
rule phone_accelerometer_python_features:
|
||||
input:
|
||||
|
@ -791,10 +791,25 @@ rule empatica_accelerometer_python_features:
|
|||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_accelerometer"
|
||||
output:
|
||||
"data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}.csv"
|
||||
"data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_accelerometer_python_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_accelerometer",
|
||||
provider_main = config["EMPATICA_ACCELEROMETER"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_accelerometer_features/z_empatica_accelerometer_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_accelerometer_features/z_empatica_accelerometer_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_accelerometer_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_accelerometer_with_datetime.csv",
|
||||
|
@ -817,7 +832,8 @@ rule empatica_heartrate_python_features:
|
|||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_heartrate"
|
||||
output:
|
||||
"data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_python_{provider_key}.csv"
|
||||
"data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
|
@ -843,10 +859,25 @@ rule empatica_temperature_python_features:
|
|||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_temperature"
|
||||
output:
|
||||
"data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}.csv"
|
||||
"data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_temperature_python_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_temperature",
|
||||
provider_main = config["EMPATICA_TEMPERATURE"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_temperature_features/z_empatica_temperature_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_temperature_features/z_empatica_temperature_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_temperature_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_temperature_with_datetime.csv",
|
||||
|
@ -869,10 +900,25 @@ rule empatica_electrodermal_activity_python_features:
|
|||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_electrodermal_activity"
|
||||
output:
|
||||
"data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}.csv"
|
||||
"data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_electrodermal_activity_python_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_electrodermal_activity",
|
||||
provider_main = config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_electrodermal_activity_features/z_empatica_electrodermal_activity_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_electrodermal_activity_features/z_empatica_electrodermal_activity_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_electrodermal_activity_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_electrodermal_activity_with_datetime.csv",
|
||||
|
@ -895,10 +941,25 @@ rule empatica_blood_volume_pulse_python_features:
|
|||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_blood_volume_pulse"
|
||||
output:
|
||||
"data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}.csv"
|
||||
"data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_blood_volume_pulse_python_cr_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_blood_volume_pulse",
|
||||
provider_main = config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_blood_volume_pulse_features/z_empatica_blood_volume_pulse_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_blood_volume_pulse_features/z_empatica_blood_volume_pulse_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_blood_volume_pulse_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_blood_volume_pulse_with_datetime.csv",
|
||||
|
@ -921,10 +982,25 @@ rule empatica_inter_beat_interval_python_features:
|
|||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_inter_beat_interval"
|
||||
output:
|
||||
"data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}.csv"
|
||||
"data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_inter_beat_interval_python_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_inter_beat_interval",
|
||||
provider_main = config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_inter_beat_interval_features/z_empatica_inter_beat_interval_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_inter_beat_interval_features/z_empatica_inter_beat_interval_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_inter_beat_interval_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_inter_beat_interval_with_datetime.csv",
|
||||
|
@ -972,6 +1048,38 @@ rule merge_sensor_features_for_individual_participants:
|
|||
script:
|
||||
"../src/features/utils/merge_sensor_features_for_individual_participants.R"
|
||||
|
||||
rule join_standardized_features_from_empatica:
|
||||
input:
|
||||
sensor_features = find_empaticas_standardized_features_files
|
||||
wildcard_constraints:
|
||||
sensor_key = '(empatica).*'
|
||||
output:
|
||||
"data/processed/features/{pid}/z_{sensor_key}.csv"
|
||||
script:
|
||||
"../src/features/utils/join_features_from_providers.R"
|
||||
|
||||
rule standardize_features_from_providers_no_empatica:
|
||||
input:
|
||||
sensor_features = find_joint_non_empatica_sensor_files
|
||||
wildcard_constraints:
|
||||
sensor_key = '(phone|fitbit).*'
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["OTHER"],
|
||||
provider_key = "OTHER",
|
||||
sensor_key = "{sensor_key}"
|
||||
output:
|
||||
"data/processed/features/{pid}/z_{sensor_key}.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule merge_standardized_sensor_features_for_individual_participants:
|
||||
input:
|
||||
feature_files = input_merge_standardized_sensor_features_for_individual_participants
|
||||
output:
|
||||
"data/processed/features/{pid}/z_all_sensor_features.csv"
|
||||
script:
|
||||
"../src/features/utils/merge_sensor_features_for_individual_participants.R"
|
||||
|
||||
rule merge_sensor_features_for_all_participants:
|
||||
input:
|
||||
feature_files = expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])
|
||||
|
@ -980,6 +1088,14 @@ rule merge_sensor_features_for_all_participants:
|
|||
script:
|
||||
"../src/features/utils/merge_sensor_features_for_all_participants.R"
|
||||
|
||||
rule merge_standardized_sensor_features_for_all_participants:
|
||||
input:
|
||||
feature_files = expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"])
|
||||
output:
|
||||
"data/processed/features/all_participants/z_all_sensor_features.csv"
|
||||
script:
|
||||
"../src/features/utils/merge_standardized_sensor_features_for_all_participants.R"
|
||||
|
||||
rule clean_sensor_features_for_individual_participants:
|
||||
input:
|
||||
sensor_data = rules.merge_sensor_features_for_individual_participants.output
|
||||
|
@ -988,11 +1104,12 @@ rule clean_sensor_features_for_individual_participants:
|
|||
params:
|
||||
provider = lambda wildcards: config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "all_cleaning_individual"
|
||||
script_extension = "{script_extension}",
|
||||
sensor_key = "all_cleaning_individual"
|
||||
output:
|
||||
"data/processed/features/{pid}/all_sensor_features_cleaned_{provider_key}.csv"
|
||||
"data/processed/features/{pid}/all_sensor_features_cleaned_{provider_key}_{script_extension}.csv" # bo predstavljalo probleme za naprej (kako iskati datoteke + standardizacija itd.)
|
||||
script:
|
||||
"../src/features/entry.R"
|
||||
"../src/features/entry.{params.script_extension}"
|
||||
|
||||
rule clean_sensor_features_for_all_participants:
|
||||
input:
|
||||
|
@ -1000,9 +1117,38 @@ rule clean_sensor_features_for_all_participants:
|
|||
params:
|
||||
provider = lambda wildcards: config["ALL_CLEANING_OVERALL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
provider_key = "{provider_key}",
|
||||
script_extension = "{script_extension}",
|
||||
sensor_key = "all_cleaning_overall"
|
||||
output:
|
||||
"data/processed/features/all_participants/all_sensor_features_cleaned_{provider_key}.csv"
|
||||
"data/processed/features/all_participants/all_sensor_features_cleaned_{provider_key}_{script_extension}.csv"
|
||||
script:
|
||||
"../src/features/entry.R"
|
||||
"../src/features/entry.{params.script_extension}"
|
||||
|
||||
rule clean_standardized_sensor_features_for_individual_participants:
|
||||
input:
|
||||
sensor_data = rules.merge_standardized_sensor_features_for_individual_participants.output
|
||||
wildcard_constraints:
|
||||
pid = "("+"|".join(config["PIDS"])+")"
|
||||
params:
|
||||
provider = lambda wildcards: config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
provider_key = "{provider_key}",
|
||||
script_extension = "{script_extension}",
|
||||
sensor_key = "all_cleaning_individual"
|
||||
output:
|
||||
"data/processed/features/{pid}/z_all_sensor_features_cleaned_{provider_key}_{script_extension}.csv"
|
||||
script:
|
||||
"../src/features/entry.{params.script_extension}"
|
||||
|
||||
rule clean_standardized_sensor_features_for_all_participants:
|
||||
input:
|
||||
sensor_data = rules.merge_standardized_sensor_features_for_all_participants.output
|
||||
params:
|
||||
provider = lambda wildcards: config["ALL_CLEANING_OVERALL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
provider_key = "{provider_key}",
|
||||
script_extension = "{script_extension}",
|
||||
sensor_key = "all_cleaning_overall"
|
||||
output:
|
||||
"data/processed/features/all_participants/z_all_sensor_features_cleaned_{provider_key}_{script_extension}.csv"
|
||||
script:
|
||||
"../src/features/entry.{params.script_extension}"
|
||||
|
||||
|
|
|
@ -30,22 +30,43 @@ rule baseline_features:
|
|||
|
||||
rule select_target:
|
||||
input:
|
||||
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned_rapids.csv"
|
||||
cleaned_sensor_features = "data/processed/features/{pid}/z_all_sensor_features_cleaned_straw_py.csv"
|
||||
params:
|
||||
target_variable = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||
output:
|
||||
"data/processed/models/individual_model/{pid}/input.csv"
|
||||
"data/processed/models/individual_model/{pid}/z_input.csv"
|
||||
script:
|
||||
"../src/models/select_targets.py"
|
||||
|
||||
rule merge_features_and_targets_for_population_model:
|
||||
input:
|
||||
cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned_rapids.csv",
|
||||
cleaned_sensor_features = "data/processed/features/all_participants/z_all_sensor_features_cleaned_straw_py.csv",
|
||||
demographic_features = expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]),
|
||||
params:
|
||||
target_variable=config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||
output:
|
||||
"data/processed/models/population_model/input.csv"
|
||||
"data/processed/models/population_model/z_input.csv"
|
||||
script:
|
||||
"../src/models/merge_features_and_targets_for_population_model.py"
|
||||
|
||||
# rule select_target:
|
||||
# input:
|
||||
# cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned_straw_py.csv"
|
||||
# params:
|
||||
# target_variable = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||
# output:
|
||||
# "data/processed/models/individual_model/{pid}/input.csv"
|
||||
# script:
|
||||
# "../src/models/select_targets.py"
|
||||
|
||||
# rule merge_features_and_targets_for_population_model:
|
||||
# input:
|
||||
# cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned_straw_py.csv",
|
||||
# demographic_features = expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]),
|
||||
# params:
|
||||
# target_variable=config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||
# output:
|
||||
# "data/processed/models/population_model/input.csv"
|
||||
# script:
|
||||
# "../src/models/merge_features_and_targets_for_population_model.py"
|
||||
|
||||
|
|
|
@ -4,14 +4,14 @@ rule create_example_participant_files:
|
|||
shell:
|
||||
"echo 'PHONE:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n PLATFORMS: [android]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example01.yaml && echo 'PHONE:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n PLATFORMS: [ios]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example02.yaml"
|
||||
|
||||
rule query_usernames_device_empatica_ids:
|
||||
params:
|
||||
baseline_folder = "/mnt/e/STRAWbaseline/"
|
||||
output:
|
||||
usernames_file = config["CREATE_PARTICIPANT_FILES"]["USERNAMES_CSV"],
|
||||
timezone_file = config["TIMEZONE"]["MULTIPLE"]["TZ_FILE"]
|
||||
script:
|
||||
"../../participants/prepare_usernames_file.py"
|
||||
# rule query_usernames_device_empatica_ids:
|
||||
# params:
|
||||
# baseline_folder = "/mnt/e/STRAWbaseline/"
|
||||
# output:
|
||||
# usernames_file = config["CREATE_PARTICIPANT_FILES"]["USERNAMES_CSV"],
|
||||
# timezone_file = config["TIMEZONE"]["MULTIPLE"]["TZ_FILE"]
|
||||
# script:
|
||||
# "../../participants/prepare_usernames_file.py"
|
||||
|
||||
rule prepare_tzcodes_file:
|
||||
input:
|
||||
|
|
|
@ -58,7 +58,7 @@ participants %>%
|
|||
lines <- append(lines, empty_fitbit)
|
||||
|
||||
if(add_empatica_section == TRUE && !is.na(row[empatica_device_id_column])){
|
||||
lines <- append(lines, c("EMPATICA:", paste0(" DEVICE_IDS: [",row[empatica_device_id_column],"]"),
|
||||
lines <- append(lines, c("EMPATICA:", paste0(" DEVICE_IDS: [",row$label,"]"),
|
||||
paste(" LABEL:",row$label), paste(" START_DATE:", start_date), paste(" END_DATE:", end_date)))
|
||||
} else
|
||||
lines <- append(lines, empty_empatica)
|
||||
|
|
|
@ -2,11 +2,16 @@ from zipfile import ZipFile
|
|||
import warnings
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pandas.core import indexing
|
||||
import yaml
|
||||
import csv
|
||||
from collections import OrderedDict
|
||||
from io import BytesIO, StringIO
|
||||
import sys, os
|
||||
|
||||
from cr_features.hrv import get_HRV_features, get_patched_ibi_with_bvp
|
||||
from cr_features.helper_functions import empatica1d_to_array, empatica2d_to_array
|
||||
|
||||
def processAcceleration(x, y, z):
|
||||
x = float(x)
|
||||
|
@ -52,6 +57,8 @@ def extract_empatica_data(data, sensor):
|
|||
df = pd.DataFrame.from_dict(ddict, orient='index', columns=[column])
|
||||
df[column] = df[column].astype(float)
|
||||
df.index.name = 'timestamp'
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
elif sensor == 'EMPATICA_ACCELEROMETER':
|
||||
ddict = readFile(sensor_data_file, sensor)
|
||||
|
@ -60,15 +67,22 @@ def extract_empatica_data(data, sensor):
|
|||
df['y'] = df['y'].astype(float)
|
||||
df['z'] = df['z'].astype(float)
|
||||
df.index.name = 'timestamp'
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
elif sensor == 'EMPATICA_INTER_BEAT_INTERVAL':
|
||||
df = pd.read_csv(sensor_data_file, names=['timestamp', column], header=None)
|
||||
|
||||
df = pd.read_csv(sensor_data_file, names=['timings', column], header=None)
|
||||
df['timestamp'] = df['timings']
|
||||
if df.empty:
|
||||
df = df.set_index('timestamp')
|
||||
return df
|
||||
timestampstart = float(df['timestamp'][0])
|
||||
df['timestamp'] = (df['timestamp'][1:len(df)]).astype(float) + timestampstart
|
||||
df['timestamp'] = (df['timestamp'][1:len(df)]).astype(float) + timestampstart
|
||||
df = df.drop([0])
|
||||
df[column] = df[column].astype(float)
|
||||
df = df.set_index('timestamp')
|
||||
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
"sensor has an invalid name: {}".format(sensor))
|
||||
|
@ -84,6 +98,10 @@ def pull_data(data_configuration, device, sensor, container, columns_to_download
|
|||
participant_data = pd.DataFrame(columns=columns_to_download.values())
|
||||
participant_data.set_index('timestamp', inplace=True)
|
||||
|
||||
with open('config.yaml', 'r') as stream:
|
||||
config = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
cr_ibi_provider = config['EMPATICA_INTER_BEAT_INTERVAL']['PROVIDERS']['CR']
|
||||
|
||||
available_zipfiles = list((Path(data_configuration["FOLDER"]) / Path(device)).rglob("*.zip"))
|
||||
if len(available_zipfiles) == 0:
|
||||
warnings.warn("There were no zip files in: {}. If you were expecting data for this participant the [EMPATICA][DEVICE_IDS] key in their participant file is missing the pid".format((Path(data_configuration["FOLDER"]) / Path(device))))
|
||||
|
@ -94,7 +112,13 @@ def pull_data(data_configuration, device, sensor, container, columns_to_download
|
|||
listOfFileNames = zipFile.namelist()
|
||||
for fileName in listOfFileNames:
|
||||
if fileName == sensor_csv:
|
||||
participant_data = pd.concat([participant_data, extract_empatica_data(zipFile.read(fileName), sensor)], axis=0)
|
||||
if sensor == "EMPATICA_INTER_BEAT_INTERVAL" and cr_ibi_provider.get('PATCH_WITH_BVP', False):
|
||||
participant_data = \
|
||||
pd.concat([participant_data, patch_ibi_with_bvp(zipFile.read('IBI.csv'), zipFile.read('BVP.csv'))], axis=0)
|
||||
#print("patch with ibi")
|
||||
else:
|
||||
participant_data = pd.concat([participant_data, extract_empatica_data(zipFile.read(fileName), sensor)], axis=0)
|
||||
#print("no patching")
|
||||
warning = False
|
||||
if warning:
|
||||
warnings.warn("We could not find a zipped file for {} in {} (we tried to find {})".format(sensor, zipFile, sensor_csv))
|
||||
|
@ -105,4 +129,53 @@ def pull_data(data_configuration, device, sensor, container, columns_to_download
|
|||
participant_data["device_id"] = device
|
||||
return(participant_data)
|
||||
|
||||
def patch_ibi_with_bvp(ibi_data, bvp_data):
|
||||
ibi_data_file = BytesIO(ibi_data).getvalue().decode('utf-8')
|
||||
ibi_data_file = StringIO(ibi_data_file)
|
||||
|
||||
# Begin with the cr-features part
|
||||
try:
|
||||
ibi_data, ibi_start_timestamp = empatica2d_to_array(ibi_data_file)
|
||||
except IndexError as e:
|
||||
# Checks whether IBI.csv is empty
|
||||
df_test = pd.read_csv(ibi_data_file, names=['timings', 'inter_beat_interval'], header=None)
|
||||
if df_test.empty:
|
||||
df_test['timestamp'] = df_test['timings']
|
||||
df_test = df_test.set_index('timestamp')
|
||||
return df_test
|
||||
else:
|
||||
raise IndexError("Something went wrong with indices. Error that was previously caught:\n", repr(e))
|
||||
|
||||
bvp_data_file = BytesIO(bvp_data).getvalue().decode('utf-8')
|
||||
bvp_data_file = StringIO(bvp_data_file)
|
||||
|
||||
bvp_data, bvp_start_timestamp, sample_rate = empatica1d_to_array(bvp_data_file)
|
||||
|
||||
hrv_time_and_freq_features, sample, bvp_rr, bvp_timings, peak_indx = \
|
||||
get_HRV_features(bvp_data, ma=False,
|
||||
detrend=False, m_deternd=False, low_pass=False, winsorize=True,
|
||||
winsorize_value=25, hampel_fiter=False, median_filter=False,
|
||||
mod_z_score_filter=True, sampling=64, feature_names=['meanHr'])
|
||||
|
||||
ibi_timings, ibi_rr = get_patched_ibi_with_bvp(ibi_data[0], ibi_data[1], bvp_timings, bvp_rr)
|
||||
|
||||
df = \
|
||||
pd.DataFrame(np.array([ibi_timings, ibi_rr]).transpose(), columns=['timestamp', 'inter_beat_interval'])
|
||||
df.loc[-1] = [ibi_start_timestamp, 'IBI'] # adding a row
|
||||
df.index = df.index + 1 # shifting index
|
||||
df = df.sort_index() # sorting by index
|
||||
|
||||
# Repeated as in extract_empatica_data for IBI
|
||||
df['timings'] = df['timestamp']
|
||||
timestampstart = float(df['timestamp'][0])
|
||||
df['timestamp'] = (df['timestamp'][1:len(df)]).astype(float) + timestampstart
|
||||
df = df.drop([0])
|
||||
df['inter_beat_interval'] = df['inter_beat_interval'].astype(float)
|
||||
df = df.set_index('timestamp')
|
||||
|
||||
# format timestamps
|
||||
df.index *= 1000
|
||||
df.index = df.index.astype(int)
|
||||
return(df)
|
||||
|
||||
# print(pull_data({'FOLDER': 'data/external/empatica'}, "e01", "EMPATICA_accelerometer", {'TIMESTAMP': 'timestamp', 'DEVICE_ID': 'device_id', 'DOUBLE_VALUES_0': 'x', 'DOUBLE_VALUES_1': 'y', 'DOUBLE_VALUES_2': 'z'}))
|
|
@ -50,6 +50,7 @@ EMPATICA_INTER_BEAT_INTERVAL:
|
|||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
INTER_BEAT_INTERVAL: inter_beat_interval
|
||||
TIMINGS: timings
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
|
|
@ -227,6 +227,7 @@ EMPATICA_INTER_BEAT_INTERVAL:
|
|||
- TIMESTAMP
|
||||
- DEVICE_ID
|
||||
- INTER_BEAT_INTERVAL
|
||||
- TIMINGS
|
||||
|
||||
EMPATICA_TAGS:
|
||||
- TIMESTAMP
|
||||
|
|
|
@ -39,8 +39,10 @@ rapids_cleaning <- function(sensor_data_files, provider){
|
|||
if(!data_yield_column %in% colnames(clean_features)){
|
||||
stop(paste0("Error: RAPIDS provider needs to clean data based on ", data_yield_column, " column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded", data_yield_unit, "' in [FEATURES]."))
|
||||
}
|
||||
clean_features <- clean_features %>%
|
||||
if (data_yield_ratio_threshold > 0) {
|
||||
clean_features <- clean_features %>%
|
||||
filter(.[[data_yield_column]] >= data_yield_ratio_threshold)
|
||||
}
|
||||
|
||||
# Drop columns with a percentage of NA values above cols_nan_threshold
|
||||
if(nrow(clean_features))
|
||||
|
|
|
@ -0,0 +1,88 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import math, sys
|
||||
|
||||
def straw_cleaning(sensor_data_files, provider):
|
||||
|
||||
features = pd.read_csv(sensor_data_files["sensor_data"][0])
|
||||
|
||||
# TODO: reorder the cleaning steps so it makes sense for the analysis
|
||||
# TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this
|
||||
# the snakemake rules will also have to come with additional parameter (in rules/features.smk)
|
||||
|
||||
# Impute selected features event
|
||||
impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"]
|
||||
if impute_phone_features["COMPUTE"]:
|
||||
if not 'phone_data_yield_rapids_ratiovalidyieldedminutes' in features.columns:
|
||||
raise KeyError("RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
|
||||
|
||||
# TODO: if the type of the imputation will vary for different groups of features make conditional imputations here
|
||||
phone_cols = [col for col in features if \
|
||||
col.startswith('phone_applications_foreground_rapids_') or
|
||||
col.startswith('phone_battery_rapids_') or
|
||||
col.startswith('phone_calls_rapids_') or
|
||||
col.startswith('phone_keyboard_rapids_') or
|
||||
col.startswith('phone_messages_rapids_') or
|
||||
col.startswith('phone_screen_rapids_') or
|
||||
col.startswith('phone_wifi_')]
|
||||
|
||||
mask = features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE']
|
||||
features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"].lower())
|
||||
|
||||
# Drop rows with the value of data_yield_column less than data_yield_ratio_threshold
|
||||
data_yield_unit = provider["DATA_YIELD_FEATURE"].split("_")[3].lower()
|
||||
data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + data_yield_unit
|
||||
|
||||
if not data_yield_column in features.columns:
|
||||
raise KeyError(f"RAPIDS provider needs to impute the selected event features based on {data_yield_column} column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].")
|
||||
|
||||
if provider["DATA_YIELD_RATIO_THRESHOLD"]:
|
||||
features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]]
|
||||
|
||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm')] # For later preservation of esm_cols
|
||||
|
||||
# Remove cols if threshold of NaN values is passed
|
||||
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
||||
|
||||
# Remove cols where variance is 0
|
||||
if provider["COLS_VAR_THRESHOLD"]:
|
||||
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
|
||||
|
||||
# Preserve esm cols if deleted (has to come after drop cols operations)
|
||||
for esm in esm_cols:
|
||||
if esm not in features:
|
||||
features[esm] = esm_cols[esm]
|
||||
|
||||
# Drop highly correlated features - To-Do še en thershold var, ki je v config + kako se tretirajo NaNs?
|
||||
drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]
|
||||
if drop_corr_features["COMPUTE"]:
|
||||
|
||||
numerical_cols = features.select_dtypes(include=np.number).columns.tolist()
|
||||
|
||||
# Remove columns where NaN count threshold is passed
|
||||
valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
|
||||
|
||||
cor_matrix = valid_features.corr(method='spearman').abs()
|
||||
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))
|
||||
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > drop_corr_features["CORR_THRESHOLD"])]
|
||||
|
||||
features.drop(to_drop, axis=1, inplace=True)
|
||||
|
||||
# Remove rows if threshold of NaN values is passed
|
||||
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
|
||||
features.dropna(axis=0, thresh=min_count, inplace=True)
|
||||
|
||||
return features
|
||||
|
||||
def impute(df, method='zero'):
|
||||
|
||||
def k_nearest(df): # TODO: if needed, implement k-nearest imputation / interpolation
|
||||
pass
|
||||
|
||||
return { # rest of the columns should be imputed with the selected method
|
||||
'zero': df.fillna(0),
|
||||
'mean': df.fillna(df.mean()),
|
||||
'median': df.fillna(df.median()),
|
||||
'k-nearest': k_nearest(df)
|
||||
}[method]
|
||||
|
|
@ -39,16 +39,18 @@ rapids_cleaning <- function(sensor_data_files, provider){
|
|||
if(!data_yield_column %in% colnames(clean_features)){
|
||||
stop(paste0("Error: RAPIDS provider needs to clean data based on ", data_yield_column, " column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded", data_yield_unit, "' in [FEATURES]."))
|
||||
}
|
||||
clean_features <- clean_features %>%
|
||||
if (data_yield_ratio_threshold > 0) {
|
||||
clean_features <- clean_features %>%
|
||||
filter(.[[data_yield_column]] >= data_yield_ratio_threshold)
|
||||
}
|
||||
|
||||
# Drop columns with a percentage of NA values above cols_nan_threshold
|
||||
if(nrow(clean_features))
|
||||
clean_features <- clean_features %>% select_if(~ sum(is.na(.)) / length(.) <= cols_nan_threshold )
|
||||
clean_features <- clean_features %>% select(where(~ sum(is.na(.)) / length(.) <= cols_nan_threshold ), starts_with("phone_esm"))
|
||||
|
||||
# Drop columns with zero variance
|
||||
if(drop_zero_variance_columns)
|
||||
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
||||
clean_features <- clean_features %>% select_if(grepl("pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime|phone_esm",names(.)) | sapply(., n_distinct, na.rm = T) > 1)
|
||||
|
||||
# Drop highly correlated features
|
||||
if(as.logical(drop_highly_correlated_features$COMPUTE)){
|
||||
|
|
|
@ -0,0 +1,88 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import math, sys
|
||||
|
||||
def straw_cleaning(sensor_data_files, provider):
|
||||
|
||||
features = pd.read_csv(sensor_data_files["sensor_data"][0])
|
||||
|
||||
# TODO: reorder the cleaning steps so it makes sense for the analysis
|
||||
# TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this
|
||||
# the snakemake rules will also have to come with additional parameter (in rules/features.smk)
|
||||
|
||||
# Impute selected features event
|
||||
impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"]
|
||||
if impute_phone_features["COMPUTE"]:
|
||||
if not 'phone_data_yield_rapids_ratiovalidyieldedminutes' in features.columns:
|
||||
raise KeyError("RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
|
||||
|
||||
# TODO: if the type of the imputation will vary for different groups of features make conditional imputations here
|
||||
phone_cols = [col for col in features if \
|
||||
col.startswith('phone_applications_foreground_rapids_') or
|
||||
col.startswith('phone_battery_rapids_') or
|
||||
col.startswith('phone_calls_rapids_') or
|
||||
col.startswith('phone_keyboard_rapids_') or
|
||||
col.startswith('phone_messages_rapids_') or
|
||||
col.startswith('phone_screen_rapids_') or
|
||||
col.startswith('phone_wifi_')]
|
||||
|
||||
mask = features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE']
|
||||
features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"].lower())
|
||||
|
||||
# Drop rows with the value of data_yield_column less than data_yield_ratio_threshold
|
||||
data_yield_unit = provider["DATA_YIELD_FEATURE"].split("_")[3].lower()
|
||||
data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + data_yield_unit
|
||||
|
||||
if not data_yield_column in features.columns:
|
||||
raise KeyError(f"RAPIDS provider needs to impute the selected event features based on {data_yield_column} column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].")
|
||||
|
||||
if provider["DATA_YIELD_RATIO_THRESHOLD"]:
|
||||
features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]]
|
||||
|
||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm')] # For later preservation of esm_cols
|
||||
|
||||
# Remove cols if threshold of NaN values is passed
|
||||
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
||||
|
||||
# Remove cols where variance is 0
|
||||
if provider["COLS_VAR_THRESHOLD"]:
|
||||
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
|
||||
|
||||
# Preserve esm cols if deleted (has to come after drop cols operations)
|
||||
for esm in esm_cols:
|
||||
if esm not in features:
|
||||
features[esm] = esm_cols[esm]
|
||||
|
||||
# Drop highly correlated features - To-Do še en thershold var, ki je v config + kako se tretirajo NaNs?
|
||||
drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]
|
||||
if drop_corr_features["COMPUTE"]:
|
||||
|
||||
numerical_cols = features.select_dtypes(include=np.number).columns.tolist()
|
||||
|
||||
# Remove columns where NaN count threshold is passed
|
||||
valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
|
||||
|
||||
cor_matrix = valid_features.corr(method='spearman').abs()
|
||||
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))
|
||||
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > drop_corr_features["CORR_THRESHOLD"])]
|
||||
|
||||
features.drop(to_drop, axis=1, inplace=True)
|
||||
|
||||
# Remove rows if threshold of NaN values is passed
|
||||
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
|
||||
features.dropna(axis=0, thresh=min_count, inplace=True)
|
||||
|
||||
return features
|
||||
|
||||
def impute(df, method='zero'):
|
||||
|
||||
def k_nearest(df): # TODO: if needed, implement k-nearest imputation / interpolation
|
||||
pass
|
||||
|
||||
return { # rest of the columns should be imputed with the selected method
|
||||
'zero': df.fillna(0),
|
||||
'mean': df.fillna(df.mean()),
|
||||
'median': df.fillna(df.median()),
|
||||
'k-nearest': k_nearest(df)
|
||||
}[method]
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import math as m
|
||||
|
||||
import sys
|
||||
|
||||
def extract_second_order_features(intraday_features, so_features_names, prefix=""):
|
||||
|
||||
if prefix:
|
||||
groupby_cols = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||
else:
|
||||
groupby_cols = ['local_segment']
|
||||
|
||||
if not intraday_features.empty:
|
||||
so_features = pd.DataFrame()
|
||||
#print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest())
|
||||
if "mean" in so_features_names:
|
||||
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean().add_suffix("_SO_mean")], axis=1)
|
||||
|
||||
if "median" in so_features_names:
|
||||
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median().add_suffix("_SO_median")], axis=1)
|
||||
|
||||
if "sd" in so_features_names:
|
||||
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std().add_suffix("_SO_sd")], axis=1)
|
||||
|
||||
if "nlargest" in so_features_names: # largest 5 -- maybe there is a faster groupby solution?
|
||||
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
|
||||
so_features[column+"_SO_nlargest"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nlargest(5).mean())
|
||||
|
||||
if "nsmallest" in so_features_names: # smallest 5 -- maybe there is a faster groupby solution?
|
||||
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
|
||||
so_features[column+"_SO_nsmallest"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nsmallest(5).mean())
|
||||
|
||||
if "count_windows" in so_features_names:
|
||||
so_features["SO_windowsCount"] = intraday_features.groupby(groupby_cols).count()[prefix+"level_1"]
|
||||
|
||||
# numPeaksNonZero specialized for EDA sensor
|
||||
if "eda_num_peaks_non_zero" in so_features_names and prefix+"numPeaks" in intraday_features.columns:
|
||||
so_features[prefix+"SO_numPeaksNonZero"] = intraday_features.groupby(groupby_cols)[prefix+"numPeaks"].apply(lambda x: (x!=0).sum())
|
||||
|
||||
# numWindowsNonZero specialized for BVP and IBI sensors
|
||||
if "hrv_num_windows_non_nan" in so_features_names and prefix+"meanHr" in intraday_features.columns:
|
||||
so_features[prefix+"SO_numWindowsNonNaN"] = intraday_features.groupby(groupby_cols)[prefix+"meanHr"].apply(lambda x: (~np.isnan(x)).sum())
|
||||
|
||||
so_features.reset_index(inplace=True)
|
||||
|
||||
else:
|
||||
so_features = pd.DataFrame(columns=groupby_cols)
|
||||
|
||||
return so_features
|
||||
|
||||
def get_sample_rate(data): # To-Do get the sample rate information from the file's metadata
|
||||
try:
|
||||
timestamps_diff = data['timestamp'].diff().dropna().mean()
|
||||
print("Timestamp diff:", timestamps_diff)
|
||||
except:
|
||||
raise Exception("Error occured while trying to get the mean sample rate from the data.")
|
||||
|
||||
return m.ceil(1000/timestamps_diff)
|
|
@ -0,0 +1,71 @@
|
|||
import pandas as pd
|
||||
from scipy.stats import entropy
|
||||
|
||||
from cr_features.helper_functions import convert_to2d, accelerometer_features, frequency_features
|
||||
from cr_features.calculate_features_old import calculateFeatures
|
||||
from cr_features.calculate_features import calculate_features
|
||||
from cr_features_helper_methods import extract_second_order_features
|
||||
|
||||
import sys
|
||||
|
||||
def extract_acc_features_from_intraday_data(acc_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||
acc_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||
|
||||
if not acc_intraday_data.empty:
|
||||
sample_rate = 32
|
||||
|
||||
acc_intraday_data = filter_data_by_segment(acc_intraday_data, time_segment)
|
||||
|
||||
if not acc_intraday_data.empty:
|
||||
|
||||
acc_intraday_features = pd.DataFrame()
|
||||
|
||||
# apply methods from calculate features module
|
||||
if window_length is None:
|
||||
acc_intraday_features = \
|
||||
acc_intraday_data.groupby('local_segment').apply(lambda x: calculate_features( \
|
||||
convert_to2d(x['double_values_0'], x.shape[0]), \
|
||||
convert_to2d(x['double_values_1'], x.shape[0]), \
|
||||
convert_to2d(x['double_values_2'], x.shape[0]), \
|
||||
fs=sample_rate, feature_names=features, show_progress=False))
|
||||
else:
|
||||
acc_intraday_features = \
|
||||
acc_intraday_data.groupby('local_segment').apply(lambda x: calculate_features( \
|
||||
convert_to2d(x['double_values_0'], window_length*sample_rate), \
|
||||
convert_to2d(x['double_values_1'], window_length*sample_rate), \
|
||||
convert_to2d(x['double_values_2'], window_length*sample_rate), \
|
||||
fs=sample_rate, feature_names=features, show_progress=False))
|
||||
|
||||
acc_intraday_features.reset_index(inplace=True)
|
||||
|
||||
return acc_intraday_features
|
||||
|
||||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
acc_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
||||
calc_windows = kwargs.get('calc_windows', False)
|
||||
|
||||
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||
else:
|
||||
requested_window_length = None
|
||||
|
||||
# name of the features this function can compute
|
||||
base_intraday_features_names = accelerometer_features + frequency_features
|
||||
# the subset of requested features this function can compute
|
||||
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||
|
||||
# extract features from intraday data
|
||||
acc_intraday_features = extract_acc_features_from_intraday_data(acc_intraday_data, intraday_features_to_compute,
|
||||
requested_window_length, time_segment, filter_data_by_segment)
|
||||
|
||||
if calc_windows:
|
||||
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||
acc_second_order_features = extract_second_order_features(acc_intraday_features, so_features_names)
|
||||
return acc_intraday_features, acc_second_order_features
|
||||
|
||||
return acc_intraday_features
|
|
@ -0,0 +1,73 @@
|
|||
import pandas as pd
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
from cr_features.helper_functions import convert_to2d, hrv_features
|
||||
from cr_features.hrv import extract_hrv_features_2d_wrapper
|
||||
from cr_features_helper_methods import extract_second_order_features
|
||||
|
||||
import sys
|
||||
|
||||
# pd.set_option('display.max_rows', 1000)
|
||||
pd.set_option('display.max_columns', None)
|
||||
|
||||
def extract_bvp_features_from_intraday_data(bvp_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||
bvp_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||
|
||||
if not bvp_intraday_data.empty:
|
||||
sample_rate = 64
|
||||
|
||||
bvp_intraday_data = filter_data_by_segment(bvp_intraday_data, time_segment)
|
||||
|
||||
if not bvp_intraday_data.empty:
|
||||
|
||||
bvp_intraday_features = pd.DataFrame()
|
||||
|
||||
# apply methods from calculate features module
|
||||
if window_length is None:
|
||||
bvp_intraday_features = \
|
||||
bvp_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x:
|
||||
extract_hrv_features_2d_wrapper(
|
||||
convert_to2d(x['blood_volume_pulse'], x.shape[0]),
|
||||
sampling=sample_rate, hampel_fiter=False, median_filter=False, mod_z_score_filter=True, feature_names=features))
|
||||
|
||||
else:
|
||||
bvp_intraday_features = \
|
||||
bvp_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x:
|
||||
extract_hrv_features_2d_wrapper(
|
||||
convert_to2d(x['blood_volume_pulse'], window_length*sample_rate),
|
||||
sampling=sample_rate, hampel_fiter=False, median_filter=False, mod_z_score_filter=True, feature_names=features))
|
||||
|
||||
bvp_intraday_features.reset_index(inplace=True)
|
||||
|
||||
return bvp_intraday_features
|
||||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
bvp_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
||||
calc_windows = kwargs.get('calc_windows', False)
|
||||
|
||||
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||
else:
|
||||
requested_window_length = None
|
||||
|
||||
# name of the features this function can compute
|
||||
base_intraday_features_names = hrv_features
|
||||
# the subset of requested features this function can compute
|
||||
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||
|
||||
# extract features from intraday data
|
||||
bvp_intraday_features = extract_bvp_features_from_intraday_data(bvp_intraday_data, intraday_features_to_compute,
|
||||
requested_window_length, time_segment, filter_data_by_segment)
|
||||
|
||||
if calc_windows:
|
||||
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||
bvp_second_order_features = extract_second_order_features(bvp_intraday_features, so_features_names)
|
||||
return bvp_intraday_features, bvp_second_order_features
|
||||
|
||||
return bvp_intraday_features
|
|
@ -0,0 +1,78 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy.stats import entropy
|
||||
|
||||
from cr_features.helper_functions import convert_to2d, gsr_features
|
||||
from cr_features.calculate_features import calculate_features
|
||||
from cr_features.gsr import extractGsrFeatures2D
|
||||
from cr_features_helper_methods import extract_second_order_features
|
||||
|
||||
import sys
|
||||
|
||||
#pd.set_option('display.max_columns', None)
|
||||
#pd.set_option('display.max_rows', None)
|
||||
#np.seterr(invalid='ignore')
|
||||
|
||||
|
||||
def extract_eda_features_from_intraday_data(eda_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||
eda_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||
|
||||
if not eda_intraday_data.empty:
|
||||
sample_rate = 4
|
||||
|
||||
eda_intraday_data = filter_data_by_segment(eda_intraday_data, time_segment)
|
||||
|
||||
if not eda_intraday_data.empty:
|
||||
|
||||
eda_intraday_features = pd.DataFrame()
|
||||
|
||||
# apply methods from calculate features module
|
||||
if window_length is None:
|
||||
eda_intraday_features = \
|
||||
eda_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x: extractGsrFeatures2D(convert_to2d(x['electrodermal_activity'], x.shape[0]), sampleRate=sample_rate, featureNames=features,
|
||||
threshold=.01, offset=1, riseTime=5, decayTime=15))
|
||||
else:
|
||||
eda_intraday_features = \
|
||||
eda_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x: extractGsrFeatures2D(convert_to2d(x['electrodermal_activity'], window_length*sample_rate), sampleRate=sample_rate, featureNames=features,
|
||||
threshold=.01, offset=1, riseTime=5, decayTime=15))
|
||||
|
||||
eda_intraday_features.reset_index(inplace=True)
|
||||
|
||||
return eda_intraday_features
|
||||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
eda_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
||||
calc_windows = kwargs.get('calc_windows', False)
|
||||
|
||||
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||
else:
|
||||
requested_window_length = None
|
||||
|
||||
# name of the features this function can compute
|
||||
base_intraday_features_names = gsr_features
|
||||
# the subset of requested features this function can compute
|
||||
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||
|
||||
# extract features from intraday data
|
||||
eda_intraday_features = extract_eda_features_from_intraday_data(eda_intraday_data, intraday_features_to_compute,
|
||||
requested_window_length, time_segment, filter_data_by_segment)
|
||||
|
||||
if calc_windows:
|
||||
if provider["WINDOWS"]["IMPUTE_NANS"]:
|
||||
eda_intraday_features[eda_intraday_features["numPeaks"] == 0] = \
|
||||
eda_intraday_features[eda_intraday_features["numPeaks"] == 0].fillna(0)
|
||||
pd.set_option('display.max_columns', None)
|
||||
|
||||
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||
eda_second_order_features = extract_second_order_features(eda_intraday_features, so_features_names)
|
||||
|
||||
return eda_intraday_features, eda_second_order_features
|
||||
|
||||
return eda_intraday_features
|
|
@ -0,0 +1,79 @@
|
|||
import pandas as pd
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
import numpy as np
|
||||
|
||||
from cr_features.helper_functions import convert_ibi_to2d_time, hrv_features
|
||||
from cr_features.hrv import extract_hrv_features_2d_wrapper, get_HRV_features
|
||||
from cr_features_helper_methods import extract_second_order_features
|
||||
|
||||
import math
|
||||
import sys
|
||||
|
||||
# pd.set_option('display.max_rows', 1000)
|
||||
pd.set_option('display.max_columns', None)
|
||||
|
||||
|
||||
def extract_ibi_features_from_intraday_data(ibi_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||
ibi_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||
|
||||
if not ibi_intraday_data.empty:
|
||||
|
||||
ibi_intraday_data = filter_data_by_segment(ibi_intraday_data, time_segment)
|
||||
|
||||
if not ibi_intraday_data.empty:
|
||||
|
||||
ibi_intraday_features = pd.DataFrame()
|
||||
|
||||
# apply methods from calculate features module
|
||||
if window_length is None:
|
||||
ibi_intraday_features = \
|
||||
ibi_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x:
|
||||
extract_hrv_features_2d_wrapper(
|
||||
signal_2D = \
|
||||
convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], math.ceil(x['timings'].iloc[-1]))[0],
|
||||
ibi_timings = \
|
||||
convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], math.ceil(x['timings'].iloc[-1]))[1],
|
||||
sampling=None, hampel_fiter=False, median_filter=False, mod_z_score_filter=True, feature_names=features))
|
||||
else:
|
||||
ibi_intraday_features = \
|
||||
ibi_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x:
|
||||
extract_hrv_features_2d_wrapper(
|
||||
signal_2D = convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], window_length)[0],
|
||||
ibi_timings = convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], window_length)[1],
|
||||
sampling=None, hampel_fiter=False, median_filter=False, mod_z_score_filter=True, feature_names=features))
|
||||
|
||||
ibi_intraday_features.reset_index(inplace=True)
|
||||
|
||||
return ibi_intraday_features
|
||||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
ibi_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
||||
calc_windows = kwargs.get('calc_windows', False)
|
||||
|
||||
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||
else:
|
||||
requested_window_length = None
|
||||
|
||||
# name of the features this function can compute
|
||||
base_intraday_features_names = hrv_features
|
||||
# the subset of requested features this function can compute
|
||||
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||
|
||||
# extract features from intraday data
|
||||
ibi_intraday_features = extract_ibi_features_from_intraday_data(ibi_intraday_data, intraday_features_to_compute,
|
||||
requested_window_length, time_segment, filter_data_by_segment)
|
||||
|
||||
if calc_windows:
|
||||
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||
ibi_second_order_features = extract_second_order_features(ibi_intraday_features, so_features_names)
|
||||
|
||||
return ibi_intraday_features, ibi_second_order_features
|
||||
|
||||
return ibi_intraday_features
|
|
@ -0,0 +1,65 @@
|
|||
import pandas as pd
|
||||
from scipy.stats import entropy
|
||||
|
||||
from cr_features.helper_functions import convert_to2d, generic_features
|
||||
from cr_features.calculate_features_old import calculateFeatures
|
||||
from cr_features.calculate_features import calculate_features
|
||||
from cr_features_helper_methods import extract_second_order_features
|
||||
|
||||
import sys
|
||||
|
||||
def extract_temp_features_from_intraday_data(temperature_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||
temperature_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||
|
||||
if not temperature_intraday_data.empty:
|
||||
sample_rate = 4
|
||||
|
||||
temperature_intraday_data = filter_data_by_segment(temperature_intraday_data, time_segment)
|
||||
|
||||
if not temperature_intraday_data.empty:
|
||||
|
||||
temperature_intraday_features = pd.DataFrame()
|
||||
|
||||
# apply methods from calculate features module
|
||||
if window_length is None:
|
||||
temperature_intraday_features = \
|
||||
temperature_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x: calculate_features(convert_to2d(x['temperature'], x.shape[0]), fs=sample_rate, feature_names=features, show_progress=False))
|
||||
else:
|
||||
temperature_intraday_features = \
|
||||
temperature_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x: calculate_features(convert_to2d(x['temperature'], window_length*sample_rate), fs=sample_rate, feature_names=features, show_progress=False))
|
||||
|
||||
|
||||
temperature_intraday_features.reset_index(inplace=True)
|
||||
|
||||
return temperature_intraday_features
|
||||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
temperature_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
||||
calc_windows = kwargs.get('calc_windows', False)
|
||||
|
||||
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||
else:
|
||||
requested_window_length = None
|
||||
|
||||
# name of the features this function can compute
|
||||
base_intraday_features_names = generic_features
|
||||
# the subset of requested features this function can compute
|
||||
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||
|
||||
# extract features from intraday data
|
||||
temperature_intraday_features = extract_temp_features_from_intraday_data(temperature_intraday_data, intraday_features_to_compute,
|
||||
requested_window_length, time_segment, filter_data_by_segment)
|
||||
|
||||
if calc_windows:
|
||||
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||
temperature_second_order_features = extract_second_order_features(temperature_intraday_features, so_features_names)
|
||||
return temperature_intraday_features, temperature_second_order_features
|
||||
|
||||
return temperature_intraday_features
|
|
@ -1,12 +1,16 @@
|
|||
import pandas as pd
|
||||
from utils.utils import fetch_provider_features, run_provider_cleaning_script
|
||||
|
||||
import sys
|
||||
|
||||
sensor_data_files = dict(snakemake.input)
|
||||
|
||||
provider = snakemake.params["provider"]
|
||||
provider_key = snakemake.params["provider_key"]
|
||||
sensor_key = snakemake.params["sensor_key"]
|
||||
|
||||
calc_windows = True if (provider.get("WINDOWS", False) and provider["WINDOWS"].get("COMPUTE", False)) else False
|
||||
|
||||
if sensor_key == "all_cleaning_individual" or sensor_key == "all_cleaning_overall":
|
||||
# Data cleaning
|
||||
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
|
||||
|
@ -14,6 +18,18 @@ else:
|
|||
# Extract sensor features
|
||||
del sensor_data_files["time_segments_labels"]
|
||||
time_segments_file = snakemake.input["time_segments_labels"]
|
||||
sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
|
||||
|
||||
sensor_features.to_csv(snakemake.output[0], index=False)
|
||||
if calc_windows:
|
||||
window_features, second_order_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=True)
|
||||
|
||||
window_features.to_csv(snakemake.output[1], index=False)
|
||||
second_order_features.to_csv(snakemake.output[0], index=False)
|
||||
|
||||
elif "empatica" in sensor_key:
|
||||
pd.DataFrame().to_csv(snakemake.output[1], index=False)
|
||||
|
||||
if not calc_windows:
|
||||
sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=False)
|
||||
|
||||
if not calc_windows:
|
||||
sensor_features.to_csv(snakemake.output[0], index=False)
|
||||
|
|
|
@ -3,9 +3,11 @@ library(tidyr)
|
|||
library(readr)
|
||||
|
||||
compute_data_yield_features <- function(data, feature_name, time_segment, provider){
|
||||
|
||||
data <- data %>% filter_data_by_segment(time_segment)
|
||||
if(nrow(data) == 0)
|
||||
if(nrow(data) == 0){
|
||||
return(tibble(local_segment = character(), ratiovalidyieldedminutes = numeric(), ratiovalidyieldedhours = numeric()))
|
||||
}
|
||||
features <- data %>%
|
||||
separate(timestamps_segment, into = c("start_timestamp", "end_timestamp"), convert = T, sep = ",") %>%
|
||||
mutate(duration_minutes = (end_timestamp - start_timestamp) / 60000,
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
import sys
|
||||
|
||||
sensor_data_files = dict(snakemake.input)
|
||||
|
||||
provider = snakemake.params["provider"]
|
||||
provider_key = snakemake.params["provider_key"]
|
||||
sensor_key = snakemake.params["sensor_key"]
|
||||
|
||||
pd.set_option('display.max_columns', None)
|
||||
|
||||
if provider_key == "cr":
|
||||
sys.path.append('/rapids/src/features/')
|
||||
from cr_features_helper_methods import extract_second_order_features
|
||||
|
||||
provider_main = snakemake.params["provider_main"]
|
||||
prefix = sensor_key + "_" + provider_key + "_"
|
||||
|
||||
windows_features_data = pd.read_csv(sensor_data_files["windows_features_data"])
|
||||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime', prefix + "level_1"]
|
||||
|
||||
if windows_features_data.empty:
|
||||
windows_features_data.to_csv(snakemake.output[1], index=False)
|
||||
windows_features_data.to_csv(snakemake.output[0], index=False)
|
||||
else:
|
||||
windows_features_data.loc[:, ~windows_features_data.columns.isin(excluded_columns)] = StandardScaler().fit_transform(windows_features_data.loc[:, ~windows_features_data.columns.isin(excluded_columns)])
|
||||
|
||||
windows_features_data.to_csv(snakemake.output[1], index=False)
|
||||
|
||||
if provider_main["WINDOWS"]["COMPUTE"] and "SECOND_ORDER_FEATURES" in provider_main["WINDOWS"]:
|
||||
so_features_names = provider_main["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||
windows_so_features_data = extract_second_order_features(windows_features_data, so_features_names, prefix)
|
||||
windows_so_features_data.to_csv(snakemake.output[0], index=False)
|
||||
else:
|
||||
pd.DataFrame().to_csv(snakemake.output[0], index=False)
|
||||
|
||||
else:
|
||||
for sensor_features in sensor_data_files["sensor_features"]:
|
||||
if "/" + sensor_key + ".csv" in sensor_features:
|
||||
sensor_data = pd.read_csv(sensor_features)
|
||||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||
|
||||
if not sensor_data.empty:
|
||||
sensor_data.loc[:, ~sensor_data.columns.isin(excluded_columns)] = StandardScaler().fit_transform(sensor_data.loc[:, ~sensor_data.columns.isin(excluded_columns)])
|
||||
|
||||
sensor_data.to_csv(snakemake.output[0], index=False)
|
||||
break
|
|
@ -0,0 +1,17 @@
|
|||
source("renv/activate.R")
|
||||
|
||||
library(tidyr)
|
||||
library(purrr)
|
||||
library("dplyr", warn.conflicts = F)
|
||||
library(stringr)
|
||||
|
||||
feature_files <- snakemake@input[["feature_files"]]
|
||||
|
||||
|
||||
features_of_all_participants <- tibble(filename = feature_files) %>% # create a data frame
|
||||
mutate(file_contents = map(filename, ~ read.csv(., stringsAsFactors = F, colClasses = c(local_segment = "character", local_segment_label = "character", local_segment_start_datetime="character", local_segment_end_datetime="character"))),
|
||||
pid = str_match(filename, ".*/(.*)/z_all_sensor_features.csv")[,2]) %>%
|
||||
unnest(cols = c(file_contents)) %>%
|
||||
select(-filename)
|
||||
|
||||
write.csv(features_of_all_participants, snakemake@output[[1]], row.names = FALSE)
|
|
@ -88,11 +88,13 @@ def chunk_episodes(sensor_episodes):
|
|||
|
||||
return merged_sensor_episodes
|
||||
|
||||
def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file):
|
||||
def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=False):
|
||||
import pandas as pd
|
||||
from importlib import import_module, util
|
||||
|
||||
sensor_features = pd.DataFrame(columns=["local_segment"])
|
||||
sensor_fo_features = pd.DataFrame(columns=["local_segment"])
|
||||
sensor_so_features = pd.DataFrame(columns=["local_segment"])
|
||||
time_segments_labels = pd.read_csv(time_segments_file, header=0)
|
||||
if "FEATURES" not in provider:
|
||||
raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(sensor_key.upper(), provider_key.upper()))
|
||||
|
@ -106,23 +108,57 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
|
|||
time_segments_labels["label"] = [""]
|
||||
for time_segment in time_segments_labels["label"]:
|
||||
print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, time_segment))
|
||||
features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes)
|
||||
if not "local_segment" in features.columns:
|
||||
raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")
|
||||
features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns]
|
||||
sensor_features = pd.concat([sensor_features, features], axis=0, sort=False)
|
||||
|
||||
features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes, calc_windows=calc_windows)
|
||||
|
||||
# In case of calc_window = True
|
||||
if isinstance(features, tuple):
|
||||
if not "local_segment" in features[0].columns or not "local_segment" in features[1].columns:
|
||||
raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")
|
||||
features[0].columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features[0].columns]
|
||||
features[1].columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features[1].columns]
|
||||
if not features[0].empty:
|
||||
sensor_fo_features = pd.concat([sensor_fo_features, features[0]], axis=0, sort=False)
|
||||
if not features[1].empty:
|
||||
sensor_so_features = pd.concat([sensor_so_features, features[1]], axis=0, sort=False)
|
||||
else:
|
||||
if not "local_segment" in features.columns:
|
||||
raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")
|
||||
features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns]
|
||||
sensor_features = pd.concat([sensor_features, features], axis=0, sort=False)
|
||||
else:
|
||||
for feature in provider["FEATURES"]:
|
||||
sensor_features[feature] = None
|
||||
segment_colums = pd.DataFrame()
|
||||
sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
||||
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
||||
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
||||
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
|
||||
for i in range(segment_colums.shape[1]):
|
||||
sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||
|
||||
if calc_windows:
|
||||
segment_colums = pd.DataFrame()
|
||||
sensor_fo_features['local_segment'] = sensor_fo_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
||||
split_segemnt_columns = sensor_fo_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
||||
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
||||
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
|
||||
for i in range(segment_colums.shape[1]):
|
||||
sensor_fo_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||
|
||||
segment_colums = pd.DataFrame()
|
||||
sensor_so_features['local_segment'] = sensor_so_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
||||
split_segemnt_columns = sensor_so_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
||||
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
||||
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
|
||||
for i in range(segment_colums.shape[1]):
|
||||
sensor_so_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||
|
||||
return sensor_features
|
||||
return sensor_fo_features, sensor_so_features
|
||||
|
||||
else:
|
||||
segment_colums = pd.DataFrame()
|
||||
sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
||||
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
||||
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
||||
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
|
||||
for i in range(segment_colums.shape[1]):
|
||||
sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||
|
||||
return sensor_features
|
||||
|
||||
def run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files):
|
||||
from importlib import import_module, util
|
||||
|
@ -132,4 +168,4 @@ def run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data
|
|||
cleaning_function = getattr(cleaning_module, provider_key.lower() + "_cleaning")
|
||||
sensor_features = cleaning_function(sensor_data_files, provider)
|
||||
|
||||
return sensor_features
|
||||
return sensor_features
|
|
@ -10,7 +10,7 @@ def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
|
|||
if all(~target_variable_index):
|
||||
raise ValueError("The requested target (", target_variable_name,
|
||||
")cannot be found in the dataset.",
|
||||
"Please check the names of phone_esm_ columns in all_sensor_features_cleaned_rapids.csv")
|
||||
"Please check the names of phone_esm_ columns in z_all_sensor_features_cleaned_straw_py.csv")
|
||||
sensor_features_plus_target = df_input.drop(esm_names, axis=1)
|
||||
sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]
|
||||
# We will only keep one column related to phone_esm and that will be our target variable.
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
participant = "p031"
|
||||
all_sensors = ["eda", "bvp", "ibi", "temp", "acc"]
|
||||
|
||||
for sensor in all_sensors:
|
||||
|
||||
if sensor == "eda":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_cr_windows.csv"
|
||||
elif sensor == "bvp":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_cr_windows.csv"
|
||||
elif sensor == "ibi":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_cr_windows.csv"
|
||||
elif sensor == "acc":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_accelerometer_features/empatica_accelerometer_python_cr_windows.csv"
|
||||
elif sensor == "temp":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_temperature_features/empatica_temperature_python_cr_windows.csv"
|
||||
else:
|
||||
path = "/rapids/data/processed/features/all_participants/all_sensor_features.csv" # all features all participants
|
||||
|
||||
|
||||
df = pd.read_csv(path)
|
||||
print(df)
|
||||
is_NaN = df.isnull()
|
||||
row_has_NaN = is_NaN.any(axis=1)
|
||||
rows_with_NaN = df[row_has_NaN]
|
||||
|
||||
print("All rows:", len(df.index))
|
||||
print("\nCount NaN vals:", rows_with_NaN.size)
|
||||
print("\nDf mean:")
|
||||
print(df.mean())
|
||||
|
||||
sns.heatmap(df.isna(), cbar=False)
|
||||
plt.savefig(f'{sensor}_{participant}_windows_NaN.png', bbox_inches='tight')
|
||||
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import os, sys
|
||||
|
||||
participant = "p032"
|
||||
|
||||
folder = f"/rapids/data/processed/features/{participant}/"
|
||||
for filename in os.listdir(folder):
|
||||
if filename.startswith("phone_"):
|
||||
df = pd.read_csv(f"{folder}{filename}")
|
||||
plt.figure()
|
||||
sns.heatmap(df[[col for col in df if col.startswith('phone_')]], cbar=True)
|
||||
plt.savefig(f'{participant}_{filename}.png', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
plt.figure()
|
||||
sns.heatmap(df[[col for col in df if col.startswith('phone_')]].isna(), cbar=True)
|
||||
plt.savefig(f'is_na_{participant}_{filename}.png', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from itertools import compress
|
||||
|
||||
|
||||
participant = "p031"
|
||||
sensor = "eda"
|
||||
|
||||
if sensor == "eda":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_cr_windows.csv"
|
||||
elif sensor == "bvp":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_cr_windows.csv"
|
||||
elif sensor == "ibi":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_cr_windows.csv"
|
||||
elif sensor == "acc":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_accelerometer_features/empatica_accelerometer_python_cr_windows.csv"
|
||||
elif sensor == "temp":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_temperature_features/empatica_temperature_python_cr_windows.csv"
|
||||
else:
|
||||
path = "/rapids/data/processed/features/all_participants/all_sensor_features.csv" # all features all participants"
|
||||
|
||||
|
||||
df = pd.read_csv(path)
|
||||
df_num_peaks_zero = df[df["empatica_electrodermal_activity_cr_numPeaks"] == 0]
|
||||
columns_num_peaks_zero = df_num_peaks_zero.columns[df_num_peaks_zero.isna().any()].tolist()
|
||||
|
||||
df_num_peaks_non_zero = df[df["empatica_electrodermal_activity_cr_numPeaks"] != 0]
|
||||
df_num_peaks_non_zero = df_num_peaks_non_zero[columns_num_peaks_zero]
|
||||
|
||||
pd.set_option('display.max_columns', None)
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_num_peaks_non_zero:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_num_peaks_non_zero[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q)
|
||||
plt.savefig(f'eda_{participant}_window_non_zero_peak_other_vals.png', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# Filter columns that do not contain 0
|
||||
non_zero_cols = list(compress(columns_num_peaks_zero, df_num_peaks_non_zero.all().tolist()))
|
||||
zero_cols = list(set(columns_num_peaks_zero) - set(non_zero_cols))
|
||||
|
||||
print(non_zero_cols, "\n")
|
||||
print(zero_cols)
|
||||
|
||||
|
Loading…
Reference in New Issue