Merge branch 'empatica_calculating_features'
commit
887fd7dc72
|
@ -93,6 +93,7 @@ packrat/*
|
|||
|
||||
# exclude data from source control by default
|
||||
data/external/*
|
||||
!/data/external/empatica/empatica1/E4 Data.zip
|
||||
!/data/external/.gitkeep
|
||||
!/data/external/stachl_application_genre_catalogue.csv
|
||||
!/data/external/timesegments*.csv
|
||||
|
@ -113,4 +114,10 @@ sn_profile_*/
|
|||
settings.dcf
|
||||
tests/fakedata_generation/
|
||||
site/
|
||||
credentials.yaml
|
||||
!credentials.yaml
|
||||
|
||||
# Docker container and other files
|
||||
.devcontainer
|
||||
|
||||
# Calculating features module
|
||||
calculatingfeatures/
|
||||
|
|
19
README.md
19
README.md
|
@ -11,3 +11,22 @@
|
|||
For more information refer to our [documentation](http://www.rapids.science)
|
||||
|
||||
By [MoSHI](https://www.moshi.pitt.edu/), [University of Pittsburgh](https://www.pitt.edu/)
|
||||
|
||||
## Installation
|
||||
|
||||
For RAPIDS installation refer to to the [documentation](https://www.rapids.science/1.8/setup/installation/)
|
||||
|
||||
## CalculatingFeatures
|
||||
|
||||
This RAPIDS extension uses CalculatingFeatures library accessible [here](https://repo.ijs.si/matjazbostic/calculatingfeatures).
|
||||
|
||||
To use CalculatingFeatures library:
|
||||
- Follow the installation instructions in the [README.md](https://repo.ijs.si/matjazbostic/calculatingfeatures/-/blob/master/README.md).
|
||||
|
||||
- Copy built calculatingfeatures folder into the RAPIDS workspace.
|
||||
|
||||
- Install the CalculatingFeatures package by:
|
||||
```
|
||||
pip install "path/to/the/calculatingfeatures/folder"
|
||||
CalculatingFeatures package has to be built and installed everytime to get the newest version.
|
||||
```
|
15
Snakefile
15
Snakefile
|
@ -5,7 +5,6 @@ include: "rules/common.smk"
|
|||
include: "rules/renv.smk"
|
||||
include: "rules/preprocessing.smk"
|
||||
include: "rules/features.smk"
|
||||
include: "rules/models.smk"
|
||||
include: "rules/reports.smk"
|
||||
|
||||
import itertools
|
||||
|
@ -328,6 +327,8 @@ for provider in config["EMPATICA_ACCELEROMETER"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_accelerometer.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_accelerometer_features/z_empatica_accelerometer_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
|
||||
for provider in config["EMPATICA_HEARTRATE"]["PROVIDERS"].keys():
|
||||
if config["EMPATICA_HEARTRATE"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -347,6 +348,8 @@ for provider in config["EMPATICA_TEMPERATURE"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_temperature.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_temperature_features/z_empatica_temperature_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
|
||||
for provider in config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"].keys():
|
||||
if config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -356,6 +359,8 @@ for provider in config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_electrodermal_activity.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_electrodermal_activity_features/z_empatica_electrodermal_activity_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
|
||||
for provider in config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"].keys():
|
||||
if config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -365,6 +370,9 @@ for provider in config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_blood_volume_pulse.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_blood_volume_pulse_features/z_empatica_blood_volume_pulse_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
|
||||
|
||||
for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys():
|
||||
if config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -374,6 +382,8 @@ for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_inter_beat_interval.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_inter_beat_interval_features/z_empatica_inter_beat_interval_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
|
||||
if isinstance(config["EMPATICA_TAGS"]["PROVIDERS"], dict):
|
||||
for provider in config["EMPATICA_TAGS"]["PROVIDERS"].keys():
|
||||
|
@ -426,6 +436,9 @@ if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]:
|
|||
files_to_compute.extend(expand("data/processed/models/population_model/input.csv"))
|
||||
|
||||
#files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
|
||||
# Put the for loop over STANDARDIZATION providers if all are COMPUTE == True
|
||||
# then merge all that are set to True in z_all_sensors for all and each participant
|
||||
# See the logic behind: in each sensor the "data/processed/features/all_participants/all_sensor_features.csv" is listed
|
||||
|
||||
rule all:
|
||||
input:
|
||||
|
|
136
config.yaml
136
config.yaml
|
@ -3,36 +3,34 @@
|
|||
########################################################################################################################
|
||||
|
||||
# See https://www.rapids.science/latest/setup/configuration/#participant-files
|
||||
PIDS: ['p031', 'p032', 'p033', 'p034', 'p035', 'p036', 'p037', 'p038', 'p039', 'p040', 'p042', 'p043', 'p044', 'p045', 'p046', 'p049', 'p050', 'p052', 'p053', 'p054', 'p055', 'p057', 'p058', 'p059', 'p060', 'p061', 'p062', 'p064', 'p067', 'p068', 'p069', 'p070', 'p071', 'p072', 'p073', 'p074', 'p075', 'p076', 'p077', 'p078', 'p079', 'p080', 'p081', 'p082', 'p083', 'p084', 'p085', 'p086', 'p088', 'p089', 'p090', 'p091', 'p092', 'p093', 'p106', 'p107']
|
||||
PIDS: [p031] #p01, p02, p03]
|
||||
|
||||
# See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files
|
||||
CREATE_PARTICIPANT_FILES:
|
||||
USERNAMES_CSV: "data/external/main_study_usernames.csv"
|
||||
CSV_FILE_PATH: "data/external/main_study_participants.csv" # see docs for required format
|
||||
CSV_FILE_PATH: "data/external/example_participants.csv" # see docs for required format
|
||||
PHONE_SECTION:
|
||||
ADD: True
|
||||
IGNORED_DEVICE_IDS: []
|
||||
FITBIT_SECTION:
|
||||
ADD: False
|
||||
ADD: True
|
||||
IGNORED_DEVICE_IDS: []
|
||||
EMPATICA_SECTION:
|
||||
ADD: False
|
||||
ADD: True
|
||||
IGNORED_DEVICE_IDS: []
|
||||
|
||||
# See https://www.rapids.science/latest/setup/configuration/#time-segments
|
||||
TIME_SEGMENTS: &time_segments
|
||||
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
|
||||
FILE: "data/external/timesegments_daily.csv"
|
||||
INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
|
||||
FILE: "data/external/timesegments_periodic.csv"
|
||||
INCLUDE_PAST_PERIODIC_SEGMENTS: FALSE # Only relevant if TYPE=PERIODIC, see docs
|
||||
|
||||
# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
|
||||
TIMEZONE:
|
||||
TYPE: MULTIPLE
|
||||
TYPE: SINGLE
|
||||
SINGLE:
|
||||
TZCODE: Europe/Ljubljana
|
||||
MULTIPLE:
|
||||
TZ_FILE: data/external/timezone.csv
|
||||
TZCODES_FILE: data/external/multiple_timezones.csv
|
||||
TZCODES_FILE: data/external/multiple_timezones_example.csv
|
||||
IF_MISSING_TZCODE: USE_DEFAULT
|
||||
DEFAULT_TZCODE: Europe/Ljubljana
|
||||
FITBIT:
|
||||
|
@ -87,7 +85,7 @@ PHONE_ACTIVITY_RECOGNITION:
|
|||
EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same AR episode.
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"]
|
||||
ACTIVITY_CLASSES:
|
||||
STATIONARY: ["still", "tilting"]
|
||||
|
@ -116,7 +114,7 @@ PHONE_APPLICATIONS_FOREGROUND:
|
|||
SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
INCLUDE_EPISODE_FEATURES: True
|
||||
SINGLE_CATEGORIES: ["all", "email"]
|
||||
MULTIPLE_CATEGORIES:
|
||||
|
@ -151,7 +149,7 @@ PHONE_BATTERY:
|
|||
EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
|
||||
SRC_SCRIPT: src/features/phone_battery/rapids/main.py
|
||||
|
||||
|
@ -160,12 +158,12 @@ PHONE_BLUETOOTH:
|
|||
CONTAINER: bluetooth
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
|
||||
SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R
|
||||
|
||||
DORYAB:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
FEATURES:
|
||||
ALL:
|
||||
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
|
||||
|
@ -186,7 +184,7 @@ PHONE_CALLS:
|
|||
CONTAINER: call
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
FEATURES_TYPE: EPISODES # EVENTS or EPISODES
|
||||
CALL_TYPES: [missed, incoming, outgoing]
|
||||
FEATURES:
|
||||
|
@ -229,7 +227,7 @@ PHONE_DATA_YIELD:
|
|||
PHONE_WIFI_VISIBLE]
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
|
||||
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
|
||||
SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R
|
||||
|
@ -257,7 +255,7 @@ PHONE_LIGHT:
|
|||
CONTAINER: light_sensor
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
|
||||
SRC_SCRIPT: src/features/phone_light/rapids/main.py
|
||||
|
||||
|
@ -271,7 +269,7 @@ PHONE_LOCATIONS:
|
|||
|
||||
PROVIDERS:
|
||||
DORYAB:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome", "homelabel"]
|
||||
DBSCAN_EPS: 100 # meters
|
||||
DBSCAN_MINSAMPLES: 5
|
||||
|
@ -286,7 +284,7 @@ PHONE_LOCATIONS:
|
|||
SRC_SCRIPT: src/features/phone_locations/doryab/main.py
|
||||
|
||||
BARNETT:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
|
||||
IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON
|
||||
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
|
||||
|
@ -304,7 +302,7 @@ PHONE_MESSAGES:
|
|||
CONTAINER: sms
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
MESSAGES_TYPES : [received, sent]
|
||||
FEATURES:
|
||||
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
|
||||
|
@ -316,7 +314,7 @@ PHONE_SCREEN:
|
|||
CONTAINER: screen
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
REFERENCE_HOUR_FIRST_USE: 0
|
||||
IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable
|
||||
IGNORE_EPISODES_LONGER_THAN: 360 # in minutes, set to 0 to disable
|
||||
|
@ -338,12 +336,13 @@ PHONE_WIFI_VISIBLE:
|
|||
CONTAINER: wifi
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
|
||||
SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R
|
||||
|
||||
|
||||
|
||||
|
||||
########################################################################################################################
|
||||
# FITBIT #
|
||||
########################################################################################################################
|
||||
|
@ -506,6 +505,16 @@ EMPATICA_ACCELEROMETER:
|
|||
COMPUTE: False
|
||||
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
|
||||
SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
|
||||
CR:
|
||||
COMPUTE: False
|
||||
FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features
|
||||
WINDOWS:
|
||||
COMPUTE: True
|
||||
WINDOW_LENGTH: 15 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/empatica_accelerometer/cr/main.py
|
||||
|
||||
|
||||
# See https://www.rapids.science/latest/features/empatica-heartrate/
|
||||
EMPATICA_HEARTRATE:
|
||||
|
@ -524,6 +533,16 @@ EMPATICA_TEMPERATURE:
|
|||
COMPUTE: False
|
||||
FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
|
||||
SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
|
||||
CR:
|
||||
COMPUTE: False
|
||||
FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean",
|
||||
"stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"]
|
||||
WINDOWS:
|
||||
COMPUTE: True
|
||||
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/empatica_temperature/cr/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/empatica-electrodermal-activity/
|
||||
EMPATICA_ELECTRODERMAL_ACTIVITY:
|
||||
|
@ -533,6 +552,20 @@ EMPATICA_ELECTRODERMAL_ACTIVITY:
|
|||
COMPUTE: False
|
||||
FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
|
||||
SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
|
||||
CR:
|
||||
COMPUTE: True
|
||||
FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic',
|
||||
'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore',
|
||||
'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio',
|
||||
'avgPeakIncreaseTime', 'avgPeakDecreaseTime', 'avgPeakDuration', 'signalOverallChange', 'changeDuration', 'changeRate', 'significantIncrease',
|
||||
'significantDecrease']
|
||||
WINDOWS:
|
||||
COMPUTE: True
|
||||
WINDOW_LENGTH: 60 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', count_windows, eda_num_peaks_non_zero]
|
||||
STANDARDIZE_FEATURES: True
|
||||
IMPUTE_NANS: True
|
||||
SRC_SCRIPT: src/features/empatica_electrodermal_activity/cr/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/empatica-blood-volume-pulse/
|
||||
EMPATICA_BLOOD_VOLUME_PULSE:
|
||||
|
@ -542,6 +575,16 @@ EMPATICA_BLOOD_VOLUME_PULSE:
|
|||
COMPUTE: False
|
||||
FEATURES: ["maxbvp", "minbvp", "avgbvp", "medianbvp", "modebvp", "stdbvp", "diffmaxmodebvp", "diffminmodebvp", "entropybvp"]
|
||||
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/dbdp/main.py
|
||||
CR:
|
||||
COMPUTE: False
|
||||
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
|
||||
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
|
||||
WINDOWS:
|
||||
COMPUTE: True
|
||||
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/cr/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/empatica-inter-beat-interval/
|
||||
EMPATICA_INTER_BEAT_INTERVAL:
|
||||
|
@ -551,6 +594,17 @@ EMPATICA_INTER_BEAT_INTERVAL:
|
|||
COMPUTE: False
|
||||
FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
|
||||
SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
|
||||
CR:
|
||||
COMPUTE: False
|
||||
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
|
||||
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
|
||||
PATCH_WITH_BVP: True
|
||||
WINDOWS:
|
||||
COMPUTE: True
|
||||
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/empatica_inter_beat_interval/cr/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/empatica-tags/
|
||||
EMPATICA_TAGS:
|
||||
|
@ -558,6 +612,7 @@ EMPATICA_TAGS:
|
|||
PROVIDERS: # None implemented yet
|
||||
|
||||
|
||||
|
||||
########################################################################################################################
|
||||
# PLOTS #
|
||||
########################################################################################################################
|
||||
|
@ -566,7 +621,7 @@ EMPATICA_TAGS:
|
|||
|
||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#1-histograms-of-phone-data-yield
|
||||
HISTOGRAM_PHONE_DATA_YIELD:
|
||||
PLOT: True
|
||||
PLOT: False
|
||||
|
||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#2-heatmaps-of-overall-data-yield
|
||||
HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT:
|
||||
|
@ -575,7 +630,7 @@ HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT:
|
|||
|
||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#3-heatmap-of-recorded-phone-sensors
|
||||
HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT:
|
||||
PLOT: True
|
||||
PLOT: False
|
||||
|
||||
# See https://www.rapids.science/latest/visualizations/data-quality-visualizations/#4-heatmap-of-sensor-row-count
|
||||
HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT:
|
||||
|
@ -586,7 +641,7 @@ HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT:
|
|||
|
||||
# See https://www.rapids.science/latest/visualizations/feature-visualizations/#1-heatmap-correlation-matrix
|
||||
HEATMAP_FEATURE_CORRELATION_MATRIX:
|
||||
PLOT: True
|
||||
PLOT: False
|
||||
MIN_ROWS_RATIO: 0.5
|
||||
CORR_THRESHOLD: 0.1
|
||||
CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"}
|
||||
|
@ -599,17 +654,17 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
|
|||
ALL_CLEANING_INDIVIDUAL:
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||
COMPUTE: True
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||
COLS_VAR_THRESHOLD: True
|
||||
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||
DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable
|
||||
DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||
COMPUTE: False
|
||||
COMPUTE: True
|
||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
|
||||
|
@ -617,23 +672,33 @@ ALL_CLEANING_INDIVIDUAL:
|
|||
ALL_CLEANING_OVERALL:
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||
COMPUTE: True
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||
COLS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||
COLS_VAR_THRESHOLD: True
|
||||
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
ROWS_NAN_THRESHOLD: 0.3 # set to 1 to disable
|
||||
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||
DATA_YIELD_RATIO_THRESHOLD: 0.3 # set to 0 to disable
|
||||
DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||
COMPUTE: False
|
||||
COMPUTE: True
|
||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
|
||||
|
||||
|
||||
########################################################################################################################
|
||||
# Analysis Workflow Example #
|
||||
# Z-score standardization #
|
||||
########################################################################################################################
|
||||
STANDARDIZATION:
|
||||
PROVIDERS:
|
||||
CR:
|
||||
COMPUTE: True
|
||||
SRC_SCRIPT: src/features/standardization/main.py
|
||||
|
||||
########################################################################################################################
|
||||
# Baseline #
|
||||
########################################################################################################################
|
||||
|
||||
PARAMS_FOR_ANALYSIS:
|
||||
|
@ -651,3 +716,4 @@ PARAMS_FOR_ANALYSIS:
|
|||
TARGET:
|
||||
COMPUTE: True
|
||||
LABEL: PANAS_negative_affect_mean
|
||||
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
PSQL_STRAW:
|
||||
database: staw
|
||||
user: staw_db
|
||||
password: kizi-x2yf-mate
|
||||
host: 212.235.208.113
|
||||
port: 5432
|
|
@ -0,0 +1,9 @@
|
|||
"_id","timestamp","device_id","call_type","call_duration","trace"
|
||||
1,1587663260695,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",2,14,"d5e84f8af01b2728021d4f43f53a163c0c90000c"
|
||||
2,1587739118007,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",3,0,"47c125dc7bd163b8612cdea13724a814917b6e93"
|
||||
5,1587746544891,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",2,95,"9cc793ffd6e88b1d850ce540b5d7e000ef5650d4"
|
||||
6,1587911379859,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",2,63,"51fb9344e988049a3fec774c7ca622358bf80264"
|
||||
7,1587992647361,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",3,0,"2a862a7730cfdfaf103a9487afe3e02935fd6e02"
|
||||
8,1588020039448,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",1,11,"a2c53f6a086d98622c06107780980cf1bb4e37bd"
|
||||
11,1588176189024,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",2,65,"56589df8c830c70e330b644921ed38e08d8fd1f3"
|
||||
12,1588197745079,"a748ee1a-1d0b-4ae9-9074-279a2b6ba524",3,0,"cab458018a8ed3b626515e794c70b6f415318adc"
|
|
Binary file not shown.
|
@ -0,0 +1,11 @@
|
|||
PHONE:
|
||||
DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524] # the participant's AWARE device id
|
||||
PLATFORMS: [android] # or ios
|
||||
LABEL: MyTestP01 # any string
|
||||
START_DATE: 2020-01-01 # this can also be empty
|
||||
END_DATE: 2021-01-01 # this can also be empty
|
||||
EMPATICA:
|
||||
DEVICE_IDS: [empatica1]
|
||||
LABEL: test01
|
||||
START_DATE:
|
||||
END_DATE:
|
|
@ -1,2 +1,2 @@
|
|||
label,length
|
||||
thirtyminutes,30
|
||||
fiveminutes,5
|
|
|
@ -1,9 +1,2 @@
|
|||
label,start_time,length,repeats_on,repeats_value
|
||||
threeday,00:00:00,2D 23H 59M 59S,every_day,0
|
||||
daily,00:00:00,23H 59M 59S,every_day,0
|
||||
morning,06:00:00,5H 59M 59S,every_day,0
|
||||
afternoon,12:00:00,5H 59M 59S,every_day,0
|
||||
evening,18:00:00,5H 59M 59S,every_day,0
|
||||
night,00:00:00,5H 59M 59S,every_day,0
|
||||
two_weeks_overlapping,00:00:00,13D 23H 59M 59S,every_day,0
|
||||
weekends,00:00:00,2D 23H 59M 59S,wday,5
|
||||
|
|
|
172
environment.yml
172
environment.yml
|
@ -3,114 +3,138 @@ channels:
|
|||
- conda-forge
|
||||
- defaults
|
||||
dependencies:
|
||||
- _libgcc_mutex=0.1
|
||||
- _openmp_mutex=4.5
|
||||
- _py-xgboost-mutex=2.0
|
||||
- appdirs=1.4.*
|
||||
- appdirs=1.4.4
|
||||
- arrow=0.16.0
|
||||
- asn1crypto=1.4.*
|
||||
- astropy=4.2.*
|
||||
- attrs=20.3.*
|
||||
- binaryornot=0.4.*
|
||||
- asn1crypto=1.4.0
|
||||
- astropy=4.2.1
|
||||
- attrs=20.3.0
|
||||
- binaryornot=0.4.4
|
||||
- blas=1.0
|
||||
- brotlipy=0.7.*
|
||||
- bzip2=1.0.*
|
||||
- ca-certificates
|
||||
- certifi
|
||||
- brotlipy=0.7.0
|
||||
- bzip2=1.0.8
|
||||
- ca-certificates=2021.7.5
|
||||
- certifi=2021.5.30
|
||||
- cffi=1.14.4
|
||||
- chardet=3.0.*
|
||||
- click=7.1.*
|
||||
- cookiecutter=1.6.*
|
||||
- cryptography=3.3.*
|
||||
- datrie=0.8.*
|
||||
- chardet=3.0.4
|
||||
- click=7.1.2
|
||||
- colorama=0.4.4
|
||||
- cookiecutter=1.6.0
|
||||
- cryptography=3.3.1
|
||||
- datrie=0.8.2
|
||||
- docutils=0.16
|
||||
- future=0.18.2
|
||||
- gitdb=4.0.*
|
||||
- gitdb2=4.0.*
|
||||
- gitpython=3.1.*
|
||||
- gitdb=4.0.5
|
||||
- gitdb2=4.0.2
|
||||
- gitpython=3.1.11
|
||||
- idna=2.10
|
||||
- imbalanced-learn=0.6.*
|
||||
- importlib-metadata=2.0.*
|
||||
- importlib_metadata=2.0.*
|
||||
- imbalanced-learn=0.6.2
|
||||
- importlib-metadata=2.0.0
|
||||
- importlib_metadata=2.0.0
|
||||
- intel-openmp=2019.4
|
||||
- jinja2=2.11.2
|
||||
- jinja2-time=0.2.*
|
||||
- joblib=1.0.*
|
||||
- jsonschema=3.2.*
|
||||
- libblas=3.8.*
|
||||
- libcblas=3.8.*
|
||||
- libcxx=10.0.*
|
||||
- libedit=3.1.*
|
||||
- jinja2-time=0.2.0
|
||||
- joblib=1.0.0
|
||||
- jsonschema=3.2.0
|
||||
- ld_impl_linux-64=2.36.1
|
||||
- libblas=3.8.0
|
||||
- libcblas=3.8.0
|
||||
- libcxx=10.0.0
|
||||
- libcxxabi=10.0.0
|
||||
- libedit=3.1.20191231
|
||||
- libffi=3.3
|
||||
- libgcc-ng=11.2.0
|
||||
- libgfortran
|
||||
- liblapack=3.8.*
|
||||
- libopenblas=0.3.*
|
||||
- libgfortran
|
||||
- libgfortran
|
||||
- liblapack=3.8.0
|
||||
- libopenblas=0.3.10
|
||||
- libstdcxx-ng=11.2.0
|
||||
- libxgboost=0.90
|
||||
- lightgbm=3.1.*
|
||||
- llvm-openmp=10.0.*
|
||||
- markupsafe=1.1.*
|
||||
- libzlib=1.2.11
|
||||
- lightgbm=3.1.1
|
||||
- llvm-openmp=10.0.0
|
||||
- markupsafe=1.1.1
|
||||
- mkl
|
||||
- mkl-service=2.3.*
|
||||
- mkl_fft=1.2.*
|
||||
- mkl_random=1.1.*
|
||||
- more-itertools=8.6.*
|
||||
- mkl-service=2.3.0
|
||||
- mkl_fft=1.2.0
|
||||
- mkl_random=1.1.1
|
||||
- more-itertools=8.6.0
|
||||
- ncurses=6.2
|
||||
- numpy=1.19.2
|
||||
- numpy-base=1.19.2
|
||||
- openblas=0.3.*
|
||||
- openssl
|
||||
- pandas=1.1.*
|
||||
- pbr=5.5.*
|
||||
- pip=20.3.*
|
||||
- openblas=0.3.4
|
||||
- openssl=1.1.1k
|
||||
- pandas=1.1.5
|
||||
- pbr=5.5.1
|
||||
- pip=20.3.3
|
||||
- plotly=4.14.1
|
||||
- poyo=0.5.*
|
||||
- psutil=5.7.*
|
||||
- psycopg2
|
||||
- poyo=0.5.0
|
||||
- psutil=5.7.2
|
||||
- py-xgboost=0.90
|
||||
- pycparser=2.20
|
||||
- pyerfa=1.7.*
|
||||
- pyopenssl=20.0.*
|
||||
- pyprojroot
|
||||
- pysocks=1.7.*
|
||||
- python=3.7.*
|
||||
- python-dateutil=2.8.*
|
||||
- python-dotenv
|
||||
- pyerfa=1.7.1.1
|
||||
- pyopenssl=20.0.1
|
||||
- pysocks=1.7.1
|
||||
- python=3.7.9
|
||||
- python-dateutil=2.8.1
|
||||
- python_abi=3.7
|
||||
- pytz=2020.4
|
||||
- pyyaml=5.3.*
|
||||
- pyyaml=5.3.1
|
||||
- readline=8.0
|
||||
- requests=2.25.0
|
||||
- retrying=1.3.*
|
||||
- retrying=1.3.3
|
||||
- scikit-learn=0.23.2
|
||||
- scipy=1.5.*
|
||||
- setuptools=51.0.*
|
||||
- scipy=1.5.2
|
||||
- setuptools=51.0.0
|
||||
- six=1.15.0
|
||||
- smmap=3.0.*
|
||||
- smmap2=3.0.*
|
||||
- sqlalchemy
|
||||
- smmap=3.0.4
|
||||
- smmap2=3.0.1
|
||||
- sqlite=3.33.0
|
||||
- threadpoolctl=2.1.*
|
||||
- tk=8.6.*
|
||||
- threadpoolctl=2.1.0
|
||||
- tk=8.6.10
|
||||
- tqdm=4.62.0
|
||||
- urllib3=1.25.11
|
||||
- wheel=0.36.2
|
||||
- whichcraft=0.6.*
|
||||
- whichcraft=0.6.1
|
||||
- wrapt=1.12.1
|
||||
- xgboost=0.90
|
||||
- xz=5.2.*
|
||||
- yaml=0.2.*
|
||||
- zipp=3.4.*
|
||||
- zlib=1.2.*
|
||||
- xz=5.2.5
|
||||
- yaml=0.2.5
|
||||
- zipp=3.4.0
|
||||
- zlib=1.2.11
|
||||
- pip:
|
||||
- amply==0.1.*
|
||||
- amply==0.1.4
|
||||
- bidict==0.22.0
|
||||
- biosppy==0.8.0
|
||||
- cached-property==1.5.2
|
||||
- configargparse==0.15.1
|
||||
- decorator==4.4.*
|
||||
- ipython-genutils==0.2.*
|
||||
- jupyter-core==4.6.*
|
||||
- nbformat==5.0.*
|
||||
- cr-features==0.1.15
|
||||
- cycler==0.11.0
|
||||
- decorator==4.4.2
|
||||
- fonttools==4.33.2
|
||||
- h5py==3.6.0
|
||||
- hmmlearn==0.2.7
|
||||
- ipython-genutils==0.2.0
|
||||
- jupyter-core==4.6.3
|
||||
- kiwisolver==1.4.2
|
||||
- matplotlib==3.5.1
|
||||
- nbformat==5.0.7
|
||||
- opencv-python==4.5.5.64
|
||||
- packaging==21.3
|
||||
- peakutils==1.3.3
|
||||
- pillow==9.1.0
|
||||
- pulp==2.4
|
||||
- pyparsing==2.4.*
|
||||
- pyparsing==2.4.7
|
||||
- pyrsistent==0.15.5
|
||||
- ratelimiter==1.2.*
|
||||
- pywavelets==1.3.0
|
||||
- ratelimiter==1.2.0.post0
|
||||
- seaborn==0.11.2
|
||||
- shortuuid==1.0.8
|
||||
- snakemake==5.30.2
|
||||
- toposort==1.5
|
||||
- traitlets==4.3.*
|
||||
prefix: /usr/local/Caskroom/miniconda/base/envs/rapids202108
|
||||
- traitlets==4.3.3
|
||||
- typing-extensions==4.2.0
|
||||
prefix: /opt/conda/envs/rapids
|
||||
|
|
|
@ -15,9 +15,6 @@ local({
|
|||
Sys.setenv("RENV_R_INITIALIZING" = "true")
|
||||
on.exit(Sys.unsetenv("RENV_R_INITIALIZING"), add = TRUE)
|
||||
|
||||
if(grepl("Darwin", Sys.info()["sysname"], fixed = TRUE) & grepl("ARM64", Sys.info()["version"], fixed = TRUE)) # M1 Macs
|
||||
Sys.setenv("TZDIR" = file.path(R.home(), "share", "zoneinfo"))
|
||||
|
||||
# signal that we've consented to use renv
|
||||
options(renv.consent = TRUE)
|
||||
|
||||
|
|
|
@ -791,10 +791,25 @@ rule empatica_accelerometer_python_features:
|
|||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_accelerometer"
|
||||
output:
|
||||
"data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}.csv"
|
||||
"data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_accelerometer_python_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_accelerometer",
|
||||
provider_main = config["EMPATICA_ACCELEROMETER"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_accelerometer_features/z_empatica_accelerometer_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_accelerometer_features/z_empatica_accelerometer_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_accelerometer_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_accelerometer_with_datetime.csv",
|
||||
|
@ -817,7 +832,8 @@ rule empatica_heartrate_python_features:
|
|||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_heartrate"
|
||||
output:
|
||||
"data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_python_{provider_key}.csv"
|
||||
"data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
|
@ -843,10 +859,25 @@ rule empatica_temperature_python_features:
|
|||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_temperature"
|
||||
output:
|
||||
"data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}.csv"
|
||||
"data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_temperature_python_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_temperature",
|
||||
provider_main = config["EMPATICA_TEMPERATURE"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_temperature_features/z_empatica_temperature_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_temperature_features/z_empatica_temperature_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_temperature_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_temperature_with_datetime.csv",
|
||||
|
@ -869,10 +900,25 @@ rule empatica_electrodermal_activity_python_features:
|
|||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_electrodermal_activity"
|
||||
output:
|
||||
"data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}.csv"
|
||||
"data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_electrodermal_activity_python_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_electrodermal_activity",
|
||||
provider_main = config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_electrodermal_activity_features/z_empatica_electrodermal_activity_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_electrodermal_activity_features/z_empatica_electrodermal_activity_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_electrodermal_activity_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_electrodermal_activity_with_datetime.csv",
|
||||
|
@ -895,10 +941,25 @@ rule empatica_blood_volume_pulse_python_features:
|
|||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_blood_volume_pulse"
|
||||
output:
|
||||
"data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}.csv"
|
||||
"data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_blood_volume_pulse_python_cr_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_blood_volume_pulse",
|
||||
provider_main = config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_blood_volume_pulse_features/z_empatica_blood_volume_pulse_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_blood_volume_pulse_features/z_empatica_blood_volume_pulse_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_blood_volume_pulse_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_blood_volume_pulse_with_datetime.csv",
|
||||
|
@ -921,10 +982,25 @@ rule empatica_inter_beat_interval_python_features:
|
|||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_inter_beat_interval"
|
||||
output:
|
||||
"data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}.csv"
|
||||
"data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_inter_beat_interval_python_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_inter_beat_interval",
|
||||
provider_main = config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_inter_beat_interval_features/z_empatica_inter_beat_interval_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_inter_beat_interval_features/z_empatica_inter_beat_interval_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_inter_beat_interval_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_inter_beat_interval_with_datetime.csv",
|
||||
|
|
|
@ -4,36 +4,6 @@ rule create_example_participant_files:
|
|||
shell:
|
||||
"echo 'PHONE:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n PLATFORMS: [android]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524]\n LABEL: test-01\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example01.yaml && echo 'PHONE:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n PLATFORMS: [ios]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\nFITBIT:\n DEVICE_IDS: [13dbc8a3-dae3-4834-823a-4bc96a7d459d]\n LABEL: test-02\n START_DATE: 2020-04-23 00:00:00\n END_DATE: 2020-05-04 23:59:59\n' >> ./data/external/participant_files/example02.yaml"
|
||||
|
||||
rule query_usernames_device_empatica_ids:
|
||||
params:
|
||||
baseline_folder = "/mnt/e/STRAWbaseline/"
|
||||
output:
|
||||
usernames_file = config["CREATE_PARTICIPANT_FILES"]["USERNAMES_CSV"],
|
||||
timezone_file = config["TIMEZONE"]["MULTIPLE"]["TZ_FILE"]
|
||||
script:
|
||||
"../../participants/prepare_usernames_file.py"
|
||||
|
||||
rule prepare_tzcodes_file:
|
||||
input:
|
||||
timezone_file = config["TIMEZONE"]["MULTIPLE"]["TZ_FILE"]
|
||||
output:
|
||||
tzcodes_file = config["TIMEZONE"]["MULTIPLE"]["TZCODES_FILE"]
|
||||
script:
|
||||
"../tools/create_multi_timezones_file.py"
|
||||
|
||||
rule prepare_participants_csv:
|
||||
input:
|
||||
username_list = config["CREATE_PARTICIPANT_FILES"]["USERNAMES_CSV"]
|
||||
params:
|
||||
data_configuration = config["PHONE_DATA_STREAMS"][config["PHONE_DATA_STREAMS"]["USE"]],
|
||||
participants_table = "participants",
|
||||
device_id_table = "esm",
|
||||
start_end_date_table = "esm"
|
||||
output:
|
||||
participants_file = config["CREATE_PARTICIPANT_FILES"]["CSV_FILE_PATH"]
|
||||
script:
|
||||
"../src/data/translate_usernames_into_participants_data.R"
|
||||
|
||||
rule create_participants_files:
|
||||
input:
|
||||
participants_file = config["CREATE_PARTICIPANT_FILES"]["CSV_FILE_PATH"]
|
||||
|
|
|
@ -2,11 +2,16 @@ from zipfile import ZipFile
|
|||
import warnings
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pandas.core import indexing
|
||||
import yaml
|
||||
import csv
|
||||
from collections import OrderedDict
|
||||
from io import BytesIO, StringIO
|
||||
import sys, os
|
||||
|
||||
from cr_features.hrv import get_HRV_features, get_patched_ibi_with_bvp
|
||||
from cr_features.helper_functions import empatica1d_to_array, empatica2d_to_array
|
||||
|
||||
def processAcceleration(x, y, z):
|
||||
x = float(x)
|
||||
|
@ -62,7 +67,12 @@ def extract_empatica_data(data, sensor):
|
|||
df.index.name = 'timestamp'
|
||||
|
||||
elif sensor == 'EMPATICA_INTER_BEAT_INTERVAL':
|
||||
df = pd.read_csv(sensor_data_file, names=['timestamp', column], header=None)
|
||||
|
||||
df = pd.read_csv(sensor_data_file, names=['timings', column], header=None)
|
||||
df['timestamp'] = df['timings']
|
||||
if df.empty:
|
||||
df = df.set_index('timestamp')
|
||||
return df
|
||||
timestampstart = float(df['timestamp'][0])
|
||||
df['timestamp'] = (df['timestamp'][1:len(df)]).astype(float) + timestampstart
|
||||
df = df.drop([0])
|
||||
|
@ -84,6 +94,10 @@ def pull_data(data_configuration, device, sensor, container, columns_to_download
|
|||
participant_data = pd.DataFrame(columns=columns_to_download.values())
|
||||
participant_data.set_index('timestamp', inplace=True)
|
||||
|
||||
with open('config.yaml', 'r') as stream:
|
||||
config = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
cr_ibi_provider = config['EMPATICA_INTER_BEAT_INTERVAL']['PROVIDERS']['CR']
|
||||
|
||||
available_zipfiles = list((Path(data_configuration["FOLDER"]) / Path(device)).rglob("*.zip"))
|
||||
if len(available_zipfiles) == 0:
|
||||
warnings.warn("There were no zip files in: {}. If you were expecting data for this participant the [EMPATICA][DEVICE_IDS] key in their participant file is missing the pid".format((Path(data_configuration["FOLDER"]) / Path(device))))
|
||||
|
@ -94,7 +108,13 @@ def pull_data(data_configuration, device, sensor, container, columns_to_download
|
|||
listOfFileNames = zipFile.namelist()
|
||||
for fileName in listOfFileNames:
|
||||
if fileName == sensor_csv:
|
||||
participant_data = pd.concat([participant_data, extract_empatica_data(zipFile.read(fileName), sensor)], axis=0)
|
||||
if sensor == "EMPATICA_INTER_BEAT_INTERVAL" and cr_ibi_provider.get('PATCH_WITH_BVP', False):
|
||||
participant_data = \
|
||||
pd.concat([participant_data, patch_ibi_with_bvp(zipFile.read('IBI.csv'), zipFile.read('BVP.csv'))], axis=0)
|
||||
#print("patch with ibi")
|
||||
else:
|
||||
participant_data = pd.concat([participant_data, extract_empatica_data(zipFile.read(fileName), sensor)], axis=0)
|
||||
#print("no patching")
|
||||
warning = False
|
||||
if warning:
|
||||
warnings.warn("We could not find a zipped file for {} in {} (we tried to find {})".format(sensor, zipFile, sensor_csv))
|
||||
|
@ -105,4 +125,53 @@ def pull_data(data_configuration, device, sensor, container, columns_to_download
|
|||
participant_data["device_id"] = device
|
||||
return(participant_data)
|
||||
|
||||
def patch_ibi_with_bvp(ibi_data, bvp_data):
|
||||
ibi_data_file = BytesIO(ibi_data).getvalue().decode('utf-8')
|
||||
ibi_data_file = StringIO(ibi_data_file)
|
||||
|
||||
# Begin with the cr-features part
|
||||
try:
|
||||
ibi_data, ibi_start_timestamp = empatica2d_to_array(ibi_data_file)
|
||||
except IndexError as e:
|
||||
# Checks whether IBI.csv is empty
|
||||
df_test = pd.read_csv(ibi_data_file, names=['timings', 'inter_beat_interval'], header=None)
|
||||
if df_test.empty:
|
||||
df_test['timestamp'] = df_test['timings']
|
||||
df_test = df_test.set_index('timestamp')
|
||||
return df_test
|
||||
else:
|
||||
raise IndexError("Something went wrong with indices. Error that was previously caught:\n", repr(e))
|
||||
|
||||
bvp_data_file = BytesIO(bvp_data).getvalue().decode('utf-8')
|
||||
bvp_data_file = StringIO(bvp_data_file)
|
||||
|
||||
bvp_data, bvp_start_timestamp, sample_rate = empatica1d_to_array(bvp_data_file)
|
||||
|
||||
hrv_time_and_freq_features, sample, bvp_rr, bvp_timings, peak_indx = \
|
||||
get_HRV_features(bvp_data, ma=False,
|
||||
detrend=False, m_deternd=False, low_pass=False, winsorize=True,
|
||||
winsorize_value=25, hampel_fiter=False, median_filter=False,
|
||||
mod_z_score_filter=True, sampling=64, feature_names=['meanHr'])
|
||||
|
||||
ibi_timings, ibi_rr = get_patched_ibi_with_bvp(ibi_data[0], ibi_data[1], bvp_timings, bvp_rr)
|
||||
|
||||
df = \
|
||||
pd.DataFrame(np.array([ibi_timings, ibi_rr]).transpose(), columns=['timestamp', 'inter_beat_interval'])
|
||||
df.loc[-1] = [ibi_start_timestamp, 'IBI'] # adding a row
|
||||
df.index = df.index + 1 # shifting index
|
||||
df = df.sort_index() # sorting by index
|
||||
|
||||
# Repeated as in extract_empatica_data for IBI
|
||||
df['timings'] = df['timestamp']
|
||||
timestampstart = float(df['timestamp'][0])
|
||||
df['timestamp'] = (df['timestamp'][1:len(df)]).astype(float) + timestampstart
|
||||
df = df.drop([0])
|
||||
df['inter_beat_interval'] = df['inter_beat_interval'].astype(float)
|
||||
df = df.set_index('timestamp')
|
||||
|
||||
# format timestamps
|
||||
df.index *= 1000
|
||||
df.index = df.index.astype(int)
|
||||
return(df)
|
||||
|
||||
# print(pull_data({'FOLDER': 'data/external/empatica'}, "e01", "EMPATICA_accelerometer", {'TIMESTAMP': 'timestamp', 'DEVICE_ID': 'device_id', 'DOUBLE_VALUES_0': 'x', 'DOUBLE_VALUES_1': 'y', 'DOUBLE_VALUES_2': 'z'}))
|
|
@ -50,6 +50,7 @@ EMPATICA_INTER_BEAT_INTERVAL:
|
|||
TIMESTAMP: timestamp
|
||||
DEVICE_ID: device_id
|
||||
INTER_BEAT_INTERVAL: inter_beat_interval
|
||||
TIMINGS: timings
|
||||
MUTATION:
|
||||
COLUMN_MAPPINGS:
|
||||
SCRIPTS: # List any python or r scripts that mutate your raw data
|
||||
|
|
|
@ -227,6 +227,7 @@ EMPATICA_INTER_BEAT_INTERVAL:
|
|||
- TIMESTAMP
|
||||
- DEVICE_ID
|
||||
- INTER_BEAT_INTERVAL
|
||||
- TIMINGS
|
||||
|
||||
EMPATICA_TAGS:
|
||||
- TIMESTAMP
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import math as m
|
||||
|
||||
import sys
|
||||
|
||||
def extract_second_order_features(intraday_features, so_features_names, prefix=""):
|
||||
|
||||
if prefix:
|
||||
groupby_cols = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||
else:
|
||||
groupby_cols = ['local_segment']
|
||||
|
||||
if not intraday_features.empty:
|
||||
so_features = pd.DataFrame()
|
||||
#print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest())
|
||||
if "mean" in so_features_names:
|
||||
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean().add_suffix("_SO_mean")], axis=1)
|
||||
|
||||
if "median" in so_features_names:
|
||||
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median().add_suffix("_SO_median")], axis=1)
|
||||
|
||||
if "sd" in so_features_names:
|
||||
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std().add_suffix("_SO_sd")], axis=1)
|
||||
|
||||
if "nlargest" in so_features_names: # largest 5 -- maybe there is a faster groupby solution?
|
||||
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
|
||||
so_features[column+"_SO_nlargest"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nlargest(5).mean())
|
||||
|
||||
if "nsmallest" in so_features_names: # smallest 5 -- maybe there is a faster groupby solution?
|
||||
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
|
||||
so_features[column+"_SO_nsmallest"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nsmallest(5).mean())
|
||||
|
||||
if "count_windows" in so_features_names:
|
||||
so_features["SO_windowsCount"] = intraday_features.groupby(groupby_cols).count()[prefix+"level_1"]
|
||||
|
||||
# numPeaksNonZero specialized for EDA sensor
|
||||
if "eda_num_peaks_non_zero" in so_features_names and prefix+"numPeaks" in intraday_features.columns:
|
||||
so_features[prefix+"SO_numPeaksNonZero"] = intraday_features.groupby(groupby_cols)[prefix+"numPeaks"].apply(lambda x: (x!=0).sum())
|
||||
|
||||
# numWindowsNonZero specialized for BVP and IBI sensors
|
||||
if "hrv_num_windows_non_nan" in so_features_names and prefix+"meanHr" in intraday_features.columns:
|
||||
so_features[prefix+"SO_numWindowsNonNaN"] = intraday_features.groupby(groupby_cols)[prefix+"meanHr"].apply(lambda x: (~np.isnan(x)).sum())
|
||||
|
||||
so_features.reset_index(inplace=True)
|
||||
|
||||
else:
|
||||
so_features = pd.DataFrame(columns=groupby_cols)
|
||||
|
||||
return so_features
|
||||
|
||||
def get_sample_rate(data): # To-Do get the sample rate information from the file's metadata
|
||||
try:
|
||||
timestamps_diff = data['timestamp'].diff().dropna().mean()
|
||||
print("Timestamp diff:", timestamps_diff)
|
||||
except:
|
||||
raise Exception("Error occured while trying to get the mean sample rate from the data.")
|
||||
|
||||
return m.ceil(1000/timestamps_diff)
|
|
@ -0,0 +1,71 @@
|
|||
import pandas as pd
|
||||
from scipy.stats import entropy
|
||||
|
||||
from cr_features.helper_functions import convert_to2d, accelerometer_features, frequency_features
|
||||
from cr_features.calculate_features_old import calculateFeatures
|
||||
from cr_features.calculate_features import calculate_features
|
||||
from cr_features_helper_methods import extract_second_order_features
|
||||
|
||||
import sys
|
||||
|
||||
def extract_acc_features_from_intraday_data(acc_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||
acc_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||
|
||||
if not acc_intraday_data.empty:
|
||||
sample_rate = 32
|
||||
|
||||
acc_intraday_data = filter_data_by_segment(acc_intraday_data, time_segment)
|
||||
|
||||
if not acc_intraday_data.empty:
|
||||
|
||||
acc_intraday_features = pd.DataFrame()
|
||||
|
||||
# apply methods from calculate features module
|
||||
if window_length is None:
|
||||
acc_intraday_features = \
|
||||
acc_intraday_data.groupby('local_segment').apply(lambda x: calculate_features( \
|
||||
convert_to2d(x['double_values_0'], x.shape[0]), \
|
||||
convert_to2d(x['double_values_1'], x.shape[0]), \
|
||||
convert_to2d(x['double_values_2'], x.shape[0]), \
|
||||
fs=sample_rate, feature_names=features, show_progress=False))
|
||||
else:
|
||||
acc_intraday_features = \
|
||||
acc_intraday_data.groupby('local_segment').apply(lambda x: calculate_features( \
|
||||
convert_to2d(x['double_values_0'], window_length*sample_rate), \
|
||||
convert_to2d(x['double_values_1'], window_length*sample_rate), \
|
||||
convert_to2d(x['double_values_2'], window_length*sample_rate), \
|
||||
fs=sample_rate, feature_names=features, show_progress=False))
|
||||
|
||||
acc_intraday_features.reset_index(inplace=True)
|
||||
|
||||
return acc_intraday_features
|
||||
|
||||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
acc_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
||||
calc_windows = kwargs.get('calc_windows', False)
|
||||
|
||||
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||
else:
|
||||
requested_window_length = None
|
||||
|
||||
# name of the features this function can compute
|
||||
base_intraday_features_names = accelerometer_features + frequency_features
|
||||
# the subset of requested features this function can compute
|
||||
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||
|
||||
# extract features from intraday data
|
||||
acc_intraday_features = extract_acc_features_from_intraday_data(acc_intraday_data, intraday_features_to_compute,
|
||||
requested_window_length, time_segment, filter_data_by_segment)
|
||||
|
||||
if calc_windows:
|
||||
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||
acc_second_order_features = extract_second_order_features(acc_intraday_features, so_features_names)
|
||||
return acc_intraday_features, acc_second_order_features
|
||||
|
||||
return acc_intraday_features
|
|
@ -0,0 +1,73 @@
|
|||
import pandas as pd
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
from cr_features.helper_functions import convert_to2d, hrv_features
|
||||
from cr_features.hrv import extract_hrv_features_2d_wrapper
|
||||
from cr_features_helper_methods import extract_second_order_features
|
||||
|
||||
import sys
|
||||
|
||||
# pd.set_option('display.max_rows', 1000)
|
||||
pd.set_option('display.max_columns', None)
|
||||
|
||||
def extract_bvp_features_from_intraday_data(bvp_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||
bvp_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||
|
||||
if not bvp_intraday_data.empty:
|
||||
sample_rate = 64
|
||||
|
||||
bvp_intraday_data = filter_data_by_segment(bvp_intraday_data, time_segment)
|
||||
|
||||
if not bvp_intraday_data.empty:
|
||||
|
||||
bvp_intraday_features = pd.DataFrame()
|
||||
|
||||
# apply methods from calculate features module
|
||||
if window_length is None:
|
||||
bvp_intraday_features = \
|
||||
bvp_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x:
|
||||
extract_hrv_features_2d_wrapper(
|
||||
convert_to2d(x['blood_volume_pulse'], x.shape[0]),
|
||||
sampling=sample_rate, hampel_fiter=False, median_filter=False, mod_z_score_filter=True, feature_names=features))
|
||||
|
||||
else:
|
||||
bvp_intraday_features = \
|
||||
bvp_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x:
|
||||
extract_hrv_features_2d_wrapper(
|
||||
convert_to2d(x['blood_volume_pulse'], window_length*sample_rate),
|
||||
sampling=sample_rate, hampel_fiter=False, median_filter=False, mod_z_score_filter=True, feature_names=features))
|
||||
|
||||
bvp_intraday_features.reset_index(inplace=True)
|
||||
|
||||
return bvp_intraday_features
|
||||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
bvp_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
||||
calc_windows = kwargs.get('calc_windows', False)
|
||||
|
||||
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||
else:
|
||||
requested_window_length = None
|
||||
|
||||
# name of the features this function can compute
|
||||
base_intraday_features_names = hrv_features
|
||||
# the subset of requested features this function can compute
|
||||
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||
|
||||
# extract features from intraday data
|
||||
bvp_intraday_features = extract_bvp_features_from_intraday_data(bvp_intraday_data, intraday_features_to_compute,
|
||||
requested_window_length, time_segment, filter_data_by_segment)
|
||||
|
||||
if calc_windows:
|
||||
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||
bvp_second_order_features = extract_second_order_features(bvp_intraday_features, so_features_names)
|
||||
return bvp_intraday_features, bvp_second_order_features
|
||||
|
||||
return bvp_intraday_features
|
|
@ -0,0 +1,78 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy.stats import entropy
|
||||
|
||||
from cr_features.helper_functions import convert_to2d, gsr_features
|
||||
from cr_features.calculate_features import calculate_features
|
||||
from cr_features.gsr import extractGsrFeatures2D
|
||||
from cr_features_helper_methods import extract_second_order_features
|
||||
|
||||
import sys
|
||||
|
||||
#pd.set_option('display.max_columns', None)
|
||||
#pd.set_option('display.max_rows', None)
|
||||
#np.seterr(invalid='ignore')
|
||||
|
||||
|
||||
def extract_eda_features_from_intraday_data(eda_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||
eda_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||
|
||||
if not eda_intraday_data.empty:
|
||||
sample_rate = 4
|
||||
|
||||
eda_intraday_data = filter_data_by_segment(eda_intraday_data, time_segment)
|
||||
|
||||
if not eda_intraday_data.empty:
|
||||
|
||||
eda_intraday_features = pd.DataFrame()
|
||||
|
||||
# apply methods from calculate features module
|
||||
if window_length is None:
|
||||
eda_intraday_features = \
|
||||
eda_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x: extractGsrFeatures2D(convert_to2d(x['electrodermal_activity'], x.shape[0]), sampleRate=sample_rate, featureNames=features,
|
||||
threshold=.01, offset=1, riseTime=5, decayTime=15))
|
||||
else:
|
||||
eda_intraday_features = \
|
||||
eda_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x: extractGsrFeatures2D(convert_to2d(x['electrodermal_activity'], window_length*sample_rate), sampleRate=sample_rate, featureNames=features,
|
||||
threshold=.01, offset=1, riseTime=5, decayTime=15))
|
||||
|
||||
eda_intraday_features.reset_index(inplace=True)
|
||||
|
||||
return eda_intraday_features
|
||||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
eda_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
||||
calc_windows = kwargs.get('calc_windows', False)
|
||||
|
||||
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||
else:
|
||||
requested_window_length = None
|
||||
|
||||
# name of the features this function can compute
|
||||
base_intraday_features_names = gsr_features
|
||||
# the subset of requested features this function can compute
|
||||
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||
|
||||
# extract features from intraday data
|
||||
eda_intraday_features = extract_eda_features_from_intraday_data(eda_intraday_data, intraday_features_to_compute,
|
||||
requested_window_length, time_segment, filter_data_by_segment)
|
||||
|
||||
if calc_windows:
|
||||
if provider["WINDOWS"]["IMPUTE_NANS"]:
|
||||
eda_intraday_features[eda_intraday_features["numPeaks"] == 0] = \
|
||||
eda_intraday_features[eda_intraday_features["numPeaks"] == 0].fillna(0)
|
||||
pd.set_option('display.max_columns', None)
|
||||
|
||||
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||
eda_second_order_features = extract_second_order_features(eda_intraday_features, so_features_names)
|
||||
|
||||
return eda_intraday_features, eda_second_order_features
|
||||
|
||||
return eda_intraday_features
|
|
@ -0,0 +1,79 @@
|
|||
import pandas as pd
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
import numpy as np
|
||||
|
||||
from cr_features.helper_functions import convert_ibi_to2d_time, hrv_features
|
||||
from cr_features.hrv import extract_hrv_features_2d_wrapper, get_HRV_features
|
||||
from cr_features_helper_methods import extract_second_order_features
|
||||
|
||||
import math
|
||||
import sys
|
||||
|
||||
# pd.set_option('display.max_rows', 1000)
|
||||
pd.set_option('display.max_columns', None)
|
||||
|
||||
|
||||
def extract_ibi_features_from_intraday_data(ibi_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||
ibi_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||
|
||||
if not ibi_intraday_data.empty:
|
||||
|
||||
ibi_intraday_data = filter_data_by_segment(ibi_intraday_data, time_segment)
|
||||
|
||||
if not ibi_intraday_data.empty:
|
||||
|
||||
ibi_intraday_features = pd.DataFrame()
|
||||
|
||||
# apply methods from calculate features module
|
||||
if window_length is None:
|
||||
ibi_intraday_features = \
|
||||
ibi_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x:
|
||||
extract_hrv_features_2d_wrapper(
|
||||
signal_2D = \
|
||||
convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], math.ceil(x['timings'].iloc[-1]))[0],
|
||||
ibi_timings = \
|
||||
convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], math.ceil(x['timings'].iloc[-1]))[1],
|
||||
sampling=None, hampel_fiter=False, median_filter=False, mod_z_score_filter=True, feature_names=features))
|
||||
else:
|
||||
ibi_intraday_features = \
|
||||
ibi_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x:
|
||||
extract_hrv_features_2d_wrapper(
|
||||
signal_2D = convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], window_length)[0],
|
||||
ibi_timings = convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], window_length)[1],
|
||||
sampling=None, hampel_fiter=False, median_filter=False, mod_z_score_filter=True, feature_names=features))
|
||||
|
||||
ibi_intraday_features.reset_index(inplace=True)
|
||||
|
||||
return ibi_intraday_features
|
||||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
ibi_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
||||
calc_windows = kwargs.get('calc_windows', False)
|
||||
|
||||
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||
else:
|
||||
requested_window_length = None
|
||||
|
||||
# name of the features this function can compute
|
||||
base_intraday_features_names = hrv_features
|
||||
# the subset of requested features this function can compute
|
||||
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||
|
||||
# extract features from intraday data
|
||||
ibi_intraday_features = extract_ibi_features_from_intraday_data(ibi_intraday_data, intraday_features_to_compute,
|
||||
requested_window_length, time_segment, filter_data_by_segment)
|
||||
|
||||
if calc_windows:
|
||||
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||
ibi_second_order_features = extract_second_order_features(ibi_intraday_features, so_features_names)
|
||||
|
||||
return ibi_intraday_features, ibi_second_order_features
|
||||
|
||||
return ibi_intraday_features
|
|
@ -0,0 +1,65 @@
|
|||
import pandas as pd
|
||||
from scipy.stats import entropy
|
||||
|
||||
from cr_features.helper_functions import convert_to2d, generic_features
|
||||
from cr_features.calculate_features_old import calculateFeatures
|
||||
from cr_features.calculate_features import calculate_features
|
||||
from cr_features_helper_methods import extract_second_order_features
|
||||
|
||||
import sys
|
||||
|
||||
def extract_temp_features_from_intraday_data(temperature_intraday_data, features, window_length, time_segment, filter_data_by_segment):
|
||||
temperature_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
|
||||
|
||||
if not temperature_intraday_data.empty:
|
||||
sample_rate = 4
|
||||
|
||||
temperature_intraday_data = filter_data_by_segment(temperature_intraday_data, time_segment)
|
||||
|
||||
if not temperature_intraday_data.empty:
|
||||
|
||||
temperature_intraday_features = pd.DataFrame()
|
||||
|
||||
# apply methods from calculate features module
|
||||
if window_length is None:
|
||||
temperature_intraday_features = \
|
||||
temperature_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x: calculate_features(convert_to2d(x['temperature'], x.shape[0]), fs=sample_rate, feature_names=features, show_progress=False))
|
||||
else:
|
||||
temperature_intraday_features = \
|
||||
temperature_intraday_data.groupby('local_segment').apply(\
|
||||
lambda x: calculate_features(convert_to2d(x['temperature'], window_length*sample_rate), fs=sample_rate, feature_names=features, show_progress=False))
|
||||
|
||||
|
||||
temperature_intraday_features.reset_index(inplace=True)
|
||||
|
||||
return temperature_intraday_features
|
||||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
temperature_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
||||
calc_windows = kwargs.get('calc_windows', False)
|
||||
|
||||
if provider["WINDOWS"]["COMPUTE"] and calc_windows:
|
||||
requested_window_length = provider["WINDOWS"]["WINDOW_LENGTH"]
|
||||
else:
|
||||
requested_window_length = None
|
||||
|
||||
# name of the features this function can compute
|
||||
base_intraday_features_names = generic_features
|
||||
# the subset of requested features this function can compute
|
||||
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))
|
||||
|
||||
# extract features from intraday data
|
||||
temperature_intraday_features = extract_temp_features_from_intraday_data(temperature_intraday_data, intraday_features_to_compute,
|
||||
requested_window_length, time_segment, filter_data_by_segment)
|
||||
|
||||
if calc_windows:
|
||||
so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||
temperature_second_order_features = extract_second_order_features(temperature_intraday_features, so_features_names)
|
||||
return temperature_intraday_features, temperature_second_order_features
|
||||
|
||||
return temperature_intraday_features
|
|
@ -1,12 +1,16 @@
|
|||
import pandas as pd
|
||||
from utils.utils import fetch_provider_features, run_provider_cleaning_script
|
||||
|
||||
import sys
|
||||
|
||||
sensor_data_files = dict(snakemake.input)
|
||||
|
||||
provider = snakemake.params["provider"]
|
||||
provider_key = snakemake.params["provider_key"]
|
||||
sensor_key = snakemake.params["sensor_key"]
|
||||
|
||||
calc_windows = True if (provider.get("WINDOWS", False) and provider["WINDOWS"].get("COMPUTE", False)) else False
|
||||
|
||||
if sensor_key == "all_cleaning_individual" or sensor_key == "all_cleaning_overall":
|
||||
# Data cleaning
|
||||
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
|
||||
|
@ -14,6 +18,18 @@ else:
|
|||
# Extract sensor features
|
||||
del sensor_data_files["time_segments_labels"]
|
||||
time_segments_file = snakemake.input["time_segments_labels"]
|
||||
sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file)
|
||||
|
||||
sensor_features.to_csv(snakemake.output[0], index=False)
|
||||
if calc_windows:
|
||||
window_features, second_order_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=True)
|
||||
|
||||
window_features.to_csv(snakemake.output[1], index=False)
|
||||
second_order_features.to_csv(snakemake.output[0], index=False)
|
||||
|
||||
elif "empatica" in sensor_key:
|
||||
pd.DataFrame().to_csv(snakemake.output[1], index=False)
|
||||
|
||||
if not calc_windows:
|
||||
sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=False)
|
||||
|
||||
if not calc_windows:
|
||||
sensor_features.to_csv(snakemake.output[0], index=False)
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
import sys
|
||||
|
||||
sensor_data_files = dict(snakemake.input)
|
||||
|
||||
provider = snakemake.params["provider"]
|
||||
provider_key = snakemake.params["provider_key"]
|
||||
sensor_key = snakemake.params["sensor_key"]
|
||||
|
||||
pd.set_option('display.max_columns', None)
|
||||
|
||||
if provider_key == "cr":
|
||||
sys.path.append('/rapids/src/features/')
|
||||
from cr_features_helper_methods import extract_second_order_features
|
||||
|
||||
provider_main = snakemake.params["provider_main"]
|
||||
prefix = sensor_key + "_" + provider_key + "_"
|
||||
|
||||
windows_features_data = pd.read_csv(sensor_data_files["windows_features_data"])
|
||||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime', prefix + "level_1"]
|
||||
windows_features_data.loc[:, ~windows_features_data.columns.isin(excluded_columns)] = \
|
||||
StandardScaler().fit_transform(windows_features_data.loc[:, ~windows_features_data.columns.isin(excluded_columns)])
|
||||
|
||||
windows_features_data.to_csv(snakemake.output[1], index=False)
|
||||
|
||||
if provider_main["WINDOWS"]["COMPUTE"] and "SECOND_ORDER_FEATURES" in provider_main["WINDOWS"]:
|
||||
so_features_names = provider_main["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||
windows_so_features_data = extract_second_order_features(windows_features_data, so_features_names, prefix)
|
||||
windows_so_features_data.to_csv(snakemake.output[0], index=False)
|
||||
else:
|
||||
pd.DataFrame().to_csv(snakemake.output[0], index=False)
|
||||
|
||||
else:
|
||||
pass #To-Do for the rest of the sensors.
|
|
@ -88,11 +88,13 @@ def chunk_episodes(sensor_episodes):
|
|||
|
||||
return merged_sensor_episodes
|
||||
|
||||
def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file):
|
||||
def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=False):
|
||||
import pandas as pd
|
||||
from importlib import import_module, util
|
||||
|
||||
sensor_features = pd.DataFrame(columns=["local_segment"])
|
||||
sensor_fo_features = pd.DataFrame(columns=["local_segment"])
|
||||
sensor_so_features = pd.DataFrame(columns=["local_segment"])
|
||||
time_segments_labels = pd.read_csv(time_segments_file, header=0)
|
||||
if "FEATURES" not in provider:
|
||||
raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(sensor_key.upper(), provider_key.upper()))
|
||||
|
@ -106,23 +108,57 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
|
|||
time_segments_labels["label"] = [""]
|
||||
for time_segment in time_segments_labels["label"]:
|
||||
print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, time_segment))
|
||||
features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes)
|
||||
if not "local_segment" in features.columns:
|
||||
raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")
|
||||
features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns]
|
||||
sensor_features = pd.concat([sensor_features, features], axis=0, sort=False)
|
||||
|
||||
features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes, calc_windows=calc_windows)
|
||||
|
||||
# In case of calc_window = True
|
||||
if isinstance(features, tuple):
|
||||
if not "local_segment" in features[0].columns or not "local_segment" in features[1].columns:
|
||||
raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")
|
||||
features[0].columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features[0].columns]
|
||||
features[1].columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features[1].columns]
|
||||
if not features[0].empty:
|
||||
sensor_fo_features = pd.concat([sensor_fo_features, features[0]], axis=0, sort=False)
|
||||
if not features[1].empty:
|
||||
sensor_so_features = pd.concat([sensor_so_features, features[1]], axis=0, sort=False)
|
||||
else:
|
||||
if not "local_segment" in features.columns:
|
||||
raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")
|
||||
features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns]
|
||||
sensor_features = pd.concat([sensor_features, features], axis=0, sort=False)
|
||||
else:
|
||||
for feature in provider["FEATURES"]:
|
||||
sensor_features[feature] = None
|
||||
segment_colums = pd.DataFrame()
|
||||
sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
||||
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
||||
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
||||
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
|
||||
for i in range(segment_colums.shape[1]):
|
||||
sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||
|
||||
return sensor_features
|
||||
if calc_windows:
|
||||
segment_colums = pd.DataFrame()
|
||||
sensor_fo_features['local_segment'] = sensor_fo_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
||||
split_segemnt_columns = sensor_fo_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
||||
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
||||
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
|
||||
for i in range(segment_colums.shape[1]):
|
||||
sensor_fo_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||
|
||||
segment_colums = pd.DataFrame()
|
||||
sensor_so_features['local_segment'] = sensor_so_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
||||
split_segemnt_columns = sensor_so_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
||||
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
||||
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
|
||||
for i in range(segment_colums.shape[1]):
|
||||
sensor_so_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||
|
||||
return sensor_fo_features, sensor_so_features
|
||||
|
||||
else:
|
||||
segment_colums = pd.DataFrame()
|
||||
sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
||||
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
||||
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
||||
segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns
|
||||
for i in range(segment_colums.shape[1]):
|
||||
sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]])
|
||||
|
||||
return sensor_features
|
||||
|
||||
def run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files):
|
||||
from importlib import import_module, util
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
participant = "p031"
|
||||
all_sensors = ["eda", "bvp", "ibi", "temp", "acc"]
|
||||
|
||||
for sensor in all_sensors:
|
||||
|
||||
if sensor == "eda":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_cr_windows.csv"
|
||||
elif sensor == "bvp":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_cr_windows.csv"
|
||||
elif sensor == "ibi":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_cr_windows.csv"
|
||||
elif sensor == "acc":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_accelerometer_features/empatica_accelerometer_python_cr_windows.csv"
|
||||
elif sensor == "temp":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_temperature_features/empatica_temperature_python_cr_windows.csv"
|
||||
else:
|
||||
path = "/rapids/data/processed/features/all_participants/all_sensor_features.csv" # all features all participants
|
||||
|
||||
|
||||
df = pd.read_csv(path)
|
||||
print(df)
|
||||
is_NaN = df.isnull()
|
||||
row_has_NaN = is_NaN.any(axis=1)
|
||||
rows_with_NaN = df[row_has_NaN]
|
||||
|
||||
print("All rows:", len(df.index))
|
||||
print("\nCount NaN vals:", rows_with_NaN.size)
|
||||
print("\nDf mean:")
|
||||
print(df.mean())
|
||||
|
||||
sns.heatmap(df.isna(), cbar=False)
|
||||
plt.savefig(f'{sensor}_{participant}_windows_NaN.png', bbox_inches='tight')
|
||||
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from itertools import compress
|
||||
|
||||
|
||||
participant = "p031"
|
||||
sensor = "eda"
|
||||
|
||||
if sensor == "eda":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_cr_windows.csv"
|
||||
elif sensor == "bvp":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_cr_windows.csv"
|
||||
elif sensor == "ibi":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_cr_windows.csv"
|
||||
elif sensor == "acc":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_accelerometer_features/empatica_accelerometer_python_cr_windows.csv"
|
||||
elif sensor == "temp":
|
||||
path = f"/rapids/data/interim/{participant}/empatica_temperature_features/empatica_temperature_python_cr_windows.csv"
|
||||
else:
|
||||
path = "/rapids/data/processed/features/all_participants/all_sensor_features.csv" # all features all participants"
|
||||
|
||||
|
||||
df = pd.read_csv(path)
|
||||
df_num_peaks_zero = df[df["empatica_electrodermal_activity_cr_numPeaks"] == 0]
|
||||
columns_num_peaks_zero = df_num_peaks_zero.columns[df_num_peaks_zero.isna().any()].tolist()
|
||||
|
||||
df_num_peaks_non_zero = df[df["empatica_electrodermal_activity_cr_numPeaks"] != 0]
|
||||
df_num_peaks_non_zero = df_num_peaks_non_zero[columns_num_peaks_zero]
|
||||
|
||||
pd.set_option('display.max_columns', None)
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_num_peaks_non_zero:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_num_peaks_non_zero[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q)
|
||||
plt.savefig(f'eda_{participant}_window_non_zero_peak_other_vals.png', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# Filter columns that do not contain 0
|
||||
non_zero_cols = list(compress(columns_num_peaks_zero, df_num_peaks_non_zero.all().tolist()))
|
||||
zero_cols = list(set(columns_num_peaks_zero) - set(non_zero_cols))
|
||||
|
||||
print(non_zero_cols, "\n")
|
||||
print(zero_cols)
|
||||
|
||||
|
Loading…
Reference in New Issue