Making standardization as a rule. WIP: done only for BVP.
parent
094743244d
commit
402059871f
|
@ -354,6 +354,8 @@ for provider in config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"].keys():
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_blood_volume_pulse.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_blood_volume_pulse.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||||
|
files_to_compute.extend(expand("data/interim/{pid}/empatica_blood_volume_pulse_features/standardization_empatica_blood_volume_pulse_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||||
|
|
||||||
|
|
||||||
for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys():
|
for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys():
|
||||||
if config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["COMPUTE"]:
|
if config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||||
|
|
14
config.yaml
14
config.yaml
|
@ -484,7 +484,7 @@ EMPATICA_ACCELEROMETER:
|
||||||
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
|
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
|
||||||
SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
|
SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
|
||||||
CR:
|
CR:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features
|
FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features
|
||||||
WINDOWS:
|
WINDOWS:
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
|
@ -512,7 +512,7 @@ EMPATICA_TEMPERATURE:
|
||||||
FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
|
FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
|
||||||
SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
|
SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
|
||||||
CR:
|
CR:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean",
|
FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean",
|
||||||
"stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"]
|
"stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"]
|
||||||
WINDOWS:
|
WINDOWS:
|
||||||
|
@ -531,7 +531,7 @@ EMPATICA_ELECTRODERMAL_ACTIVITY:
|
||||||
FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
|
FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
|
||||||
SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
|
SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
|
||||||
CR:
|
CR:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic',
|
FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic',
|
||||||
'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore',
|
'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore',
|
||||||
'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio',
|
'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio',
|
||||||
|
@ -572,10 +572,10 @@ EMPATICA_INTER_BEAT_INTERVAL:
|
||||||
FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
|
FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
|
||||||
SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
|
SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
|
||||||
CR:
|
CR:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
|
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
|
||||||
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
|
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
|
||||||
PATCH_WITH_BVP: False
|
PATCH_WITH_BVP: True
|
||||||
WINDOWS:
|
WINDOWS:
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
WINDOW_LENGTH: 300 # specify window length in seconds
|
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||||
|
@ -674,6 +674,6 @@ STANDARDIZATION:
|
||||||
EMPATICA_STANDARDIZATION:
|
EMPATICA_STANDARDIZATION:
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
CR:
|
CR:
|
||||||
COMPUTE: False
|
STANDARDIZE: True
|
||||||
TYPE: FROM_FIRST_ORDER # FROM_FIRST_ORDER or FROM_SECOND_ORDER(not implemented)
|
TYPE: FROM_FIRST_ORDER # FROM_FIRST_ORDER or FROM_SECOND_ORDER(not implemented)
|
||||||
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
|
SRC_SCRIPT: src/features/standardization/empatica_standardization/main.py
|
||||||
|
|
|
@ -896,6 +896,20 @@ rule empatica_blood_volume_pulse_r_features:
|
||||||
script:
|
script:
|
||||||
"../src/features/entry.R"
|
"../src/features/entry.R"
|
||||||
|
|
||||||
|
rule empatica_blood_volume_pulse_python_cr_features_standardization:
|
||||||
|
input:
|
||||||
|
windows_features_data = "data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}_windows.csv"
|
||||||
|
params:
|
||||||
|
provider = config["EMPATICA_STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||||
|
provider_key = "{provider_key}",
|
||||||
|
sensor_key = "empatica_blood_volume_pulse",
|
||||||
|
provider_main = config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"]["CR"]
|
||||||
|
output:
|
||||||
|
"data/interim/{pid}/empatica_blood_volume_pulse_features/standardization_empatica_blood_volume_pulse_python_{provider_key}.csv",
|
||||||
|
"data/interim/{pid}/empatica_blood_volume_pulse_features/standardization_empatica_blood_volume_pulse_python_{provider_key}_windows.csv"
|
||||||
|
script:
|
||||||
|
"../src/features/standardization/empatica_standardization/main.py"
|
||||||
|
|
||||||
rule empatica_inter_beat_interval_python_features:
|
rule empatica_inter_beat_interval_python_features:
|
||||||
input:
|
input:
|
||||||
sensor_data = "data/raw/{pid}/empatica_inter_beat_interval_with_datetime.csv",
|
sensor_data = "data/raw/{pid}/empatica_inter_beat_interval_with_datetime.csv",
|
||||||
|
|
|
@ -3,37 +3,48 @@ import math as m
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
def extract_second_order_features(intraday_features, so_features_names):
|
def extract_second_order_features(intraday_features, so_features_names, prefix=""):
|
||||||
|
|
||||||
|
if prefix:
|
||||||
|
groupby_cols = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||||
|
else:
|
||||||
|
groupby_cols = ['local_segment']
|
||||||
|
|
||||||
if not intraday_features.empty:
|
if not intraday_features.empty:
|
||||||
so_features = pd.DataFrame()
|
so_features = pd.DataFrame()
|
||||||
#print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest())
|
#print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest())
|
||||||
if "mean" in so_features_names:
|
if "mean" in so_features_names:
|
||||||
so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).mean().add_suffix("_SO_mean")], axis=1)
|
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean().add_suffix("_SO_mean")], axis=1)
|
||||||
|
|
||||||
if "median" in so_features_names:
|
if "median" in so_features_names:
|
||||||
so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).median().add_suffix("_SO_median")], axis=1)
|
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median().add_suffix("_SO_median")], axis=1)
|
||||||
|
|
||||||
if "sd" in so_features_names:
|
if "sd" in so_features_names:
|
||||||
so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).std().add_suffix("_SO_sd")], axis=1)
|
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std().add_suffix("_SO_sd")], axis=1)
|
||||||
|
|
||||||
if "nlargest_mean" in so_features_names: # largest 5 -- maybe there is a faster groupby solution?
|
if "nlargest_mean" in so_features_names: # largest 5 -- maybe there is a faster groupby solution?
|
||||||
for column in intraday_features.columns[2:]:
|
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
|
||||||
so_features[column+"_SO_nlargest_mean"] = intraday_features.drop("level_1", axis=1).groupby("local_segment")[column].apply(lambda x: x.nlargest(5).mean())
|
so_features[column+"_SO_nlargest_mean"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nlargest(5).mean())
|
||||||
|
|
||||||
if "nsmallest_mean" in so_features_names: # smallest 5 -- maybe there is a faster groupby solution?
|
if "nsmallest_mean" in so_features_names: # smallest 5 -- maybe there is a faster groupby solution?
|
||||||
for column in intraday_features.columns[2:]:
|
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
|
||||||
so_features[column+"_SO_nsmallest_mean"] = intraday_features.drop("level_1", axis=1).groupby("local_segment")[column].apply(lambda x: x.nsmallest(5).mean())
|
so_features[column+"_SO_nsmallest_mean"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nsmallest(5).mean())
|
||||||
|
|
||||||
if "count_windows" in so_features_names:
|
if "count_windows" in so_features_names:
|
||||||
so_features["SO_windowsCount"] = intraday_features.groupby(["local_segment"]).count()["level_1"]
|
so_features["SO_windowsCount"] = intraday_features.groupby(groupby_cols).count()[prefix+"level_1"]
|
||||||
|
|
||||||
# numPeaksNonZero specialized for EDA sensor
|
# numPeaksNonZero specialized for EDA sensor
|
||||||
if "eda_num_peaks_non_zero" in so_features_names and "numPeaks" in intraday_features.columns:
|
if "eda_num_peaks_non_zero" in so_features_names and prefix+"numPeaks" in intraday_features.columns:
|
||||||
so_features["SO_numPeaksNonZero"] = intraday_features.groupby("local_segment")["numPeaks"].apply(lambda x: (x!=0).sum())
|
so_features[prefix+"SO_numPeaksNonZero"] = intraday_features.groupby(groupby_cols)[prefix+"numPeaks"].apply(lambda x: (x!=0).sum())
|
||||||
|
|
||||||
# numWindowsNonZero specialized for BVP and IBI sensors
|
# numWindowsNonZero specialized for BVP and IBI sensors
|
||||||
if "hrv_num_windows_non_zero" in so_features_names and "meanHr" in intraday_features.columns:
|
if "hrv_num_windows_non_zero" in so_features_names and prefix+"meanHr" in intraday_features.columns:
|
||||||
so_features["SO_numWindowsNonZero"] = intraday_features.groupby("local_segment")["meanHr"].apply(lambda x: (x!=0).sum())
|
so_features[prefix+"SO_numWindowsNonZero"] = intraday_features.groupby(groupby_cols)[prefix+"meanHr"].apply(lambda x: (x!=0).sum())
|
||||||
|
|
||||||
so_features.reset_index(inplace=True)
|
so_features.reset_index(inplace=True)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
so_features = pd.DataFrame(columns=["local_segment"])
|
so_features = pd.DataFrame(columns=groupby_cols)
|
||||||
|
|
||||||
return so_features
|
return so_features
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from utils.utils import fetch_provider_features, run_provider_cleaning_script
|
from utils.utils import fetch_provider_features, run_provider_cleaning_script
|
||||||
from sklearn.preprocessing import StandardScaler
|
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.append('/rapids/src/features/')
|
||||||
|
from cr_features_helper_methods import extract_second_order_features
|
||||||
|
|
||||||
|
sensor_data_files = dict(snakemake.input)
|
||||||
|
|
||||||
|
provider = snakemake.params["provider"]
|
||||||
|
provider_key = snakemake.params["provider_key"]
|
||||||
|
sensor_key = snakemake.params["sensor_key"]
|
||||||
|
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
|
||||||
|
if provider_key == "cr":
|
||||||
|
provider_main = snakemake.params["provider_main"]
|
||||||
|
prefix = sensor_key + "_" + provider_key + "_"
|
||||||
|
|
||||||
|
windows_features_data = pd.read_csv(sensor_data_files["windows_features_data"])
|
||||||
|
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime',
|
||||||
|
prefix + "level_1"]
|
||||||
|
windows_features_data.loc[:, ~windows_features_data.columns.isin(excluded_columns)] = \
|
||||||
|
StandardScaler().fit_transform(windows_features_data.loc[:, ~windows_features_data.columns.isin(excluded_columns)])
|
||||||
|
|
||||||
|
windows_features_data.to_csv(snakemake.output[1], index=False)
|
||||||
|
|
||||||
|
if provider_main["WINDOWS"]["COMPUTE"] and "SECOND_ORDER_FEATURES" in provider_main["WINDOWS"]:
|
||||||
|
so_features_names = provider_main["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||||
|
windows_so_features_data = extract_second_order_features(windows_features_data, so_features_names, prefix)
|
||||||
|
windows_so_features_data.to_csv(snakemake.output[0], index=False)
|
Loading…
Reference in New Issue