Making standardization as a rule. WIP: done only for BVP.

sociality-task
Primoz 2022-06-13 14:12:03 +00:00
parent 094743244d
commit 402059871f
6 changed files with 81 additions and 22 deletions

View File

@ -354,6 +354,8 @@ for provider in config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_blood_volume_pulse.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
files_to_compute.extend(expand("data/interim/{pid}/empatica_blood_volume_pulse_features/standardization_empatica_blood_volume_pulse_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys():
if config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["COMPUTE"]:

View File

@ -484,7 +484,7 @@ EMPATICA_ACCELEROMETER:
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
CR:
COMPUTE: True
COMPUTE: False
FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features
WINDOWS:
COMPUTE: True
@ -512,7 +512,7 @@ EMPATICA_TEMPERATURE:
FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
CR:
COMPUTE: True
COMPUTE: False
FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean",
"stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"]
WINDOWS:
@ -531,7 +531,7 @@ EMPATICA_ELECTRODERMAL_ACTIVITY:
FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
CR:
COMPUTE: True
COMPUTE: False
FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic',
'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore',
'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio',
@ -572,10 +572,10 @@ EMPATICA_INTER_BEAT_INTERVAL:
FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
CR:
COMPUTE: True
COMPUTE: False
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
PATCH_WITH_BVP: False
PATCH_WITH_BVP: True
WINDOWS:
COMPUTE: True
WINDOW_LENGTH: 300 # specify window length in seconds
@ -674,6 +674,6 @@ STANDARDIZATION:
EMPATICA_STANDARDIZATION:
PROVIDERS:
CR:
COMPUTE: False
STANDARDIZE: True
TYPE: FROM_FIRST_ORDER # FROM_FIRST_ORDER or FROM_SECOND_ORDER(not implemented)
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
SRC_SCRIPT: src/features/standardization/empatica_standardization/main.py

View File

@ -896,6 +896,20 @@ rule empatica_blood_volume_pulse_r_features:
script:
"../src/features/entry.R"
rule empatica_blood_volume_pulse_python_cr_features_standardization:
input:
windows_features_data = "data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}_windows.csv"
params:
provider = config["EMPATICA_STANDARDIZATION"]["PROVIDERS"]["CR"],
provider_key = "{provider_key}",
sensor_key = "empatica_blood_volume_pulse",
provider_main = config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"]["CR"]
output:
"data/interim/{pid}/empatica_blood_volume_pulse_features/standardization_empatica_blood_volume_pulse_python_{provider_key}.csv",
"data/interim/{pid}/empatica_blood_volume_pulse_features/standardization_empatica_blood_volume_pulse_python_{provider_key}_windows.csv"
script:
"../src/features/standardization/empatica_standardization/main.py"
rule empatica_inter_beat_interval_python_features:
input:
sensor_data = "data/raw/{pid}/empatica_inter_beat_interval_with_datetime.csv",

View File

@ -3,37 +3,48 @@ import math as m
import sys
def extract_second_order_features(intraday_features, so_features_names):
def extract_second_order_features(intraday_features, so_features_names, prefix=""):
if prefix:
groupby_cols = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
else:
groupby_cols = ['local_segment']
if not intraday_features.empty:
so_features = pd.DataFrame()
#print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest())
if "mean" in so_features_names:
so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).mean().add_suffix("_SO_mean")], axis=1)
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean().add_suffix("_SO_mean")], axis=1)
if "median" in so_features_names:
so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).median().add_suffix("_SO_median")], axis=1)
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median().add_suffix("_SO_median")], axis=1)
if "sd" in so_features_names:
so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).std().add_suffix("_SO_sd")], axis=1)
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std().add_suffix("_SO_sd")], axis=1)
if "nlargest_mean" in so_features_names: # largest 5 -- maybe there is a faster groupby solution?
for column in intraday_features.columns[2:]:
so_features[column+"_SO_nlargest_mean"] = intraday_features.drop("level_1", axis=1).groupby("local_segment")[column].apply(lambda x: x.nlargest(5).mean())
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
so_features[column+"_SO_nlargest_mean"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nlargest(5).mean())
if "nsmallest_mean" in so_features_names: # smallest 5 -- maybe there is a faster groupby solution?
for column in intraday_features.columns[2:]:
so_features[column+"_SO_nsmallest_mean"] = intraday_features.drop("level_1", axis=1).groupby("local_segment")[column].apply(lambda x: x.nsmallest(5).mean())
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
so_features[column+"_SO_nsmallest_mean"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nsmallest(5).mean())
if "count_windows" in so_features_names:
so_features["SO_windowsCount"] = intraday_features.groupby(["local_segment"]).count()["level_1"]
so_features["SO_windowsCount"] = intraday_features.groupby(groupby_cols).count()[prefix+"level_1"]
# numPeaksNonZero specialized for EDA sensor
if "eda_num_peaks_non_zero" in so_features_names and "numPeaks" in intraday_features.columns:
so_features["SO_numPeaksNonZero"] = intraday_features.groupby("local_segment")["numPeaks"].apply(lambda x: (x!=0).sum())
if "eda_num_peaks_non_zero" in so_features_names and prefix+"numPeaks" in intraday_features.columns:
so_features[prefix+"SO_numPeaksNonZero"] = intraday_features.groupby(groupby_cols)[prefix+"numPeaks"].apply(lambda x: (x!=0).sum())
# numWindowsNonZero specialized for BVP and IBI sensors
if "hrv_num_windows_non_zero" in so_features_names and "meanHr" in intraday_features.columns:
so_features["SO_numWindowsNonZero"] = intraday_features.groupby("local_segment")["meanHr"].apply(lambda x: (x!=0).sum())
if "hrv_num_windows_non_zero" in so_features_names and prefix+"meanHr" in intraday_features.columns:
so_features[prefix+"SO_numWindowsNonZero"] = intraday_features.groupby(groupby_cols)[prefix+"meanHr"].apply(lambda x: (x!=0).sum())
so_features.reset_index(inplace=True)
else:
so_features = pd.DataFrame(columns=["local_segment"])
so_features = pd.DataFrame(columns=groupby_cols)
return so_features

View File

@ -1,6 +1,5 @@
import pandas as pd
from utils.utils import fetch_provider_features, run_provider_cleaning_script
from sklearn.preprocessing import StandardScaler
import sys

View File

@ -0,0 +1,33 @@
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import sys
sys.path.append('/rapids/src/features/')
from cr_features_helper_methods import extract_second_order_features
sensor_data_files = dict(snakemake.input)
provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"]
sensor_key = snakemake.params["sensor_key"]
pd.set_option('display.max_columns', None)
if provider_key == "cr":
provider_main = snakemake.params["provider_main"]
prefix = sensor_key + "_" + provider_key + "_"
windows_features_data = pd.read_csv(sensor_data_files["windows_features_data"])
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime',
prefix + "level_1"]
windows_features_data.loc[:, ~windows_features_data.columns.isin(excluded_columns)] = \
StandardScaler().fit_transform(windows_features_data.loc[:, ~windows_features_data.columns.isin(excluded_columns)])
windows_features_data.to_csv(snakemake.output[1], index=False)
if provider_main["WINDOWS"]["COMPUTE"] and "SECOND_ORDER_FEATURES" in provider_main["WINDOWS"]:
so_features_names = provider_main["WINDOWS"]["SECOND_ORDER_FEATURES"]
windows_so_features_data = extract_second_order_features(windows_features_data, so_features_names, prefix)
windows_so_features_data.to_csv(snakemake.output[0], index=False)