diff --git a/Snakefile b/Snakefile index 7dc5ad3a..25e0fc08 100644 --- a/Snakefile +++ b/Snakefile @@ -354,6 +354,8 @@ for provider in config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/processed/features/{pid}/empatica_blood_volume_pulse.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + files_to_compute.extend(expand("data/interim/{pid}/empatica_blood_volume_pulse_features/standardization_empatica_blood_volume_pulse_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["EMPATICA_STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower())) + for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys(): if config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["COMPUTE"]: diff --git a/config.yaml b/config.yaml index c150b3cc..432b1364 100644 --- a/config.yaml +++ b/config.yaml @@ -484,7 +484,7 @@ EMPATICA_ACCELEROMETER: FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"] SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py CR: - COMPUTE: True + COMPUTE: False FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features WINDOWS: COMPUTE: True @@ -512,7 +512,7 @@ EMPATICA_TEMPERATURE: FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"] SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py CR: - COMPUTE: True + COMPUTE: False FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean", "stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"] WINDOWS: @@ -531,7 +531,7 @@ EMPATICA_ELECTRODERMAL_ACTIVITY: FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"] SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py CR: - COMPUTE: True + COMPUTE: False FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic', 'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore', 'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio', @@ -572,10 +572,10 @@ EMPATICA_INTER_BEAT_INTERVAL: FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"] SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py CR: - COMPUTE: True + COMPUTE: False FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features 'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features - PATCH_WITH_BVP: False + PATCH_WITH_BVP: True WINDOWS: COMPUTE: True WINDOW_LENGTH: 300 # specify window length in seconds @@ -674,6 +674,6 @@ STANDARDIZATION: EMPATICA_STANDARDIZATION: PROVIDERS: CR: - COMPUTE: False + STANDARDIZE: True TYPE: FROM_FIRST_ORDER # FROM_FIRST_ORDER or FROM_SECOND_ORDER(not implemented) - SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R + SRC_SCRIPT: src/features/standardization/empatica_standardization/main.py diff --git a/rules/features.smk b/rules/features.smk index 1b6e0ad8..6a1e0a97 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -896,6 +896,20 @@ rule empatica_blood_volume_pulse_r_features: script: "../src/features/entry.R" +rule empatica_blood_volume_pulse_python_cr_features_standardization: + input: + windows_features_data = "data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}_windows.csv" + params: + provider = config["EMPATICA_STANDARDIZATION"]["PROVIDERS"]["CR"], + provider_key = "{provider_key}", + sensor_key = "empatica_blood_volume_pulse", + provider_main = config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"]["CR"] + output: + "data/interim/{pid}/empatica_blood_volume_pulse_features/standardization_empatica_blood_volume_pulse_python_{provider_key}.csv", + "data/interim/{pid}/empatica_blood_volume_pulse_features/standardization_empatica_blood_volume_pulse_python_{provider_key}_windows.csv" + script: + "../src/features/standardization/empatica_standardization/main.py" + rule empatica_inter_beat_interval_python_features: input: sensor_data = "data/raw/{pid}/empatica_inter_beat_interval_with_datetime.csv", diff --git a/src/features/cr_features_helper_methods.py b/src/features/cr_features_helper_methods.py index bf25296d..c140fe99 100644 --- a/src/features/cr_features_helper_methods.py +++ b/src/features/cr_features_helper_methods.py @@ -3,37 +3,48 @@ import math as m import sys -def extract_second_order_features(intraday_features, so_features_names): +def extract_second_order_features(intraday_features, so_features_names, prefix=""): + + if prefix: + groupby_cols = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime'] + else: + groupby_cols = ['local_segment'] + if not intraday_features.empty: so_features = pd.DataFrame() #print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest()) if "mean" in so_features_names: - so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).mean().add_suffix("_SO_mean")], axis=1) + so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean().add_suffix("_SO_mean")], axis=1) + if "median" in so_features_names: - so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).median().add_suffix("_SO_median")], axis=1) + so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median().add_suffix("_SO_median")], axis=1) + if "sd" in so_features_names: - so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).std().add_suffix("_SO_sd")], axis=1) + so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std().add_suffix("_SO_sd")], axis=1) + if "nlargest_mean" in so_features_names: # largest 5 -- maybe there is a faster groupby solution? - for column in intraday_features.columns[2:]: - so_features[column+"_SO_nlargest_mean"] = intraday_features.drop("level_1", axis=1).groupby("local_segment")[column].apply(lambda x: x.nlargest(5).mean()) + for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]: + so_features[column+"_SO_nlargest_mean"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nlargest(5).mean()) + if "nsmallest_mean" in so_features_names: # smallest 5 -- maybe there is a faster groupby solution? - for column in intraday_features.columns[2:]: - so_features[column+"_SO_nsmallest_mean"] = intraday_features.drop("level_1", axis=1).groupby("local_segment")[column].apply(lambda x: x.nsmallest(5).mean()) + for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]: + so_features[column+"_SO_nsmallest_mean"] = intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols)[column].apply(lambda x: x.nsmallest(5).mean()) + if "count_windows" in so_features_names: - so_features["SO_windowsCount"] = intraday_features.groupby(["local_segment"]).count()["level_1"] + so_features["SO_windowsCount"] = intraday_features.groupby(groupby_cols).count()[prefix+"level_1"] # numPeaksNonZero specialized for EDA sensor - if "eda_num_peaks_non_zero" in so_features_names and "numPeaks" in intraday_features.columns: - so_features["SO_numPeaksNonZero"] = intraday_features.groupby("local_segment")["numPeaks"].apply(lambda x: (x!=0).sum()) + if "eda_num_peaks_non_zero" in so_features_names and prefix+"numPeaks" in intraday_features.columns: + so_features[prefix+"SO_numPeaksNonZero"] = intraday_features.groupby(groupby_cols)[prefix+"numPeaks"].apply(lambda x: (x!=0).sum()) # numWindowsNonZero specialized for BVP and IBI sensors - if "hrv_num_windows_non_zero" in so_features_names and "meanHr" in intraday_features.columns: - so_features["SO_numWindowsNonZero"] = intraday_features.groupby("local_segment")["meanHr"].apply(lambda x: (x!=0).sum()) + if "hrv_num_windows_non_zero" in so_features_names and prefix+"meanHr" in intraday_features.columns: + so_features[prefix+"SO_numWindowsNonZero"] = intraday_features.groupby(groupby_cols)[prefix+"meanHr"].apply(lambda x: (x!=0).sum()) so_features.reset_index(inplace=True) else: - so_features = pd.DataFrame(columns=["local_segment"]) + so_features = pd.DataFrame(columns=groupby_cols) return so_features diff --git a/src/features/entry.py b/src/features/entry.py index 4cee0155..dea1e1df 100644 --- a/src/features/entry.py +++ b/src/features/entry.py @@ -1,6 +1,5 @@ import pandas as pd from utils.utils import fetch_provider_features, run_provider_cleaning_script -from sklearn.preprocessing import StandardScaler import sys diff --git a/src/features/standardization/empatica_standardization/main.py b/src/features/standardization/empatica_standardization/main.py new file mode 100644 index 00000000..2e2fb0f6 --- /dev/null +++ b/src/features/standardization/empatica_standardization/main.py @@ -0,0 +1,33 @@ +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler + +import sys + +sys.path.append('/rapids/src/features/') +from cr_features_helper_methods import extract_second_order_features + +sensor_data_files = dict(snakemake.input) + +provider = snakemake.params["provider"] +provider_key = snakemake.params["provider_key"] +sensor_key = snakemake.params["sensor_key"] + +pd.set_option('display.max_columns', None) + +if provider_key == "cr": + provider_main = snakemake.params["provider_main"] + prefix = sensor_key + "_" + provider_key + "_" + + windows_features_data = pd.read_csv(sensor_data_files["windows_features_data"]) + excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime', + prefix + "level_1"] + windows_features_data.loc[:, ~windows_features_data.columns.isin(excluded_columns)] = \ + StandardScaler().fit_transform(windows_features_data.loc[:, ~windows_features_data.columns.isin(excluded_columns)]) + + windows_features_data.to_csv(snakemake.output[1], index=False) + + if provider_main["WINDOWS"]["COMPUTE"] and "SECOND_ORDER_FEATURES" in provider_main["WINDOWS"]: + so_features_names = provider_main["WINDOWS"]["SECOND_ORDER_FEATURES"] + windows_so_features_data = extract_second_order_features(windows_features_data, so_features_names, prefix) + windows_so_features_data.to_csv(snakemake.output[0], index=False) \ No newline at end of file