From e1d7607de4e7d78bb2fb2ec4de30af84edac5b2d Mon Sep 17 00:00:00 2001 From: primoz Date: Fri, 10 Jun 2022 12:34:48 +0000 Subject: [PATCH] Extraction of additional SO features. Min/max has been changed to nsmallest/nlargest means. --- config.yaml | 44 ++++++++++++------- src/features/cr_features_helper_methods.py | 17 +++++-- .../empatica_blood_volume_pulse/cr/main.py | 2 +- .../empatica_inter_beat_interval/cr/main.py | 4 +- src/features/entry.py | 16 ------- tests/scripts/zero_vals.py | 27 ++++++++++++ 6 files changed, 72 insertions(+), 38 deletions(-) create mode 100644 tests/scripts/zero_vals.py diff --git a/config.yaml b/config.yaml index 2c3b5ab7..ce7fafd0 100644 --- a/config.yaml +++ b/config.yaml @@ -484,13 +484,13 @@ EMPATICA_ACCELEROMETER: FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"] SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py CR: - COMPUTE: False + COMPUTE: True FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features WINDOWS: COMPUTE: True WINDOW_LENGTH: 15 # specify window length in seconds - SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'max', 'min'] - STANDARDIZE_SO_FEATURES: True + SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest_mean', 'nsmallest_mean', 'count_windows'] + STANDARDIZE_FEATURES: False SRC_SCRIPT: src/features/empatica_accelerometer/cr/main.py @@ -512,14 +512,14 @@ EMPATICA_TEMPERATURE: FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"] SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py CR: - COMPUTE: False + COMPUTE: True FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean", "stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"] WINDOWS: COMPUTE: True WINDOW_LENGTH: 300 # specify window length in seconds - SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'max', 'min'] - STANDARDIZE_SO_FEATURES: True + SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest_mean', 'nsmallest_mean', 'count_windows'] + STANDARDIZE_FEATURES: False SRC_SCRIPT: src/features/empatica_temperature/cr/main.py # See https://www.rapids.science/latest/features/empatica-electrodermal-activity/ @@ -531,17 +531,17 @@ EMPATICA_ELECTRODERMAL_ACTIVITY: FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"] SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py CR: - COMPUTE: False + COMPUTE: True FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic', 'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore', 'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio', 'avgPeakIncreaseTime', 'avgPeakDecreaseTime', 'avgPeakDuration', 'signalOverallChange', 'changeDuration', 'changeRate', 'significantIncrease', 'significantDecrease'] WINDOWS: - COMPUTE: False + COMPUTE: True WINDOW_LENGTH: 60 # specify window length in seconds - SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'max', 'min'] - STANDARDIZE_SO_FEATURES: True + SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest_mean', 'nsmallest_mean', count_windows, eda_num_peaks_non_zero] + STANDARDIZE_FEATURES: False SRC_SCRIPT: src/features/empatica_electrodermal_activity/cr/main.py # See https://www.rapids.science/latest/features/empatica-blood-volume-pulse/ @@ -559,8 +559,8 @@ EMPATICA_BLOOD_VOLUME_PULSE: WINDOWS: COMPUTE: True WINDOW_LENGTH: 300 # specify window length in seconds - SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'max', 'min'] - STANDARDIZE_SO_FEATURES: True + SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest_mean', 'nsmallest_mean', 'count_windows'] + STANDARDIZE_FEATURES: False SRC_SCRIPT: src/features/empatica_blood_volume_pulse/cr/main.py # See https://www.rapids.science/latest/features/empatica-inter-beat-interval/ @@ -579,8 +579,8 @@ EMPATICA_INTER_BEAT_INTERVAL: WINDOWS: COMPUTE: True WINDOW_LENGTH: 300 # specify window length in seconds - SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'max', 'min'] - STANDARDIZE_SO_FEATURES: True + SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest_mean', 'nsmallest_mean', 'count_windows'] + STANDARDIZE_FEATURES: False SRC_SCRIPT: src/features/empatica_inter_beat_interval/cr/main.py # See https://www.rapids.science/latest/features/empatica-tags/ @@ -662,4 +662,18 @@ ALL_CLEANING_OVERALL: COMPUTE: True MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 CORR_THRESHOLD: 0.95 - SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R \ No newline at end of file + SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R + + +######################################################################################################################## +# Z-score standardization # +######################################################################################################################## +STANDARDIZATION: + COMPUTE: True + EXCECUTE_FULL_PIPELINE: False # Standardization to be calculated from feature extraction step including merging all sensors and participants steps (in seperate standardization file) +EMPATICA_STANDARDIZATION: + PROVIDERS: + CR: + COMPUTE: False + TYPE: FROM_FIRST_ORDER # FROM_FIRST_ORDER or FROM_SECOND_ORDER(not implemented) + SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R diff --git a/src/features/cr_features_helper_methods.py b/src/features/cr_features_helper_methods.py index f8dd3456..6f2d3b2e 100644 --- a/src/features/cr_features_helper_methods.py +++ b/src/features/cr_features_helper_methods.py @@ -6,16 +6,25 @@ import sys def extract_second_order_features(intraday_features, so_features_names): if not intraday_features.empty: so_features = pd.DataFrame() + #print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest()) if "mean" in so_features_names: so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).mean().add_suffix("_SO_mean")], axis=1) if "median" in so_features_names: so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).median().add_suffix("_SO_median")], axis=1) if "sd" in so_features_names: so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).std().add_suffix("_SO_sd")], axis=1) - if "max" in so_features_names: - so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).max().add_suffix("_SO_max")], axis=1) - if "min" in so_features_names: - so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).min().add_suffix("_SO_min")], axis=1) + if "nlargest_mean" in so_features_names: # largest 5 -- maybe there is a faster groupby solution? + for column in intraday_features.columns[2:]: + so_features[column+"_SO_nlargest_mean"] = intraday_features.drop("level_1", axis=1).groupby("local_segment")[column].apply(lambda x: x.nlargest(5).mean()) + if "nsmallest_mean" in so_features_names: # smallest 5 -- maybe there is a faster groupby solution? + for column in intraday_features.columns[2:]: + so_features[column+"_SO_nsmallest_mean"] = intraday_features.drop("level_1", axis=1).groupby("local_segment")[column].apply(lambda x: x.nsmallest(5).mean()) + if "count_windows" in so_features_names: + so_features["SO_windowsCount"] = intraday_features.groupby(["local_segment"]).count()["level_1"] + + # numPeaksNonZero specialized for EDA sensor + if "eda_num_peaks_non_zero" in so_features_names and "numPeaks" in intraday_features.columns: + so_features["SO_numPeaksNonZero"] = intraday_features.groupby("local_segment")["numPeaks"].apply(lambda x: (x!=0).sum()) so_features.reset_index(inplace=True) diff --git a/src/features/empatica_blood_volume_pulse/cr/main.py b/src/features/empatica_blood_volume_pulse/cr/main.py index 9e6db52b..edf52076 100644 --- a/src/features/empatica_blood_volume_pulse/cr/main.py +++ b/src/features/empatica_blood_volume_pulse/cr/main.py @@ -66,7 +66,7 @@ def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segmen requested_window_length, time_segment, filter_data_by_segment) if calc_windows: - if provider["WINDOWS"].get("STANDARDIZE_SO_FEATURES", False): + if provider["WINDOWS"].get("STANDARDIZE_FEATURES", False): fo_columns = bvp_intraday_features.columns.values[2:] fo_columns_z_score = [col + "_zscore" for col in fo_columns] bvp_intraday_features[fo_columns_z_score] = StandardScaler().fit_transform(bvp_intraday_features[fo_columns]) diff --git a/src/features/empatica_inter_beat_interval/cr/main.py b/src/features/empatica_inter_beat_interval/cr/main.py index 41af4cf1..1137b802 100644 --- a/src/features/empatica_inter_beat_interval/cr/main.py +++ b/src/features/empatica_inter_beat_interval/cr/main.py @@ -33,7 +33,7 @@ def extract_ibi_features_from_intraday_data(ibi_intraday_data, features, window_ signal_2D = \ convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], math.ceil(x['timings'].iloc[-1]))[0], ibi_timings = \ - convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], math.ceil(x['timings'].iloc[-1]))[1], + convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], math.ceil(x['timings'].iloc[-1]))[1], sampling=None, hampel_fiter=False, median_filter=False, mod_z_score_filter=True, feature_names=features)) else: ibi_intraday_features = \ @@ -70,7 +70,7 @@ def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segmen ibi_intraday_features = extract_ibi_features_from_intraday_data(ibi_intraday_data, intraday_features_to_compute, requested_window_length, time_segment, filter_data_by_segment) if calc_windows: - if provider["WINDOWS"].get("STANDARDIZE_SO_FEATURES", False): + if provider["WINDOWS"].get("STANDARDIZE_FEATURES", False): fo_columns = ibi_intraday_features.columns.values[2:] fo_columns_z_score = [col + "_zscore" for col in fo_columns] ibi_intraday_features[fo_columns_z_score] = StandardScaler().fit_transform(ibi_intraday_features[fo_columns]) diff --git a/src/features/entry.py b/src/features/entry.py index 46895c71..4cee0155 100644 --- a/src/features/entry.py +++ b/src/features/entry.py @@ -22,23 +22,7 @@ else: if calc_windows: window_features, second_order_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=True) - - # # Get basic stats from all participant's windows - # fo_means_stds = pd.DataFrame({"mean": window_features.mean(), "median": window_features.median(), "sd": window_features.std(), - # "min": window_features.min(), "max": window_features.max()}) - - # fo_columns = window_features.columns.values[5:] - # fo_columns_z_score = [col + "_zscore" for col in fo_columns] - # window_features[fo_columns_z_score] = StandardScaler().fit_transform(window_features[fo_columns]) - - # print(fo_means_stds) - # Z-score SO features by columns - # if provider["WINDOWS"].get("STANDARDIZE_SO_FEATURES", False): - # for indx, fo_mean_std in fo_means_stds.iterrows(): - # print(indx, fo_mean_std) - - # sys.exit() window_features.to_csv(snakemake.output[1], index=False) second_order_features.to_csv(snakemake.output[0], index=False) diff --git a/tests/scripts/zero_vals.py b/tests/scripts/zero_vals.py new file mode 100644 index 00000000..9e13ecf5 --- /dev/null +++ b/tests/scripts/zero_vals.py @@ -0,0 +1,27 @@ +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + + +# path = "/rapids/data/processed/features/all_participants/all_sensor_features.csv" # all features all participants +# path = "/rapids/data/interim/p03/empatica_accelerometer_features/empatica_accelerometer_python_cr_windows.csv" +path = "/rapids/data/interim/p031/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_cr_windows.csv" +# path = "/rapids/data/interim/p02/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_cr_windows.csv" +# path = "/rapids/data/interim/p02/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_cr_windows.csv" +# path = "/rapids/data/interim/p02/empatica_temperature_features/empatica_temperature_python_cr_windows.csv" + +df = pd.read_csv(path) +print(df) +is_NaN = df.isnull() +df = df[df["empatica_electrodermal_activity_cr_numPeaks"]] +print(df) + + +# row_has_NaN = is_NaN. any(axis=1) +# rows_with_NaN = df[row_has_NaN] +# print(rows_with_NaN.size) + +# sns.heatmap(df.isna(), cbar=False) +plt.savefig('eda_windows_p03_window_60_thresh_default.png', bbox_inches='tight') + +