diff --git a/config.yaml b/config.yaml index 77d7a8e5..503e5872 100644 --- a/config.yaml +++ b/config.yaml @@ -493,6 +493,7 @@ EMPATICA_ACCELEROMETER: WINDOWS: COMPUTE: True WINDOW_LENGTH: 10 # specify window length in seconds + SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'max', 'min'] SRC_SCRIPT: src/features/empatica_accelerometer/cr/main.py @@ -510,11 +511,11 @@ EMPATICA_TEMPERATURE: CONTAINER: TEMP PROVIDERS: DBDP: - COMPUTE: False + COMPUTE: True FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"] SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py CR: - COMPUTE: False + COMPUTE: True FEATURES: ["autocorrelations", "countAboveMean", "countBelowMean", "maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean", "stdDev", "median", "meanChange", "numberOfZeroCrossings", "absEnergy", "linearTrendSlope", "ratioBeyondRSigma", "binnedEntropy", "numOfPeaksAutocorr", "numberOfZeroCrossingsAutocorr", "areaAutocorr", @@ -523,6 +524,7 @@ EMPATICA_TEMPERATURE: WINDOWS: COMPUTE: True WINDOW_LENGTH: 90 # specify window length in seconds + SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'max', 'min'] SRC_SCRIPT: src/features/empatica_temperature/cr/main.py # See https://www.rapids.science/latest/features/empatica-electrodermal-activity/ @@ -530,11 +532,11 @@ EMPATICA_ELECTRODERMAL_ACTIVITY: CONTAINER: EDA PROVIDERS: DBDP: - COMPUTE: False + COMPUTE: True FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"] SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py CR: - COMPUTE: False + COMPUTE: True FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic', 'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore', 'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio', @@ -543,6 +545,7 @@ EMPATICA_ELECTRODERMAL_ACTIVITY: WINDOWS: COMPUTE: True WINDOW_LENGTH: 80 # specify window length in seconds + SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'max', 'min'] SRC_SCRIPT: src/features/empatica_electrodermal_activity/cr/main.py # See https://www.rapids.science/latest/features/empatica-blood-volume-pulse/ @@ -550,16 +553,17 @@ EMPATICA_BLOOD_VOLUME_PULSE: CONTAINER: BVP PROVIDERS: DBDP: - COMPUTE: False + COMPUTE: True FEATURES: ["maxbvp", "minbvp", "avgbvp", "medianbvp", "modebvp", "stdbvp", "diffmaxmodebvp", "diffminmodebvp", "entropybvp"] SRC_SCRIPT: src/features/empatica_blood_volume_pulse/dbdp/main.py CR: - COMPUTE: False + COMPUTE: True FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features 'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features WINDOWS: COMPUTE: True WINDOW_LENGTH: 10 # specify window length in seconds + SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'max', 'min'] SRC_SCRIPT: src/features/empatica_blood_volume_pulse/cr/main.py # See https://www.rapids.science/latest/features/empatica-inter-beat-interval/ @@ -576,7 +580,8 @@ EMPATICA_INTER_BEAT_INTERVAL: 'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features WINDOWS: COMPUTE: True - WINDOW_LENGTH: 120 # specify window length in seconds + WINDOW_LENGTH: 2000 # specify window length in seconds + SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'max', 'min'] SRC_SCRIPT: src/features/empatica_inter_beat_interval/cr/main.py # See https://www.rapids.science/latest/features/empatica-tags/ diff --git a/environment.yml b/environment.yml index 7276ba1f..169138a1 100644 --- a/environment.yml +++ b/environment.yml @@ -110,12 +110,13 @@ dependencies: - bidict==0.22.0 - biosppy==0.8.0 - cached-property==1.5.2 - - calculatingfeatures==0.1.1 - configargparse==0.15.1 + - cr-features==0.1.8 - cycler==0.11.0 - decorator==4.4.2 - - fonttools==4.31.2 + - fonttools==4.33.2 - h5py==3.6.0 + - hmmlearn==0.2.7 - ipython-genutils==0.2.0 - jupyter-core==4.6.3 - kiwisolver==1.4.2 @@ -124,7 +125,7 @@ dependencies: - opencv-python==4.5.5.64 - packaging==21.3 - peakutils==1.3.3 - - pillow==9.0.1 + - pillow==9.1.0 - pulp==2.4 - pyparsing==2.4.7 - pyrsistent==0.15.5 @@ -134,5 +135,5 @@ dependencies: - snakemake==5.30.2 - toposort==1.5 - traitlets==4.3.3 - - typing-extensions==4.1.1 + - typing-extensions==4.2.0 prefix: /opt/conda/envs/rapids diff --git a/rules/features.smk b/rules/features.smk index 1b6e0ad8..b72e3cbb 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -771,7 +771,8 @@ rule empatica_accelerometer_python_features: sensor_key = "empatica_accelerometer" output: "data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}.csv", - "data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}_windows.csv" + "data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}_windows.csv", + "data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}_windows_SO_features.csv" script: "../src/features/entry.py" @@ -798,7 +799,8 @@ rule empatica_heartrate_python_features: sensor_key = "empatica_heartrate" output: "data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_python_{provider_key}.csv", - "data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_python_{provider_key}_windows.csv" + "data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_python_{provider_key}_windows.csv", + "data/interim/{pid}/empatica_heartrate_features/empatica_heartrate_python_{provider_key}_windows_SO_features.csv" script: "../src/features/entry.py" @@ -825,7 +827,8 @@ rule empatica_temperature_python_features: sensor_key = "empatica_temperature" output: "data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}.csv", - "data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}_windows.csv" + "data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}_windows.csv", + "data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}_windows_SO_features.csv" script: "../src/features/entry.py" @@ -852,7 +855,8 @@ rule empatica_electrodermal_activity_python_features: sensor_key = "empatica_electrodermal_activity" output: "data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}.csv", - "data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}_windows.csv" + "data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}_windows.csv", + "data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}_windows_SO_features.csv" script: "../src/features/entry.py" @@ -879,7 +883,8 @@ rule empatica_blood_volume_pulse_python_features: sensor_key = "empatica_blood_volume_pulse" output: "data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}.csv", - "data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}_windows.csv" + "data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}_windows.csv", + "data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}_windows_SO_features.csv" script: "../src/features/entry.py" @@ -906,7 +911,8 @@ rule empatica_inter_beat_interval_python_features: sensor_key = "empatica_inter_beat_interval" output: "data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}.csv", - "data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}_windows.csv" + "data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}_windows.csv", + "data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}_windows_SO_features.csv" script: "../src/features/entry.py" diff --git a/src/features/__init__.py b/src/features/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/features/cr_features_helper_methods.py b/src/features/cr_features_helper_methods.py new file mode 100644 index 00000000..c9ebceba --- /dev/null +++ b/src/features/cr_features_helper_methods.py @@ -0,0 +1,30 @@ +import pandas as pd + +def extract_second_order_features(intraday_features, so_features_names): + if not intraday_features.empty: + so_features = pd.DataFrame() + if "mean" in so_features_names: + so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).mean().add_suffix("_SO_mean")], axis=1) + if "median" in so_features_names: + so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).mean().add_suffix("_SO_median")], axis=1) + if "sd" in so_features_names: + so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).std().add_suffix("_SO_sd")], axis=1) + if "max" in so_features_names: + so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).max().add_suffix("_SO_max")], axis=1) + if "min" in so_features_names: + so_features = pd.concat([so_features, intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).min().add_suffix("_SO_min")], axis=1) + + so_features.reset_index(inplace=True) + + else: + so_features = pd.DataFrame(columns=["local_segment"]) + + return so_features + +def get_sample_rate(data): + try: + timestamps_diff = data['timestamp'].diff().dropna().mean() + except: + raise Exception("Error occured while trying to get the mean sample rate from the data.") + + return int(1000/timestamps_diff) \ No newline at end of file diff --git a/src/features/empatica_accelerometer/cr/main.py b/src/features/empatica_accelerometer/cr/main.py index ca2bf5ee..71c13827 100644 --- a/src/features/empatica_accelerometer/cr/main.py +++ b/src/features/empatica_accelerometer/cr/main.py @@ -3,6 +3,7 @@ from scipy.stats import entropy from cr_features.helper_functions import convert_to2d, accelerometer_features, frequency_features from cr_features.calculate_features import calculate_features +from cr_features_helper_methods import get_sample_rate, extract_second_order_features import sys @@ -69,4 +70,9 @@ def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segmen acc_intraday_features = extract_acc_features_from_intraday_data(acc_intraday_data, intraday_features_to_compute, requested_window_length, time_segment, filter_data_by_segment) + if calc_windows: + so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"] + acc_second_order_features = extract_second_order_features(acc_intraday_features, so_features_names) + return acc_intraday_features, acc_second_order_features + return acc_intraday_features \ No newline at end of file diff --git a/src/features/empatica_blood_volume_pulse/cr/main.py b/src/features/empatica_blood_volume_pulse/cr/main.py index daf3a079..0a78b41d 100644 --- a/src/features/empatica_blood_volume_pulse/cr/main.py +++ b/src/features/empatica_blood_volume_pulse/cr/main.py @@ -3,6 +3,7 @@ from scipy.stats import entropy from cr_features.helper_functions import convert_to2d, hrv_features, hrv_freq_features from cr_features.hrv import extract_hrv_features_2d_wrapper +from cr_features_helper_methods import get_sample_rate, extract_second_order_features import sys @@ -74,4 +75,9 @@ def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segmen bvp_intraday_features = extract_bvp_features_from_intraday_data(bvp_intraday_data, intraday_features_to_compute, requested_window_length, time_segment, filter_data_by_segment) + if calc_windows: + so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"] + bvp_second_order_features = extract_second_order_features(bvp_intraday_features, so_features_names) + return bvp_intraday_features, bvp_second_order_features + return bvp_intraday_features \ No newline at end of file diff --git a/src/features/empatica_electrodermal_activity/cr/main.py b/src/features/empatica_electrodermal_activity/cr/main.py index 7cb6e5b2..3e09b84d 100644 --- a/src/features/empatica_electrodermal_activity/cr/main.py +++ b/src/features/empatica_electrodermal_activity/cr/main.py @@ -3,6 +3,7 @@ from scipy.stats import entropy from cr_features.helper_functions import convert_to2d, gsr_features from cr_features.calculate_features import calculate_features +from cr_features_helper_methods import get_sample_rate, extract_second_order_features def getSampleRate(data): @@ -62,4 +63,9 @@ def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segmen eda_intraday_features = extract_eda_features_from_intraday_data(eda_intraday_data, intraday_features_to_compute, requested_window_length, time_segment, filter_data_by_segment) + if calc_windows: + so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"] + eda_second_order_features = extract_second_order_features(eda_intraday_features, so_features_names) + return eda_intraday_features, eda_second_order_features + return eda_intraday_features \ No newline at end of file diff --git a/src/features/empatica_inter_beat_interval/cr/main.py b/src/features/empatica_inter_beat_interval/cr/main.py index 8988f729..02c0cd94 100644 --- a/src/features/empatica_inter_beat_interval/cr/main.py +++ b/src/features/empatica_inter_beat_interval/cr/main.py @@ -3,6 +3,7 @@ import numpy as np from cr_features.helper_functions import convert_ibi_to2d_time, hrv_features, hrv_freq_features from cr_features.hrv import extract_hrv_features_2d_wrapper +from cr_features_helper_methods import get_sample_rate, extract_second_order_features import math import sys @@ -11,14 +12,6 @@ pd.set_option('display.max_rows', 1000) pd.set_option('display.max_columns', None) -def get_sample_rate(data): - try: - timestamps_diff = data['timestamp'].diff().dropna().mean() - except: - raise Exception("Error occured while trying to get the mean sample rate from the data.") - - return int(1000/timestamps_diff) - def extract_ibi_features_from_intraday_data(ibi_intraday_data, features, window_length, time_segment, filter_data_by_segment): ibi_intraday_features = pd.DataFrame(columns=["local_segment"] + features) @@ -30,13 +23,7 @@ def extract_ibi_features_from_intraday_data(ibi_intraday_data, features, window_ if not ibi_intraday_data.empty: ibi_intraday_features = pd.DataFrame() - # np.set_printoptions(threshold=sys.maxsize) - # print(ibi_intraday_data.groupby('local_segment').apply(lambda x: math.ceil(x['timings'].iloc[-1]))) - # nekaj = ibi_intraday_data.groupby('local_segment').apply(lambda x: \ - # convert_ibi_to2d_time(x[['timings', 'inter_beat_interval']], window_length)[1]) - - # sys.exit() - + # apply methods from calculate features module if window_length is None: ibi_intraday_features = \ @@ -82,5 +69,10 @@ def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segmen # extract features from intraday data ibi_intraday_features = extract_ibi_features_from_intraday_data(ibi_intraday_data, intraday_features_to_compute, requested_window_length, time_segment, filter_data_by_segment) + if calc_windows: + so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"] + ibi_second_order_features = extract_second_order_features(ibi_intraday_features, so_features_names) + return ibi_intraday_features, ibi_second_order_features + return ibi_intraday_features \ No newline at end of file diff --git a/src/features/empatica_temperature/cr/main.py b/src/features/empatica_temperature/cr/main.py index 78eaa952..7f556b32 100644 --- a/src/features/empatica_temperature/cr/main.py +++ b/src/features/empatica_temperature/cr/main.py @@ -3,6 +3,7 @@ from scipy.stats import entropy from cr_features.helper_functions import convert_to2d, generic_features from cr_features.calculate_features import calculate_features +from cr_features_helper_methods import get_sample_rate, extract_second_order_features import sys @@ -61,4 +62,10 @@ def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segmen # extract features from intraday data temperature_intraday_features = extract_temp_features_from_intraday_data(temperature_intraday_data, intraday_features_to_compute, requested_window_length, time_segment, filter_data_by_segment) + + if calc_windows: + so_features_names = provider["WINDOWS"]["SECOND_ORDER_FEATURES"] + temperature_second_order_features = extract_second_order_features(temperature_intraday_features, so_features_names) + return temperature_intraday_features, temperature_second_order_features + return temperature_intraday_features \ No newline at end of file diff --git a/src/features/entry.py b/src/features/entry.py index f7127f35..26aa1adc 100644 --- a/src/features/entry.py +++ b/src/features/entry.py @@ -22,13 +22,17 @@ else: del sensor_data_files["time_segments_labels"] time_segments_file = snakemake.input["time_segments_labels"] - sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=calc_windows) - # Calculation over multiple windows in case of Empatica's CR-features if calc_windows: - sensor_features.to_csv(snakemake.output[1], index=False) - sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=False) + first_order_features, second_order_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=calc_windows) + first_order_features.to_csv(snakemake.output[1], index=False) + second_order_features.to_csv(snakemake.output[2], index=False) + calc_windows = False elif "empatica" in sensor_key: pd.DataFrame().to_csv(snakemake.output[1], index=False) + pd.DataFrame().to_csv(snakemake.output[2], index=False) + + + sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, time_segments_file, calc_windows=calc_windows) sensor_features.to_csv(snakemake.output[0], index=False) diff --git a/src/features/utils/utils.py b/src/features/utils/utils.py index 063afc9d..832620d6 100644 --- a/src/features/utils/utils.py +++ b/src/features/utils/utils.py @@ -93,6 +93,8 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file from importlib import import_module, util sensor_features = pd.DataFrame(columns=["local_segment"]) + sensor_fo_features = pd.DataFrame(columns=["local_segment"]) + sensor_so_features = pd.DataFrame(columns=["local_segment"]) time_segments_labels = pd.read_csv(time_segments_file, header=0) if "FEATURES" not in provider: raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(sensor_key.upper(), provider_key.upper())) @@ -106,23 +108,57 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file time_segments_labels["label"] = [""] for time_segment in time_segments_labels["label"]: print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, time_segment)) + features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes, calc_windows=calc_windows) - if not "local_segment" in features.columns: - raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)") - features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns] - sensor_features = pd.concat([sensor_features, features], axis=0, sort=False) + + # In case of calc_window = True + if isinstance(features, tuple): + if not "local_segment" in features[0].columns or not "local_segment" in features[1].columns: + raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)") + features[0].columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features[0].columns] + features[1].columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features[1].columns] + if not features[0].empty: + sensor_fo_features = pd.concat([sensor_features, features[0]], axis=0, sort=False) + if not features[1].empty: + sensor_so_features = pd.concat([sensor_features, features[1]], axis=0, sort=False) + else: + if not "local_segment" in features.columns: + raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)") + features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns] + sensor_features = pd.concat([sensor_features, features], axis=0, sort=False) else: for feature in provider["FEATURES"]: sensor_features[feature] = None - segment_colums = pd.DataFrame() - sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '') - split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True) - new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"]) - segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns - for i in range(segment_colums.shape[1]): - sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]]) + + if calc_windows: + segment_colums = pd.DataFrame() + sensor_fo_features['local_segment'] = sensor_fo_features['local_segment'].str.replace(r'_RR\d+SS', '') + split_segemnt_columns = sensor_fo_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True) + new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"]) + segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns + for i in range(segment_colums.shape[1]): + sensor_fo_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]]) + + segment_colums = pd.DataFrame() + sensor_so_features['local_segment'] = sensor_so_features['local_segment'].str.replace(r'_RR\d+SS', '') + split_segemnt_columns = sensor_so_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True) + new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"]) + segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns + for i in range(segment_colums.shape[1]): + sensor_so_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]]) - return sensor_features + return sensor_fo_features, sensor_so_features + + else: + segment_colums = pd.DataFrame() + sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '') + split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True) + new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"]) + segment_colums[["local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]] = new_segment_columns + for i in range(segment_colums.shape[1]): + sensor_features.insert(1 + i, segment_colums.columns[i], segment_colums[segment_colums.columns[i]]) + + return sensor_features def run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files): from importlib import import_module, util