From d9a574c550f8beb1cf5662e164ffa66048106dc9 Mon Sep 17 00:00:00 2001 From: Primoz Date: Fri, 23 Sep 2022 13:24:50 +0000 Subject: [PATCH] Changes in the cleaning script and preparation of empatica data yield method. --- .../all_cleaning_individual/straw/main.py | 60 ++++++++++--------- src/features/empatica_data_yield.py | 29 +++++++++ 2 files changed, 61 insertions(+), 28 deletions(-) create mode 100644 src/features/empatica_data_yield.py diff --git a/src/features/all_cleaning_individual/straw/main.py b/src/features/all_cleaning_individual/straw/main.py index 0c41676f..e72b5637 100644 --- a/src/features/all_cleaning_individual/straw/main.py +++ b/src/features/all_cleaning_individual/straw/main.py @@ -25,9 +25,6 @@ def straw_cleaning(sensor_data_files, provider): target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True) - # TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this - # the snakemake rules will also have to come with additional parameter (in rules/features.smk) - # TODO: imputate the rows where the participants have at least 2 rows (2 time segments) - error prevention (has to be tested) # TODO: because of different imputation logic (e.g., the phone_data_yield parameter for phone features) the imputation has to # be planned accordingly. Should the phone features first be imputated with 0 and only then general kNN imputation is executed @@ -38,12 +35,29 @@ def straw_cleaning(sensor_data_files, provider): # values still exist. # (2) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows) - # TODO: determine the threshold at which the column should be removed because of too many Nans. features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] # (3.1) QUALITY CHECK (DATA YIELD COLUMN) which determines if the row stays or not (if either E4 or phone is low quality the row is useless - TODO: determine threshold) # Here, the imputation is still not executed - only quality check + + # ??? Drop rows with the value of phone_data_yield_column less than data_yield_ratio_threshold ??? + phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower() + phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit + + empatica_data_yield_column = "????????????" + + if not phone_data_yield_column in features.columns and not empatica_data_yield_column in features.columns: + raise KeyError(f"RAPIDS provider needs to clean the selected event features based on {phone_data_yield_column} column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].") + + if provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]: + features = features[features[phone_data_yield_column] >= provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]] + + # Potrebno premisliti točno kako bi izgledal data_yield za E4: bo se ustvaril dodaten stolpec; bodo različne spremenljivke, podobno kot hour in minute pri phone? + if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]: + features = features[features['???????????????'] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]] + # ---> imputation ?? + impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"] if True: #impute_phone_features["COMPUTE"]: @@ -64,22 +78,20 @@ def straw_cleaning(sensor_data_files, provider): features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"].lower()) print(features[features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE']][phone_cols]) - - # ??? Drop rows with the value of data_yield_column less than data_yield_ratio_threshold ??? - data_yield_unit = provider["DATA_YIELD_FEATURE"].split("_")[3].lower() - data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + data_yield_unit - - if not data_yield_column in features.columns: - raise KeyError(f"RAPIDS provider needs to impute the selected event features based on {data_yield_column} column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].") - - if provider["DATA_YIELD_RATIO_THRESHOLD"]: - features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]] # (3.2) (optional) DOES ROW CONSIST OF ENOUGH NON-NAN VALUES? Possible some of these examples could still pass previous condition but not this one? + # () Remove rows if threshold of NaN values is passed + min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row + features.dropna(axis=0, thresh=min_count, inplace=True) - # (4) IMPUTATION: IMPUTE DATA WITH KNN METHOD + # (4) IMPUTATION: IMPUTE DATA WITH KNN METHOD (TODO: for now only kNN) # - no other input restriction for this method except that rows are full enough and have reasonably high quality as assessed by data yield + graph_bf_af(features, "before_knn") + impute_cols = [col for col in features.columns if col not in excluded_columns] + features[impute_cols] = impute(features[impute_cols], method="knn") + + graph_bf_af(features, "after_knn") # (5) REMOVE COLS WHERE VARIANCE IS 0 if provider["COLS_VAR_THRESHOLD"]: @@ -105,25 +117,12 @@ def straw_cleaning(sensor_data_files, provider): features.drop(to_drop, axis=1, inplace=True) - # (7) Remove rows if threshold of NaN values is passed - min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row - features.dropna(axis=0, thresh=min_count, inplace=True) - - - sns.set(rc={"figure.figsize":(16, 8)}) - sns.heatmap(features.isna(), cbar=False) - plt.savefig(f'features_nans_bf_knn.png', bbox_inches='tight') - ## (8) STANDARDIZATION if provider["STANDARDIZATION"]: features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)]) - sns.set(rc={"figure.figsize":(16, 8)}) - sns.heatmap(features.isna(), cbar=False) - plt.savefig(f'features_nans_af_knn.png', bbox_inches='tight') - # (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME if features.isna().any().any(): raise ValueError @@ -132,6 +131,11 @@ def straw_cleaning(sensor_data_files, provider): return features +def graph_bf_af(features, phase_name): + sns.set(rc={"figure.figsize":(16, 8)}) + sns.heatmap(features.isna(), cbar=False) + plt.savefig(f'features_nans_{phase_name}.png', bbox_inches='tight') + def impute(df, method='zero'): def k_nearest(df): diff --git a/src/features/empatica_data_yield.py b/src/features/empatica_data_yield.py new file mode 100644 index 00000000..517113ef --- /dev/null +++ b/src/features/empatica_data_yield.py @@ -0,0 +1,29 @@ +import pandas as pd +import numpy as np +from datetime import datetime + +import sys + +def calculate_empatica_data_yield(features): + # Get time segment duration in seconds from dataframe + datetime_start = datetime.strptime(df.loc[0, 'local_segment_start_datetime'], '%y-%m-%d %H:%M:%S') + datetime_end = datetime.strptime(df.loc[0, 'local_segment_end_datetime'], '%y-%m-%d %H:%M:%S') + tseg_duration = (datetime_end - datetime_start).total_seconds() + + acc_data_yield = (features['empatica_accelerometer_cr_SO_windowsCount'] * 15) / tseg_duration + temp_data_yield = (features['empatica_temperature_cr_SO_windowsCount'] * 300) / tseg_duration + acc_data_yield = (features['empatica_electrodermal_activity_cr_SO_windowsCount'] * 60) / tseg_duration + ibi_data_yield = (features['empatica_inter_beat_interval_cr_SO_windowsCount'] * 300) / tseg_duration + + # TODO: morda smisleno obdelovati različne senzorje ločeno -> lahko da ibi ne bo dobre kvalitete, ostali pa bodo okej. Zakaj bi samo zaradi IBI zavrgli celotno vrstico ... + # lahko se tudi naredi overall kvaliteta empatice npr. povprečje vseh data_yield rezultatov? Oz. povprečje z utežmi glede na število stolpcev, ki jih senzor vsebuje + # ... čeprav št. stolpcev ni najboljše, saj je pomembnost nekaterih (npr. EDA) značilk zelo vprašljiva. + # TODO: bolja nastavitev delovnih ur sedaj je od 4 do 4... to povzroči veliko manjkajočih podatkov in posledično nizek (telefonski in E4) data_yield ... + + data_yield_features = [col for col in features.columns if "SO_windowsCount" in col and "a"] + + + + + +