From a34412a18dbd75aafa9f3bba9303ab6a962cec03 Mon Sep 17 00:00:00 2001 From: Primoz Date: Wed, 5 Oct 2022 14:16:55 +0000 Subject: [PATCH] E4 data yield corrections. Changes in overal cs - standardization. --- config.yaml | 2 +- .../all_cleaning_overall/straw/main.py | 106 ++++++++++-------- src/features/empatica_data_yield.py | 13 ++- 3 files changed, 68 insertions(+), 53 deletions(-) diff --git a/config.yaml b/config.yaml index 23d7e82b..e50a5565 100644 --- a/config.yaml +++ b/config.yaml @@ -680,7 +680,7 @@ ALL_CLEANING_INDIVIDUAL: ALL_CLEANING_OVERALL: PROVIDERS: RAPIDS: - COMPUTE: True + COMPUTE: False IMPUTE_SELECTED_EVENT_FEATURES: COMPUTE: False MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 diff --git a/src/features/all_cleaning_overall/straw/main.py b/src/features/all_cleaning_overall/straw/main.py index b07f9403..dba7c9fd 100644 --- a/src/features/all_cleaning_overall/straw/main.py +++ b/src/features/all_cleaning_overall/straw/main.py @@ -1,10 +1,9 @@ import pandas as pd import numpy as np -import math, sys, random -import yaml +import math, sys, random, warnings, yaml from sklearn.impute import KNNImputer -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import StandardScaler, minmax_scale import matplotlib.pyplot as plt import seaborn as sns @@ -31,7 +30,7 @@ def straw_cleaning(sensor_data_files, provider): graph_bf_af(features, "2target_rows_after") - # (2.1) QUALITY CHECK (DATA YIELD COLUMN) deletes the rows where E4 or phone data is low quality + # (2) QUALITY CHECK (DATA YIELD COLUMN) drops the rows where E4 or phone data is low quality phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower() phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit @@ -43,42 +42,26 @@ def straw_cleaning(sensor_data_files, provider): hist = features[["empatica_data_yield", phone_data_yield_column]].hist() plt.legend() plt.savefig(f'phone_E4_histogram.png', bbox_inches='tight') + # Drop rows where phone data yield is less then given threshold if provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]: print("\nThreshold:", provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]) print("Phone features data yield stats:", features[phone_data_yield_column].describe(), "\n") - print(features[phone_data_yield_column].sort_values()) + # print(features[phone_data_yield_column].sort_values()) hist = features[phone_data_yield_column].hist(bins=5) + plt.close() features = features[features[phone_data_yield_column] >= provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True) # Drop rows where empatica data yield is less then given threshold if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]: print("\nThreshold:", provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]) print("E4 features data yield stats:", features["empatica_data_yield"].describe(), "\n") - print(features["empatica_data_yield"].sort_values()) + # print(features["empatica_data_yield"].sort_values()) features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True) graph_bf_af(features, "3data_yield_drop_rows") - - # (2.2) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES? - min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row - features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans - graph_bf_af(features, "4too_much_nans_rows") - - # (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows) - esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns - - features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] - - graph_bf_af(features, "5too_much_nans_cols") - - # Preserve esm cols if deleted (has to come after drop cols operations) - for esm in esm_cols: - if esm not in features: - features[esm] = esm_cols[esm] - - # (4) CONTEXTUAL IMPUTATION + # (3) CONTEXTUAL IMPUTATION # Impute selected phone features with a high number impute_w_hn = [col for col in features.columns if \ @@ -91,7 +74,7 @@ def straw_cleaning(sensor_data_files, provider): "timelastmessages" in col] features[impute_w_hn] = impute(features[impute_w_hn], method="high_number") - graph_bf_af(features, "6high_number_imp") + graph_bf_af(features, "4high_number_imp") # Impute special case (mostcommonactivity) and (homelabel) impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col] @@ -112,7 +95,7 @@ def straw_cleaning(sensor_data_files, provider): col.startswith('phone_wifi_visible')] features[impute_zero] = impute(features[impute_zero], method="zero") - graph_bf_af(features, "7zero_imp") + graph_bf_af(features, "5zero_imp") # Impute phone locations with median - should this rather be imputed at kNN step?? # impute_locations = [col for col in features.columns if "phone_locations_" in col] @@ -127,30 +110,53 @@ def straw_cleaning(sensor_data_files, provider): # graph_bf_af(features[impute_locations], "phoneloc_before") # features[impute_locations] = features[impute_locations + ["pid"]].groupby("pid").transform(lambda x: x.fillna(x.median()))[impute_locations] + + # (4) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows) + esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns + features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] - # (5) STANDARDIZATION - if provider["STANDARDIZATION"]: - features.loc[:, ~features.columns.isin(excluded_columns + ["pid"])] = \ - features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: 0 if (x.std() == 0) else (x - x.mean()) / x.std()) + graph_bf_af(features, "6too_much_nans_cols") - graph_bf_af(features, "8standardization") - - # (6) IMPUTATION: IMPUTE DATA WITH KNN METHOD - impute_cols = [col for col in features.columns if col not in excluded_columns and col != "pid"] - features[impute_cols] = impute(features[impute_cols], method="knn") - - graph_bf_af(features, "9knn_after") - - # (7) REMOVE COLS WHERE VARIANCE IS 0 - esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] + # (5) REMOVE COLS WHERE VARIANCE IS 0 if provider["COLS_VAR_THRESHOLD"]: features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True) - graph_bf_af(features, "10variance_drop") + graph_bf_af(features, "7variance_drop") + + # Preserve esm cols if deleted (has to come after drop cols operations) + for esm in esm_cols: + if esm not in features: + features[esm] = esm_cols[esm] + + # (6) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES? + min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row + features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans + + graph_bf_af(features, "8too_much_nans_rows") + + # (7) STANDARDIZATION + # I expect to see RuntimeWarnings in this block + if provider["STANDARDIZATION"]: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=RuntimeWarning) + features.loc[:, ~features.columns.isin(excluded_columns + ["pid"])] = \ + features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: minmax_scale(x.astype(float))) + + graph_bf_af(features, "9standardization") + + # (8) IMPUTATION: IMPUTE DATA WITH KNN METHOD + features.reset_index(drop=True, inplace=True) + impute_cols = [col for col in features.columns if col not in excluded_columns and col != "pid"] + features[impute_cols] = impute(features[impute_cols], method="knn") + + graph_bf_af(features, "10knn_after") + + + # (9) DROP HIGHLY CORRELATED FEATURES + esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] - # (8) DROP HIGHLY CORRELATED FEATURES drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"] if drop_corr_features["COMPUTE"] and features.shape[0] > 5: # If small amount of segments (rows) is present, do not execute correlation check @@ -163,6 +169,11 @@ def straw_cleaning(sensor_data_files, provider): upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])] + sns.heatmap(corr_matrix, cmap="YlGnBu", annot=True) + plt.savefig(f'correlation_matrix.png', bbox_inches='tight') + plt.close() + # TODO: katere značilke se izbrišejo - ali korelirajo kakšni pari E4:PHONE? + features.drop(to_drop, axis=1, inplace=True) # Preserve esm cols if deleted (has to come after drop cols operations) @@ -172,10 +183,11 @@ def straw_cleaning(sensor_data_files, provider): graph_bf_af(features, "11correlation_drop") - # (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME + # (10) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME if features.isna().any().any(): - raise ValueError + raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.") + sys.exit() return features def impute(df, method='zero'): @@ -192,7 +204,7 @@ def impute(df, method='zero'): 'knn': k_nearest(df) }[method] -def graph_bf_af(features, phase_name, plt_flag=False): +def graph_bf_af(features, phase_name, plt_flag=True): if plt_flag: sns.set(rc={"figure.figsize":(16, 8)}) sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number) @@ -201,5 +213,5 @@ def graph_bf_af(features, phase_name, plt_flag=False): print(f"\n-------------{phase_name}-------------") print("Rows number:", features.shape[0]) print("Columns number:", len(features.columns)) + print("NaN values:", features.isna().sum().sum()) print("---------------------------------------------\n") - diff --git a/src/features/empatica_data_yield.py b/src/features/empatica_data_yield.py index bd691a12..1c8681c9 100644 --- a/src/features/empatica_data_yield.py +++ b/src/features/empatica_data_yield.py @@ -10,12 +10,15 @@ def calculate_empatica_data_yield(features): datetime_end = datetime.strptime(features.loc[0, 'local_segment_end_datetime'], '%Y-%m-%d %H:%M:%S') tseg_duration = (datetime_end - datetime_start).total_seconds() - features["acc_data_yield"] = (features['empatica_accelerometer_cr_SO_windowsCount'] * 15) / tseg_duration - features["temp_data_yield"] = (features['empatica_temperature_cr_SO_windowsCount'] * 300) / tseg_duration - features["eda_data_yield"] = (features['empatica_electrodermal_activity_cr_SO_windowsCount'] * 60) / tseg_duration - features["ibi_data_yield"] = (features['empatica_inter_beat_interval_cr_SO_windowsCount'] * 300) / tseg_duration + features["acc_data_yield"] = (features['empatica_accelerometer_cr_SO_windowsCount'] * 15) / tseg_duration \ + if 'empatica_accelerometer_cr_SO_windowsCount' in features else 0 + features["temp_data_yield"] = (features['empatica_temperature_cr_SO_windowsCount'] * 300) / tseg_duration \ + if 'empatica_temperature_cr_SO_windowsCount' in features else 0 + features["eda_data_yield"] = (features['empatica_electrodermal_activity_cr_SO_windowsCount'] * 60) / tseg_duration \ + if 'empatica_electrodermal_activity_cr_SO_windowsCount' in features else 0 + features["ibi_data_yield"] = (features['empatica_inter_beat_interval_cr_SO_windowsCount'] * 300) / tseg_duration \ + if 'empatica_inter_beat_interval_cr_SO_windowsCount' in features else 0 - # TODO: boljša nastavitev delovnih ur sedaj je od 4:00 do 4:00... to povzroči veliko manjkajočih podatkov in posledično nizek (telefonski in E4) data_yield ... empatica_data_yield_cols = ['acc_data_yield', 'temp_data_yield', 'eda_data_yield', 'ibi_data_yield'] features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1).fillna(0) features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average)