import pandas as pd import numpy as np import math, sys import yaml from sklearn.impute import KNNImputer from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt import seaborn as sns from src.features import empatica_data_yield as edy def straw_cleaning(sensor_data_files, provider): features = pd.read_csv(sensor_data_files["sensor_data"][0]) esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns with open('config.yaml', 'r') as stream: config = yaml.load(stream, Loader=yaml.FullLoader) excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime'] # (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']: target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True) # (2) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows) features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] # (3.1) QUALITY CHECK (DATA YIELD COLUMN) which determines if the row stays or not (if either E4 or phone is low quality the row is useless - TODO: determine threshold) phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower() phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit features = edy.calculate_empatica_data_yield(features) if not phone_data_yield_column in features.columns and not "empatica_data_yield" in features.columns: raise KeyError(f"RAPIDS provider needs to clean the selected event features based on {phone_data_yield_column} column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].") if provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]: features = features[features[phone_data_yield_column] >= provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]] if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]: features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]] # ---> imputation ?? # impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"] # if True: #impute_phone_features["COMPUTE"]: # if not 'phone_data_yield_rapids_ratiovalidyieldedminutes' in features.columns: # raise KeyError("RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].") # phone_cols = [col for col in features if \ # col.startswith('phone_applications_foreground_rapids_') or # col.startswith('phone_battery_rapids_') or # col.startswith('phone_calls_rapids_') or # col.startswith('phone_keyboard_rapids_') or # col.startswith('phone_messages_rapids_') or # col.startswith('phone_screen_rapids_') or # col.startswith('phone_wifi_')] # mask = features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE'] # features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"].lower()) # print(features[features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE']][phone_cols]) # (3.2) (optional) DOES ROW CONSIST OF ENOUGH NON-NAN VALUES? Possible some of these examples could still pass previous condition but not this one? min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row features.dropna(axis=0, thresh=min_count, inplace=True) # (4) IMPUTATION: IMPUTE DATA WITH KNN METHOD (TODO: for now only kNN) # - no other input restriction for this method except that rows are full enough and have reasonably high quality as assessed by data yield graph_bf_af(features, "before_knn") impute_cols = [col for col in features.columns if col not in excluded_columns] features[impute_cols] = impute(features[impute_cols], method="knn") graph_bf_af(features, "after_knn") # (5) REMOVE COLS WHERE VARIANCE IS 0 if provider["COLS_VAR_THRESHOLD"]: features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True) # Preserve esm cols if deleted (has to come after drop cols operations) for esm in esm_cols: if esm not in features: features[esm] = esm_cols[esm] # (6) DROP HIGHLY CORRELATED FEATURES drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"] if drop_corr_features["COMPUTE"]: numerical_cols = features.select_dtypes(include=np.number).columns.tolist() # Remove columns where NaN count threshold is passed valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]] cor_matrix = valid_features.corr(method='spearman').abs() upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool)) to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > drop_corr_features["CORR_THRESHOLD"])] features.drop(to_drop, axis=1, inplace=True) ## (8) STANDARDIZATION if provider["STANDARDIZATION"]: features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)]) # (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME if features.isna().any().any(): raise ValueError sys.exit() return features def graph_bf_af(features, phase_name): sns.set(rc={"figure.figsize":(16, 8)}) sns.heatmap(features.isna(), cbar=False) plt.savefig(f'features_nans_{phase_name}.png', bbox_inches='tight') def impute(df, method='zero'): def k_nearest(df): imputer = KNNImputer(n_neighbors=3) return pd.DataFrame(imputer.fit_transform(df), columns=df.columns) return { 'zero': df.fillna(0), 'mean': df.fillna(df.mean()), 'median': df.fillna(df.median()), 'knn': k_nearest(df) }[method]