import pandas as pd import numpy as np import math, sys def straw_cleaning(sensor_data_files, provider): features = pd.read_csv(sensor_data_files["sensor_data"][0]) # TODO: reorder the cleaning steps so it makes sense for the analysis # TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this # the snakemake rules will also have to come with additional parameter (in rules/features.smk) # Impute selected features event impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"] if impute_phone_features["COMPUTE"]: if not 'phone_data_yield_rapids_ratiovalidyieldedminutes' in features.columns: raise KeyError("RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].") # TODO: if the type of the imputation will vary for different groups of features make conditional imputations here phone_cols = [col for col in features if \ col.startswith('phone_applications_foreground_rapids_') or col.startswith('phone_battery_rapids_') or col.startswith('phone_calls_rapids_') or col.startswith('phone_keyboard_rapids_') or col.startswith('phone_messages_rapids_') or col.startswith('phone_screen_rapids_') or col.startswith('phone_wifi_')] mask = features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE'] features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"]) # Drop rows with the value of data_yield_column less than data_yield_ratio_threshold data_yield_unit = provider["DATA_YIELD_FEATURE"].split("_")[3].lower() data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + data_yield_unit if not data_yield_column in features.columns: raise KeyError(f"RAPIDS provider needs to impute the selected event features based on {data_yield_column} column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].") features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]] # Remove cols if threshold of NaN values is passed features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] # Remove cols where variance is 0 if provider["COLS_VAR_THRESHOLD"]: features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True) # Drop highly correlated features - To-Do še en thershold var, ki je v config + kako se tretirajo NaNs? drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"] if drop_corr_features["COMPUTE"]: numerical_cols = features.select_dtypes(include=np.number).columns.tolist() # Remove columns where NaN count threshold is passed valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]] cor_matrix = valid_features.corr(method='spearman').abs() upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool)) to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > drop_corr_features["CORR_THRESHOLD"])] features.drop(to_drop, axis=1, inplace=True) # Remove rows if threshold of NaN values is passed min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # min not nan values in row features.dropna(axis=0, thresh=min_count, inplace=True) return features def impute(df, method='zero'): def k_nearest(df): # TODO: if needed implement k-nearest imputation / interpolation pass return { # rest of the columns should be imputed with the selected method 'zero': df.fillna(0), 'mean': df.fillna(df.mean()), 'median': df.fillna(df.median()), 'k-nearest': k_nearest(df) }[method]