From 1e38d9bf1e4d1caa10ff8c5245a59a9b20d4d7f9 Mon Sep 17 00:00:00 2001 From: Primoz Date: Thu, 6 Oct 2022 13:27:38 +0000 Subject: [PATCH] Standardization and correlation visualization in overall cleaning script. --- src/features/all_cleaning_overall/straw/main.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/features/all_cleaning_overall/straw/main.py b/src/features/all_cleaning_overall/straw/main.py index dba7c9fd..9324424d 100644 --- a/src/features/all_cleaning_overall/straw/main.py +++ b/src/features/all_cleaning_overall/straw/main.py @@ -137,12 +137,13 @@ def straw_cleaning(sensor_data_files, provider): graph_bf_af(features, "8too_much_nans_rows") # (7) STANDARDIZATION - # I expect to see RuntimeWarnings in this block + if provider["STANDARDIZATION"]: + # Expected warning within this code block with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) features.loc[:, ~features.columns.isin(excluded_columns + ["pid"])] = \ - features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: minmax_scale(x.astype(float))) + features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: StandardScaler().fit_transform(x.values[:,np.newaxis]).ravel()) graph_bf_af(features, "9standardization") @@ -169,10 +170,16 @@ def straw_cleaning(sensor_data_files, provider): upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])] - sns.heatmap(corr_matrix, cmap="YlGnBu", annot=True) + sns.heatmap(corr_matrix, cmap="YlGnBu") plt.savefig(f'correlation_matrix.png', bbox_inches='tight') plt.close() - # TODO: katere značilke se izbrišejo - ali korelirajo kakšni pari E4:PHONE? + + s = corr_matrix.unstack() + so = s.sort_values(ascending=False) + + pd.set_option('display.max_rows', None) + sorted_upper = upper.unstack().sort_values(ascending=False) + # print(sorted_upper[sorted_upper > drop_corr_features["CORR_THRESHOLD"]]) features.drop(to_drop, axis=1, inplace=True) @@ -204,7 +211,7 @@ def impute(df, method='zero'): 'knn': k_nearest(df) }[method] -def graph_bf_af(features, phase_name, plt_flag=True): +def graph_bf_af(features, phase_name, plt_flag=False): if plt_flag: sns.set(rc={"figure.figsize":(16, 8)}) sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)