Standardization and correlation visualization in overall cleaning script.

notes
Primoz 2022-10-06 13:27:38 +00:00
parent a34412a18d
commit 1e38d9bf1e
1 changed files with 12 additions and 5 deletions

View File

@ -137,12 +137,13 @@ def straw_cleaning(sensor_data_files, provider):
graph_bf_af(features, "8too_much_nans_rows")
# (7) STANDARDIZATION
# I expect to see RuntimeWarnings in this block
if provider["STANDARDIZATION"]:
# Expected warning within this code block
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
features.loc[:, ~features.columns.isin(excluded_columns + ["pid"])] = \
features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: minmax_scale(x.astype(float)))
features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: StandardScaler().fit_transform(x.values[:,np.newaxis]).ravel())
graph_bf_af(features, "9standardization")
@ -169,10 +170,16 @@ def straw_cleaning(sensor_data_files, provider):
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
sns.heatmap(corr_matrix, cmap="YlGnBu", annot=True)
sns.heatmap(corr_matrix, cmap="YlGnBu")
plt.savefig(f'correlation_matrix.png', bbox_inches='tight')
plt.close()
# TODO: katere značilke se izbrišejo - ali korelirajo kakšni pari E4:PHONE?
s = corr_matrix.unstack()
so = s.sort_values(ascending=False)
pd.set_option('display.max_rows', None)
sorted_upper = upper.unstack().sort_values(ascending=False)
# print(sorted_upper[sorted_upper > drop_corr_features["CORR_THRESHOLD"]])
features.drop(to_drop, axis=1, inplace=True)
@ -204,7 +211,7 @@ def impute(df, method='zero'):
'knn': k_nearest(df)
}[method]
def graph_bf_af(features, phase_name, plt_flag=True):
def graph_bf_af(features, phase_name, plt_flag=False):
if plt_flag:
sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)