Standardization and correlation visualization in overall cleaning script.
parent
a34412a18d
commit
1e38d9bf1e
|
@ -137,12 +137,13 @@ def straw_cleaning(sensor_data_files, provider):
|
|||
graph_bf_af(features, "8too_much_nans_rows")
|
||||
|
||||
# (7) STANDARDIZATION
|
||||
# I expect to see RuntimeWarnings in this block
|
||||
|
||||
if provider["STANDARDIZATION"]:
|
||||
# Expected warning within this code block
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", category=RuntimeWarning)
|
||||
features.loc[:, ~features.columns.isin(excluded_columns + ["pid"])] = \
|
||||
features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: minmax_scale(x.astype(float)))
|
||||
features.loc[:, ~features.columns.isin(excluded_columns)].groupby('pid').transform(lambda x: StandardScaler().fit_transform(x.values[:,np.newaxis]).ravel())
|
||||
|
||||
graph_bf_af(features, "9standardization")
|
||||
|
||||
|
@ -169,10 +170,16 @@ def straw_cleaning(sensor_data_files, provider):
|
|||
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
|
||||
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
|
||||
|
||||
sns.heatmap(corr_matrix, cmap="YlGnBu", annot=True)
|
||||
sns.heatmap(corr_matrix, cmap="YlGnBu")
|
||||
plt.savefig(f'correlation_matrix.png', bbox_inches='tight')
|
||||
plt.close()
|
||||
# TODO: katere značilke se izbrišejo - ali korelirajo kakšni pari E4:PHONE?
|
||||
|
||||
s = corr_matrix.unstack()
|
||||
so = s.sort_values(ascending=False)
|
||||
|
||||
pd.set_option('display.max_rows', None)
|
||||
sorted_upper = upper.unstack().sort_values(ascending=False)
|
||||
# print(sorted_upper[sorted_upper > drop_corr_features["CORR_THRESHOLD"]])
|
||||
|
||||
features.drop(to_drop, axis=1, inplace=True)
|
||||
|
||||
|
@ -204,7 +211,7 @@ def impute(df, method='zero'):
|
|||
'knn': k_nearest(df)
|
||||
}[method]
|
||||
|
||||
def graph_bf_af(features, phase_name, plt_flag=True):
|
||||
def graph_bf_af(features, phase_name, plt_flag=False):
|
||||
if plt_flag:
|
||||
sns.set(rc={"figure.figsize":(16, 8)})
|
||||
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)
|
||||
|
|
Loading…
Reference in New Issue