Reorganisation and reordering of the cleaning script.

notes
Primoz 2022-09-12 13:44:17 +00:00
parent 15d792089d
commit d27a4a71c8
2 changed files with 33 additions and 13 deletions

View File

@ -688,7 +688,7 @@ ALL_CLEANING_INDIVIDUAL:
COMPUTE: True COMPUTE: True
IMPUTE_PHONE_SELECTED_EVENT_FEATURES: IMPUTE_PHONE_SELECTED_EVENT_FEATURES:
COMPUTE: False COMPUTE: False
TYPE: median # options: zero, mean, median or k-nearest TYPE: zero # options: zero, mean, median or k-nearest
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
COLS_NAN_THRESHOLD: 1 # set to 1 remove only columns that contains all NaN COLS_NAN_THRESHOLD: 1 # set to 1 remove only columns that contains all NaN
COLS_VAR_THRESHOLD: True COLS_VAR_THRESHOLD: True
@ -723,7 +723,7 @@ ALL_CLEANING_OVERALL:
COMPUTE: True COMPUTE: True
IMPUTE_PHONE_SELECTED_EVENT_FEATURES: IMPUTE_PHONE_SELECTED_EVENT_FEATURES:
COMPUTE: False COMPUTE: False
TYPE: median # options: zero, mean, median or k-nearest TYPE: zero # options: zero, mean, median or k-nearest
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
COLS_NAN_THRESHOLD: 1 # set to 1 remove only columns that contains all NaN COLS_NAN_THRESHOLD: 1 # set to 1 remove only columns that contains all NaN
COLS_VAR_THRESHOLD: True COLS_VAR_THRESHOLD: True

View File

@ -4,25 +4,26 @@ import math, sys
import yaml import yaml
from sklearn.impute import KNNImputer from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
def straw_cleaning(sensor_data_files, provider): def straw_cleaning(sensor_data_files, provider):
features = pd.read_csv(sensor_data_files["sensor_data"][0]) features = pd.read_csv(sensor_data_files["sensor_data"][0])
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
with open('config.yaml', 'r') as stream: with open('config.yaml', 'r') as stream:
config = yaml.load(stream, Loader=yaml.FullLoader) config = yaml.load(stream, Loader=yaml.FullLoader)
#Filter-out all rows that do not have the target column available # (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']: if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
features = features[features['phone_esm_straw_' + target].notna()].reset_index() features = features[features['phone_esm_straw_' + target].notna()].reset_index()
test_cols = [col for col in features.columns if 'phone_calls' in col or 'phone_messages' in col] test_cols = [col for col in features.columns if 'phone_calls' in col or 'phone_messages' in col]
# TODO: reorder the cleaning steps so it makes sense for the analysis # TODO: reorder the cleaning steps so it makes sense for the analysis
# TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this # TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this
# the snakemake rules will also have to come with additional parameter (in rules/features.smk) # the snakemake rules will also have to come with additional parameter (in rules/features.smk)
@ -36,7 +37,7 @@ def straw_cleaning(sensor_data_files, provider):
# because of the lack of the availability. Secondly, there's a high importance that features data frame is checked if and NaN # because of the lack of the availability. Secondly, there's a high importance that features data frame is checked if and NaN
# values still exist. # values still exist.
# Impute selected features event # (2) PARTIAL IMPUTATION: IMPUTE DATA DEPENDEND ON THE FEATURES GROUP (e.g., phone or E4 features)
impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"] impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"]
if impute_phone_features["COMPUTE"]: if impute_phone_features["COMPUTE"]:
if not 'phone_data_yield_rapids_ratiovalidyieldedminutes' in features.columns: if not 'phone_data_yield_rapids_ratiovalidyieldedminutes' in features.columns:
@ -55,7 +56,7 @@ def straw_cleaning(sensor_data_files, provider):
mask = features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE'] mask = features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE']
features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"].lower()) features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"].lower())
# Drop rows with the value of data_yield_column less than data_yield_ratio_threshold # ??? Drop rows with the value of data_yield_column less than data_yield_ratio_threshold ???
data_yield_unit = provider["DATA_YIELD_FEATURE"].split("_")[3].lower() data_yield_unit = provider["DATA_YIELD_FEATURE"].split("_")[3].lower()
data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + data_yield_unit data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + data_yield_unit
@ -65,10 +66,10 @@ def straw_cleaning(sensor_data_files, provider):
if provider["DATA_YIELD_RATIO_THRESHOLD"]: if provider["DATA_YIELD_RATIO_THRESHOLD"]:
features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]] features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]]
# Remove cols if threshold of NaN values is passed (should be <= if even all NaN columns must be preserved) # (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved)
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]] features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
# Remove cols where variance is 0 # (4) REMOVE COLS WHERE VARIANCE IS 0
if provider["COLS_VAR_THRESHOLD"]: if provider["COLS_VAR_THRESHOLD"]:
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True) features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
@ -77,7 +78,7 @@ def straw_cleaning(sensor_data_files, provider):
if esm not in features: if esm not in features:
features[esm] = esm_cols[esm] features[esm] = esm_cols[esm]
# Drop highly correlated features - To-Do še en thershold var, ki je v config + kako se tretirajo NaNs? # (5) DROP HIGHLY CORRELATED FEATURES
drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"] drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]
if drop_corr_features["COMPUTE"]: if drop_corr_features["COMPUTE"]:
@ -98,7 +99,26 @@ def straw_cleaning(sensor_data_files, provider):
sns.set(rc={"figure.figsize":(16, 8)}) sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(features.isna(), cbar=False) sns.heatmap(features.isna(), cbar=False)
plt.savefig(f'features_nans.png', bbox_inches='tight') plt.savefig(f'features_nans_bf_knn.png', bbox_inches='tight')
# KNN IMPUTATION
features = impute(features, method="knn")
sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(features.isna(), cbar=False)
plt.savefig(f'features_nans_af_knn.png', bbox_inches='tight')
## STANDARDIZATION - should it happen before or after kNN imputation?
# TODO: check if there are additional columns that need to be excluded from the standardization
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
excluded_columns += [col for col in features.columns if "level_1" in col]
features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)])
# VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
if features.isna.any().any():
raise ValueError
sys.exit() sys.exit()
@ -106,14 +126,14 @@ def straw_cleaning(sensor_data_files, provider):
def impute(df, method='zero'): def impute(df, method='zero'):
def k_nearest(df): # TODO: if needed, implement k-nearest imputation / interpolation def k_nearest(df):
imputer = KNNImputer(n_neighbors=3) imputer = KNNImputer(n_neighbors=3)
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns) return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
return { # rest of the columns should be imputed with the selected method return {
'zero': df.fillna(0), 'zero': df.fillna(0),
'mean': df.fillna(df.mean()), 'mean': df.fillna(df.mean()),
'median': df.fillna(df.median()), 'median': df.fillna(df.median()),
'k-nearest': k_nearest(df) 'knn': k_nearest(df)
}[method] }[method]