From 26804cf8ea2e6582464ab482baf8c50dcefc7114 Mon Sep 17 00:00:00 2001 From: Primoz Date: Fri, 21 Apr 2023 13:24:31 +0200 Subject: [PATCH] Repair preprocessing one hot encoding of test set. --- machine_learning/preprocessing.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/machine_learning/preprocessing.py b/machine_learning/preprocessing.py index a11558c..1f55482 100644 --- a/machine_learning/preprocessing.py +++ b/machine_learning/preprocessing.py @@ -33,7 +33,7 @@ class Preprocessing: Args: categorical_features (DataFrame): DataFrame including only categorical columns. numerical_features (_type_): DataFrame including only numerical columns. - mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results + mode (int): Mode of the column with which DataFrame is filled. Returns: DataFrame: Hot-One Encoded DataFrame. @@ -46,7 +46,7 @@ class Preprocessing: if not categorical_features.empty: categorical_features = pd.get_dummies(categorical_features) - return pd.concat([numerical_features, categorical_features], axis=1) + return pd.concat([numerical_features, categorical_features], axis=1), categorical_features.columns.tolist() def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]): @@ -68,20 +68,27 @@ class Preprocessing: categorical_columns = [col for col in self.train_X.columns if col in categorical_columns] # For train set - train_X_categorical_features = self.train_X[categorical_columns].copy() train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1) mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0] - self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features) + self.train_X, train_cat_col_names = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features) + encoded_categorical_features = [col for col in self.train_X.columns if col.startswith(tuple(categorical_columns))] # For test set - test_X_categorical_features = self.test_X[categorical_columns].copy() test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1) - self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features) + self.test_X, test_cat_col_names = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features) + # Create categorical columns that were not found in test set and fill them with 0 + missing_cols = [col for col in train_cat_col_names if col not in test_cat_col_names] + self.test_X[missing_cols] = 0 + + # Sort column names alphabetically + self.train_X = self.train_X.reindex(sorted(self.train_X.columns), axis=1) + self.test_X = self.test_X.reindex(sorted(self.test_X.columns), axis=1) + def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):