Repair preprocessing one hot encoding of test set.
parent
865225994b
commit
26804cf8ea
|
@ -33,7 +33,7 @@ class Preprocessing:
|
||||||
Args:
|
Args:
|
||||||
categorical_features (DataFrame): DataFrame including only categorical columns.
|
categorical_features (DataFrame): DataFrame including only categorical columns.
|
||||||
numerical_features (_type_): DataFrame including only numerical columns.
|
numerical_features (_type_): DataFrame including only numerical columns.
|
||||||
mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
|
mode (int): Mode of the column with which DataFrame is filled.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
DataFrame: Hot-One Encoded DataFrame.
|
DataFrame: Hot-One Encoded DataFrame.
|
||||||
|
@ -46,7 +46,7 @@ class Preprocessing:
|
||||||
if not categorical_features.empty:
|
if not categorical_features.empty:
|
||||||
categorical_features = pd.get_dummies(categorical_features)
|
categorical_features = pd.get_dummies(categorical_features)
|
||||||
|
|
||||||
return pd.concat([numerical_features, categorical_features], axis=1)
|
return pd.concat([numerical_features, categorical_features], axis=1), categorical_features.columns.tolist()
|
||||||
|
|
||||||
|
|
||||||
def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
|
def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
|
||||||
|
@ -68,19 +68,26 @@ class Preprocessing:
|
||||||
categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
|
categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
|
||||||
|
|
||||||
# For train set
|
# For train set
|
||||||
|
|
||||||
train_X_categorical_features = self.train_X[categorical_columns].copy()
|
train_X_categorical_features = self.train_X[categorical_columns].copy()
|
||||||
train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
|
train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
|
||||||
mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
|
mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
|
||||||
|
|
||||||
self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
|
self.train_X, train_cat_col_names = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
|
||||||
|
encoded_categorical_features = [col for col in self.train_X.columns if col.startswith(tuple(categorical_columns))]
|
||||||
|
|
||||||
# For test set
|
# For test set
|
||||||
|
|
||||||
test_X_categorical_features = self.test_X[categorical_columns].copy()
|
test_X_categorical_features = self.test_X[categorical_columns].copy()
|
||||||
test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
|
test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
|
||||||
|
|
||||||
self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
|
self.test_X, test_cat_col_names = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
|
||||||
|
|
||||||
|
# Create categorical columns that were not found in test set and fill them with 0
|
||||||
|
missing_cols = [col for col in train_cat_col_names if col not in test_cat_col_names]
|
||||||
|
self.test_X[missing_cols] = 0
|
||||||
|
|
||||||
|
# Sort column names alphabetically
|
||||||
|
self.train_X = self.train_X.reindex(sorted(self.train_X.columns), axis=1)
|
||||||
|
self.test_X = self.test_X.reindex(sorted(self.test_X.columns), axis=1)
|
||||||
|
|
||||||
|
|
||||||
def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
|
def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
|
||||||
|
|
Loading…
Reference in New Issue