diff --git a/machine_learning/preprocessing.py b/machine_learning/preprocessing.py index 5f17166..ef3fdd9 100644 --- a/machine_learning/preprocessing.py +++ b/machine_learning/preprocessing.py @@ -55,32 +55,100 @@ class Preprocessing: pass - def one_hot_encode(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]): + def one_hot_encoder(categorical_features, numerical_features, mode): + """ + This code is an implementation of one-hot encoding. It takes in two data sets, + one with categorical features and one with numerical features and a mode parameter. + First it uses the fillna() function to fill in any missing values present in the + categorical data set with the mode value. Then it uses the apply () method to + convert each column of the data set into a category data type which is then + transformed using the pd.get_dummies() function. Finally it concatenates the + numerical data set and the transformed categorical data set using pd.concat() and + returns it. - categorical_columns = [col for col in self.X.columns if col in categorical_columns] + Args: + categorical_features (DataFrame): DataFrame including only categorical columns. + numerical_features (_type_): DataFrame including only numerical columns. + mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results - categorical_features = self.X[categorical_columns].copy() - mode_categorical_features = categorical_features.mode().iloc[0] - - # fillna with mode - categorical_features = categorical_features.fillna(mode_categorical_features) + Returns: + DataFrame: Hot-One Encoded DataFrame. + """ + # Fill train set with mode + categorical_features = categorical_features.fillna(mode) # one-hot encoding categorical_features = categorical_features.apply(lambda col: col.astype("category")) if not categorical_features.empty: categorical_features = pd.get_dummies(categorical_features) - numerical_features = self.X.drop(categorical_columns, axis=1) - train_x = pd.concat([numerical_features, categorical_features], axis=1) + return pd.concat([numerical_features, categorical_features], axis=1) - # TODO: has to return a train set (or 54 participans in logo) and a test set (1 participant in logo) - - - def imputer(method="mean"): - # TODO: This has to be done in context of CV method - so that test data has only information to mean of train data (it is imputed with train data mean or median etc.) - # TODO: has to return train set (or 54 participans in logo) and test test (1 participant in logo) - pass + def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]): + """ + This code is used to transform categorical data into numerical representations. + It first identifies the categorical columns, then copies them and saves them as + a new dataset. The missing data is filled with the mode (most frequent value in + the respective column). This new dataset is then subjected to one-hot encoding, + which is a process of transforming categorical data into machine interpretable + numerical form by converting categories into multiple binary outcome variables. + These encoded values are then concatenated to the numerical features prior to + being returned as the final dataset. + + Args: + categorical_columns (list, optional): List of categorical columns in the dataset. + Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"]. + + TODO: TESTING + """ + categorical_columns = [col for col in self.train_X.columns if col in categorical_columns] + + # For train set + + train_X_categorical_features = self.train_X[categorical_columns].copy() + train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1) + mode_train_X_categorical_features = train_X_categorical_features.mode() + + self.train_X = one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features) + + # For test set + + test_X_categorical_features = self.test_X[categorical_columns].copy() + test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1) + + self.test_X = one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features) + def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"): + + # TODO: TESTING + + if groupby: + # Interval numerical features # TODO: How can we get and assign appropriate groupby means and assign them to correct columns? + + # VVVVV ...... IN PROGRES ...... VVVVV + means = self.train_X[interval_feature_list].groupby(groupby_feature).mean() + self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + other_feature_list)]] = \ + self.train_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean())) + + self.test_X[self.test_X.loc[:, ~self.test_X.columns.isin([groupby_feature] + other_feature_list)]] = \ + self.test_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean())) + + # Other features + self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + interval_feature_list)]] = \ + self.train_X[other_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.median())) + + else: + # Interval numerical features + means = self.train_X[interval_feature_list].mean() + self.train_X[interval_feature_list].fillna(means, inplace=True) + self.test_X[interval_feature_list].fillna(means, inplace=True) + + # Other features + medians = self.train_X[other_feature_list].median() + self.train_X[other_feature_list].fillna(medians, inplace=True) + self.test_X[other_feature_list].fillna(medians, inplace=True) + +