Add imputation and One-Hot Encoding Methods.

2023-02-22 18:05:01 +01:00 · 2023-02-22 18:05:01 +01:00 · 7f6ae9b323
parent 8f6cb3f444
commit 7f6ae9b323
1 changed files with 84 additions and 16 deletions
--- a/machine_learning/preprocessing.py
+++ b/machine_learning/preprocessing.py
@ -55,32 +55,100 @@ class Preprocessing:
        pass


-    def one_hot_encode(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
+    def one_hot_encoder(categorical_features, numerical_features, mode):
+        """
+        This code is an implementation of one-hot encoding. It takes in two data sets, 
+        one with categorical features and one with numerical features and a mode parameter. 
+        First it uses the fillna() function to fill in any missing values present in the 
+        categorical data set with the mode value. Then it uses the apply () method to 
+        convert each column of the data set into a category data type which is then 
+        transformed using the pd.get_dummies() function. Finally it concatenates the 
+        numerical data set and the transformed categorical data set using pd.concat() and 
+        returns it.

-        categorical_columns = [col for col in self.X.columns if col in categorical_columns]
+        Args:
+            categorical_features (DataFrame): DataFrame including only categorical columns.
+            numerical_features (_type_): DataFrame including only numerical columns.
+            mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results

-        categorical_features = self.X[categorical_columns].copy()
-        mode_categorical_features = categorical_features.mode().iloc[0]
-
-        # fillna with mode
-        categorical_features = categorical_features.fillna(mode_categorical_features)
+        Returns:
+            DataFrame: Hot-One Encoded DataFrame.
+        """
+        # Fill train set with mode
+        categorical_features = categorical_features.fillna(mode)

        # one-hot encoding
        categorical_features = categorical_features.apply(lambda col: col.astype("category"))
        if not categorical_features.empty:
            categorical_features = pd.get_dummies(categorical_features)

-        numerical_features = self.X.drop(categorical_columns, axis=1)
-        train_x = pd.concat([numerical_features, categorical_features], axis=1)
+        return pd.concat([numerical_features, categorical_features], axis=1)


-        # TODO: has to return a train set (or 54 participans in logo) and a test set (1 participant in logo)
+    def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
+        """
+        This code is used to transform categorical data into numerical representations. 
+        It first identifies the categorical columns, then copies them and saves them as 
+        a new dataset. The missing data is filled with the mode (most frequent value in 
+        the respective column). This new dataset is then subjected to one-hot encoding, 
+        which is a process of transforming categorical data into machine interpretable 
+        numerical form by converting categories into multiple binary outcome variables. 
+        These encoded values are then concatenated to the numerical features prior to 
+        being returned as the final dataset.
+
+        Args:
+            categorical_columns (list, optional): List of categorical columns in the dataset. 
+                Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"].
+        
+        TODO: TESTING
+        """
+        categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
+
+        # For train set
+        
+        train_X_categorical_features = self.train_X[categorical_columns].copy()
+        train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
+        mode_train_X_categorical_features = train_X_categorical_features.mode()
+        
+        self.train_X = one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
+        
+        # For test set
+        
+        test_X_categorical_features = self.test_X[categorical_columns].copy()
+        test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
+        
+        self.test_X = one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)


-    def imputer(method="mean"):
-        # TODO: This has to be done in context of CV method - so that test data has only information to mean of train data (it is imputed with train data mean or median etc.)
-        # TODO: has to return train set (or 54 participans in logo) and test test (1 participant in logo)
-        pass
+    def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
+        
+        # TODO: TESTING
+        
+        if groupby:
+            # Interval numerical features # TODO: How can we get and assign appropriate groupby means and assign them to correct columns?
+            
+            # VVVVV ......  IN PROGRES ...... VVVVV
+            means = self.train_X[interval_feature_list].groupby(groupby_feature).mean() 
+            self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + other_feature_list)]] = \
+                self.train_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
+                
+            self.test_X[self.test_X.loc[:, ~self.test_X.columns.isin([groupby_feature] + other_feature_list)]] = \
+                self.test_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
+                
+            # Other features
+            self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + interval_feature_list)]] = \
+                self.train_X[other_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.median()))
+            
+        else:
+            # Interval numerical features
+            means = self.train_X[interval_feature_list].mean()
+            self.train_X[interval_feature_list].fillna(means, inplace=True)
+            self.test_X[interval_feature_list].fillna(means, inplace=True)
+                    
+            # Other features
+            medians = self.train_X[other_feature_list].median()
+            self.train_X[other_feature_list].fillna(medians, inplace=True)
+            self.test_X[other_feature_list].fillna(medians, inplace=True)