stress_at_work_analysis/machine_learning/preprocessing.py

import os
import sys

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

class Preprocessing:
    """This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data. 
       It's blind to the test data - e.g, it imputes the test data with train data mean. 
       This means, it somehow needs an access to the information about data split. In context 
    """
    

    def __init__(self, train_X, train_y, test_X, test_y):
        self.train_X = train_X
        self.train_y = train_y
        self.test_X = test_X
        self.test_y = test_y


    def one_hot_encoder(self, categorical_features, numerical_features, mode):
        """
        This code is an implementation of one-hot encoding. It takes in two data sets, 
        one with categorical features and one with numerical features and a mode parameter. 
        First it uses the fillna() function to fill in any missing values present in the 
        categorical data set with the mode value. Then it uses the apply () method to 
        convert each column of the data set into a category data type which is then 
        transformed using the pd.get_dummies() function. Finally it concatenates the 
        numerical data set and the transformed categorical data set using pd.concat() and 
        returns it.

        Args:
            categorical_features (DataFrame): DataFrame including only categorical columns.
            numerical_features (_type_): DataFrame including only numerical columns.
            mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results

        Returns:
            DataFrame: Hot-One Encoded DataFrame.
        """
        # Fill train set with mode
        categorical_features = categorical_features.fillna(mode)

        # one-hot encoding
        categorical_features = categorical_features.apply(lambda col: col.astype("category"))
        if not categorical_features.empty:
            categorical_features = pd.get_dummies(categorical_features)

        return pd.concat([numerical_features, categorical_features], axis=1)


    def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
        """
        This code is used to transform categorical data into numerical representations. 
        It first identifies the categorical columns, then copies them and saves them as 
        a new dataset. The missing data is filled with the mode (most frequent value in 
        the respective column). This new dataset is then subjected to one-hot encoding, 
        which is a process of transforming categorical data into machine interpretable 
        numerical form by converting categories into multiple binary outcome variables. 
        These encoded values are then concatenated to the numerical features prior to 
        being returned as the final dataset.

        Args:
            categorical_columns (list, optional): List of categorical columns in the dataset. 
                Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"].
        
        """
        categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]

        # For train set
        
        train_X_categorical_features = self.train_X[categorical_columns].copy()
        train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
        mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
        
        self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
        
        # For test set
        
        test_X_categorical_features = self.test_X[categorical_columns].copy()
        test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
        
        self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)


    def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
        
        # TODO: TESTING
        
        if groupby:
            # Interval numerical features # TODO: How can we get and assign appropriate groupby means and assign them to correct columns?
            
            # VVVVV ......  IN PROGRES ...... VVVVV
            means = self.train_X[interval_feature_list].groupby(groupby_feature).mean() 
            self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + other_feature_list)]] = \
                self.train_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
                
            self.test_X[self.test_X.loc[:, ~self.test_X.columns.isin([groupby_feature] + other_feature_list)]] = \
                self.test_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
                
            # Other features
            self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + interval_feature_list)]] = \
                self.train_X[other_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.median()))
            
        else:
            # Interval numerical features
            means = self.train_X[interval_feature_list].mean()
            self.train_X[interval_feature_list].fillna(means, inplace=True)
            self.test_X[interval_feature_list].fillna(means, inplace=True)
                    
            # Other features
            medians = self.train_X[other_feature_list].median()
            self.train_X[other_feature_list].fillna(medians, inplace=True)
            self.test_X[other_feature_list].fillna(medians, inplace=True)
            
            
    def get_train_test_sets(self):
        """Train and test sets getter

        Returns:
            tuple of Pandas DataFrames: Gets train test sets in traditional sklearn format.
        """
        return self.train_X, self.train_y, self.test_X, self.test_y
Add preprocessing class. 2023-02-22 13:44:03 +01:00			`import os`
			`import sys`

			`import numpy as np`
			`import matplotlib.pyplot as plt`
			`import pandas as pd`

			`class Preprocessing:`
			`"""This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data.`
			`It's blind to the test data - e.g, it imputes the test data with train data mean.`
			`This means, it somehow needs an access to the information about data split. In context`
			`"""`
Clean and fix Preprocessing module. 2023-02-23 10:40:58 +01:00
Add preprocessing class. 2023-02-22 13:44:03 +01:00
			`def __init__(self, train_X, train_y, test_X, test_y):`
			`self.train_X = train_X`
			`self.train_y = train_y`
			`self.test_X = test_X`
			`self.test_y = test_y`


Clean and fix Preprocessing module. 2023-02-23 10:40:58 +01:00			`def one_hot_encoder(self, categorical_features, numerical_features, mode):`
Add imputation and One-Hot Encoding Methods. 2023-02-22 18:05:01 +01:00			`"""`
			`This code is an implementation of one-hot encoding. It takes in two data sets,`
			`one with categorical features and one with numerical features and a mode parameter.`
			`First it uses the fillna() function to fill in any missing values present in the`
			`categorical data set with the mode value. Then it uses the apply () method to`
			`convert each column of the data set into a category data type which is then`
			`transformed using the pd.get_dummies() function. Finally it concatenates the`
			`numerical data set and the transformed categorical data set using pd.concat() and`
			`returns it.`

			`Args:`
			`categorical_features (DataFrame): DataFrame including only categorical columns.`
			`numerical_features (_type_): DataFrame including only numerical columns.`
			`mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results`

			`Returns:`
			`DataFrame: Hot-One Encoded DataFrame.`
			`"""`
			`# Fill train set with mode`
			`categorical_features = categorical_features.fillna(mode)`
Add preprocessing class. 2023-02-22 13:44:03 +01:00
			`# one-hot encoding`
			`categorical_features = categorical_features.apply(lambda col: col.astype("category"))`
			`if not categorical_features.empty:`
			`categorical_features = pd.get_dummies(categorical_features)`

Add imputation and One-Hot Encoding Methods. 2023-02-22 18:05:01 +01:00			`return pd.concat([numerical_features, categorical_features], axis=1)`
Add preprocessing class. 2023-02-22 13:44:03 +01:00

Add imputation and One-Hot Encoding Methods. 2023-02-22 18:05:01 +01:00			`def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):`
			`"""`
			`This code is used to transform categorical data into numerical representations.`
			`It first identifies the categorical columns, then copies them and saves them as`
			`a new dataset. The missing data is filled with the mode (most frequent value in`
			`the respective column). This new dataset is then subjected to one-hot encoding,`
			`which is a process of transforming categorical data into machine interpretable`
			`numerical form by converting categories into multiple binary outcome variables.`
			`These encoded values are then concatenated to the numerical features prior to`
			`being returned as the final dataset.`
Add preprocessing class. 2023-02-22 13:44:03 +01:00
Add imputation and One-Hot Encoding Methods. 2023-02-22 18:05:01 +01:00			`Args:`
			`categorical_columns (list, optional): List of categorical columns in the dataset.`
			`Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"].`

			`"""`
			`categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]`
Add preprocessing class. 2023-02-22 13:44:03 +01:00
Add imputation and One-Hot Encoding Methods. 2023-02-22 18:05:01 +01:00			`# For train set`

			`train_X_categorical_features = self.train_X[categorical_columns].copy()`
			`train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)`
Clean and fix Preprocessing module. 2023-02-23 10:40:58 +01:00			`mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]`
Add imputation and One-Hot Encoding Methods. 2023-02-22 18:05:01 +01:00
Clean and fix Preprocessing module. 2023-02-23 10:40:58 +01:00			`self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)`
Add imputation and One-Hot Encoding Methods. 2023-02-22 18:05:01 +01:00
			`# For test set`

			`test_X_categorical_features = self.test_X[categorical_columns].copy()`
			`test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)`

Clean and fix Preprocessing module. 2023-02-23 10:40:58 +01:00			`self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)`
Add preprocessing class. 2023-02-22 13:44:03 +01:00

Add imputation and One-Hot Encoding Methods. 2023-02-22 18:05:01 +01:00			`def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):`

			`# TODO: TESTING`

			`if groupby:`
			`# Interval numerical features # TODO: How can we get and assign appropriate groupby means and assign them to correct columns?`

			`# VVVVV ...... IN PROGRES ...... VVVVV`
			`means = self.train_X[interval_feature_list].groupby(groupby_feature).mean()`
			`self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + other_feature_list)]] = \`
			`self.train_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))`

			`self.test_X[self.test_X.loc[:, ~self.test_X.columns.isin([groupby_feature] + other_feature_list)]] = \`
			`self.test_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))`

			`# Other features`
			`self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + interval_feature_list)]] = \`
			`self.train_X[other_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.median()))`

			`else:`
			`# Interval numerical features`
			`means = self.train_X[interval_feature_list].mean()`
			`self.train_X[interval_feature_list].fillna(means, inplace=True)`
			`self.test_X[interval_feature_list].fillna(means, inplace=True)`

			`# Other features`
			`medians = self.train_X[other_feature_list].median()`
			`self.train_X[other_feature_list].fillna(medians, inplace=True)`
			`self.test_X[other_feature_list].fillna(medians, inplace=True)`
Clean and fix Preprocessing module. 2023-02-23 10:40:58 +01:00

			`def get_train_test_sets(self):`
			`"""Train and test sets getter`

			`Returns:`
			`tuple of Pandas DataFrames: Gets train test sets in traditional sklearn format.`
			`"""`
			`return self.train_X, self.train_y, self.test_X, self.test_y`
Add imputation and One-Hot Encoding Methods. 2023-02-22 18:05:01 +01:00

Add preprocessing class. 2023-02-22 13:44:03 +01:00