Clean and fix Preprocessing module.

ml_pipeline
Primoz 2023-02-23 10:40:58 +01:00
parent 9ed863b7a1
commit bccc1cd1de
1 changed files with 14 additions and 45 deletions

View File

@ -5,13 +5,12 @@ import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold
class Preprocessing: class Preprocessing:
"""This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data. """This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data.
It's blind to the test data - e.g, it imputes the test data with train data mean. It's blind to the test data - e.g, it imputes the test data with train data mean.
This means, it somehow needs an access to the information about data split. In context This means, it somehow needs an access to the information about data split. In context
""" """
def __init__(self, train_X, train_y, test_X, test_y): def __init__(self, train_X, train_y, test_X, test_y):
self.train_X = train_X self.train_X = train_X
@ -19,46 +18,8 @@ class Preprocessing:
self.test_X = test_X self.test_X = test_X
self.test_y = test_y self.test_y = test_y
# TODO This is probably NOT in the right place in this class ...
def prepare_data_for_cross_validation(self):
data = self.data.copy()
if self.cv_method == "logo": def one_hot_encoder(self, categorical_features, numerical_features, mode):
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
elif self.cv_method == "half_logo":
data['pid_index'] = data.groupby('pid').cumcount()
data['pid_count'] = data.groupby('pid')['pid'].transform('count')
data["pid_index"] = (data['pid_index'] / data['pid_count'] + 1).round()
data["pid_half"] = data["pid"] + "_" + data["pid_index"].astype(int).astype(str)
data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
elif self.cv_method == "5kfold":
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
return data_X, data_y, data_groups
# TODO This is probably NOT in the right place in this class ...
def initialize_cv_method(self, cv_method):
self.cv_method = cv_method
self.X, self.y, self.groups = self.prepare_data_for_cross_validation()
if cv_method in ["logo", "half_logo"]:
cv = LeaveOneGroupOut()
elif cv_method == "5kfold":
cv = StratifiedKFold(n_splits=5, shuffle=True)
def get_cv_train_test_split():
# TODO: for loop nad vsemi možnimi loso spliti? Skratka, ta preprocessin razred že dobi posamezno instanco train-testa
# (torej 55 udeležencev proti 1 udeležencu).
# Možno bi bilo tudi, da se naredi razred, ki handla oboje, vendar bi pri tem prišlo do morebitnih napačnih interpretacij.
pass
def one_hot_encoder(categorical_features, numerical_features, mode):
""" """
This code is an implementation of one-hot encoding. It takes in two data sets, This code is an implementation of one-hot encoding. It takes in two data sets,
one with categorical features and one with numerical features and a mode parameter. one with categorical features and one with numerical features and a mode parameter.
@ -103,7 +64,6 @@ class Preprocessing:
categorical_columns (list, optional): List of categorical columns in the dataset. categorical_columns (list, optional): List of categorical columns in the dataset.
Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"]. Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"].
TODO: TESTING
""" """
categorical_columns = [col for col in self.train_X.columns if col in categorical_columns] categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
@ -111,16 +71,16 @@ class Preprocessing:
train_X_categorical_features = self.train_X[categorical_columns].copy() train_X_categorical_features = self.train_X[categorical_columns].copy()
train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1) train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
mode_train_X_categorical_features = train_X_categorical_features.mode() mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
self.train_X = one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features) self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
# For test set # For test set
test_X_categorical_features = self.test_X[categorical_columns].copy() test_X_categorical_features = self.test_X[categorical_columns].copy()
test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1) test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
self.test_X = one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features) self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"): def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
@ -152,6 +112,15 @@ class Preprocessing:
medians = self.train_X[other_feature_list].median() medians = self.train_X[other_feature_list].median()
self.train_X[other_feature_list].fillna(medians, inplace=True) self.train_X[other_feature_list].fillna(medians, inplace=True)
self.test_X[other_feature_list].fillna(medians, inplace=True) self.test_X[other_feature_list].fillna(medians, inplace=True)
def get_train_test_sets(self):
"""Train and test sets getter
Returns:
tuple of Pandas DataFrames: Gets train test sets in traditional sklearn format.
"""
return self.train_X, self.train_y, self.test_X, self.test_y