From bccc1cd1de7fd825cf3f998a267b22ec802a77fa Mon Sep 17 00:00:00 2001 From: Primoz Date: Thu, 23 Feb 2023 10:40:58 +0100 Subject: [PATCH] Clean and fix Preprocessing module. --- machine_learning/preprocessing.py | 59 ++++++++----------------------- 1 file changed, 14 insertions(+), 45 deletions(-) diff --git a/machine_learning/preprocessing.py b/machine_learning/preprocessing.py index fdafa39..a11558c 100644 --- a/machine_learning/preprocessing.py +++ b/machine_learning/preprocessing.py @@ -5,13 +5,12 @@ import numpy as np import matplotlib.pyplot as plt import pandas as pd -from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold - class Preprocessing: """This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data. It's blind to the test data - e.g, it imputes the test data with train data mean. This means, it somehow needs an access to the information about data split. In context """ + def __init__(self, train_X, train_y, test_X, test_y): self.train_X = train_X @@ -19,46 +18,8 @@ class Preprocessing: self.test_X = test_X self.test_y = test_y - # TODO This is probably NOT in the right place in this class ... - def prepare_data_for_cross_validation(self): - data = self.data.copy() - if self.cv_method == "logo": - data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"] - - elif self.cv_method == "half_logo": - data['pid_index'] = data.groupby('pid').cumcount() - data['pid_count'] = data.groupby('pid')['pid'].transform('count') - - data["pid_index"] = (data['pid_index'] / data['pid_count'] + 1).round() - data["pid_half"] = data["pid"] + "_" + data["pid_index"].astype(int).astype(str) - - data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"] - - elif self.cv_method == "5kfold": - data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"] - - return data_X, data_y, data_groups - - # TODO This is probably NOT in the right place in this class ... - def initialize_cv_method(self, cv_method): - self.cv_method = cv_method - self.X, self.y, self.groups = self.prepare_data_for_cross_validation() - - if cv_method in ["logo", "half_logo"]: - cv = LeaveOneGroupOut() - elif cv_method == "5kfold": - cv = StratifiedKFold(n_splits=5, shuffle=True) - - def get_cv_train_test_split(): - # TODO: for loop nad vsemi možnimi loso spliti? Skratka, ta preprocessin razred že dobi posamezno instanco train-testa - # (torej 55 udeležencev proti 1 udeležencu). - # Možno bi bilo tudi, da se naredi razred, ki handla oboje, vendar bi pri tem prišlo do morebitnih napačnih interpretacij. - - pass - - - def one_hot_encoder(categorical_features, numerical_features, mode): + def one_hot_encoder(self, categorical_features, numerical_features, mode): """ This code is an implementation of one-hot encoding. It takes in two data sets, one with categorical features and one with numerical features and a mode parameter. @@ -103,7 +64,6 @@ class Preprocessing: categorical_columns (list, optional): List of categorical columns in the dataset. Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"]. - TODO: TESTING """ categorical_columns = [col for col in self.train_X.columns if col in categorical_columns] @@ -111,16 +71,16 @@ class Preprocessing: train_X_categorical_features = self.train_X[categorical_columns].copy() train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1) - mode_train_X_categorical_features = train_X_categorical_features.mode() + mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0] - self.train_X = one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features) + self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features) # For test set test_X_categorical_features = self.test_X[categorical_columns].copy() test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1) - self.test_X = one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features) + self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features) def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"): @@ -152,6 +112,15 @@ class Preprocessing: medians = self.train_X[other_feature_list].median() self.train_X[other_feature_list].fillna(medians, inplace=True) self.test_X[other_feature_list].fillna(medians, inplace=True) + + + def get_train_test_sets(self): + """Train and test sets getter + + Returns: + tuple of Pandas DataFrames: Gets train test sets in traditional sklearn format. + """ + return self.train_X, self.train_y, self.test_X, self.test_y