Clean and fix Preprocessing module.
parent
9ed863b7a1
commit
bccc1cd1de
|
@ -5,13 +5,12 @@ import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold
|
|
||||||
|
|
||||||
class Preprocessing:
|
class Preprocessing:
|
||||||
"""This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data.
|
"""This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data.
|
||||||
It's blind to the test data - e.g, it imputes the test data with train data mean.
|
It's blind to the test data - e.g, it imputes the test data with train data mean.
|
||||||
This means, it somehow needs an access to the information about data split. In context
|
This means, it somehow needs an access to the information about data split. In context
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, train_X, train_y, test_X, test_y):
|
def __init__(self, train_X, train_y, test_X, test_y):
|
||||||
self.train_X = train_X
|
self.train_X = train_X
|
||||||
|
@ -19,46 +18,8 @@ class Preprocessing:
|
||||||
self.test_X = test_X
|
self.test_X = test_X
|
||||||
self.test_y = test_y
|
self.test_y = test_y
|
||||||
|
|
||||||
# TODO This is probably NOT in the right place in this class ...
|
|
||||||
def prepare_data_for_cross_validation(self):
|
|
||||||
data = self.data.copy()
|
|
||||||
|
|
||||||
if self.cv_method == "logo":
|
def one_hot_encoder(self, categorical_features, numerical_features, mode):
|
||||||
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
|
|
||||||
|
|
||||||
elif self.cv_method == "half_logo":
|
|
||||||
data['pid_index'] = data.groupby('pid').cumcount()
|
|
||||||
data['pid_count'] = data.groupby('pid')['pid'].transform('count')
|
|
||||||
|
|
||||||
data["pid_index"] = (data['pid_index'] / data['pid_count'] + 1).round()
|
|
||||||
data["pid_half"] = data["pid"] + "_" + data["pid_index"].astype(int).astype(str)
|
|
||||||
|
|
||||||
data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
|
|
||||||
|
|
||||||
elif self.cv_method == "5kfold":
|
|
||||||
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
|
|
||||||
|
|
||||||
return data_X, data_y, data_groups
|
|
||||||
|
|
||||||
# TODO This is probably NOT in the right place in this class ...
|
|
||||||
def initialize_cv_method(self, cv_method):
|
|
||||||
self.cv_method = cv_method
|
|
||||||
self.X, self.y, self.groups = self.prepare_data_for_cross_validation()
|
|
||||||
|
|
||||||
if cv_method in ["logo", "half_logo"]:
|
|
||||||
cv = LeaveOneGroupOut()
|
|
||||||
elif cv_method == "5kfold":
|
|
||||||
cv = StratifiedKFold(n_splits=5, shuffle=True)
|
|
||||||
|
|
||||||
def get_cv_train_test_split():
|
|
||||||
# TODO: for loop nad vsemi možnimi loso spliti? Skratka, ta preprocessin razred že dobi posamezno instanco train-testa
|
|
||||||
# (torej 55 udeležencev proti 1 udeležencu).
|
|
||||||
# Možno bi bilo tudi, da se naredi razred, ki handla oboje, vendar bi pri tem prišlo do morebitnih napačnih interpretacij.
|
|
||||||
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def one_hot_encoder(categorical_features, numerical_features, mode):
|
|
||||||
"""
|
"""
|
||||||
This code is an implementation of one-hot encoding. It takes in two data sets,
|
This code is an implementation of one-hot encoding. It takes in two data sets,
|
||||||
one with categorical features and one with numerical features and a mode parameter.
|
one with categorical features and one with numerical features and a mode parameter.
|
||||||
|
@ -103,7 +64,6 @@ class Preprocessing:
|
||||||
categorical_columns (list, optional): List of categorical columns in the dataset.
|
categorical_columns (list, optional): List of categorical columns in the dataset.
|
||||||
Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"].
|
Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"].
|
||||||
|
|
||||||
TODO: TESTING
|
|
||||||
"""
|
"""
|
||||||
categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
|
categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
|
||||||
|
|
||||||
|
@ -111,16 +71,16 @@ class Preprocessing:
|
||||||
|
|
||||||
train_X_categorical_features = self.train_X[categorical_columns].copy()
|
train_X_categorical_features = self.train_X[categorical_columns].copy()
|
||||||
train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
|
train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
|
||||||
mode_train_X_categorical_features = train_X_categorical_features.mode()
|
mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
|
||||||
|
|
||||||
self.train_X = one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
|
self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
|
||||||
|
|
||||||
# For test set
|
# For test set
|
||||||
|
|
||||||
test_X_categorical_features = self.test_X[categorical_columns].copy()
|
test_X_categorical_features = self.test_X[categorical_columns].copy()
|
||||||
test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
|
test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
|
||||||
|
|
||||||
self.test_X = one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
|
self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
|
||||||
|
|
||||||
|
|
||||||
def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
|
def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
|
||||||
|
@ -152,6 +112,15 @@ class Preprocessing:
|
||||||
medians = self.train_X[other_feature_list].median()
|
medians = self.train_X[other_feature_list].median()
|
||||||
self.train_X[other_feature_list].fillna(medians, inplace=True)
|
self.train_X[other_feature_list].fillna(medians, inplace=True)
|
||||||
self.test_X[other_feature_list].fillna(medians, inplace=True)
|
self.test_X[other_feature_list].fillna(medians, inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
def get_train_test_sets(self):
|
||||||
|
"""Train and test sets getter
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple of Pandas DataFrames: Gets train test sets in traditional sklearn format.
|
||||||
|
"""
|
||||||
|
return self.train_X, self.train_y, self.test_X, self.test_y
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue