diff --git a/machine_learning/preprocessing.py b/machine_learning/preprocessing.py new file mode 100644 index 0000000..5f17166 --- /dev/null +++ b/machine_learning/preprocessing.py @@ -0,0 +1,86 @@ +import os +import sys + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold + +class Preprocessing: + """This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data. + It's blind to the test data - e.g, it imputes the test data with train data mean. + This means, it somehow needs an access to the information about data split. In context + """ + + def __init__(self, train_X, train_y, test_X, test_y): + self.train_X = train_X + self.train_y = train_y + self.test_X = test_X + self.test_y = test_y + + # TODO This is probably NOT in the right place in this class ... + def prepare_data_for_cross_validation(self): + data = self.data.copy() + + if self.cv_method == "logo": + data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"] + + elif self.cv_method == "half_logo": + data['pid_index'] = data.groupby('pid').cumcount() + data['pid_count'] = data.groupby('pid')['pid'].transform('count') + + data["pid_index"] = (data['pid_index'] / data['pid_count'] + 1).round() + data["pid_half"] = data["pid"] + "_" + data["pid_index"].astype(int).astype(str) + + data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"] + + elif self.cv_method == "5kfold": + data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"] + + return data_X, data_y, data_groups + + # TODO This is probably NOT in the right place in this class ... + def initialize_cv_method(self, cv_method): + self.cv_method = cv_method + self.X, self.y, self.groups = self.prepare_data_for_cross_validation() + + if cv_method in ["logo", "half_logo"]: + cv = LeaveOneGroupOut() + elif cv_method == "5kfold": + cv = StratifiedKFold(n_splits=5, shuffle=True) + + def get_cv_train_test_split(): + # TODO: for loop nad vsemi možnimi loso spliti? Kako se bo to sem integriralo. + pass + + + def one_hot_encode(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]): + + categorical_columns = [col for col in self.X.columns if col in categorical_columns] + + categorical_features = self.X[categorical_columns].copy() + mode_categorical_features = categorical_features.mode().iloc[0] + + # fillna with mode + categorical_features = categorical_features.fillna(mode_categorical_features) + + # one-hot encoding + categorical_features = categorical_features.apply(lambda col: col.astype("category")) + if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) + + numerical_features = self.X.drop(categorical_columns, axis=1) + train_x = pd.concat([numerical_features, categorical_features], axis=1) + + + # TODO: has to return a train set (or 54 participans in logo) and a test set (1 participant in logo) + + + def imputer(method="mean"): + # TODO: This has to be done in context of CV method - so that test data has only information to mean of train data (it is imputed with train data mean or median etc.) + # TODO: has to return train set (or 54 participans in logo) and test test (1 participant in logo) + pass + + +