Add preprocessing class.

2023-02-22 13:44:03 +01:00 · 2023-02-22 13:44:03 +01:00 · 8f6cb3f444
parent ef12f64fe5
commit 8f6cb3f444
1 changed files with 86 additions and 0 deletions
--- a/machine_learning/preprocessing.py
+++ b/machine_learning/preprocessing.py
@ -0,0 +1,86 @@
+import os
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold
+
+class Preprocessing:
+    """This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data. 
+       It's blind to the test data - e.g, it imputes the test data with train data mean. 
+       This means, it somehow needs an access to the information about data split. In context 
+    """
+
+    def __init__(self, train_X, train_y, test_X, test_y):
+        self.train_X = train_X
+        self.train_y = train_y
+        self.test_X = test_X
+        self.test_y = test_y
+
+    # TODO This is probably NOT in the right place in this class ...
+    def prepare_data_for_cross_validation(self):
+        data = self.data.copy()
+
+        if self.cv_method == "logo":
+            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
+            
+        elif self.cv_method == "half_logo":
+            data['pid_index'] = data.groupby('pid').cumcount()
+            data['pid_count'] = data.groupby('pid')['pid'].transform('count')
+
+            data["pid_index"] = (data['pid_index'] / data['pid_count'] + 1).round()
+            data["pid_half"] = data["pid"] + "_" +  data["pid_index"].astype(int).astype(str)
+
+            data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
+           
+        elif self.cv_method == "5kfold":
+            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
+
+        return data_X, data_y, data_groups
+
+    # TODO This is probably NOT in the right place in this class ...
+    def initialize_cv_method(self, cv_method):
+        self.cv_method = cv_method
+        self.X, self.y, self.groups = self.prepare_data_for_cross_validation()
+        
+        if cv_method in ["logo", "half_logo"]:
+            cv = LeaveOneGroupOut()
+        elif cv_method == "5kfold":
+            cv = StratifiedKFold(n_splits=5, shuffle=True)
+
+    def get_cv_train_test_split():
+        # TODO: for loop nad vsemi možnimi loso spliti? Kako se bo to sem integriralo. 
+        pass
+
+
+    def one_hot_encode(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
+
+        categorical_columns = [col for col in self.X.columns if col in categorical_columns]
+
+        categorical_features = self.X[categorical_columns].copy()
+        mode_categorical_features = categorical_features.mode().iloc[0]
+
+        # fillna with mode
+        categorical_features = categorical_features.fillna(mode_categorical_features)
+
+        # one-hot encoding
+        categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+        if not categorical_features.empty:
+            categorical_features = pd.get_dummies(categorical_features)
+
+        numerical_features = self.X.drop(categorical_columns, axis=1)
+        train_x = pd.concat([numerical_features, categorical_features], axis=1)
+
+
+        # TODO: has to return a train set (or 54 participans in logo) and a test set (1 participant in logo)
+
+
+    def imputer(method="mean"):
+        # TODO: This has to be done in context of CV method - so that test data has only information to mean of train data (it is imputed with train data mean or median etc.)
+        # TODO: has to return train set (or 54 participans in logo) and test test (1 participant in logo)
+        pass
+
+
+