Merge branch 'ml_pipeline'

2023-05-10 15:02:17 +02:00 · 2023-05-10 15:02:17 +02:00 · 3e38b64b45
parent 76071fd550 26804cf8ea
commit 3e38b64b45
4 changed files with 219 additions and 137 deletions
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@ -20,30 +20,74 @@ import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import recall_score, f1_score
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 from machine_learning.cross_validation import CrossValidation
 from machine_learning.preprocessing import Preprocessing
 from machine_learning.feature_selection import FeatureSelection
 # %% 
 df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 df.set_index(index_columns, inplace=True)
 # Create binary target 
 bins = [-1, 0, 4] # bins for stressfulness (0-4) target
 df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
 nan_cols = df.columns[df.isna().any()].tolist()
 df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))
 cv = CrossValidation(data=df, cv_method="logo")
 categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
 interval_feature_list, other_feature_list = [], []
-print(df.columns.tolist())
+# %%
 for split in cv.get_splits():
    train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
    pre = Preprocessing(train_X, train_y, test_X, test_y)
    pre.one_hot_encode_train_and_test_sets(categorical_columns)
    train_X, train_y, test_X, test_y = pre.get_train_test_sets()
    print(train_X.shape, test_X.shape)
    # Predict before feature selection
    rfc = RandomForestClassifier(n_estimators=10)
    rfc.fit(train_X, train_y)
    predictions = rfc.predict(test_X)
    print("Recall:", recall_score(test_y, predictions))
    print("F1:", f1_score(test_y, predictions))
    # Feature selection on train set
    train_groups, test_groups = cv.get_groups_sets(split)
    fs = FeatureSelection(train_X, train_y, train_groups) 
    selected_features = fs.select_features(n_min=20, n_max=29, k=40,
                                           ml_type="classification_bin", 
                                           metric="recall", n_tolerance=20)
    train_X = train_X[selected_features]
    test_X = test_X[selected_features]
    print(selected_features)
    print(len(selected_features))
    # Predict after feature selection    
    rfc = RandomForestClassifier(n_estimators=500)
    rfc.fit(train_X, train_y)
    predictions = rfc.predict(test_X)
    print("Recall:", recall_score(test_y, predictions))
    print("F1:", f1_score(test_y, predictions))
    break
 # %%
--- a/machine_learning/cross_validation.py
+++ b/machine_learning/cross_validation.py
@ -49,8 +49,8 @@ class CrossValidation():
            data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
-        elif self.cv_method == "5kfold":
+        elif self.cv_method == "Stratified5kfold":
-            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
+            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], None
        self.X, self.y, self.groups = data_X, data_y, data_groups
@ -71,7 +71,7 @@ class CrossValidation():
        if self.cv_method in ["logo", "half_logo"]:
            self.cv = LeaveOneGroupOut()
-        elif self.cv_method == "5kfold":
+        elif self.cv_method == "Stratified5kfold":
            self.cv = StratifiedKFold(n_splits=5, shuffle=True)
@ -118,4 +118,11 @@ class CrossValidation():
        """
        return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
    def get_groups_sets(self, split):
        if self.groups is None:
            return None, None
        else:
            return self.groups.iloc[split[0]], self.groups.iloc[split[1]]
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@ -1,11 +1,13 @@
 import os
 import sys
 import warnings
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
-from sklearn.feature_selection import SequentialFeatureSelector
+from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
 from sklearn.model_selection import cross_validate, StratifiedKFold, GroupKFold
 from sklearn.naive_bayes import GaussianNB
 from sklearn.linear_model import Lasso 
@ -21,13 +23,15 @@ from sklearn.linear_model import Lasso
 class FeatureSelection:
-    def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
+    def __init__(self, X, y, groups):
-        pass # TODO.... 
+        self.X = X
        self.y = y
        self.groups = groups
-    def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
+    def select_best_feature(self, features, method="remove", ml_category="classification", ml_subcategory="bin", metric="recall", stored_features=[]):
        """The method selects the best feature by testing the prediction on the feature set with or without the current feature.
-        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat 
+        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particular 
        feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
        specified as a parameter.
@ -35,7 +39,11 @@ class FeatureSelection:
            df (DataFrame): Input data on which the predictions will be made.
            features (list): List of features to select the best/worst from
            method (str, optional): remove or add features.  Defaults to "remove".
-            ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and  metric. Defaults to "classification".
+            ml_category (str, optional): Either classification or regression ml problem controls the ML algorithm and  metric. 
                Defaults to "classification".
            ml_subcategory (str, optional): In case of classification '_bin' for binary classification 
                and 'multi' for multiclass classification. For regression an empty string '' is sufficient. 
                Defaults to "bin".
            metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
            stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
@ -49,173 +57,189 @@ class FeatureSelection:
        best_feature = None
-        if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
+        # Validacije tipov ML in specificiranimi metrikami
-            raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
+        if ml_category == "classification":
-        elif ml_type == "regression" and metric not in ['r2']:
+            if ml_subcategory == "bin" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
                raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
            elif ml_subcategory == "multi":
                ml_subcategory_error = False
                if metric != "accuracy" and "_" in metric:          
                    metric_s, metric_t = metric.split("_")
                    if metric_s not in ['accuracy', 'precision', 'recall', 'f1'] or metric_t not in ['micro', 'macro', 'weighted']:
                        ml_subcategory_error = True
                else:
                    ml_subcategory_error = True
                if ml_subcategory_error:
                    raise ValueError(""""Classification metric for multi-class classification must be specified precisely.
                                     Available metric are: 'accuracy', 'precision', 'recall' and 'f1'.
                                     Only accuracy must be specified as 'accuracy'.
                                     For others please add appropriate suffixes: '_macro', '_micro', or '_weighted', e.g., 'f1_macro'""")
        elif ml_category == "regression" and metric not in ['r2']:
            raise ValueError("Regression metric not recognized. Please choose 'r2'")
        for feat in features:
            if method == "remove":
-                pred_features = [col for col in df.columns if feat != col] # All but feat
+                pred_features = [col for col in self.X.columns if feat != col] # All but feat
            elif method == "add":
                pred_features = [feat] + stored_features # Feat with stored features
-            X, y  = df.drop(columns=['target', 'pid'])[pred_features], df['target']
+            X  = self.X[pred_features].copy()
-            if ml_type == "classification":
+            if self.groups is not None:
                cv = GroupKFold(n_splits=5)
            else:
                cv = StratifiedKFold(n_splits=5, shuffle=True)
            # See link about scoring for multiclassfication
            # http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
            if ml_category == "classification":
                nb = GaussianNB()
                model_cv = cross_validate(
                    nb,
                    X=X,
-                    y=y,
+                    y=self.y,
-                    cv=StratifiedKFold(n_splits=5, shuffle=True),
+                    cv=cv,
                    groups=self.groups,
                    n_jobs=-1,
-                    scoring=('accuracy', 'precision', 'recall', 'f1')
+                    scoring=(metric)
                )
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
-                    if metric == "accuracy":
+            elif ml_category == "regression":
                        acc = np.mean(model_cv['test_accuracy'])
                        acc_std = np.std(model_cv['test_accuracy'])
                        if not best_feature or (acc > best_metric_score):
                            best_feature = feat
                            best_metric_score = acc
                            best_metric_score_std = acc_std
                    elif metric == "precision":
                        prec = np.mean(model_cv['test_precision'])
                        prec_std = np.std(model_cv['test_precision'])
                        if not best_feature or (prec > best_metric_score):
                            best_feature = feat
                            best_metric_score = prec
                            best_metric_score_std = prec_std
                    elif metric == "recall":
                        rec = np.mean(model_cv['test_recall'])
                        rec_std = np.std(model_cv['test_recall'])
                        if not best_feature or (rec > best_metric_score):
                            best_feature = feat
                            best_metric_score = rec
                            best_metric_score_std = rec_std
                    else:
                        f1 = np.mean(model_cv['test_f1'])
                        f1_std = np.std(model_cv['test_f1'])
                        if not best_feature or (f1 > best_metric_score):
                            best_feature = feat
                            best_metric_score = f1
                            best_metric_score_std = f1_std 
            elif ml_type == "regression":
                lass = Lasso()
                model_cv = cross_validate(
                    lass,
                    X=X,
                    y=y,
-                    cv=StratifiedKFold(n_splits=5, shuffle=True),
+                    cv=cv,
                    groups=self.groups,
                    n_jobs=-1,
                    scoring=('r2')
                )
                if metric == "r2":
                    r2 = np.mean(model_cv['test_r2'])
                    r2_std = np.std(model_cv['test_r2'])
                    if not best_feature or (r2 > best_metric_score):
                        best_feature = feat
                        best_metric_score = r2
                        best_metric_score_std = r2_std
            else:
                raise ValueError("ML type not yet implemented!")
            # Section of metrics' scores comparison. 
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
                metric_score = np.nanmean(model_cv["test_score"])
                metric_score_std = np.nanstd(model_cv["test_score"])
                if not best_feature or (metric_score > best_metric_score):
                    best_feature = feat
                    best_metric_score = metric_score
                    best_metric_score_std = metric_score_std
        return best_feature, best_metric_score, best_metric_score_std
-    def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
+    def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10):
        """This method selects a set of features and returns them as a list. It returns number of features 
        determined in the interval of [n_min, n_max]. 
-        n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
+        The method consists of two steps: 
-        if n_max > n_features:
+        (1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter.
-            n_max = n_features
+        (2) The sequential features removal procedure is executed. Using the remaing features from (1).
            The best score is detected using a removal procedure. The procedure sequentially removes the features 
            that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is 
            improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance) 
            with which the next n removed features are inspected whether currently best score is improved.     
        Args:
            n_min (int, optional): Minimal amount of features returned.
            n_max (int, optional): Maximal amount of features returned.
            k (int, optional): Determines the k in the k-best features method. 
                If None, SelectKBest feature selection does not execute.
            ml_type(str, optional): Type of ML problem. Currently implemented options: 
                'classification_bin', 'classification_multi', and 'regression_'
            method (str, optional): "remove" or "add" features.  Defaults to "remove".
            n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
                the method returns index of feature with current best score as a tipping point feature.
        Returns:
            list: list of selected features
        """        
        if k is not None and k <= n_max:
            raise ValueError("The k parameter needs to be greater than the n_max parameter.")
        # Select k-best feature dependent on the type of ML task
        ml_category, ml_subcategory = ml_type.split("_")
        if k is not None:
            if ml_category == "classification":
                if ml_subcategory== "bin":
                    selector = SelectKBest(mutual_info_classif, k=k)
                elif ml_subcategory== "multi":
                    selector = SelectKBest(f_classif, k=k)
                else:
                    raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
            elif ml_category == "regression":
                selector = SelectKBest(f_regression, k=k)
            else:
                raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
            selector.fit(self.X, self.y)
            cols_idxs = selector.get_support(indices=True)
            self.X = self.X.iloc[:,cols_idxs]
        print("All columns (after SelectKBest method):")
        print(self.X.columns)
        # Sequential feature addition / removal
        n_features = self.X.shape[1]
        if n_max >= n_features:
            n_max = n_features-1 # The algorithm removes at least one feature
        if n_min > n_features:
-            raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
+            raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
        if n_max < n_min:
            raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
-        features = df.columns.tolist()
+        features = self.X.columns.tolist()
        features.remove("pid")
        features.remove("target")
        feature_importance = []
        if method == "remove":
            best_score = 0
            best_feature_indx = None
            i_worse = 0
            for i in reversed(range(n_features)):
                if i+1 == n_min:
                    break
                best_feature, best_metric_score, best_metric_score_std = \
-                    self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
+                    self.select_best_feature(features, method=method, ml_category=ml_category, ml_subcategory=ml_subcategory, metric=metric)
-                feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
+                    
                feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
                features.remove(best_feature)
                print("Features left:", i) 
                if i <= n_max:
                    if best_metric_score >= best_score:
                        best_score = best_metric_score
                        best_feature_indx = i+1
                        i_worse = 0
                    else:
                        i_worse += 1
                    if i_worse == n_tolerance: 
                        break  
            feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
-            # Selekcijski kriterij značilk v rangu max-min
+            print(feature_importance_df)
-            # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
+            print("best_feature_indx", best_feature_indx)
            print("best_score", best_score)
-            # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
+            features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist()
            selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]    
-            # "Tipping point" značilka mora biti v rangu max-min
+            return selected_features
-            selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
+        else:
-            selection_area.set_index(["i", "name"], inplace=True)
+            raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
            diffrences = selection_area.diff()
            diffrences.dropna(how='any', inplace=True)
            # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo 
            cumulative_sumation = diffrences.cumsum()
            tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
            # Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative 
            tipping_feature_indx_2 = None
            for indx, row in diffrences.iterrows():
                if row["metric"] > 0:
                    tipping_feature_indx_2 = indx
                else: 
                    break
            # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score     
            tipping_feature_indx_3 = None
            cum_sum_score = 0
            i_worse = 0
            # TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
            for indx, row in selection_area.iterrows():
                if row["metric"] > 0:
                    tipping_feature_indx_3 = indx
                    cum_sum_score += row["metric"]
                    i_worse = 0
                else:
                    i_worse += 1
                if i_worse == n_not_improve:
                    break
    def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
        pass
    def vizualize_feature_selection_process():
        pass
    def execute_feature_selection_step():
        pass
--- a/machine_learning/preprocessing.py
+++ b/machine_learning/preprocessing.py
@ -33,7 +33,7 @@ class Preprocessing:
        Args:
            categorical_features (DataFrame): DataFrame including only categorical columns.
            numerical_features (_type_): DataFrame including only numerical columns.
-            mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
+            mode (int): Mode of the column with which DataFrame is filled.
        Returns:
            DataFrame: Hot-One Encoded DataFrame.
@ -46,7 +46,7 @@ class Preprocessing:
        if not categorical_features.empty:
            categorical_features = pd.get_dummies(categorical_features)
-        return pd.concat([numerical_features, categorical_features], axis=1)
+        return pd.concat([numerical_features, categorical_features], axis=1), categorical_features.columns.tolist()
    def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
@ -68,19 +68,26 @@ class Preprocessing:
        categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
        # For train set
        train_X_categorical_features = self.train_X[categorical_columns].copy()
        train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
        mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
-        self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
+        self.train_X, train_cat_col_names = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
        encoded_categorical_features = [col for col in self.train_X.columns if col.startswith(tuple(categorical_columns))]
        # For test set
        test_X_categorical_features = self.test_X[categorical_columns].copy()
        test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
-        self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
+        self.test_X, test_cat_col_names = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
        # Create categorical columns that were not found in test set and fill them with 0        
        missing_cols = [col for col in train_cat_col_names if col not in test_cat_col_names]
        self.test_X[missing_cols] = 0
        # Sort column names alphabetically        
        self.train_X = self.train_X.reindex(sorted(self.train_X.columns), axis=1)
        self.test_X = self.test_X.reindex(sorted(self.test_X.columns), axis=1)
    def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):