2023-02-15 12:27:39 +01:00
import os
import sys
2023-04-19 15:56:34 +02:00
import warnings
2023-02-15 12:27:39 +01:00
import numpy as np
import matplotlib . pyplot as plt
import pandas as pd
2023-02-20 11:51:34 +01:00
from sklearn . feature_selection import SequentialFeatureSelector
2023-04-19 15:56:34 +02:00
from sklearn . model_selection import cross_validate , StratifiedKFold
2023-02-20 11:51:34 +01:00
from sklearn . naive_bayes import GaussianNB
2023-04-14 17:20:22 +02:00
from sklearn . linear_model import Lasso
2023-02-20 11:51:34 +01:00
2023-02-15 12:27:39 +01:00
""" Feature selection pipeline: a methods that can be used in the wrapper metod alongside other wrapper contents (hyperparameter tuning etc.).
2023-04-14 17:20:22 +02:00
( 1 ) Establish methods for each of the steps in feature selection protocol .
2023-02-15 12:27:39 +01:00
( 2 ) Ensure that above methods are given only a part of data and use appropriate random seeds - to later simulate use case in production .
( 3 ) Implement a method which gives graphical exploration of ( 1 ) ( a ) and ( b ) steps of the feature selection .
( 4 ) Prepare a core method that can be fit into a wrapper ( see sklearn wrapper methods ) and integrates methods from ( 1 )
2023-02-20 11:51:34 +01:00
"""
class FeatureSelection :
2023-04-19 15:56:34 +02:00
def __init__ ( self , X , y ) :
self . X = X
self . y = y
2023-04-14 17:20:22 +02:00
2023-04-19 15:56:34 +02:00
def select_best_feature ( self , features , method = " remove " , ml_type = " classification " , metric = " recall " , stored_features = [ ] ) :
2023-04-14 17:20:22 +02:00
""" The method selects the best feature by testing the prediction on the feature set with or without the current feature.
The " remove " method removes a particular feature and predicts on the test set without it . The " add " method adds a particulat
feature to the previously established feature set ( stored_features ) . The best feature is selected dependent on the metric
specified as a parameter .
Args :
df ( DataFrame ) : Input data on which the predictions will be made .
features ( list ) : List of features to select the best / worst from
method ( str , optional ) : remove or add features . Defaults to " remove " .
ml_type ( str , optional ) : Either classification or regression ml problem controls the ML algorithm and metric . Defaults to " classification " .
metric ( str , optional ) : Selected metric with which the best / worst feature will be determined . Defaults to " recall " .
stored_features ( list , optional ) : In case if method is ' add ' , stored features refer to the features that had been previously added . Defaults to [ ] .
Raises :
ValueError : Raises if classification or regression metrics are not recognised if a specific ml_type is selected .
ValueError : If unknown ml_type is chosen .
Returns :
tuple : name of the best feature , best feature score , best feature score standard deviation .
"""
best_feature = None
if ml_type == " classification " and metric not in [ ' accuracy ' , ' precision ' , ' recall ' , ' f1 ' ] :
raise ValueError ( " Classification metric not recognized. Please choose ' accuracy ' , ' precision ' , ' recall ' and/or ' f1 ' " )
elif ml_type == " regression " and metric not in [ ' r2 ' ] :
raise ValueError ( " Regression metric not recognized. Please choose ' r2 ' " )
for feat in features :
if method == " remove " :
2023-04-19 15:56:34 +02:00
pred_features = [ col for col in self . X . columns if feat != col ] # All but feat
2023-04-14 17:20:22 +02:00
elif method == " add " :
pred_features = [ feat ] + stored_features # Feat with stored features
2023-04-19 15:56:34 +02:00
X = self . X [ pred_features ] . copy ( )
2023-04-14 17:20:22 +02:00
if ml_type == " classification " :
nb = GaussianNB ( )
model_cv = cross_validate (
nb ,
X = X ,
2023-04-19 15:56:34 +02:00
y = self . y ,
2023-04-14 17:20:22 +02:00
cv = StratifiedKFold ( n_splits = 5 , shuffle = True ) ,
n_jobs = - 1 ,
scoring = ( ' accuracy ' , ' precision ' , ' recall ' , ' f1 ' )
)
with warnings . catch_warnings ( ) :
warnings . filterwarnings ( " ignore " , message = " Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. " )
if metric == " accuracy " :
acc = np . mean ( model_cv [ ' test_accuracy ' ] )
acc_std = np . std ( model_cv [ ' test_accuracy ' ] )
if not best_feature or ( acc > best_metric_score ) :
best_feature = feat
best_metric_score = acc
best_metric_score_std = acc_std
elif metric == " precision " :
prec = np . mean ( model_cv [ ' test_precision ' ] )
prec_std = np . std ( model_cv [ ' test_precision ' ] )
if not best_feature or ( prec > best_metric_score ) :
best_feature = feat
best_metric_score = prec
best_metric_score_std = prec_std
elif metric == " recall " :
rec = np . mean ( model_cv [ ' test_recall ' ] )
rec_std = np . std ( model_cv [ ' test_recall ' ] )
if not best_feature or ( rec > best_metric_score ) :
best_feature = feat
best_metric_score = rec
best_metric_score_std = rec_std
else :
f1 = np . mean ( model_cv [ ' test_f1 ' ] )
f1_std = np . std ( model_cv [ ' test_f1 ' ] )
if not best_feature or ( f1 > best_metric_score ) :
best_feature = feat
best_metric_score = f1
best_metric_score_std = f1_std
elif ml_type == " regression " :
lass = Lasso ( )
model_cv = cross_validate (
lass ,
X = X ,
y = y ,
cv = StratifiedKFold ( n_splits = 5 , shuffle = True ) ,
n_jobs = - 1 ,
scoring = ( ' r2 ' )
)
if metric == " r2 " :
r2 = np . mean ( model_cv [ ' test_r2 ' ] )
r2_std = np . std ( model_cv [ ' test_r2 ' ] )
if not best_feature or ( r2 > best_metric_score ) :
best_feature = feat
best_metric_score = r2
best_metric_score_std = r2_std
else :
raise ValueError ( " ML type not yet implemented! " )
return best_feature , best_metric_score , best_metric_score_std
2023-04-19 15:56:34 +02:00
def select_features ( self , n_min = 20 , n_max = 50 , method = " remove " , n_not_improve = 10 ) :
""" This method selects a set of features and returns them as a list. It returns number of features
determined in the interval of [ n_min , n_max ] . The best score is detected using a removal procedure .
The procedure sequentially removes the features that attribute the least to the choosen evaluation metric .
If in this sequence the score ML score is improved the next feature is remove otherwise there is a
tolerance criteria ( n_not_improve ) with which the next n remove features are inspected whether
currently best score is improved . The features are returned in specified interval as a list .
Args :
n_min ( int ) : Minimal amount of features returned .
n_max ( int ) : Maximal amount of features returned .
method ( str , optional ) : " remove " or " add " features . Defaults to " remove " .
n_not_improve ( int ) : If the best score is not improved in n that is specified by this parameter
the method returns index of feature with current best score as a tipping point feature .
Returns :
list : list of selected features
"""
2023-04-14 17:20:22 +02:00
2023-04-19 15:56:34 +02:00
n_features = self . X . shape [ 1 ]
if n_max > = n_features :
n_max = n_features - 1 # The algorithm removes at least one feature
2023-04-14 17:20:22 +02:00
if n_min > n_features :
2023-04-19 15:56:34 +02:00
raise ValueError ( " The number of features in the dataframe must be at least as n_min+1 parameter. " )
2023-04-14 17:20:22 +02:00
if n_max < n_min :
raise ValueError ( " n_max parameter needs to be greater than or equal to n_min parameter. " )
2023-04-19 15:56:34 +02:00
features = self . X . columns . tolist ( )
2023-04-14 17:20:22 +02:00
feature_importance = [ ]
if method == " remove " :
2023-04-19 15:56:34 +02:00
best_score = 0
best_feature_indx = None
i_worse = 0
2023-04-14 17:20:22 +02:00
for i in reversed ( range ( n_features ) ) :
2023-04-19 15:56:34 +02:00
if i + 1 == n_min :
break
2023-04-14 17:20:22 +02:00
best_feature , best_metric_score , best_metric_score_std = \
2023-04-19 15:56:34 +02:00
self . select_best_feature ( features , method = method , ml_type = " classification " , metric = " recall " )
feature_importance . append ( ( i + 1 , best_feature , best_metric_score , best_metric_score_std ) )
2023-04-14 17:20:22 +02:00
features . remove ( best_feature )
2023-04-19 15:56:34 +02:00
if i < = n_max :
if best_metric_score > = best_score :
best_score = best_metric_score
best_feature_indx = i + 1
i_worse = 0
else :
i_worse + = 1
if i_worse == n_not_improve :
break
2023-04-14 17:20:22 +02:00
feature_importance_df = pd . DataFrame ( feature_importance , columns = [ ' i ' , ' name ' , ' metric ' , ' metric_sd ' ] )
2023-04-19 15:56:34 +02:00
print ( feature_importance_df )
print ( " best_feature_indx " , best_feature_indx )
print ( " best_score " , best_score )
features_to_remove = feature_importance_df [ feature_importance_df [ " i " ] > = best_feature_indx ] [ " name " ] . values . tolist ( )
selected_features = [ feat for feat in self . X . columns . tolist ( ) if feat not in features_to_remove ]
return selected_features
2023-04-14 17:20:22 +02:00
2023-04-19 15:56:34 +02:00
"""
2023-04-14 17:20:22 +02:00
# Selekcijski kriterij značilk v rangu max-min
# Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
# Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
# "Tipping point" značilka mora biti v rangu max-min
selection_area = feature_importance_df [ ( feature_importance_df [ " i " ] > = n_min + 1 ) & ( feature_importance_df [ " i " ] < = n_max ) ]
selection_area . set_index ( [ " i " , " name " ] , inplace = True )
2023-04-19 15:56:34 +02:00
print ( selection_area )
2023-04-14 17:20:22 +02:00
diffrences = selection_area . diff ( )
diffrences . dropna ( how = ' any ' , inplace = True )
2023-04-19 15:56:34 +02:00
print ( diffrences )
2023-04-14 17:20:22 +02:00
# Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo
cumulative_sumation = diffrences . cumsum ( )
2023-04-19 15:56:34 +02:00
print ( cumulative_sumation )
2023-04-14 17:20:22 +02:00
tipping_feature_indx_1 = cumulative_sumation . idxmax ( ) [ " metric " ]
2023-04-19 15:56:34 +02:00
print ( tipping_feature_indx_1 )
2023-04-14 17:20:22 +02:00
# Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score
2023-04-19 15:56:34 +02:00
tipping_feature_indx_2 = None
best_score = 0
2023-04-14 17:20:22 +02:00
i_worse = 0
for indx , row in selection_area . iterrows ( ) :
2023-04-19 15:56:34 +02:00
if row [ " metric " ] > best_score :
tipping_feature_indx_2 = indx
best_score = row [ " metric " ]
2023-04-14 17:20:22 +02:00
i_worse = 0
else :
i_worse + = 1
if i_worse == n_not_improve :
2023-04-19 15:56:34 +02:00
break
2023-04-14 17:20:22 +02:00
2023-04-19 15:56:34 +02:00
print ( tipping_feature_indx_2 )
selection_area . reset_index ( inplace = True )
features_to_remove = feature_importance_df [ feature_importance_df [ " i " ] > = tipping_feature_indx_2 [ 0 ] ] [ " name " ] . values . tolist ( )
2023-04-14 17:20:22 +02:00
2023-04-19 15:56:34 +02:00
selected_features = [ feat for feat in self . X . columns . tolist ( ) if feat not in features_to_remove ]
"""