Prepare scripts for feature importance analysis.

2023-01-19 16:20:43 +01:00 · 2023-01-19 16:20:43 +01:00 · 6a98c8cdcf
parent d263b32564
commit 6a98c8cdcf
4 changed files with 310 additions and 8 deletions
--- a/exploration/expl_features_analysis.py
+++ b/exploration/expl_features_analysis.py
@ -21,6 +21,12 @@ import os, sys, math
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 from sklearn.tree import DecisionTreeClassifier
 from sklearn import tree
 from sklearn.impute import SimpleImputer
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 def calc_entropy(column):
@ -87,15 +93,14 @@ def n_features_with_highest_info_gain(info_gain_dict, n=50):
    Get n-features that have highest information gain
    """
    import heapq
-    return heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
+    n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
    return {feature[0]: feature[1] for feature in n_largest}
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
 # %% 
 categorical_feature_colnames = ["gender", "startlanguage"]
 additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
@ -114,12 +119,161 @@ if not categorical_features.empty:
 numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
 model_input = pd.concat([numerical_features, categorical_features], axis=1)
 # Binarizacija targeta
 bins = [-1, 0, 4] # bins for stressfulness (0-4) target
 model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) 
 print(model_input['target'].value_counts(), edges)
 # %%
 info_gains = get_information_gains(model_input, 'target')
 selected_features = n_features_with_highest_info_gain(info_gains, n=150)
 selected_features
 # TODO: binarizacija targeta
 # %% [markdown]
 # Present the feature importance results
 # %%
 print("Total columns:", len(info_gains))
 print(pd.Series(info_gains).value_counts())
 n_features_with_highest_info_gain(info_gains, n=189)
 # %%
 def compute_impurity(feature, impurity_criterion):
    """
    This function calculates impurity of a feature.
    Supported impurity criteria: 'entropy', 'gini'
    input: feature (this needs to be a Pandas series)
    output: feature impurity
    """
    probs = feature.value_counts(normalize=True)
    if impurity_criterion == 'entropy':
        impurity = -1 * np.sum(np.log2(probs) * probs)
    elif impurity_criterion == 'gini':
        impurity = 1 - np.sum(np.square(probs))
    else:
        raise ValueError('Unknown impurity criterion')
    return impurity
 def comp_feature_information_gain(df, target, descriptive_feature, split_criterion, print_flag=False):
    """
    This function calculates information gain for splitting on 
    a particular descriptive feature for a given dataset
    and a given impurity criteria.
    Supported split criterion: 'entropy', 'gini'
    """
    if print_flag:
        print('target feature:', target)
        print('descriptive_feature:', descriptive_feature)
        print('split criterion:', split_criterion)
    target_entropy = compute_impurity(df[target], split_criterion)
    # we define two lists below:
    # entropy_list to store the entropy of each partition
    # weight_list to store the relative number of observations in each partition
    entropy_list = list()
    weight_list = list()
    # loop over each level of the descriptive feature
    # to partition the dataset with respect to that level
    # and compute the entropy and the weight of the level's partition
    for level in df[descriptive_feature].unique():
        df_feature_level = df[df[descriptive_feature] == level]
        entropy_level = compute_impurity(df_feature_level[target], split_criterion)
        entropy_list.append(round(entropy_level, 3))
        weight_level = len(df_feature_level) / len(df)
        weight_list.append(round(weight_level, 3))
    # print('impurity of partitions:', entropy_list)
    # print('weights of partitions:', weight_list)
    feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
    information_gain = target_entropy - feature_remaining_impurity
    if print_flag:
        print('impurity of partitions:', entropy_list)
        print('weights of partitions:', weight_list)
        print('remaining impurity:', feature_remaining_impurity)
        print('information gain:', information_gain)
        print('====================')
    return information_gain
 def calc_information_gain_2(data, split_name, target_name, split_criterion):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original impurity
    original_impurity = compute_impurity(data[target_name], split_criterion)
    #Find the unique values in the column
    values = data[split_name].unique()
    # Make two subsets of the data, based on the unique values
    left_split = data[data[split_name] == values[0]]
    right_split = data[data[split_name] == values[1]]
    # Loop through the splits and calculate the subset impurities
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * compute_impurity(subset[target_name], split_criterion) 
    # Return information gain
    return original_impurity - to_subtract
 def get_information_gains_2(data, target_name, split_criterion):
  #Intialize an empty dictionary for information gains
  information_gains = {}
  #Iterate through each column name in our list
  for feature in list(data.columns):
    #Find the information gain for the column
    information_gain = calc_information_gain_2(model_input, target_name, feature, split_criterion)
    #Add the information gain to our dictionary using the column name as the ekey                                         
    information_gains[feature] = information_gain
  #Return the key with the highest value                                          
  #return max(information_gains, key=information_gains.get)
  return information_gains
 # %% [markdown]
 # Present the feature importance results from other methods
 # %%
 split_criterion = 'entropy'
 print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
 information_gains = get_information_gains_2(model_input, 'target', split_criterion)
 print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
 n_features_with_highest_info_gain(information_gains, n=19)
 # %%
 X, y  = model_input.drop(columns=['target', 'pid']), model_input['target']
 imputer = SimpleImputer(missing_values=np.nan, strategy='median')
 X = imputer.fit_transform(X)
 clf = DecisionTreeClassifier()
 clf.fit(X, y)
 feat_importance = clf.tree_.compute_feature_importances(normalize=False)
 print("feat importance = " + str(feat_importance))
 plt.figure(figsize=(12,12))
 tree.plot_tree(clf,
               feature_names = list(model_input.drop(columns=['target', 'pid']).columns), 
               class_names=True,
               filled = True, fontsize=2, max_depth=10)
 plt.savefig('tree_high_dpi', dpi=800)
 # %%
 print(model_input['target'])
 corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
 list(corrs.sort_values(ascending=False).index) 
 # %%
--- a/exploration/expl_features_groups_analysis.py
+++ b/exploration/expl_features_groups_analysis.py
@ -0,0 +1,149 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 # %matplotlib inline
 import os, sys, math
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 from sklearn.impute import SimpleImputer
 from sklearn.ensemble import RandomForestClassifier 
 from sklearn.model_selection import train_test_split
 from sklearn import metrics 
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
 categorical_feature_colnames = ["gender", "startlanguage"]
 additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
 categorical_features = model_input[categorical_feature_colnames].copy()
 mode_categorical_features = categorical_features.mode().iloc[0]
 # fillna with mode
 categorical_features = categorical_features.fillna(mode_categorical_features)
 # one-hot encoding
 categorical_features = categorical_features.apply(lambda col: col.astype("category"))
 if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)
 numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
 model_input = pd.concat([numerical_features, categorical_features], axis=1)
 # Binarizacija targeta
 bins = [-1, 0, 4] # bins for stressfulness (0-4) target
 model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) 
 print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
 print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)
 # %%
 # Get phone and non-phone columns
 def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
    """
    This function makes predictions with sensor groups. 
    It takes in a dataframe (df), a list of group substrings (groups_substrings) 
    and an optional parameter include_group (default is True). 
    It creates a list of columns in the dataframe that contain the group substrings, 
    while excluding the 'pid' and 'target' columns. It then splits the data into training 
    and test sets, using a test size of 0.25 for the first split and 0.2 for the second split. 
    A SimpleImputer is used to fill in missing values with median values. 
    A RandomForestClassifier is then used to fit the training set and make predictions 
    on the test set. Finally, accuracy, precision, recall and F1 scores are printed 
    for each substring group depending on whether or not include_group 
    is set to True or False.
    """
    for fgroup_substr in groups_substrings:
        if include_group:
            feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
        else:    
            feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
        X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        X = imputer.fit_transform(X)
        X, _, y, _ =  train_test_split(X, y, random_state=19, test_size=0.25)
        X_train, X_test, y_train, y_test =  train_test_split(X, y, random_state=2, test_size=0.2)
        rfc = RandomForestClassifier(random_state=0)
        rfc.fit(X_train, y_train)
        y_pred = rfc.predict(X_test)
        if include_group:
            print("\nPrediction with", fgroup_substr)
        else:
            print("\nPrediction without", fgroup_substr)
        print("************************************************")
        print("Accuracy", metrics.accuracy_score(y_test, y_pred))
        print("Precision", metrics.precision_score(y_test, y_pred))
        print("Recall", metrics.recall_score(y_test, y_pred))
        print("F1", metrics.f1_score(y_test, y_pred), "\n")
 # %%
 model_input
 groups_substr = ["_", "phone_", "empatica_"]
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
 # %%
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
 groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
 # %%
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
 groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
 # %%
 # Create an empty list to store the feature column groups 
 feature_column_groups = []
 # Iterate through each column in model_input 
 for column in model_input.columns: 
    # Split the column name by '_' 
    split_column = column.split('_')
    # Create a variable to store the prefix of the current column 
    prefix = ''
    # Iterate through each part of the split column name 
    for part in split_column: 
        # Add the part to the prefix variable 
        prefix += part + '_'
        # Check if the prefix is already in our feature column groups list 
        if prefix not in feature_column_groups: 
            # If not, add it to our list of feature columns groups  
            feature_column_groups.append(prefix)
 # Print out all possible feature columns groups that contain more than one entry in a columns list  
 print(feature_column_groups)
 # %%
 # Write all the sensors  (phone, empatica), seperate other (demographical) cols also
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@ -460,4 +460,3 @@ print("F1", np.mean(xgb_classifier_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% jupyter={"outputs_hidden": false, "source_hidden": false}
--- a/exploration/tree_high_dpi.png
+++ b/exploration/tree_high_dpi.png