Explore features with Entropy and IG.

2023-01-13 17:08:56 +01:00 · 2023-01-13 17:08:56 +01:00 · ad2fab133f
parent 72fdd9c5ec
commit ad2fab133f
1 changed files with 125 additions and 0 deletions
--- a/exploration/expl_features_analysis.py
+++ b/exploration/expl_features_analysis.py
@ -0,0 +1,125 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 # %matplotlib inline
 import os, sys, math
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 def calc_entropy(column):
    """
    Calculate entropy given a pandas series, list, or numpy array.
    """
    # Compute the counts of each unique value in the column
    counts = np.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            # use log from math and set base to 2
            entropy += prob * math.log(prob, 2)
    return -entropy
 def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    #Find the unique values in the column
    values = data[split_name].unique()
    # Make two subsets of the data, based on the unique values
    left_split = data[data[split_name] == values[0]]
    right_split = data[data[split_name] == values[1]]
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    # Return information gain
    return original_entropy - to_subtract
 def get_information_gains(data, target_name):
  #Intialize an empty dictionary for information gains
  information_gains = {}
  #Iterate through each column name in our list
  for col in list(data.columns):
    #Find the information gain for the column
    information_gain = calc_information_gain(data, col, target_name)
    #Add the information gain to our dictionary using the column name as the ekey                                         
    information_gains[col] = information_gain
  #Return the key with the highest value                                          
  #return max(information_gains, key=information_gains.get)
  return information_gains
 def n_features_with_highest_info_gain(info_gain_dict, n=50):
    """
    Get n-features that have highest information gain
    """
    import heapq
    return heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
 # %% 
 categorical_feature_colnames = ["gender", "startlanguage"]
 additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
 categorical_features = model_input[categorical_feature_colnames].copy()
 mode_categorical_features = categorical_features.mode().iloc[0]
 # fillna with mode
 categorical_features = categorical_features.fillna(mode_categorical_features)
 # one-hot encoding
 categorical_features = categorical_features.apply(lambda col: col.astype("category"))
 if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)
 numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
 model_input = pd.concat([numerical_features, categorical_features], axis=1)
 # %%
 info_gains = get_information_gains(model_input, 'target')
 selected_features = n_features_with_highest_info_gain(info_gains, n=150)
 selected_features
 # TODO: binarizacija targeta
 # %%