Explore features with Entropy and IG.
parent
72fdd9c5ec
commit
ad2fab133f
|
@ -0,0 +1,125 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
# %matplotlib inline
|
||||
|
||||
import os, sys, math
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
def calc_entropy(column):
|
||||
"""
|
||||
Calculate entropy given a pandas series, list, or numpy array.
|
||||
"""
|
||||
# Compute the counts of each unique value in the column
|
||||
counts = np.bincount(column)
|
||||
# Divide by the total column length to get a probability
|
||||
probabilities = counts / len(column)
|
||||
|
||||
# Initialize the entropy to 0
|
||||
entropy = 0
|
||||
# Loop through the probabilities, and add each one to the total entropy
|
||||
for prob in probabilities:
|
||||
if prob > 0:
|
||||
# use log from math and set base to 2
|
||||
entropy += prob * math.log(prob, 2)
|
||||
|
||||
return -entropy
|
||||
|
||||
|
||||
def calc_information_gain(data, split_name, target_name):
|
||||
"""
|
||||
Calculate information gain given a data set, column to split on, and target
|
||||
"""
|
||||
# Calculate the original entropy
|
||||
original_entropy = calc_entropy(data[target_name])
|
||||
#Find the unique values in the column
|
||||
values = data[split_name].unique()
|
||||
|
||||
# Make two subsets of the data, based on the unique values
|
||||
left_split = data[data[split_name] == values[0]]
|
||||
right_split = data[data[split_name] == values[1]]
|
||||
|
||||
# Loop through the splits and calculate the subset entropies
|
||||
to_subtract = 0
|
||||
for subset in [left_split, right_split]:
|
||||
prob = (subset.shape[0] / data.shape[0])
|
||||
to_subtract += prob * calc_entropy(subset[target_name])
|
||||
|
||||
# Return information gain
|
||||
return original_entropy - to_subtract
|
||||
|
||||
|
||||
def get_information_gains(data, target_name):
|
||||
#Intialize an empty dictionary for information gains
|
||||
information_gains = {}
|
||||
|
||||
#Iterate through each column name in our list
|
||||
for col in list(data.columns):
|
||||
#Find the information gain for the column
|
||||
information_gain = calc_information_gain(data, col, target_name)
|
||||
#Add the information gain to our dictionary using the column name as the ekey
|
||||
information_gains[col] = information_gain
|
||||
|
||||
#Return the key with the highest value
|
||||
#return max(information_gains, key=information_gains.get)
|
||||
|
||||
return information_gains
|
||||
|
||||
def n_features_with_highest_info_gain(info_gain_dict, n=50):
|
||||
"""
|
||||
Get n-features that have highest information gain
|
||||
"""
|
||||
import heapq
|
||||
return heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
|
||||
|
||||
|
||||
# %%
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = model_input[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
|
||||
model_input = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# %%
|
||||
info_gains = get_information_gains(model_input, 'target')
|
||||
selected_features = n_features_with_highest_info_gain(info_gains, n=150)
|
||||
selected_features
|
||||
|
||||
# TODO: binarizacija targeta
|
||||
|
||||
|
||||
# %%
|
Loading…
Reference in New Issue