# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% jupyter={"source_hidden": false, "outputs_hidden": false} # %matplotlib inline import os, sys, math import numpy as np import matplotlib.pyplot as plt import pandas as pd # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} def calc_entropy(column): """ Calculate entropy given a pandas series, list, or numpy array. """ # Compute the counts of each unique value in the column counts = np.bincount(column) # Divide by the total column length to get a probability probabilities = counts / len(column) # Initialize the entropy to 0 entropy = 0 # Loop through the probabilities, and add each one to the total entropy for prob in probabilities: if prob > 0: # use log from math and set base to 2 entropy += prob * math.log(prob, 2) return -entropy def calc_information_gain(data, split_name, target_name): """ Calculate information gain given a data set, column to split on, and target """ # Calculate the original entropy original_entropy = calc_entropy(data[target_name]) #Find the unique values in the column values = data[split_name].unique() # Make two subsets of the data, based on the unique values left_split = data[data[split_name] == values[0]] right_split = data[data[split_name] == values[1]] # Loop through the splits and calculate the subset entropies to_subtract = 0 for subset in [left_split, right_split]: prob = (subset.shape[0] / data.shape[0]) to_subtract += prob * calc_entropy(subset[target_name]) # Return information gain return original_entropy - to_subtract def get_information_gains(data, target_name): #Intialize an empty dictionary for information gains information_gains = {} #Iterate through each column name in our list for col in list(data.columns): #Find the information gain for the column information_gain = calc_information_gain(data, col, target_name) #Add the information gain to our dictionary using the column name as the ekey information_gains[col] = information_gain #Return the key with the highest value #return max(information_gains, key=information_gains.get) return information_gains def n_features_with_highest_info_gain(info_gain_dict, n=50): """ Get n-features that have highest information gain """ import heapq return heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1]) # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns) # %% categorical_feature_colnames = ["gender", "startlanguage"] additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col] categorical_feature_colnames += additional_categorical_features categorical_features = model_input[categorical_feature_colnames].copy() mode_categorical_features = categorical_features.mode().iloc[0] # fillna with mode categorical_features = categorical_features.fillna(mode_categorical_features) # one-hot encoding categorical_features = categorical_features.apply(lambda col: col.astype("category")) if not categorical_features.empty: categorical_features = pd.get_dummies(categorical_features) numerical_features = model_input.drop(categorical_feature_colnames, axis=1) model_input = pd.concat([numerical_features, categorical_features], axis=1) # %% info_gains = get_information_gains(model_input, 'target') selected_features = n_features_with_highest_info_gain(info_gains, n=150) selected_features # TODO: binarizacija targeta # %%