From ad2fab133f6397a161133e67831314100a9c48cc Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Fri, 13 Jan 2023 17:08:56 +0100
Subject: [PATCH] Explore features with Entropy and IG.

---
 exploration/expl_features_analysis.py | 125 ++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 exploration/expl_features_analysis.py

diff --git a/exploration/expl_features_analysis.py b/exploration/expl_features_analysis.py
new file mode 100644
index 0000000..a498185
--- /dev/null
+++ b/exploration/expl_features_analysis.py
@@ -0,0 +1,125 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# %matplotlib inline
+
+import os, sys, math
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+def calc_entropy(column):
+    """
+    Calculate entropy given a pandas series, list, or numpy array.
+    """
+    # Compute the counts of each unique value in the column
+    counts = np.bincount(column)
+    # Divide by the total column length to get a probability
+    probabilities = counts / len(column)
+    
+    # Initialize the entropy to 0
+    entropy = 0
+    # Loop through the probabilities, and add each one to the total entropy
+    for prob in probabilities:
+        if prob > 0:
+            # use log from math and set base to 2
+            entropy += prob * math.log(prob, 2)
+    
+    return -entropy
+
+
+def calc_information_gain(data, split_name, target_name):
+    """
+    Calculate information gain given a data set, column to split on, and target
+    """
+    # Calculate the original entropy
+    original_entropy = calc_entropy(data[target_name])
+    #Find the unique values in the column
+    values = data[split_name].unique()
+    
+    # Make two subsets of the data, based on the unique values
+    left_split = data[data[split_name] == values[0]]
+    right_split = data[data[split_name] == values[1]]
+    
+    # Loop through the splits and calculate the subset entropies
+    to_subtract = 0
+    for subset in [left_split, right_split]:
+        prob = (subset.shape[0] / data.shape[0]) 
+        to_subtract += prob * calc_entropy(subset[target_name])
+    
+    # Return information gain
+    return original_entropy - to_subtract
+
+
+def get_information_gains(data, target_name):
+  #Intialize an empty dictionary for information gains
+  information_gains = {}
+  
+  #Iterate through each column name in our list
+  for col in list(data.columns):
+    #Find the information gain for the column
+    information_gain = calc_information_gain(data, col, target_name)
+    #Add the information gain to our dictionary using the column name as the ekey                                         
+    information_gains[col] = information_gain
+  
+  #Return the key with the highest value                                          
+  #return max(information_gains, key=information_gains.get)
+  
+  return information_gains
+
+def n_features_with_highest_info_gain(info_gain_dict, n=50):
+    """
+    Get n-features that have highest information gain
+    """
+    import heapq
+    return heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
+
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
+
+
+# %% 
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+categorical_features = model_input[categorical_feature_colnames].copy()
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
+model_input = pd.concat([numerical_features, categorical_features], axis=1)
+
+# %%
+info_gains = get_information_gains(model_input, 'target')
+selected_features = n_features_with_highest_info_gain(info_gains, n=150)
+selected_features
+
+# TODO: binarizacija targeta
+
+
+# %%