diff --git a/exploration/expl_features_analysis.py b/exploration/expl_features_analysis.py new file mode 100644 index 0000000..a498185 --- /dev/null +++ b/exploration/expl_features_analysis.py @@ -0,0 +1,125 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.13.0 +# kernelspec: +# display_name: straw2analysis +# language: python +# name: straw2analysis +# --- + +# %% jupyter={"source_hidden": false, "outputs_hidden": false} +# %matplotlib inline + +import os, sys, math + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} +def calc_entropy(column): + """ + Calculate entropy given a pandas series, list, or numpy array. + """ + # Compute the counts of each unique value in the column + counts = np.bincount(column) + # Divide by the total column length to get a probability + probabilities = counts / len(column) + + # Initialize the entropy to 0 + entropy = 0 + # Loop through the probabilities, and add each one to the total entropy + for prob in probabilities: + if prob > 0: + # use log from math and set base to 2 + entropy += prob * math.log(prob, 2) + + return -entropy + + +def calc_information_gain(data, split_name, target_name): + """ + Calculate information gain given a data set, column to split on, and target + """ + # Calculate the original entropy + original_entropy = calc_entropy(data[target_name]) + #Find the unique values in the column + values = data[split_name].unique() + + # Make two subsets of the data, based on the unique values + left_split = data[data[split_name] == values[0]] + right_split = data[data[split_name] == values[1]] + + # Loop through the splits and calculate the subset entropies + to_subtract = 0 + for subset in [left_split, right_split]: + prob = (subset.shape[0] / data.shape[0]) + to_subtract += prob * calc_entropy(subset[target_name]) + + # Return information gain + return original_entropy - to_subtract + + +def get_information_gains(data, target_name): + #Intialize an empty dictionary for information gains + information_gains = {} + + #Iterate through each column name in our list + for col in list(data.columns): + #Find the information gain for the column + information_gain = calc_information_gain(data, col, target_name) + #Add the information gain to our dictionary using the column name as the ekey + information_gains[col] = information_gain + + #Return the key with the highest value + #return max(information_gains, key=information_gains.get) + + return information_gains + +def n_features_with_highest_info_gain(info_gain_dict, n=50): + """ + Get n-features that have highest information gain + """ + import heapq + return heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1]) + + +# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns) + + +# %% +categorical_feature_colnames = ["gender", "startlanguage"] +additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col] +categorical_feature_colnames += additional_categorical_features + +categorical_features = model_input[categorical_feature_colnames].copy() +mode_categorical_features = categorical_features.mode().iloc[0] + +# fillna with mode +categorical_features = categorical_features.fillna(mode_categorical_features) + +# one-hot encoding +categorical_features = categorical_features.apply(lambda col: col.astype("category")) +if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) + +numerical_features = model_input.drop(categorical_feature_colnames, axis=1) +model_input = pd.concat([numerical_features, categorical_features], axis=1) + +# %% +info_gains = get_information_gains(model_input, 'target') +selected_features = n_features_with_highest_info_gain(info_gains, n=150) +selected_features + +# TODO: binarizacija targeta + + +# %%