Prepare scripts for feature importance analysis.
parent
d263b32564
commit
6a98c8cdcf
|
@ -21,6 +21,12 @@ import os, sys, math
|
|||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn import tree
|
||||
from sklearn.impute import SimpleImputer
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
def calc_entropy(column):
|
||||
|
@ -87,15 +93,14 @@ def n_features_with_highest_info_gain(info_gain_dict, n=50):
|
|||
Get n-features that have highest information gain
|
||||
"""
|
||||
import heapq
|
||||
return heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
|
||||
n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
|
||||
return {feature[0]: feature[1] for feature in n_largest}
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
|
||||
|
||||
|
||||
# %%
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
@ -114,12 +119,161 @@ if not categorical_features.empty:
|
|||
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
|
||||
model_input = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# Binarizacija targeta
|
||||
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)
|
||||
print(model_input['target'].value_counts(), edges)
|
||||
|
||||
# %%
|
||||
info_gains = get_information_gains(model_input, 'target')
|
||||
selected_features = n_features_with_highest_info_gain(info_gains, n=150)
|
||||
selected_features
|
||||
|
||||
# TODO: binarizacija targeta
|
||||
|
||||
# %% [markdown]
|
||||
# Present the feature importance results
|
||||
|
||||
# %%
|
||||
print("Total columns:", len(info_gains))
|
||||
print(pd.Series(info_gains).value_counts())
|
||||
|
||||
n_features_with_highest_info_gain(info_gains, n=189)
|
||||
|
||||
# %%
|
||||
def compute_impurity(feature, impurity_criterion):
|
||||
"""
|
||||
This function calculates impurity of a feature.
|
||||
Supported impurity criteria: 'entropy', 'gini'
|
||||
input: feature (this needs to be a Pandas series)
|
||||
output: feature impurity
|
||||
"""
|
||||
probs = feature.value_counts(normalize=True)
|
||||
|
||||
if impurity_criterion == 'entropy':
|
||||
impurity = -1 * np.sum(np.log2(probs) * probs)
|
||||
elif impurity_criterion == 'gini':
|
||||
impurity = 1 - np.sum(np.square(probs))
|
||||
else:
|
||||
raise ValueError('Unknown impurity criterion')
|
||||
|
||||
return impurity
|
||||
|
||||
|
||||
def comp_feature_information_gain(df, target, descriptive_feature, split_criterion, print_flag=False):
|
||||
"""
|
||||
This function calculates information gain for splitting on
|
||||
a particular descriptive feature for a given dataset
|
||||
and a given impurity criteria.
|
||||
Supported split criterion: 'entropy', 'gini'
|
||||
"""
|
||||
if print_flag:
|
||||
print('target feature:', target)
|
||||
print('descriptive_feature:', descriptive_feature)
|
||||
print('split criterion:', split_criterion)
|
||||
|
||||
target_entropy = compute_impurity(df[target], split_criterion)
|
||||
|
||||
# we define two lists below:
|
||||
# entropy_list to store the entropy of each partition
|
||||
# weight_list to store the relative number of observations in each partition
|
||||
entropy_list = list()
|
||||
weight_list = list()
|
||||
|
||||
# loop over each level of the descriptive feature
|
||||
# to partition the dataset with respect to that level
|
||||
# and compute the entropy and the weight of the level's partition
|
||||
for level in df[descriptive_feature].unique():
|
||||
df_feature_level = df[df[descriptive_feature] == level]
|
||||
entropy_level = compute_impurity(df_feature_level[target], split_criterion)
|
||||
entropy_list.append(round(entropy_level, 3))
|
||||
weight_level = len(df_feature_level) / len(df)
|
||||
weight_list.append(round(weight_level, 3))
|
||||
|
||||
# print('impurity of partitions:', entropy_list)
|
||||
# print('weights of partitions:', weight_list)
|
||||
|
||||
feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
|
||||
|
||||
information_gain = target_entropy - feature_remaining_impurity
|
||||
|
||||
if print_flag:
|
||||
print('impurity of partitions:', entropy_list)
|
||||
print('weights of partitions:', weight_list)
|
||||
print('remaining impurity:', feature_remaining_impurity)
|
||||
print('information gain:', information_gain)
|
||||
print('====================')
|
||||
|
||||
return information_gain
|
||||
|
||||
|
||||
def calc_information_gain_2(data, split_name, target_name, split_criterion):
|
||||
"""
|
||||
Calculate information gain given a data set, column to split on, and target
|
||||
"""
|
||||
# Calculate the original impurity
|
||||
original_impurity = compute_impurity(data[target_name], split_criterion)
|
||||
#Find the unique values in the column
|
||||
values = data[split_name].unique()
|
||||
|
||||
# Make two subsets of the data, based on the unique values
|
||||
left_split = data[data[split_name] == values[0]]
|
||||
right_split = data[data[split_name] == values[1]]
|
||||
|
||||
# Loop through the splits and calculate the subset impurities
|
||||
to_subtract = 0
|
||||
for subset in [left_split, right_split]:
|
||||
prob = (subset.shape[0] / data.shape[0])
|
||||
to_subtract += prob * compute_impurity(subset[target_name], split_criterion)
|
||||
|
||||
# Return information gain
|
||||
return original_impurity - to_subtract
|
||||
|
||||
|
||||
def get_information_gains_2(data, target_name, split_criterion):
|
||||
#Intialize an empty dictionary for information gains
|
||||
information_gains = {}
|
||||
|
||||
#Iterate through each column name in our list
|
||||
for feature in list(data.columns):
|
||||
#Find the information gain for the column
|
||||
information_gain = calc_information_gain_2(model_input, target_name, feature, split_criterion)
|
||||
#Add the information gain to our dictionary using the column name as the ekey
|
||||
information_gains[feature] = information_gain
|
||||
|
||||
#Return the key with the highest value
|
||||
#return max(information_gains, key=information_gains.get)
|
||||
|
||||
return information_gains
|
||||
|
||||
# %% [markdown]
|
||||
# Present the feature importance results from other methods
|
||||
|
||||
# %%
|
||||
split_criterion = 'entropy'
|
||||
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
|
||||
information_gains = get_information_gains_2(model_input, 'target', split_criterion)
|
||||
print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
|
||||
n_features_with_highest_info_gain(information_gains, n=19)
|
||||
|
||||
# %%
|
||||
|
||||
X, y = model_input.drop(columns=['target', 'pid']), model_input['target']
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
X = imputer.fit_transform(X)
|
||||
|
||||
clf = DecisionTreeClassifier()
|
||||
clf.fit(X, y)
|
||||
|
||||
feat_importance = clf.tree_.compute_feature_importances(normalize=False)
|
||||
print("feat importance = " + str(feat_importance))
|
||||
|
||||
plt.figure(figsize=(12,12))
|
||||
tree.plot_tree(clf,
|
||||
feature_names = list(model_input.drop(columns=['target', 'pid']).columns),
|
||||
class_names=True,
|
||||
filled = True, fontsize=2, max_depth=10)
|
||||
|
||||
plt.savefig('tree_high_dpi', dpi=800)
|
||||
# %%
|
||||
print(model_input['target'])
|
||||
corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
|
||||
list(corrs.sort_values(ascending=False).index)
|
||||
|
||||
# %%
|
||||
|
|
|
@ -0,0 +1,149 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
# %matplotlib inline
|
||||
|
||||
import os, sys, math
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn import metrics
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
|
||||
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = model_input[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
|
||||
model_input = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# Binarizacija targeta
|
||||
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)
|
||||
|
||||
print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
|
||||
print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
# Get phone and non-phone columns
|
||||
|
||||
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
|
||||
"""
|
||||
This function makes predictions with sensor groups.
|
||||
It takes in a dataframe (df), a list of group substrings (groups_substrings)
|
||||
and an optional parameter include_group (default is True).
|
||||
It creates a list of columns in the dataframe that contain the group substrings,
|
||||
while excluding the 'pid' and 'target' columns. It then splits the data into training
|
||||
and test sets, using a test size of 0.25 for the first split and 0.2 for the second split.
|
||||
A SimpleImputer is used to fill in missing values with median values.
|
||||
A RandomForestClassifier is then used to fit the training set and make predictions
|
||||
on the test set. Finally, accuracy, precision, recall and F1 scores are printed
|
||||
for each substring group depending on whether or not include_group
|
||||
is set to True or False.
|
||||
|
||||
"""
|
||||
for fgroup_substr in groups_substrings:
|
||||
if include_group:
|
||||
feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
|
||||
else:
|
||||
feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
|
||||
|
||||
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
X = imputer.fit_transform(X)
|
||||
|
||||
X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2)
|
||||
|
||||
rfc = RandomForestClassifier(random_state=0)
|
||||
rfc.fit(X_train, y_train)
|
||||
y_pred = rfc.predict(X_test)
|
||||
|
||||
if include_group:
|
||||
print("\nPrediction with", fgroup_substr)
|
||||
else:
|
||||
print("\nPrediction without", fgroup_substr)
|
||||
|
||||
print("************************************************")
|
||||
print("Accuracy", metrics.accuracy_score(y_test, y_pred))
|
||||
print("Precision", metrics.precision_score(y_test, y_pred))
|
||||
print("Recall", metrics.recall_score(y_test, y_pred))
|
||||
print("F1", metrics.f1_score(y_test, y_pred), "\n")
|
||||
# %%
|
||||
model_input
|
||||
groups_substr = ["_", "phone_", "empatica_"]
|
||||
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
|
||||
|
||||
# %%
|
||||
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
|
||||
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
|
||||
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
|
||||
|
||||
# %%
|
||||
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
|
||||
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
|
||||
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
|
||||
|
||||
# %%
|
||||
# Create an empty list to store the feature column groups
|
||||
feature_column_groups = []
|
||||
|
||||
# Iterate through each column in model_input
|
||||
for column in model_input.columns:
|
||||
|
||||
# Split the column name by '_'
|
||||
split_column = column.split('_')
|
||||
|
||||
# Create a variable to store the prefix of the current column
|
||||
prefix = ''
|
||||
|
||||
# Iterate through each part of the split column name
|
||||
for part in split_column:
|
||||
|
||||
# Add the part to the prefix variable
|
||||
prefix += part + '_'
|
||||
|
||||
# Check if the prefix is already in our feature column groups list
|
||||
if prefix not in feature_column_groups:
|
||||
|
||||
# If not, add it to our list of feature columns groups
|
||||
feature_column_groups.append(prefix)
|
||||
|
||||
# Print out all possible feature columns groups that contain more than one entry in a columns list
|
||||
print(feature_column_groups)
|
||||
# %%
|
||||
# Write all the sensors (phone, empatica), seperate other (demographical) cols also
|
|
@ -460,4 +460,3 @@ print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
|||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 2.5 MiB |
Loading…
Reference in New Issue