Prepare scripts for feature importance analysis.

ml_pipeline
Primoz 2023-01-19 16:20:43 +01:00
parent d263b32564
commit 6a98c8cdcf
4 changed files with 310 additions and 8 deletions

View File

@ -21,6 +21,12 @@ import os, sys, math
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.impute import SimpleImputer
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
def calc_entropy(column): def calc_entropy(column):
@ -87,15 +93,14 @@ def n_features_with_highest_info_gain(info_gain_dict, n=50):
Get n-features that have highest information gain Get n-features that have highest information gain
""" """
import heapq import heapq
return heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1]) n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
return {feature[0]: feature[1] for feature in n_largest}
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns) model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
# %%
categorical_feature_colnames = ["gender", "startlanguage"] categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col] additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features categorical_feature_colnames += additional_categorical_features
@ -114,12 +119,161 @@ if not categorical_features.empty:
numerical_features = model_input.drop(categorical_feature_colnames, axis=1) numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
model_input = pd.concat([numerical_features, categorical_features], axis=1) model_input = pd.concat([numerical_features, categorical_features], axis=1)
# Binarizacija targeta
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)
print(model_input['target'].value_counts(), edges)
# %% # %%
info_gains = get_information_gains(model_input, 'target') info_gains = get_information_gains(model_input, 'target')
selected_features = n_features_with_highest_info_gain(info_gains, n=150)
selected_features
# TODO: binarizacija targeta
# %% [markdown]
# Present the feature importance results
# %%
print("Total columns:", len(info_gains))
print(pd.Series(info_gains).value_counts())
n_features_with_highest_info_gain(info_gains, n=189)
# %%
def compute_impurity(feature, impurity_criterion):
"""
This function calculates impurity of a feature.
Supported impurity criteria: 'entropy', 'gini'
input: feature (this needs to be a Pandas series)
output: feature impurity
"""
probs = feature.value_counts(normalize=True)
if impurity_criterion == 'entropy':
impurity = -1 * np.sum(np.log2(probs) * probs)
elif impurity_criterion == 'gini':
impurity = 1 - np.sum(np.square(probs))
else:
raise ValueError('Unknown impurity criterion')
return impurity
def comp_feature_information_gain(df, target, descriptive_feature, split_criterion, print_flag=False):
"""
This function calculates information gain for splitting on
a particular descriptive feature for a given dataset
and a given impurity criteria.
Supported split criterion: 'entropy', 'gini'
"""
if print_flag:
print('target feature:', target)
print('descriptive_feature:', descriptive_feature)
print('split criterion:', split_criterion)
target_entropy = compute_impurity(df[target], split_criterion)
# we define two lists below:
# entropy_list to store the entropy of each partition
# weight_list to store the relative number of observations in each partition
entropy_list = list()
weight_list = list()
# loop over each level of the descriptive feature
# to partition the dataset with respect to that level
# and compute the entropy and the weight of the level's partition
for level in df[descriptive_feature].unique():
df_feature_level = df[df[descriptive_feature] == level]
entropy_level = compute_impurity(df_feature_level[target], split_criterion)
entropy_list.append(round(entropy_level, 3))
weight_level = len(df_feature_level) / len(df)
weight_list.append(round(weight_level, 3))
# print('impurity of partitions:', entropy_list)
# print('weights of partitions:', weight_list)
feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
information_gain = target_entropy - feature_remaining_impurity
if print_flag:
print('impurity of partitions:', entropy_list)
print('weights of partitions:', weight_list)
print('remaining impurity:', feature_remaining_impurity)
print('information gain:', information_gain)
print('====================')
return information_gain
def calc_information_gain_2(data, split_name, target_name, split_criterion):
"""
Calculate information gain given a data set, column to split on, and target
"""
# Calculate the original impurity
original_impurity = compute_impurity(data[target_name], split_criterion)
#Find the unique values in the column
values = data[split_name].unique()
# Make two subsets of the data, based on the unique values
left_split = data[data[split_name] == values[0]]
right_split = data[data[split_name] == values[1]]
# Loop through the splits and calculate the subset impurities
to_subtract = 0
for subset in [left_split, right_split]:
prob = (subset.shape[0] / data.shape[0])
to_subtract += prob * compute_impurity(subset[target_name], split_criterion)
# Return information gain
return original_impurity - to_subtract
def get_information_gains_2(data, target_name, split_criterion):
#Intialize an empty dictionary for information gains
information_gains = {}
#Iterate through each column name in our list
for feature in list(data.columns):
#Find the information gain for the column
information_gain = calc_information_gain_2(model_input, target_name, feature, split_criterion)
#Add the information gain to our dictionary using the column name as the ekey
information_gains[feature] = information_gain
#Return the key with the highest value
#return max(information_gains, key=information_gains.get)
return information_gains
# %% [markdown]
# Present the feature importance results from other methods
# %%
split_criterion = 'entropy'
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
information_gains = get_information_gains_2(model_input, 'target', split_criterion)
print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
n_features_with_highest_info_gain(information_gains, n=19)
# %%
X, y = model_input.drop(columns=['target', 'pid']), model_input['target']
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)
clf = DecisionTreeClassifier()
clf.fit(X, y)
feat_importance = clf.tree_.compute_feature_importances(normalize=False)
print("feat importance = " + str(feat_importance))
plt.figure(figsize=(12,12))
tree.plot_tree(clf,
feature_names = list(model_input.drop(columns=['target', 'pid']).columns),
class_names=True,
filled = True, fontsize=2, max_depth=10)
plt.savefig('tree_high_dpi', dpi=800)
# %%
print(model_input['target'])
corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
list(corrs.sort_values(ascending=False).index)
# %% # %%

View File

@ -0,0 +1,149 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %matplotlib inline
import os, sys, math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features
categorical_features = model_input[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]
# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
model_input = pd.concat([numerical_features, categorical_features], axis=1)
# Binarizacija targeta
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)
print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)
# %%
# Get phone and non-phone columns
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
"""
This function makes predictions with sensor groups.
It takes in a dataframe (df), a list of group substrings (groups_substrings)
and an optional parameter include_group (default is True).
It creates a list of columns in the dataframe that contain the group substrings,
while excluding the 'pid' and 'target' columns. It then splits the data into training
and test sets, using a test size of 0.25 for the first split and 0.2 for the second split.
A SimpleImputer is used to fill in missing values with median values.
A RandomForestClassifier is then used to fit the training set and make predictions
on the test set. Finally, accuracy, precision, recall and F1 scores are printed
for each substring group depending on whether or not include_group
is set to True or False.
"""
for fgroup_substr in groups_substrings:
if include_group:
feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
else:
feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)
X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2)
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
if include_group:
print("\nPrediction with", fgroup_substr)
else:
print("\nPrediction without", fgroup_substr)
print("************************************************")
print("Accuracy", metrics.accuracy_score(y_test, y_pred))
print("Precision", metrics.precision_score(y_test, y_pred))
print("Recall", metrics.recall_score(y_test, y_pred))
print("F1", metrics.f1_score(y_test, y_pred), "\n")
# %%
model_input
groups_substr = ["_", "phone_", "empatica_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
# %%
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
# %%
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
# %%
# Create an empty list to store the feature column groups
feature_column_groups = []
# Iterate through each column in model_input
for column in model_input.columns:
# Split the column name by '_'
split_column = column.split('_')
# Create a variable to store the prefix of the current column
prefix = ''
# Iterate through each part of the split column name
for part in split_column:
# Add the part to the prefix variable
prefix += part + '_'
# Check if the prefix is already in our feature column groups list
if prefix not in feature_column_groups:
# If not, add it to our list of feature columns groups
feature_column_groups.append(prefix)
# Print out all possible feature columns groups that contain more than one entry in a columns list
print(feature_column_groups)
# %%
# Write all the sensors (phone, empatica), seperate other (demographical) cols also

View File

@ -460,4 +460,3 @@ print("F1", np.mean(xgb_classifier_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1]) print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])) print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
# %% jupyter={"outputs_hidden": false, "source_hidden": false}

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.5 MiB