Merge remote-tracking branch 'origin/ml_pipeline' into ml_pipeline

ml_pipeline
junos 2023-03-01 13:37:18 +01:00
commit 633d029dc2
11 changed files with 1068 additions and 17 deletions

View File

@ -0,0 +1,318 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %matplotlib inline
import os, sys, math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
def calc_entropy(column):
"""
Calculate entropy given a pandas series, list, or numpy array.
"""
# Compute the counts of each unique value in the column
counts = np.bincount(column)
# Divide by the total column length to get a probability
probabilities = counts / len(column)
# Initialize the entropy to 0
entropy = 0
# Loop through the probabilities, and add each one to the total entropy
for prob in probabilities:
if prob > 0:
# use log from math and set base to 2
entropy += prob * math.log(prob, 2)
return -entropy
def calc_information_gain(data, split_name, target_name):
"""
Calculate information gain given a data set, column to split on, and target
"""
# Calculate the original entropy
original_entropy = calc_entropy(data[target_name])
#Find the unique values in the column
values = data[split_name].unique()
# Make two subsets of the data, based on the unique values
left_split = data[data[split_name] == values[0]]
right_split = data[data[split_name] == values[1]]
# Loop through the splits and calculate the subset entropies
to_subtract = 0
for subset in [left_split, right_split]:
prob = (subset.shape[0] / data.shape[0])
to_subtract += prob * calc_entropy(subset[target_name])
# Return information gain
return original_entropy - to_subtract
def get_information_gains(data, target_name):
#Intialize an empty dictionary for information gains
information_gains = {}
#Iterate through each column name in our list
for col in list(data.columns):
#Find the information gain for the column
information_gain = calc_information_gain(data, col, target_name)
#Add the information gain to our dictionary using the column name as the ekey
information_gains[col] = information_gain
#Return the key with the highest value
#return max(information_gains, key=information_gains.get)
return information_gains
def n_features_with_highest_info_gain(info_gain_dict, n=None):
"""
Get n-features that have highest information gain
"""
if n is None:
n = len(info_gain_dict)
import heapq
n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
return {feature[0]: feature[1] for feature in n_largest}
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features
categorical_features = model_input[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]
# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
model_input = pd.concat([numerical_features, categorical_features], axis=1)
# Binarizacija targeta
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)
print(model_input['target'].value_counts(), edges)
# %%
info_gains = get_information_gains(model_input, 'target')
# %% [markdown]
# Present the feature importance results
# %%
print("Total columns:", len(info_gains))
print(pd.Series(info_gains).value_counts())
n_features_with_highest_info_gain(info_gains, n=189)
# %%
def compute_impurity(feature, impurity_criterion):
"""
This function calculates impurity of a feature.
Supported impurity criteria: 'entropy', 'gini'
input: feature (this needs to be a Pandas series)
output: feature impurity
"""
probs = feature.value_counts(normalize=True)
if impurity_criterion == 'entropy':
impurity = -1 * np.sum(np.log2(probs) * probs)
elif impurity_criterion == 'gini':
impurity = 1 - np.sum(np.square(probs))
else:
raise ValueError('Unknown impurity criterion')
return impurity
def comp_feature_information_gain(df, target, descriptive_feature, split_criterion, print_flag=False):
"""
This function calculates information gain for splitting on
a particular descriptive feature for a given dataset
and a given impurity criteria.
Supported split criterion: 'entropy', 'gini'
"""
if print_flag:
print('target feature:', target)
print('descriptive_feature:', descriptive_feature)
print('split criterion:', split_criterion)
target_entropy = compute_impurity(df[target], split_criterion)
# we define two lists below:
# entropy_list to store the entropy of each partition
# weight_list to store the relative number of observations in each partition
entropy_list = list()
weight_list = list()
# loop over each level of the descriptive feature
# to partition the dataset with respect to that level
# and compute the entropy and the weight of the level's partition
for level in df[descriptive_feature].unique():
df_feature_level = df[df[descriptive_feature] == level]
entropy_level = compute_impurity(df_feature_level[target], split_criterion)
entropy_list.append(round(entropy_level, 3))
weight_level = len(df_feature_level) / len(df)
weight_list.append(round(weight_level, 3))
# print('impurity of partitions:', entropy_list)
# print('weights of partitions:', weight_list)
feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
information_gain = target_entropy - feature_remaining_impurity
if print_flag:
print('impurity of partitions:', entropy_list)
print('weights of partitions:', weight_list)
print('remaining impurity:', feature_remaining_impurity)
print('information gain:', information_gain)
print('====================')
return information_gain
def calc_information_gain_2(data, split_name, target_name, split_criterion):
"""
Calculate information gain given a data set, column to split on, and target
"""
# Calculate the original impurity
original_impurity = compute_impurity(data[target_name], split_criterion)
#Find the unique values in the column
values = data[split_name].unique()
# Make two subsets of the data, based on the unique values
left_split = data[data[split_name] == values[0]]
right_split = data[data[split_name] == values[1]]
# Loop through the splits and calculate the subset impurities
to_subtract = 0
for subset in [left_split, right_split]:
prob = (subset.shape[0] / data.shape[0])
to_subtract += prob * compute_impurity(subset[target_name], split_criterion)
# Return information gain
return original_impurity - to_subtract
def get_information_gains_2(data, target_name, split_criterion):
#Intialize an empty dictionary for information gains
information_gains = {}
#Iterate through each column name in our list
for feature in list(data.columns):
#Find the information gain for the column
information_gain = calc_information_gain_2(model_input, target_name, feature, split_criterion)
#Add the information gain to our dictionary using the column name as the ekey
information_gains[feature] = information_gain
#Return the key with the highest value
#return max(information_gains, key=information_gains.get)
return information_gains
# %% [markdown]
# Present the feature importance results from other methods
# %%
split_criterion = 'entropy'
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
information_gains = get_information_gains_2(model_input, 'target', split_criterion)
print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
n_features_with_highest_info_gain(information_gains)
# %%
# Present the feature importance using a tree (that uses gini imputity measure)
split_criterion = 'entropy'
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
X, y = model_input.drop(columns=['target', 'pid']), model_input['target']
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)
X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25)
clf = DecisionTreeClassifier(criterion=split_criterion)
clf.fit(X, y)
feat_importance = clf.tree_.compute_feature_importances(normalize=False)
print("feat importance = ", feat_importance)
print("shape", feat_importance.shape)
tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist()))
info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp))
info_gains_dict[info_gains_dict > 0]
# %%
# Binarizacija vrednosti tree Information Gain-a
bins = [-0.1, 0, 0.1] # bins for target's correlations with features
cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True)
plt.title(f"Tree information gains by value ({split_criterion})")
cut_info_gains.value_counts().plot(kind='bar', color='purple')
plt.xticks(rotation=45, ha='right')
print(cut_info_gains.value_counts())
pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20))
# %%
# Plot feature importance tree graph
plt.figure(figsize=(12,12))
tree.plot_tree(clf,
feature_names = list(model_input.drop(columns=['target', 'pid']).columns),
class_names=True,
filled = True, fontsize=5, max_depth=3)
plt.savefig('tree_high_dpi', dpi=800)
# %% [markdown]
# Present the feature importance by correlation with target
corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
# corrs.sort_values(ascending=False)
# Binarizacija vrednosti korelacij
bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features
cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True)
plt.title("Target's correlations with features")
cut_corrs.value_counts().plot(kind='bar')
plt.xticks(rotation=45, ha='right')
print(cut_corrs.value_counts())
print(corrs[corrs > 0.1]) # or corrs < -0.1])
# %%
# %%

View File

@ -0,0 +1,328 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %matplotlib inline
import os, sys, math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn import metrics
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features
categorical_features = model_input[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]
# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
model_input = pd.concat([numerical_features, categorical_features], axis=1)
# Binarizacija targeta
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)
print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)
# %%
# Add prefix to demographical features
demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile',
'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']
new_names = [(col, "demo_"+col) for col in demo_features]
model_input.rename(columns=dict(new_names), inplace=True)
demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio',
'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M',
'demo_startlanguage_nl', 'demo_startlanguage_sl']
# %%
# Get phone and non-phone columns
import warnings
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
"""
This function makes predictions with sensor groups.
It takes in a dataframe (df), a list of group substrings (groups_substrings)
and an optional parameter include_group (default is True).
It creates a list of columns in the dataframe that contain the group substrings,
while excluding the 'pid' and 'target' columns. It then splits the data into training
and test sets, using a test size of 0.25 for the first split and 0.2 for the second split.
A SimpleImputer is used to fill in missing values with median values.
A LogisticRegression is then used to fit the training set and make predictions
on the test set. Finally, accuracy, precision, recall and F1 scores are printed
for each substring group depending on whether or not include_group
is set to True or False.
"""
best_sensor = None
best_recall_score, best_f1_score = None, None
for fgroup_substr in groups_substrings:
if fgroup_substr is None:
feature_group_cols = list(df.columns)
feature_group_cols.remove("pid")
feature_group_cols.remove("target")
else:
if include_group:
feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
else:
feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
nb = GaussianNB()
model_cv = cross_validate(
nb,
X=imputer.fit_transform(X),
y=y,
cv=StratifiedKFold(n_splits=5, shuffle=True),
n_jobs=-1,
scoring=('accuracy', 'precision', 'recall', 'f1')
)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
if print_flag:
if include_group:
print("\nPrediction with", fgroup_substr)
else:
print("\nPrediction without", fgroup_substr)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
acc = np.mean(model_cv['test_accuracy'])
acc_std = np.std(model_cv['test_accuracy'])
prec = np.mean(model_cv['test_precision'])
prec_std = np.std(model_cv['test_precision'])
rec = np.mean(model_cv['test_recall'])
rec_std = np.std(model_cv['test_recall'])
f1 = np.mean(model_cv['test_f1'])
f1_std = np.std(model_cv['test_f1'])
if print_flag:
print("************************************************")
print(f"Accuracy: {acc} (sd={acc_std})")
print(f"Precison: {prec} (sd={prec_std})")
print(f"Recall: {rec} (sd={rec_std})")
print(f"F1: {f1} (sd={f1_std})\n")
if (not best_recall_score and not best_f1_score) or (rec > best_recall_score):
best_sensor = fgroup_substr
best_recall_score, best_f1_score = rec, f1
best_recall_score_std, best_f1_score_std = rec_std, f1_std
return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std
# %% [markdown]
# ### sensor big feature groups (phone, empatica, demographical)
big_groups_substr = ["phone_", "empatica_", "demo_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)
# %% [markdown]
# ### Empatica sezor groups
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
# e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
# %% [markdown]
# ### Phone sensor groups
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
# phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_",
# "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
# %%
# Write all the sensors (phone, empatica), seperate other (demographical) cols also
sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
# %%
def find_sensor_group_features_importance(model_input, sensor_groups_strings):
"""
This function finds the importance of sensor groups for a given model input. It takes two parameters:
model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores,
which will be populated with tuples containing the best sensor, its recall score, and its F1 score.
It then makes a copy of the model input and the sensor groups strings. It then loops through each group
in the list of strings, creating a list of important columns from the sensor importance scores list.
It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score,
and its F1 score. These values are added to the sensor importance scores list as a tuple. The function
then removes that best sensor from the list of strings before looping again until all groups have been evaluated.
Finally, it returns the populated list of tuples containing all sensors' scores.
"""
sensor_importance_scores = []
model_input = model_input.copy()
sensor_groups_strings = sensor_groups_strings.copy()
groups_len = len(sensor_groups_strings)
for i in range(groups_len):
important_cols = [col[0] for col in sensor_importance_scores]
with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std = \
make_predictions_with_sensor_groups(model_input,
groups_substrings=sensor_groups_strings, include_group=True,
with_cols=with_cols)
sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std ))
print(f"\nAdded sensor: {best_sensor}\n")
sensor_groups_strings.remove(best_sensor)
return sensor_importance_scores
# %%
# Method for sorting list of tuples into 3 lists
def sort_tuples_to_lists(list_of_tuples):
"""
sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument
and sorts them into three separate lists. The first list, xs, contains the first element
of each tuple. The second list, yrecall, contains the second element of each tuple rounded
to 4 decimal places. The third list, y_fscore, contains the third element of each tuple
rounded to 4 decimal places. The method returns all three lists.
"""
xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], []
for a_tuple in list_of_tuples:
xs.append(a_tuple[0])
y_recall.append(round(a_tuple[1], 4))
y_fscore.append(round(a_tuple[2], 4))
recall_std.append(round(a_tuple[3], 4))
fscore_std.append(round(a_tuple[4], 4))
return xs, y_recall, y_fscore, recall_std, fscore_std
def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
title="Sequential addition of features and its F1, and recall scores"):
"""
This function plots the sequential progress of feature addition scores using two subplots.
The first subplot is for recall scores and the second subplot is for F1-scores.
The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes.
The title of the plot can be specified by the user using the parameter title.
The maximum recall index and maximum F1-score index are also plotted using a black dot.
The figure size is set to 18.5 inches in width and 10.5 inches in height,
and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed
using plt.show().
"""
fig, ax = plt.subplots(nrows=2, sharex=True)
ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD
ax[0].plot(xs, y_recall, color='red')
ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD
mrec_indx = np.argmax(y_recall)
ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
ax[0].legend(["Upper std", "Mean Recall", "Lower std"])
ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD
ax[1].plot(xs, y_fscore)
ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD
mfscore_indx = np.argmax(y_fscore)
ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
ax[1].legend(["Upper std", "Mean F1-score", "Lower std"])
fig.set_size_inches(18.5, 10.5)
ax[0].title.set_text('Recall scores')
ax[1].title.set_text('F1-scores')
plt.suptitle(title, fontsize=14)
plt.xticks(rotation=90)
plt.show()
# %%
sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
# sensors_features_groups = ["phone_", "empatica_", "demo_"]
# %%
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores)
# %% [markdown]
# ### Visualize sensors groups F1 and recall scores
print(sensor_groups_importance_scores)
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
title="Sequential addition of sensors and its F1, and recall scores")
# %%
# Take the most important feature group and investigate it feature-by-feature
best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
# best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
# xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(best_sensor_features_scores)
# %% [markdown]
# ### Visualize best sensor's F1 and recall scores
# print(best_sensor_features_scores)
# plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
# title="Best sensor addition it's features with F1 and recall scores")
# %%
# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
# It also saves the sequence of scores for all sensors' features in excel file
seq_columns = ["sensor_name", "feature_sequence", "recall", "f1_score"]
feature_sequence = pd.DataFrame(columns=seq_columns)
for i, sensor_group in enumerate(sensor_groups_importance_scores):
current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(current_sensor_features_scores)
feature_sequence = pd.concat([feature_sequence, pd.DataFrame({"sensor_name":sensor_group[0], "feature_sequence": [xs], "recall": [y_recall],
"f1_score": [y_fscore], "recall_std": [recall_std], "f1_std": [fscore_std]})])
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")
feature_sequence.to_excel("all_sensors_sequential_addition_scores.xlsx", index=False)
# %%
# TODO: method that reads data from the excel file, specified above, and then the method,
# that selects only features that are max a thresh[%] below the max value (best for recall
# possibly for f1). This method should additionally take threshold parameter.
# %%

View File

@ -0,0 +1,49 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
from machine_learning.cross_validation import CrossValidation
from machine_learning.preprocessing import Preprocessing
# %%
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
df.set_index(index_columns, inplace=True)
cv = CrossValidation(data=df, cv_method="logo")
categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
interval_feature_list, other_feature_list = [], []
print(df.columns.tolist())
for split in cv.get_splits():
train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
pre = Preprocessing(train_X, train_y, test_X, test_y)
pre.one_hot_encode_train_and_test_sets(categorical_columns)
train_X, train_y, test_X, test_y = pre.get_train_test_sets()
break
# %%

View File

@ -46,9 +46,9 @@ import machine_learning.helper
#
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
cv_method_str = '5kfold' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)
undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
@ -72,20 +72,27 @@ model_input['target'].value_counts()
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# UnderSampling
if undersampling:
model_input_new = pd.DataFrame(columns=model_input.columns)
for pid in model_input["pid"].unique():
stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
if (len(stress) == 0):
continue
if (len(no_stress) == 0):
continue
model_input_new = pd.concat([model_input_new, stress], axis=0)
no_stress = model_input[model_input['target'] == 0]
stress = model_input[model_input['target'] == 1]
no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
# In case there are more stress samples than no_stress, take all instances of no_stress.
model_input_new = pd.concat([model_input_new, no_stress], axis=0)
model_input = model_input_new
no_stress = no_stress.sample(n=len(stress))
model_input = pd.concat([stress,no_stress], axis=0)
# model_input_new = pd.DataFrame(columns=model_input.columns)
# for pid in model_input["pid"].unique():
# stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
# no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
# if (len(stress) == 0):
# continue
# if (len(no_stress) == 0):
# continue
# model_input_new = pd.concat([model_input_new, stress], axis=0)
# no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
# # In case there are more stress samples than no_stress, take all instances of no_stress.
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
# model_input = model_input_new
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
@ -170,7 +177,7 @@ final_scores = machine_learning.helper.run_all_classification_models(imputer.fit
# %%
final_scores.index.name = "metric"
final_scores = final_scores.set_index(["method", final_scores.index])
final_scores.to_csv("../presentation/event_stressful_detection_5fold.csv")
final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv")
# %% [markdown]
# ### Logistic Regression
@ -453,4 +460,3 @@ print("F1", np.mean(xgb_classifier_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
# %% jupyter={"outputs_hidden": false, "source_hidden": false}

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.5 MiB

View File

@ -0,0 +1,121 @@
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold
class CrossValidation():
"""This code implements a CrossValidation class for creating cross validation splits.
"""
def __init__(self, data=None, cv_method='logo'):
"""This method initializes the cv_method argument and optionally prepares the data if supplied.
Args:
cv_method (str, optional): String of cross validation method; options are 'logo', 'half_logo' and '5kfold'.
Defaults to 'logo'.
data (DataFrame, optional): Pandas DataFrame with target, pid columns and other features as columns.
Defaults to None.
"""
self.initialize_cv_method(cv_method)
if data is not None:
self.prepare_data(data)
def prepare_data(self, data):
"""Prepares the data ready to be passed to the cross-validation algorithm, depending on the cv_method type.
For example, if cv_method is set to 'half_logo' new columns 'pid_index', 'pid_count', 'pid_half'
are added and used in the process.
Args:
data (_type_): Pandas DataFrame with target, pid columns and other features as columns.
"""
self.data = data
if self.cv_method == "logo":
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
elif self.cv_method == "half_logo":
data['pid_index'] = data.groupby('pid').cumcount()
data['pid_count'] = data.groupby('pid')['pid'].transform('count')
data["pid_index"] = (data['pid_index'] / data['pid_count'] + 1).round()
data["pid_half"] = data["pid"] + "_" + data["pid_index"].astype(int).astype(str)
data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
elif self.cv_method == "5kfold":
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
self.X, self.y, self.groups = data_X, data_y, data_groups
def initialize_cv_method(self, cv_method):
"""Initializes the given cv_method type. Depending on the type, the appropriate splitting technique is used.
Args:
cv_method (str): The type of cross-validation method to use; options are 'logo', 'half_logo' and '5kfold'.
Raises:
ValueError: If cv_method is not in the list of available methods, it raises an ValueError.
"""
self.cv_method = cv_method
if self.cv_method not in ["logo", "half_logo", "5kfold"]:
raise ValueError("Invalid cv_method input. Correct values are: 'logo', 'half_logo', '5kfold'")
if self.cv_method in ["logo", "half_logo"]:
self.cv = LeaveOneGroupOut()
elif self.cv_method == "5kfold":
self.cv = StratifiedKFold(n_splits=5, shuffle=True)
def get_splits(self):
"""Returns a generator object containing the cross-validation splits.
Raises:
ValueError: Raises ValueError if no data has been set.
"""
if not self.data.empty:
return self.cv.split(self.X, self.y, self.groups)
else:
raise ValueError("No data has been set. Use 'prepare_data(data)' method to set the data.")
def get_data(self):
"""data getter
Returns:
Pandas DataFrame: Returns the data from the class instance.
"""
return self.data
def get_x_y_groups(self):
"""X, y, and groups data getter
Returns:
Pandas DataFrame: Returns the data from the class instance.
"""
return self.X, self.y, self.groups
def get_train_test_sets(self, split):
"""Gets train and test sets, dependent on the split parameter. This method can be used in a specific splitting context,
where by index we can get train and test sets.
Args:
split (tuple of indices): It represents one iteration of the split generator (see get_splits method).
Returns:
tuple of Pandas DataFrames: This method returns train_X, train_y, test_X, test_y, with correctly indexed rows by split param.
"""
return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]

View File

@ -0,0 +1,45 @@
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.naive_bayes import GaussianNB
""" Feature selection pipeline: a methods that can be used in the wrapper metod alongside other wrapper contents (hyperparameter tuning etc.).
(1) Establish methods for each of the steps in feature selection protocol:
(a) feature selection inside specific sensors (sklearn method): returns most important features from all sensors
(b) feature selection between "tuned" sensors: returns filtered sensors, containing most important features retured with (a)
(2) Ensure that above methods are given only a part of data and use appropriate random seeds - to later simulate use case in production.
(3) Implement a method which gives graphical exploration of (1) (a) and (b) steps of the feature selection.
(4) Prepare a core method that can be fit into a wrapper (see sklearn wrapper methods) and integrates methods from (1)
"""
class FeatureSelection:
def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
pass
def within_sensors_feature_selection(estimator, scoring, tol):
features_list = []
nb = GaussianNB()
sfs = SequentialFeatureSelector(nb, n_features_to_select='auto', tol=0.02) # Can set n_features to an absolute value -> then remove tol parameter.
return features_list
def between_sensors_feature_selection():
pass
def vizualize_feature_selection_process():
pass
def execute_feature_selection_step():
pass

View File

@ -0,0 +1,126 @@
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
class Preprocessing:
"""This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data.
It's blind to the test data - e.g, it imputes the test data with train data mean.
This means, it somehow needs an access to the information about data split. In context
"""
def __init__(self, train_X, train_y, test_X, test_y):
self.train_X = train_X
self.train_y = train_y
self.test_X = test_X
self.test_y = test_y
def one_hot_encoder(self, categorical_features, numerical_features, mode):
"""
This code is an implementation of one-hot encoding. It takes in two data sets,
one with categorical features and one with numerical features and a mode parameter.
First it uses the fillna() function to fill in any missing values present in the
categorical data set with the mode value. Then it uses the apply () method to
convert each column of the data set into a category data type which is then
transformed using the pd.get_dummies() function. Finally it concatenates the
numerical data set and the transformed categorical data set using pd.concat() and
returns it.
Args:
categorical_features (DataFrame): DataFrame including only categorical columns.
numerical_features (_type_): DataFrame including only numerical columns.
mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
Returns:
DataFrame: Hot-One Encoded DataFrame.
"""
# Fill train set with mode
categorical_features = categorical_features.fillna(mode)
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
return pd.concat([numerical_features, categorical_features], axis=1)
def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
"""
This code is used to transform categorical data into numerical representations.
It first identifies the categorical columns, then copies them and saves them as
a new dataset. The missing data is filled with the mode (most frequent value in
the respective column). This new dataset is then subjected to one-hot encoding,
which is a process of transforming categorical data into machine interpretable
numerical form by converting categories into multiple binary outcome variables.
These encoded values are then concatenated to the numerical features prior to
being returned as the final dataset.
Args:
categorical_columns (list, optional): List of categorical columns in the dataset.
Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"].
"""
categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
# For train set
train_X_categorical_features = self.train_X[categorical_columns].copy()
train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
# For test set
test_X_categorical_features = self.test_X[categorical_columns].copy()
test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
# TODO: TESTING
if groupby:
# Interval numerical features # TODO: How can we get and assign appropriate groupby means and assign them to correct columns?
# VVVVV ...... IN PROGRES ...... VVVVV
means = self.train_X[interval_feature_list].groupby(groupby_feature).mean()
self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + other_feature_list)]] = \
self.train_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
self.test_X[self.test_X.loc[:, ~self.test_X.columns.isin([groupby_feature] + other_feature_list)]] = \
self.test_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
# Other features
self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + interval_feature_list)]] = \
self.train_X[other_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.median()))
else:
# Interval numerical features
means = self.train_X[interval_feature_list].mean()
self.train_X[interval_feature_list].fillna(means, inplace=True)
self.test_X[interval_feature_list].fillna(means, inplace=True)
# Other features
medians = self.train_X[other_feature_list].median()
self.train_X[other_feature_list].fillna(medians, inplace=True)
self.test_X[other_feature_list].fillna(medians, inplace=True)
def get_train_test_sets(self):
"""Train and test sets getter
Returns:
tuple of Pandas DataFrames: Gets train test sets in traditional sklearn format.
"""
return self.train_X, self.train_y, self.test_X, self.test_y

View File

@ -0,0 +1,29 @@
method,metric,max,mean
Dummy,test_accuracy,0.8557046979865772,0.8548446932649828
Dummy,test_average_precision,0.1457286432160804,0.14515530673501736
Dummy,test_recall,0.0,0.0
Dummy,test_f1,0.0,0.0
logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606
logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834
logistic_reg,test_recall,0.3023255813953488,0.24266238973536486
logistic_reg,test_f1,0.3909774436090226,0.318943511424051
svc,test_accuracy,0.8557046979865772,0.8548446932649828
svc,test_average_precision,0.44514416839823046,0.4068200938341621
svc,test_recall,0.0,0.0
svc,test_f1,0.0,0.0
gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234
gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853
gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935
gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699
stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601
stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592
stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675
stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583
random_forest,test_accuracy,0.9110738255033557,0.9011129472867694
random_forest,test_average_precision,0.6998372262021191,0.6619275281099584
random_forest,test_recall,0.4069767441860465,0.35356856455493185
random_forest,test_f1,0.5691056910569107,0.5078402513053142
xgboost,test_accuracy,0.9128978224455612,0.9007711937764886
xgboost,test_average_precision,0.7366643049075349,0.698622165966308
xgboost,test_recall,0.5287356321839081,0.44346431435445066
xgboost,test_f1,0.638888888888889,0.5633957169928393
1 method metric max mean
2 Dummy test_accuracy 0.8557046979865772 0.8548446932649828
3 Dummy test_average_precision 0.1457286432160804 0.14515530673501736
4 Dummy test_recall 0.0 0.0
5 Dummy test_f1 0.0 0.0
6 logistic_reg test_accuracy 0.8640939597315436 0.8504895843872606
7 logistic_reg test_average_precision 0.44363425265068757 0.37511495347389834
8 logistic_reg test_recall 0.3023255813953488 0.24266238973536486
9 logistic_reg test_f1 0.3909774436090226 0.318943511424051
10 svc test_accuracy 0.8557046979865772 0.8548446932649828
11 svc test_average_precision 0.44514416839823046 0.4068200938341621
12 svc test_recall 0.0 0.0
13 svc test_f1 0.0 0.0
14 gaussian_naive_bayes test_accuracy 0.7684563758389261 0.7479123806954234
15 gaussian_naive_bayes test_average_precision 0.2534828030085334 0.23379392278901853
16 gaussian_naive_bayes test_recall 0.42528735632183906 0.3924619085805935
17 gaussian_naive_bayes test_f1 0.34285714285714286 0.3107236284017699
18 stochastic_gradient_descent test_accuracy 0.8576214405360134 0.7773610783222601
19 stochastic_gradient_descent test_average_precision 0.3813093757959869 0.3617503752215592
20 stochastic_gradient_descent test_recall 0.686046511627907 0.2822507350975675
21 stochastic_gradient_descent test_f1 0.3652173913043478 0.21849107443075583
22 random_forest test_accuracy 0.9110738255033557 0.9011129472867694
23 random_forest test_average_precision 0.6998372262021191 0.6619275281099584
24 random_forest test_recall 0.4069767441860465 0.35356856455493185
25 random_forest test_f1 0.5691056910569107 0.5078402513053142
26 xgboost test_accuracy 0.9128978224455612 0.9007711937764886
27 xgboost test_average_precision 0.7366643049075349 0.698622165966308
28 xgboost test_recall 0.5287356321839081 0.44346431435445066
29 xgboost test_f1 0.638888888888889 0.5633957169928393

View File

@ -0,0 +1,29 @@
method,metric,max,mean
Dummy,test_accuracy,1.0,0.8524114578096439
Dummy,test_average_precision,0.7,0.14758854219035614
Dummy,test_recall,0.0,0.0
Dummy,test_f1,0.0,0.0
logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311
logistic_reg,test_average_precision,1.0,0.44605167668563583
logistic_reg,test_recall,1.0,0.25353566685532386
logistic_reg,test_f1,0.823529411764706,0.27951926390778625
svc,test_accuracy,1.0,0.8524114578096439
svc,test_average_precision,0.9612401707068228,0.44179454944271934
svc,test_recall,0.0,0.0
svc,test_f1,0.0,0.0
gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129
gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991
gaussian_naive_bayes,test_recall,1.0,0.3743761174081108
gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668
stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739
stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582
stochastic_gradient_descent,test_recall,1.0,0.30156420704502945
stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388
random_forest,test_accuracy,1.0,0.8722158105763481
random_forest,test_average_precision,1.0,0.49817066323226833
random_forest,test_recall,1.0,0.18161263127840668
random_forest,test_f1,1.0,0.2508096532365307
xgboost,test_accuracy,1.0,0.8812627400277729
xgboost,test_average_precision,1.0,0.5505695112208401
xgboost,test_recall,1.0,0.2896161238315027
xgboost,test_f1,0.9411764705882353,0.36887408735855665
1 method metric max mean
2 Dummy test_accuracy 1.0 0.8524114578096439
3 Dummy test_average_precision 0.7 0.14758854219035614
4 Dummy test_recall 0.0 0.0
5 Dummy test_f1 0.0 0.0
6 logistic_reg test_accuracy 0.9824561403508771 0.8445351955631311
7 logistic_reg test_average_precision 1.0 0.44605167668563583
8 logistic_reg test_recall 1.0 0.25353566685532386
9 logistic_reg test_f1 0.823529411764706 0.27951926390778625
10 svc test_accuracy 1.0 0.8524114578096439
11 svc test_average_precision 0.9612401707068228 0.44179454944271934
12 svc test_recall 0.0 0.0
13 svc test_f1 0.0 0.0
14 gaussian_naive_bayes test_accuracy 0.9 0.7491301746887129
15 gaussian_naive_bayes test_average_precision 0.9189430193277607 0.2833170327386991
16 gaussian_naive_bayes test_recall 1.0 0.3743761174081108
17 gaussian_naive_bayes test_f1 0.7000000000000001 0.2698456659235668
18 stochastic_gradient_descent test_accuracy 1.0 0.7926428596764739
19 stochastic_gradient_descent test_average_precision 1.0 0.4421948838597582
20 stochastic_gradient_descent test_recall 1.0 0.30156420704502945
21 stochastic_gradient_descent test_f1 0.8148148148148148 0.24088393234361388
22 random_forest test_accuracy 1.0 0.8722158105763481
23 random_forest test_average_precision 1.0 0.49817066323226833
24 random_forest test_recall 1.0 0.18161263127840668
25 random_forest test_f1 1.0 0.2508096532365307
26 xgboost test_accuracy 1.0 0.8812627400277729
27 xgboost test_average_precision 1.0 0.5505695112208401
28 xgboost test_recall 1.0 0.2896161238315027
29 xgboost test_f1 0.9411764705882353 0.36887408735855665