Compare commits

..

No commits in common. "0b16aa6fe4996b34f617361f87c257cbf510ff45" and "12f2c927fa994791a8100f6e86b2e92c5f052365" have entirely different histories.

43 changed files with 631 additions and 2302 deletions

10
.gitignore vendored
View File

@ -5,19 +5,9 @@ __pycache__/
/exploration/*.ipynb
/config/*.ipynb
/statistical_analysis/*.ipynb
/presentation/*.ipynb
/machine_learning/intermediate_results/
/data/features/
/data/baseline/
/data/*input*.csv
/data/daily*
/data/intradaily*
/data/stressfulness_event*
/data/30min*
/presentation/*scores.csv
/presentation/Results.ods
.Rproj.user
.Rhistory
/presentation/*.nb.html
presentation/event_stressful_detection_half_loso.csv
presentation/event_stressful_detection_loso.csv

View File

@ -3,6 +3,5 @@
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
<mapping directory="$PROJECT_DIR$/rapids" vcs="Git" />
<mapping directory="$PROJECT_DIR$/rapids/calculatingfeatures" vcs="Git" />
</component>
</project>

View File

@ -7,10 +7,8 @@ dependencies:
- black
- isort
- flake8
- imbalanced-learn=0.10.0
- jupyterlab
- jupytext
- lightgbm
- mypy
- nodejs
- pandas

View File

@ -1,318 +0,0 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %matplotlib inline
import os, sys, math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
def calc_entropy(column):
"""
Calculate entropy given a pandas series, list, or numpy array.
"""
# Compute the counts of each unique value in the column
counts = np.bincount(column)
# Divide by the total column length to get a probability
probabilities = counts / len(column)
# Initialize the entropy to 0
entropy = 0
# Loop through the probabilities, and add each one to the total entropy
for prob in probabilities:
if prob > 0:
# use log from math and set base to 2
entropy += prob * math.log(prob, 2)
return -entropy
def calc_information_gain(data, split_name, target_name):
"""
Calculate information gain given a data set, column to split on, and target
"""
# Calculate the original entropy
original_entropy = calc_entropy(data[target_name])
#Find the unique values in the column
values = data[split_name].unique()
# Make two subsets of the data, based on the unique values
left_split = data[data[split_name] == values[0]]
right_split = data[data[split_name] == values[1]]
# Loop through the splits and calculate the subset entropies
to_subtract = 0
for subset in [left_split, right_split]:
prob = (subset.shape[0] / data.shape[0])
to_subtract += prob * calc_entropy(subset[target_name])
# Return information gain
return original_entropy - to_subtract
def get_information_gains(data, target_name):
#Intialize an empty dictionary for information gains
information_gains = {}
#Iterate through each column name in our list
for col in list(data.columns):
#Find the information gain for the column
information_gain = calc_information_gain(data, col, target_name)
#Add the information gain to our dictionary using the column name as the ekey
information_gains[col] = information_gain
#Return the key with the highest value
#return max(information_gains, key=information_gains.get)
return information_gains
def n_features_with_highest_info_gain(info_gain_dict, n=None):
"""
Get n-features that have highest information gain
"""
if n is None:
n = len(info_gain_dict)
import heapq
n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
return {feature[0]: feature[1] for feature in n_largest}
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features
categorical_features = model_input[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]
# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
model_input = pd.concat([numerical_features, categorical_features], axis=1)
# Binarizacija targeta
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)
print(model_input['target'].value_counts(), edges)
# %%
info_gains = get_information_gains(model_input, 'target')
# %% [markdown]
# Present the feature importance results
# %%
print("Total columns:", len(info_gains))
print(pd.Series(info_gains).value_counts())
n_features_with_highest_info_gain(info_gains, n=189)
# %%
def compute_impurity(feature, impurity_criterion):
"""
This function calculates impurity of a feature.
Supported impurity criteria: 'entropy', 'gini'
input: feature (this needs to be a Pandas series)
output: feature impurity
"""
probs = feature.value_counts(normalize=True)
if impurity_criterion == 'entropy':
impurity = -1 * np.sum(np.log2(probs) * probs)
elif impurity_criterion == 'gini':
impurity = 1 - np.sum(np.square(probs))
else:
raise ValueError('Unknown impurity criterion')
return impurity
def comp_feature_information_gain(df, target, descriptive_feature, split_criterion, print_flag=False):
"""
This function calculates information gain for splitting on
a particular descriptive feature for a given dataset
and a given impurity criteria.
Supported split criterion: 'entropy', 'gini'
"""
if print_flag:
print('target feature:', target)
print('descriptive_feature:', descriptive_feature)
print('split criterion:', split_criterion)
target_entropy = compute_impurity(df[target], split_criterion)
# we define two lists below:
# entropy_list to store the entropy of each partition
# weight_list to store the relative number of observations in each partition
entropy_list = list()
weight_list = list()
# loop over each level of the descriptive feature
# to partition the dataset with respect to that level
# and compute the entropy and the weight of the level's partition
for level in df[descriptive_feature].unique():
df_feature_level = df[df[descriptive_feature] == level]
entropy_level = compute_impurity(df_feature_level[target], split_criterion)
entropy_list.append(round(entropy_level, 3))
weight_level = len(df_feature_level) / len(df)
weight_list.append(round(weight_level, 3))
# print('impurity of partitions:', entropy_list)
# print('weights of partitions:', weight_list)
feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
information_gain = target_entropy - feature_remaining_impurity
if print_flag:
print('impurity of partitions:', entropy_list)
print('weights of partitions:', weight_list)
print('remaining impurity:', feature_remaining_impurity)
print('information gain:', information_gain)
print('====================')
return information_gain
def calc_information_gain_2(data, split_name, target_name, split_criterion):
"""
Calculate information gain given a data set, column to split on, and target
"""
# Calculate the original impurity
original_impurity = compute_impurity(data[target_name], split_criterion)
#Find the unique values in the column
values = data[split_name].unique()
# Make two subsets of the data, based on the unique values
left_split = data[data[split_name] == values[0]]
right_split = data[data[split_name] == values[1]]
# Loop through the splits and calculate the subset impurities
to_subtract = 0
for subset in [left_split, right_split]:
prob = (subset.shape[0] / data.shape[0])
to_subtract += prob * compute_impurity(subset[target_name], split_criterion)
# Return information gain
return original_impurity - to_subtract
def get_information_gains_2(data, target_name, split_criterion):
#Intialize an empty dictionary for information gains
information_gains = {}
#Iterate through each column name in our list
for feature in list(data.columns):
#Find the information gain for the column
information_gain = calc_information_gain_2(model_input, target_name, feature, split_criterion)
#Add the information gain to our dictionary using the column name as the ekey
information_gains[feature] = information_gain
#Return the key with the highest value
#return max(information_gains, key=information_gains.get)
return information_gains
# %% [markdown]
# Present the feature importance results from other methods
# %%
split_criterion = 'entropy'
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
information_gains = get_information_gains_2(model_input, 'target', split_criterion)
print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
n_features_with_highest_info_gain(information_gains)
# %%
# Present the feature importance using a tree (that uses gini imputity measure)
split_criterion = 'entropy'
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
X, y = model_input.drop(columns=['target', 'pid']), model_input['target']
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)
X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25)
clf = DecisionTreeClassifier(criterion=split_criterion)
clf.fit(X, y)
feat_importance = clf.tree_.compute_feature_importances(normalize=False)
print("feat importance = ", feat_importance)
print("shape", feat_importance.shape)
tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist()))
info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp))
info_gains_dict[info_gains_dict > 0]
# %%
# Binarizacija vrednosti tree Information Gain-a
bins = [-0.1, 0, 0.1] # bins for target's correlations with features
cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True)
plt.title(f"Tree information gains by value ({split_criterion})")
cut_info_gains.value_counts().plot(kind='bar', color='purple')
plt.xticks(rotation=45, ha='right')
print(cut_info_gains.value_counts())
pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20))
# %%
# Plot feature importance tree graph
plt.figure(figsize=(12,12))
tree.plot_tree(clf,
feature_names = list(model_input.drop(columns=['target', 'pid']).columns),
class_names=True,
filled = True, fontsize=5, max_depth=3)
plt.savefig('tree_high_dpi', dpi=800)
# %% [markdown]
# Present the feature importance by correlation with target
corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
# corrs.sort_values(ascending=False)
# Binarizacija vrednosti korelacij
bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features
cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True)
plt.title("Target's correlations with features")
cut_corrs.value_counts().plot(kind='bar')
plt.xticks(rotation=45, ha='right')
print(cut_corrs.value_counts())
print(corrs[corrs > 0.1]) # or corrs < -0.1])
# %%
# %%

View File

@ -1,328 +0,0 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %matplotlib inline
import os, sys, math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn import metrics
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features
categorical_features = model_input[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]
# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
model_input = pd.concat([numerical_features, categorical_features], axis=1)
# Binarizacija targeta
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)
print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)
# %%
# Add prefix to demographical features
demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile',
'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']
new_names = [(col, "demo_"+col) for col in demo_features]
model_input.rename(columns=dict(new_names), inplace=True)
demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio',
'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M',
'demo_startlanguage_nl', 'demo_startlanguage_sl']
# %%
# Get phone and non-phone columns
import warnings
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
"""
This function makes predictions with sensor groups.
It takes in a dataframe (df), a list of group substrings (groups_substrings)
and an optional parameter include_group (default is True).
It creates a list of columns in the dataframe that contain the group substrings,
while excluding the 'pid' and 'target' columns. It then splits the data into training
and test sets, using a test size of 0.25 for the first split and 0.2 for the second split.
A SimpleImputer is used to fill in missing values with median values.
A LogisticRegression is then used to fit the training set and make predictions
on the test set. Finally, accuracy, precision, recall and F1 scores are printed
for each substring group depending on whether or not include_group
is set to True or False.
"""
best_sensor = None
best_recall_score, best_f1_score = None, None
for fgroup_substr in groups_substrings:
if fgroup_substr is None:
feature_group_cols = list(df.columns)
feature_group_cols.remove("pid")
feature_group_cols.remove("target")
else:
if include_group:
feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
else:
feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
nb = GaussianNB()
model_cv = cross_validate(
nb,
X=imputer.fit_transform(X),
y=y,
cv=StratifiedKFold(n_splits=5, shuffle=True),
n_jobs=-1,
scoring=('accuracy', 'precision', 'recall', 'f1')
)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
if print_flag:
if include_group:
print("\nPrediction with", fgroup_substr)
else:
print("\nPrediction without", fgroup_substr)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
acc = np.mean(model_cv['test_accuracy'])
acc_std = np.std(model_cv['test_accuracy'])
prec = np.mean(model_cv['test_precision'])
prec_std = np.std(model_cv['test_precision'])
rec = np.mean(model_cv['test_recall'])
rec_std = np.std(model_cv['test_recall'])
f1 = np.mean(model_cv['test_f1'])
f1_std = np.std(model_cv['test_f1'])
if print_flag:
print("************************************************")
print(f"Accuracy: {acc} (sd={acc_std})")
print(f"Precison: {prec} (sd={prec_std})")
print(f"Recall: {rec} (sd={rec_std})")
print(f"F1: {f1} (sd={f1_std})\n")
if (not best_recall_score and not best_f1_score) or (rec > best_recall_score):
best_sensor = fgroup_substr
best_recall_score, best_f1_score = rec, f1
best_recall_score_std, best_f1_score_std = rec_std, f1_std
return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std
# %% [markdown]
# ### sensor big feature groups (phone, empatica, demographical)
big_groups_substr = ["phone_", "empatica_", "demo_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)
# %% [markdown]
# ### Empatica sezor groups
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
# e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
# %% [markdown]
# ### Phone sensor groups
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
# phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_",
# "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
# %%
# Write all the sensors (phone, empatica), seperate other (demographical) cols also
sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
# %%
def find_sensor_group_features_importance(model_input, sensor_groups_strings):
"""
This function finds the importance of sensor groups for a given model input. It takes two parameters:
model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores,
which will be populated with tuples containing the best sensor, its recall score, and its F1 score.
It then makes a copy of the model input and the sensor groups strings. It then loops through each group
in the list of strings, creating a list of important columns from the sensor importance scores list.
It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score,
and its F1 score. These values are added to the sensor importance scores list as a tuple. The function
then removes that best sensor from the list of strings before looping again until all groups have been evaluated.
Finally, it returns the populated list of tuples containing all sensors' scores.
"""
sensor_importance_scores = []
model_input = model_input.copy()
sensor_groups_strings = sensor_groups_strings.copy()
groups_len = len(sensor_groups_strings)
for i in range(groups_len):
important_cols = [col[0] for col in sensor_importance_scores]
with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std = \
make_predictions_with_sensor_groups(model_input,
groups_substrings=sensor_groups_strings, include_group=True,
with_cols=with_cols)
sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std ))
print(f"\nAdded sensor: {best_sensor}\n")
sensor_groups_strings.remove(best_sensor)
return sensor_importance_scores
# %%
# Method for sorting list of tuples into 3 lists
def sort_tuples_to_lists(list_of_tuples):
"""
sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument
and sorts them into three separate lists. The first list, xs, contains the first element
of each tuple. The second list, yrecall, contains the second element of each tuple rounded
to 4 decimal places. The third list, y_fscore, contains the third element of each tuple
rounded to 4 decimal places. The method returns all three lists.
"""
xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], []
for a_tuple in list_of_tuples:
xs.append(a_tuple[0])
y_recall.append(round(a_tuple[1], 4))
y_fscore.append(round(a_tuple[2], 4))
recall_std.append(round(a_tuple[3], 4))
fscore_std.append(round(a_tuple[4], 4))
return xs, y_recall, y_fscore, recall_std, fscore_std
def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
title="Sequential addition of features and its F1, and recall scores"):
"""
This function plots the sequential progress of feature addition scores using two subplots.
The first subplot is for recall scores and the second subplot is for F1-scores.
The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes.
The title of the plot can be specified by the user using the parameter title.
The maximum recall index and maximum F1-score index are also plotted using a black dot.
The figure size is set to 18.5 inches in width and 10.5 inches in height,
and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed
using plt.show().
"""
fig, ax = plt.subplots(nrows=2, sharex=True)
ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD
ax[0].plot(xs, y_recall, color='red')
ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD
mrec_indx = np.argmax(y_recall)
ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
ax[0].legend(["Upper std", "Mean Recall", "Lower std"])
ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD
ax[1].plot(xs, y_fscore)
ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD
mfscore_indx = np.argmax(y_fscore)
ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
ax[1].legend(["Upper std", "Mean F1-score", "Lower std"])
fig.set_size_inches(18.5, 10.5)
ax[0].title.set_text('Recall scores')
ax[1].title.set_text('F1-scores')
plt.suptitle(title, fontsize=14)
plt.xticks(rotation=90)
plt.show()
# %%
sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
# sensors_features_groups = ["phone_", "empatica_", "demo_"]
# %%
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores)
# %% [markdown]
# ### Visualize sensors groups F1 and recall scores
print(sensor_groups_importance_scores)
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
title="Sequential addition of sensors and its F1, and recall scores")
# %%
# Take the most important feature group and investigate it feature-by-feature
best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
# best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
# xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(best_sensor_features_scores)
# %% [markdown]
# ### Visualize best sensor's F1 and recall scores
# print(best_sensor_features_scores)
# plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
# title="Best sensor addition it's features with F1 and recall scores")
# %%
# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
# It also saves the sequence of scores for all sensors' features in excel file
seq_columns = ["sensor_name", "feature_sequence", "recall", "f1_score"]
feature_sequence = pd.DataFrame(columns=seq_columns)
for i, sensor_group in enumerate(sensor_groups_importance_scores):
current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(current_sensor_features_scores)
feature_sequence = pd.concat([feature_sequence, pd.DataFrame({"sensor_name":sensor_group[0], "feature_sequence": [xs], "recall": [y_recall],
"f1_score": [y_fscore], "recall_std": [recall_std], "f1_std": [fscore_std]})])
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")
feature_sequence.to_excel("all_sensors_sequential_addition_scores.xlsx", index=False)
# %%
# TODO: method that reads data from the excel file, specified above, and then the method,
# that selects only features that are max a thresh[%] below the max value (best for recall
# possibly for f1). This method should additionally take threshold parameter.
# %%

View File

@ -1,166 +0,0 @@
# -*- coding: utf-8 -*-
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
import os
import sys
import datetime
import math
import seaborn as sns
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import participants.query_db
from features.esm import *
from features.esm_JCQ import *
from features.esm_SAM import *
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# %%
participants_inactive_usernames = participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01")
)
df_esm_inactive = get_esm_data(participants_inactive_usernames)
# %%
df_esm_preprocessed = preprocess_esm(df_esm_inactive)
# %% [markdown]
# Investigate stressfulness events
# %%
extracted_ers = df_esm_preprocessed.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire length
extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire answering is 15 min
session_start_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].min().to_frame().rename(columns={'timestamp': 'session_start_timestamp'}) # questionnaire start timestamp
session_end_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
se_time = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
se_duration = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
# Make se_durations to the appropriate lengths
# Extracted 3 targets that will be transfered in the csv file to the cleaning script.
df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].columns
se_stressfulness_event_tg = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'appraisal_stressfulness_event'})
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \
.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
.join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
.join(se_time, on=['device_id', 'esm_session'], how='left') \
.join(se_duration, on=['device_id', 'esm_session'], how='left') \
# Filter-out the sessions that are not useful. Because of the ambiguity this excludes:
# (1) straw event times that are marked as "0 - I don't remember"
# (2) straw event durations that are marked as "0 - I don't remember"
extracted_ers = extracted_ers[(~extracted_ers.se_time.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("Removed "))]
extracted_ers.reset_index(drop=True, inplace=True)
# Add default duration in case if participant answered that no stressful event occured
# Prepare data to fit the data structure in the CSV file ...
# Add the event time as the start of the questionnaire if no stress event occured
extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
# Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds
extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
extracted_ers['shift_direction'] = -1
""">>>>> begin section (could be optimized) <<<<<"""
# Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
# is taken as end time of the segment. Else the user input duration is taken.
extracted_ers['temp_duration'] = extracted_ers['se_duration']
extracted_ers['se_duration'] = \
np.where(
extracted_ers['se_duration'].astype(str).str.startswith("1 - "),
extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'],
extracted_ers['se_duration']
)
# This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds.
extracted_ers['se_duration'] = \
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else abs(pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60)
# Check whether min se_duration is at least the same duration as the ioi. Filter-out the rest.
""">>>>> end section <<<<<"""
# %% [markdown]
# Count negative values of duration
print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
print("Count stressed:", extracted_ers[(~extracted_ers['se_duration'].isna())][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
print("Count negative durations (invalid se_time user input):", extracted_ers[extracted_ers['se_duration'] < 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
print("Count 0 durations:", extracted_ers[extracted_ers['se_duration'] == 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
extracted_ers[extracted_ers['se_duration'] <= 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]
extracted_ers[(~extracted_ers['se_duration'].isna()) & (extracted_ers['se_duration'] <= 0)][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']]
ax = extracted_ers.hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
hist, bin_edges = np.histogram(extracted_ers['se_duration'].dropna())
hist
bin_edges
extracted_ers = extracted_ers[extracted_ers['se_duration'] >= 0]
# %%
# bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos' ..... right=False
bins = [-100000000, -0.0000001, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
extracted_ers['bins'], edges = pd.cut(extracted_ers.se_duration, bins=bins, labels=['neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
sns.displot(
data=extracted_ers.dropna(),
x="bins",
binwidth=0.1,
)
# %% [markdown]
extracted_ers[extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'] >= 0]
extracted_ers['se_time'].value_counts()
pd.set_option('display.max_rows', 100)
# Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika.
extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()] # Remove no stress events
extracted_ers['diff_se_time_session_end'] = (extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'])
print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
print("Count negative durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] < 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']])
print("Count 0 durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] == 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
extracted_ers[extracted_ers['diff_se_time_session_end'] < 0]['diff_se_time_session_end']
# extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)]
bins2 = [-100000, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['neg_zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
extracted_ers['bins2']
sns.displot(
data=extracted_ers.dropna(),
x="bins2",
binwidth=0.1,
)
extracted_ers.shape
extracted_ers.dropna().shape
print()
# %%
extracted_ers['appraisal_stressfulness_event_num'] = extracted_ers['appraisal_stressfulness_event'].str[0].astype(int)
print("duration-target (corr):", extracted_ers['se_duration'].corr(extracted_ers['appraisal_stressfulness_event_num']))
# %%
# Explore groupby participants?

View File

@ -1,49 +0,0 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
from machine_learning.cross_validation import CrossValidation
from machine_learning.preprocessing import Preprocessing
# %%
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
df.set_index(index_columns, inplace=True)
cv = CrossValidation(data=df, cv_method="logo")
categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
interval_feature_list, other_feature_list = [], []
print(df.columns.tolist())
for split in cv.get_splits():
train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
pre = Preprocessing(train_X, train_y, test_X, test_y)
pre.one_hot_encode_train_and_test_sets(categorical_columns)
train_X, train_y, test_X, test_y = pre.get_train_test_sets()
break
# %%

View File

@ -13,17 +13,20 @@
# name: straw2analysis
# ---
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
# %matplotlib inline
import datetime
import importlib
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
@ -36,32 +39,28 @@ nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import machine_learning.helper
import machine_learning.labels
import machine_learning.model
# %% [markdown]
# # RAPIDS models
# %% [markdown]
# ## Set script's parameters
#
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method)
# %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/stressfulness_event_nonstandardized/input_appraisal_stressfulness_event_mean.csv")
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input.set_index(index_columns, inplace=True)
model_input['target'].value_counts()
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# bins = [-10, 0, 10] # bins for z-scored targets
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
# %% jupyter={"source_hidden": true}
# bins = [-10, -1, 1, 10] # bins for z-scored targets
bins = [0, 1, 4] # bins for stressfulness (1-4) target
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
model_input['target'].value_counts(), edges
# model_input = model_input[model_input['target'] != "medium"]
@ -69,34 +68,7 @@ model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x
model_input['target'].value_counts()
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# UnderSampling
if undersampling:
no_stress = model_input[model_input['target'] == 0]
stress = model_input[model_input['target'] == 1]
no_stress = no_stress.sample(n=len(stress))
model_input = pd.concat([stress,no_stress], axis=0)
# model_input_new = pd.DataFrame(columns=model_input.columns)
# for pid in model_input["pid"].unique():
# stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
# no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
# if (len(stress) == 0):
# continue
# if (len(no_stress) == 0):
# continue
# model_input_new = pd.concat([model_input_new, stress], axis=0)
# no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
# # In case there are more stress samples than no_stress, take all instances of no_stress.
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
# model_input = model_input_new
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
if cv_method_str == 'half_logo':
if cv_method_str == 'halflogo':
model_input['pid_index'] = model_input.groupby('pid').cumcount()
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
@ -108,7 +80,7 @@ else:
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features
@ -128,8 +100,8 @@ numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
train_x = pd.concat([numerical_features, categorical_features], axis=1)
train_x.dtypes
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
# %% jupyter={"source_hidden": true}
cv_method = None # Defaults to 5 k-folds in cross_validate method
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
cv_method = LeaveOneGroupOut()
cv_method.get_n_splits(
@ -138,16 +110,14 @@ if cv_method_str == 'logo' or cv_method_str == 'half_logo':
groups=data_groups,
)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
# %% [markdown]
# ### Baseline: Dummy Classifier (most frequent)
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
dummy_class = DummyClassifier(strategy="most_frequent")
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
dummy_classifier = cross_validate(
dummy_class,
X=imputer.fit_transform(train_x),
@ -156,36 +126,23 @@ dummy_classifier = cross_validate(
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
scoring=('accuracy', 'average_precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
print("Precision", np.mean(dummy_classifier['test_precision']))
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(dummy_classifier['test_accuracy']))
print("Precision", np.mean(dummy_classifier['test_average_precision']))
print("Recall", np.mean(dummy_classifier['test_recall']))
print("F1", np.mean(dummy_classifier['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown] nteract={"transient": {"deleting": false}}
# ### All models
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
final_scores = machine_learning.helper.run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
# %%
final_scores.index.name = "metric"
final_scores = final_scores.set_index(["method", final_scores.index])
final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv")
# %% [markdown]
# ### Logistic Regression
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
logistic_regression = linear_model.LogisticRegression()
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
log_reg_scores = cross_validate(
logistic_regression,
X=imputer.fit_transform(train_x),
@ -195,9 +152,8 @@ log_reg_scores = cross_validate(
n_jobs=-1,
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(log_reg_scores['test_accuracy']))
print("Precision", np.mean(log_reg_scores['test_precision']))
print("Recall", np.mean(log_reg_scores['test_recall']))
print("F1", np.mean(log_reg_scores['test_f1']))
@ -207,10 +163,10 @@ print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accurac
# %% [markdown]
# ### Support Vector Machine
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
svc = svm.SVC()
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
svc_scores = cross_validate(
svc,
X=imputer.fit_transform(train_x),
@ -220,9 +176,8 @@ svc_scores = cross_validate(
n_jobs=-1,
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(svc_scores['test_accuracy']))
print("Precision", np.mean(svc_scores['test_precision']))
print("Recall", np.mean(svc_scores['test_recall']))
print("F1", np.mean(svc_scores['test_f1']))
@ -232,10 +187,10 @@ print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'],
# %% [markdown]
# ### Gaussian Naive Bayes
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
gaussian_nb = naive_bayes.GaussianNB()
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
gaussian_nb_scores = cross_validate(
gaussian_nb,
X=imputer.fit_transform(train_x),
@ -246,9 +201,8 @@ gaussian_nb_scores = cross_validate(
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(gaussian_nb_scores['test_accuracy']))
print("Precision", np.mean(gaussian_nb_scores['test_precision']))
print("Recall", np.mean(gaussian_nb_scores['test_recall']))
print("F1", np.mean(gaussian_nb_scores['test_f1']))
@ -258,10 +212,10 @@ print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_acc
# %% [markdown]
# ### Stochastic Gradient Descent Classifier
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
sgdc = linear_model.SGDClassifier()
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
sgdc_scores = cross_validate(
sgdc,
X=imputer.fit_transform(train_x),
@ -272,9 +226,8 @@ sgdc_scores = cross_validate(
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(sgdc_scores['test_accuracy']))
print("Precision", np.mean(sgdc_scores['test_precision']))
print("Recall", np.mean(sgdc_scores['test_recall']))
print("F1", np.mean(sgdc_scores['test_f1']))
@ -284,10 +237,10 @@ print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy']
# %% [markdown]
# ### K-nearest neighbors
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
knn = neighbors.KNeighborsClassifier()
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
knn_scores = cross_validate(
knn,
X=imputer.fit_transform(train_x),
@ -298,9 +251,8 @@ knn_scores = cross_validate(
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(knn_scores['test_accuracy']))
print("Precision", np.mean(knn_scores['test_precision']))
print("Recall", np.mean(knn_scores['test_recall']))
print("F1", np.mean(knn_scores['test_f1']))
@ -310,10 +262,10 @@ print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'],
# %% [markdown]
# ### Decision Tree
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
dtree = tree.DecisionTreeClassifier()
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
dtree_scores = cross_validate(
dtree,
X=imputer.fit_transform(train_x),
@ -324,9 +276,8 @@ dtree_scores = cross_validate(
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(dtree_scores['test_accuracy']))
print("Precision", np.mean(dtree_scores['test_precision']))
print("Recall", np.mean(dtree_scores['test_recall']))
print("F1", np.mean(dtree_scores['test_f1']))
@ -336,10 +287,10 @@ print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'
# %% [markdown]
# ### Random Forest Classifier
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
rfc = ensemble.RandomForestClassifier()
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
rfc_scores = cross_validate(
rfc,
X=imputer.fit_transform(train_x),
@ -348,47 +299,23 @@ rfc_scores = cross_validate(
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1'),
return_estimator=True
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(rfc_scores['test_accuracy']))
print("Precision", np.mean(rfc_scores['test_precision']))
print("Recall", np.mean(rfc_scores['test_recall']))
print("F1", np.mean(rfc_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown]
# ### Feature importance (RFC)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
for idx, estimator in enumerate(rfc_scores['estimator']):
feature_importances = pd.DataFrame(estimator.feature_importances_,
index = list(train_x.columns),
columns=['importance'])
# print("\nFeatures sorted by their score for estimator {}:".format(idx))
# print(feature_importances.sort_values('importance', ascending=False).head(10))
rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
pd.set_option('display.max_rows', 100)
print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
# %% [markdown]
# ### Gradient Boosting Classifier
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
gbc = ensemble.GradientBoostingClassifier()
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
gbc_scores = cross_validate(
gbc,
X=imputer.fit_transform(train_x),
@ -399,9 +326,8 @@ gbc_scores = cross_validate(
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(gbc_scores['test_accuracy']))
print("Precision", np.mean(gbc_scores['test_precision']))
print("Recall", np.mean(gbc_scores['test_recall']))
print("F1", np.mean(gbc_scores['test_f1']))
@ -411,10 +337,10 @@ print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'],
# %% [markdown]
# ### LGBM Classifier
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
lgbm = LGBMClassifier()
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
lgbm_scores = cross_validate(
lgbm,
X=imputer.fit_transform(train_x),
@ -425,9 +351,8 @@ lgbm_scores = cross_validate(
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(lgbm_scores['test_accuracy']))
print("Precision", np.mean(lgbm_scores['test_precision']))
print("Recall", np.mean(lgbm_scores['test_recall']))
print("F1", np.mean(lgbm_scores['test_f1']))
@ -437,10 +362,10 @@ print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy']
# %% [markdown]
# ### XGBoost Classifier
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
xgb_classifier = xg.sklearn.XGBClassifier()
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
# %% jupyter={"source_hidden": true}
xgb_classifier_scores = cross_validate(
xgb_classifier,
X=imputer.fit_transform(train_x),
@ -451,12 +376,10 @@ xgb_classifier_scores = cross_validate(
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(xgb_classifier_scores['test_accuracy']))
print("Precision", np.mean(xgb_classifier_scores['test_precision']))
print("Recall", np.mean(xgb_classifier_scores['test_recall']))
print("F1", np.mean(xgb_classifier_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))

View File

@ -31,6 +31,7 @@ from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyClassifier
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
from lightgbm import LGBMClassifier
import xgboost as xg
from sklearn.cluster import KMeans
@ -51,12 +52,12 @@ from machine_learning.classification_models import ClassificationModels
# %% [markdown]
# ## Set script's parameters
n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
n_clusters = 5 # Number of clusters (could be regarded as a hyperparameter)
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
# %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
@ -65,7 +66,7 @@ model_input.columns[list(model_input.columns).index('age'):-1]
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
lime_cols
lime_col = 'limesurvey_demand_control_ratio_quartile'
lime_col = 'limesurvey_demand_control_ratio'
clust_col = lime_col
model_input[clust_col].describe()
@ -74,10 +75,9 @@ model_input[clust_col].describe()
# %% jupyter={"source_hidden": true}
# Filter-out outlier rows by clust_col
#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
uniq = uniq.dropna()
plt.bar(uniq['pid'], uniq[clust_col])
# %% jupyter={"source_hidden": true}
@ -109,7 +109,7 @@ for k in range(n_clusters):
model_input_subset['target'].value_counts()
if cv_method_str == 'half_logo':
if cv_method_str == 'halflogo':
model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
@ -140,7 +140,7 @@ for k in range(n_clusters):
train_x = pd.concat([numerical_features, categorical_features], axis=1)
# Establish cv method
cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
cv_method = None # Defaults to 5 k-folds in cross_validate method
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
cv_method = LeaveOneGroupOut()
cv_method.get_n_splits(

View File

@ -15,18 +15,26 @@
# %% jupyter={"source_hidden": true}
# %matplotlib inline
import datetime
import importlib
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.dummy import DummyClassifier
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
from lightgbm import LGBMClassifier
import xgboost as xg
from sklearn.cluster import KMeans
from IPython.core.interactiveshell import InteractiveShell
@ -36,6 +44,8 @@ nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import machine_learning.labels
import machine_learning.model
from machine_learning.classification_models import ClassificationModels
# %% [markdown]

View File

@ -120,7 +120,6 @@ sum(data_y.isna())
# %% [markdown]
# ### Baseline: Dummy Regression (mean)
# %%
dummy_regr = DummyRegressor(strategy="mean")
# %% jupyter={"source_hidden": true}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.5 MiB

View File

@ -1,121 +0,0 @@
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold
class CrossValidation():
"""This code implements a CrossValidation class for creating cross validation splits.
"""
def __init__(self, data=None, cv_method='logo'):
"""This method initializes the cv_method argument and optionally prepares the data if supplied.
Args:
cv_method (str, optional): String of cross validation method; options are 'logo', 'half_logo' and '5kfold'.
Defaults to 'logo'.
data (DataFrame, optional): Pandas DataFrame with target, pid columns and other features as columns.
Defaults to None.
"""
self.initialize_cv_method(cv_method)
if data is not None:
self.prepare_data(data)
def prepare_data(self, data):
"""Prepares the data ready to be passed to the cross-validation algorithm, depending on the cv_method type.
For example, if cv_method is set to 'half_logo' new columns 'pid_index', 'pid_count', 'pid_half'
are added and used in the process.
Args:
data (_type_): Pandas DataFrame with target, pid columns and other features as columns.
"""
self.data = data
if self.cv_method == "logo":
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
elif self.cv_method == "half_logo":
data['pid_index'] = data.groupby('pid').cumcount()
data['pid_count'] = data.groupby('pid')['pid'].transform('count')
data["pid_index"] = (data['pid_index'] / data['pid_count'] + 1).round()
data["pid_half"] = data["pid"] + "_" + data["pid_index"].astype(int).astype(str)
data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
elif self.cv_method == "5kfold":
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
self.X, self.y, self.groups = data_X, data_y, data_groups
def initialize_cv_method(self, cv_method):
"""Initializes the given cv_method type. Depending on the type, the appropriate splitting technique is used.
Args:
cv_method (str): The type of cross-validation method to use; options are 'logo', 'half_logo' and '5kfold'.
Raises:
ValueError: If cv_method is not in the list of available methods, it raises an ValueError.
"""
self.cv_method = cv_method
if self.cv_method not in ["logo", "half_logo", "5kfold"]:
raise ValueError("Invalid cv_method input. Correct values are: 'logo', 'half_logo', '5kfold'")
if self.cv_method in ["logo", "half_logo"]:
self.cv = LeaveOneGroupOut()
elif self.cv_method == "5kfold":
self.cv = StratifiedKFold(n_splits=5, shuffle=True)
def get_splits(self):
"""Returns a generator object containing the cross-validation splits.
Raises:
ValueError: Raises ValueError if no data has been set.
"""
if not self.data.empty:
return self.cv.split(self.X, self.y, self.groups)
else:
raise ValueError("No data has been set. Use 'prepare_data(data)' method to set the data.")
def get_data(self):
"""data getter
Returns:
Pandas DataFrame: Returns the data from the class instance.
"""
return self.data
def get_x_y_groups(self):
"""X, y, and groups data getter
Returns:
Pandas DataFrame: Returns the data from the class instance.
"""
return self.X, self.y, self.groups
def get_train_test_sets(self, split):
"""Gets train and test sets, dependent on the split parameter. This method can be used in a specific splitting context,
where by index we can get train and test sets.
Args:
split (tuple of indices): It represents one iteration of the split generator (see get_splits method).
Returns:
tuple of Pandas DataFrames: This method returns train_X, train_y, test_X, test_y, with correctly indexed rows by split param.
"""
return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]

View File

@ -1,221 +0,0 @@
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Lasso
""" Feature selection pipeline: a methods that can be used in the wrapper metod alongside other wrapper contents (hyperparameter tuning etc.).
(1) Establish methods for each of the steps in feature selection protocol.
(2) Ensure that above methods are given only a part of data and use appropriate random seeds - to later simulate use case in production.
(3) Implement a method which gives graphical exploration of (1) (a) and (b) steps of the feature selection.
(4) Prepare a core method that can be fit into a wrapper (see sklearn wrapper methods) and integrates methods from (1)
"""
class FeatureSelection:
def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
pass # TODO....
def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
"""The method selects the best feature by testing the prediction on the feature set with or without the current feature.
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat
feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
specified as a parameter.
Args:
df (DataFrame): Input data on which the predictions will be made.
features (list): List of features to select the best/worst from
method (str, optional): remove or add features. Defaults to "remove".
ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and metric. Defaults to "classification".
metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
Raises:
ValueError: Raises if classification or regression metrics are not recognised if a specific ml_type is selected.
ValueError: If unknown ml_type is chosen.
Returns:
tuple: name of the best feature, best feature score, best feature score standard deviation.
"""
best_feature = None
if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
elif ml_type == "regression" and metric not in ['r2']:
raise ValueError("Regression metric not recognized. Please choose 'r2'")
for feat in features:
if method == "remove":
pred_features = [col for col in df.columns if feat != col] # All but feat
elif method == "add":
pred_features = [feat] + stored_features # Feat with stored features
X, y = df.drop(columns=['target', 'pid'])[pred_features], df['target']
if ml_type == "classification":
nb = GaussianNB()
model_cv = cross_validate(
nb,
X=X,
y=y,
cv=StratifiedKFold(n_splits=5, shuffle=True),
n_jobs=-1,
scoring=('accuracy', 'precision', 'recall', 'f1')
)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
if metric == "accuracy":
acc = np.mean(model_cv['test_accuracy'])
acc_std = np.std(model_cv['test_accuracy'])
if not best_feature or (acc > best_metric_score):
best_feature = feat
best_metric_score = acc
best_metric_score_std = acc_std
elif metric == "precision":
prec = np.mean(model_cv['test_precision'])
prec_std = np.std(model_cv['test_precision'])
if not best_feature or (prec > best_metric_score):
best_feature = feat
best_metric_score = prec
best_metric_score_std = prec_std
elif metric == "recall":
rec = np.mean(model_cv['test_recall'])
rec_std = np.std(model_cv['test_recall'])
if not best_feature or (rec > best_metric_score):
best_feature = feat
best_metric_score = rec
best_metric_score_std = rec_std
else:
f1 = np.mean(model_cv['test_f1'])
f1_std = np.std(model_cv['test_f1'])
if not best_feature or (f1 > best_metric_score):
best_feature = feat
best_metric_score = f1
best_metric_score_std = f1_std
elif ml_type == "regression":
lass = Lasso()
model_cv = cross_validate(
lass,
X=X,
y=y,
cv=StratifiedKFold(n_splits=5, shuffle=True),
n_jobs=-1,
scoring=('r2')
)
if metric == "r2":
r2 = np.mean(model_cv['test_r2'])
r2_std = np.std(model_cv['test_r2'])
if not best_feature or (r2 > best_metric_score):
best_feature = feat
best_metric_score = r2
best_metric_score_std = r2_std
else:
raise ValueError("ML type not yet implemented!")
return best_feature, best_metric_score, best_metric_score_std
def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
if n_max > n_features:
n_max = n_features
if n_min > n_features:
raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
if n_max < n_min:
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
features = df.columns.tolist()
features.remove("pid")
features.remove("target")
feature_importance = []
if method == "remove":
for i in reversed(range(n_features)):
best_feature, best_metric_score, best_metric_score_std = \
self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
features.remove(best_feature)
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
# Selekcijski kriterij značilk v rangu max-min
# Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
# Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
# "Tipping point" značilka mora biti v rangu max-min
selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
selection_area.set_index(["i", "name"], inplace=True)
diffrences = selection_area.diff()
diffrences.dropna(how='any', inplace=True)
# Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo
cumulative_sumation = diffrences.cumsum()
tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
# Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative
tipping_feature_indx_2 = None
for indx, row in diffrences.iterrows():
if row["metric"] > 0:
tipping_feature_indx_2 = indx
else:
break
# Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score
tipping_feature_indx_3 = None
cum_sum_score = 0
i_worse = 0
# TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
for indx, row in selection_area.iterrows():
if row["metric"] > 0:
tipping_feature_indx_3 = indx
cum_sum_score += row["metric"]
i_worse = 0
else:
i_worse += 1
if i_worse == n_not_improve:
break
def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
pass
def vizualize_feature_selection_process():
pass
def execute_feature_selection_step():
pass

File diff suppressed because one or more lines are too long

View File

@ -1,12 +1,10 @@
from pathlib import Path
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble, naive_bayes, neighbors, tree
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor, DummyClassifier
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xg
from sklearn.dummy import DummyRegressor
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
@ -68,7 +66,7 @@ def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> P
def insert_row(df, row):
return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
def prepare_regression_model_input(input_csv):
def prepare_model_input(input_csv):
model_input = pd.read_csv(input_csv)
@ -78,9 +76,10 @@ def prepare_regression_model_input(input_csv):
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
categorical_feature_colnames = ["gender", "startlanguage", "limesurvey_demand_control_ratio_quartile"]
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features
#TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully
#additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
#TODO: check if mostcommonactivity is indeed a categorical features after aggregating
#categorical_feature_colnames += additional_categorical_features
categorical_features = data_x[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]
# fillna with mode
@ -97,363 +96,172 @@ def prepare_regression_model_input(input_csv):
return train_x, data_y, data_groups
def run_all_regression_models(input_csv):
def run_all_models(input_csv):
# Prepare data
data_x, data_y, data_groups = prepare_regression_model_input(input_csv)
train_x, data_y, data_groups = prepare_model_input(input_csv)
# Prepare cross validation
logo = LeaveOneGroupOut()
logo.get_n_splits(
data_x,
train_x,
data_y,
groups=data_groups,
)
metrics = ['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error']
test_metrics = ["test_" + metric for metric in metrics]
scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
scores = pd.DataFrame(columns=["method", "median", "max"])
# Validate models
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr_scores = cross_validate(
dummy_regr,
X=data_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
)
print("Dummy model:")
print("R^2: ", np.nanmedian(dummy_regr_scores['test_r2']))
scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "dummy"
scores = pd.concat([scores, scores_df])
lin_reg_rapids = linear_model.LinearRegression()
lin_reg_scores = cross_validate(
lin_reg_scores = cross_val_score(
lin_reg_rapids,
X=data_x,
X=train_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
scoring='r2'
)
print("Linear regression:")
print("R^2: ", np.nanmedian(lin_reg_scores['test_r2']))
scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "linear_reg"
scores = pd.concat([scores, scores_df])
print(np.median(lin_reg_scores))
scores = insert_row(scores, ["Linear regression",np.median(lin_reg_scores),np.max(lin_reg_scores)])
ridge_reg = linear_model.Ridge(alpha=.5)
ridge_reg_scores = cross_validate(
ridge_reg_scores = cross_val_score(
ridge_reg,
X=data_x,
X=train_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
scoring="r2"
)
print("Ridge regression")
scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "ridge_reg"
scores = pd.concat([scores, scores_df])
print(np.median(ridge_reg_scores))
scores = insert_row(scores, ["Ridge regression",np.median(ridge_reg_scores),np.max(ridge_reg_scores)])
lasso_reg = linear_model.Lasso(alpha=0.1)
lasso_reg_score = cross_validate(
lasso_reg_score = cross_val_score(
lasso_reg,
X=data_x,
X=train_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
scoring="r2"
)
print("Lasso regression")
scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "lasso_reg"
scores = pd.concat([scores, scores_df])
print(np.median(lasso_reg_score))
scores = insert_row(scores, ["Lasso regression",np.median(lasso_reg_score),np.max(lasso_reg_score)])
bayesian_ridge_reg = linear_model.BayesianRidge()
bayesian_ridge_reg_score = cross_validate(
bayesian_ridge_reg_score = cross_val_score(
bayesian_ridge_reg,
X=data_x,
X=train_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
scoring="r2"
)
print("Bayesian Ridge")
scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "bayesian_ridge"
scores = pd.concat([scores, scores_df])
print(np.median(bayesian_ridge_reg_score))
scores = insert_row(scores, ["Bayesian Ridge",np.median(bayesian_ridge_reg_score),np.max(bayesian_ridge_reg_score)])
ransac_reg = linear_model.RANSACRegressor()
ransac_reg_score = cross_validate(
ransac_reg_score = cross_val_score(
ransac_reg,
X=data_x,
X=train_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
scoring="r2"
)
print("RANSAC (outlier robust regression)")
scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "RANSAC"
scores = pd.concat([scores, scores_df])
print(np.median(ransac_reg_score))
scores = insert_row(scores, ["RANSAC",np.median(ransac_reg_score),np.max(ransac_reg_score)])
svr = svm.SVR()
svr_score = cross_validate(
svr_score = cross_val_score(
svr,
X=data_x,
X=train_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
scoring="r2"
)
print("Support vector regression")
scores_df = pd.DataFrame(svr_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "SVR"
scores = pd.concat([scores, scores_df])
print(np.median(svr_score))
scores = insert_row(scores, ["Support vector regression",np.median(svr_score),np.max(svr_score)])
kridge = kernel_ridge.KernelRidge()
kridge_score = cross_validate(
kridge_score = cross_val_score(
kridge,
X=data_x,
X=train_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
scoring="r2"
)
print("Kernel Ridge regression")
scores_df = pd.DataFrame(kridge_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "kernel_ridge"
scores = pd.concat([scores, scores_df])
print(np.median(kridge_score))
scores = insert_row(scores, ["Kernel Ridge regression",np.median(kridge_score),np.max(kridge_score)])
gpr = gaussian_process.GaussianProcessRegressor()
gpr_score = cross_validate(
gpr_score = cross_val_score(
gpr,
X=data_x,
X=train_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
scoring="r2"
)
print("Gaussian Process Regression")
scores_df = pd.DataFrame(gpr_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "gaussian_proc"
scores = pd.concat([scores, scores_df])
print(np.median(gpr_score))
scores = insert_row(scores, ["Gaussian Process Regression",np.median(gpr_score),np.max(gpr_score)])
rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
rfr_score = cross_validate(
rfr_score = cross_val_score(
rfr,
X=data_x,
X=train_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
scoring="r2"
)
print("Random Forest Regression")
scores_df = pd.DataFrame(rfr_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "random_forest"
scores = pd.concat([scores, scores_df])
print(np.median(rfr_score))
scores = insert_row(scores, ["Random Forest Regression",np.median(rfr_score),np.max(rfr_score)])
xgb = XGBRegressor()
xgb_score = cross_validate(
xgb_score = cross_val_score(
xgb,
X=data_x,
X=train_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
scoring="r2"
)
print("XGBoost Regressor")
scores_df = pd.DataFrame(xgb_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "XGBoost"
scores = pd.concat([scores, scores_df])
print(np.median(xgb_score))
scores = insert_row(scores, ["XGBoost Regressor",np.median(xgb_score),np.max(xgb_score)])
ada = ensemble.AdaBoostRegressor()
ada_score = cross_validate(
ada_score = cross_val_score(
ada,
X=data_x,
X=train_x,
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring=metrics
scoring="r2"
)
print("ADA Boost Regressor")
scores_df = pd.DataFrame(ada_score)[test_metrics]
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
scores_df["method"] = "ADA_boost"
scores = pd.concat([scores, scores_df])
return scores
def run_all_classification_models(data_x, data_y, data_groups, cv_method):
metrics = ['accuracy', 'average_precision', 'recall', 'f1']
test_metrics = ["test_" + metric for metric in metrics]
scores = pd.DataFrame(columns=["method", "max", "mean"])
dummy_class = DummyClassifier(strategy="most_frequent")
dummy_score = cross_validate(
dummy_class,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=metrics
)
print("Dummy")
scores_df = pd.DataFrame(dummy_score)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "Dummy"
scores = pd.concat([scores, scores_df])
logistic_regression = linear_model.LogisticRegression()
log_reg_scores = cross_validate(
logistic_regression,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Logistic regression")
scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "logistic_reg"
scores = pd.concat([scores, scores_df])
svc = svm.SVC()
svc_scores = cross_validate(
svc,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Support Vector Machine")
scores_df = pd.DataFrame(svc_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "svc"
scores = pd.concat([scores, scores_df])
gaussian_nb = naive_bayes.GaussianNB()
gaussian_nb_scores = cross_validate(
gaussian_nb,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Gaussian Naive Bayes")
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "gaussian_naive_bayes"
scores = pd.concat([scores, scores_df])
sgdc = linear_model.SGDClassifier()
sgdc_scores = cross_validate(
sgdc,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Stochastic Gradient Descent")
scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "stochastic_gradient_descent"
scores = pd.concat([scores, scores_df])
rfc = ensemble.RandomForestClassifier()
rfc_scores = cross_validate(
rfc,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("Random Forest")
scores_df = pd.DataFrame(rfc_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "random_forest"
scores = pd.concat([scores, scores_df])
xgb_classifier = XGBClassifier()
xgb_scores = cross_validate(
xgb_classifier,
X=data_x,
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=metrics
)
print("XGBoost")
scores_df = pd.DataFrame(xgb_scores)[test_metrics]
scores_df = scores_df.agg(['max', 'mean']).transpose()
scores_df["method"] = "xgboost"
scores = pd.concat([scores, scores_df])
print(np.median(ada_score))
scores = insert_row(scores, ["ADA Boost Regressor",np.median(ada_score),np.max(ada_score)])
return scores

View File

@ -1,126 +0,0 @@
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
class Preprocessing:
"""This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data.
It's blind to the test data - e.g, it imputes the test data with train data mean.
This means, it somehow needs an access to the information about data split. In context
"""
def __init__(self, train_X, train_y, test_X, test_y):
self.train_X = train_X
self.train_y = train_y
self.test_X = test_X
self.test_y = test_y
def one_hot_encoder(self, categorical_features, numerical_features, mode):
"""
This code is an implementation of one-hot encoding. It takes in two data sets,
one with categorical features and one with numerical features and a mode parameter.
First it uses the fillna() function to fill in any missing values present in the
categorical data set with the mode value. Then it uses the apply () method to
convert each column of the data set into a category data type which is then
transformed using the pd.get_dummies() function. Finally it concatenates the
numerical data set and the transformed categorical data set using pd.concat() and
returns it.
Args:
categorical_features (DataFrame): DataFrame including only categorical columns.
numerical_features (_type_): DataFrame including only numerical columns.
mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
Returns:
DataFrame: Hot-One Encoded DataFrame.
"""
# Fill train set with mode
categorical_features = categorical_features.fillna(mode)
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
return pd.concat([numerical_features, categorical_features], axis=1)
def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
"""
This code is used to transform categorical data into numerical representations.
It first identifies the categorical columns, then copies them and saves them as
a new dataset. The missing data is filled with the mode (most frequent value in
the respective column). This new dataset is then subjected to one-hot encoding,
which is a process of transforming categorical data into machine interpretable
numerical form by converting categories into multiple binary outcome variables.
These encoded values are then concatenated to the numerical features prior to
being returned as the final dataset.
Args:
categorical_columns (list, optional): List of categorical columns in the dataset.
Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"].
"""
categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
# For train set
train_X_categorical_features = self.train_X[categorical_columns].copy()
train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
# For test set
test_X_categorical_features = self.test_X[categorical_columns].copy()
test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
# TODO: TESTING
if groupby:
# Interval numerical features # TODO: How can we get and assign appropriate groupby means and assign them to correct columns?
# VVVVV ...... IN PROGRES ...... VVVVV
means = self.train_X[interval_feature_list].groupby(groupby_feature).mean()
self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + other_feature_list)]] = \
self.train_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
self.test_X[self.test_X.loc[:, ~self.test_X.columns.isin([groupby_feature] + other_feature_list)]] = \
self.test_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
# Other features
self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + interval_feature_list)]] = \
self.train_X[other_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.median()))
else:
# Interval numerical features
means = self.train_X[interval_feature_list].mean()
self.train_X[interval_feature_list].fillna(means, inplace=True)
self.test_X[interval_feature_list].fillna(means, inplace=True)
# Other features
medians = self.train_X[other_feature_list].median()
self.train_X[other_feature_list].fillna(medians, inplace=True)
self.test_X[other_feature_list].fillna(medians, inplace=True)
def get_train_test_sets(self):
"""Train and test sets getter
Returns:
tuple of Pandas DataFrames: Gets train test sets in traditional sklearn format.
"""
return self.train_X, self.train_y, self.test_X, self.test_y

View File

@ -1,51 +0,0 @@
library(conflicted)
library(yaml)
library(RPostgreSQL)
library(tidyverse)
conflicts_prefer(
dplyr::filter,
dplyr::lag
)
library(magrittr)
# read the password from file
credentials <- yaml.load_file("../rapids/credentials.yaml")
pw <- credentials$PSQL_STRAW$password
# load the PostgreSQL driver
drv <- RPostgres::Postgres()
# creates a connection to the postgres database
# note that "con" will be used later in each connection to the database
con <- RPostgres::dbConnect(drv,
dbname = "staw",
host = "eol.ijs.si", port = 5432,
user = "staw_db", password = pw
)
rm(pw, credentials) # removes the password
# check for the bluetooth table, an example
dbExistsTable(con, "app_categories")
df_app_categories <- tbl(con, "app_categories") %>%
collect()
head(df_app_categories)
table(df_app_categories$play_store_genre)
# Correct some mistakes
df_app_categories %<>% mutate(
play_store_genre = {
function(x) {
case_when(
x == "Education,Education" ~ "Education",
x == "EducationEducation" ~ "Education",
x == "not_found" ~ "System",
.default = x
)
}
}(play_store_genre)
)
dbDisconnect(con)

View File

@ -1,103 +0,0 @@
---
title: "Stressful event detection"
output: html_notebook
---
```{r chunk_options, include=FALSE}
knitr::opts_chunk$set(
comment = "#>", echo = FALSE, fig.width = 6
)
```
```{r libraries, include=FALSE}
library(knitr)
library(kableExtra)
library(stringr)
library(RColorBrewer)
library(magrittr)
library(tidyverse)
```
```{r fig_setup, include=FALSE}
accent <- RColorBrewer::brewer.pal(7, "Accent")
```
```{r read_data, include=FALSE}
podatki <- read_csv("E:/STRAWresults/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
podatki %<>% mutate(pid = as_factor(pid))
```
# Event descriptions
Participants were asked "Was there a particular event that created tension in you?" with the following options:
- 0 - No
- 1 - Yes, slightly
- 2 - Yes, moderately
- 3 - Yes, considerably
- 4 - Yes, extremely
If they answered anything but "No", they were also asked about the event's perceived threat (e.g. "Did this event make you feel anxious?") and challenge (e.g. "How eager are you to tackle this event?").
We only consider general "stressfulness" in this presentation.
Most of the time, nothing stressful happened:
```{r target_table}
kable(table(podatki$target), col.names = c("stressfulness", "frequency")) %>%
kable_styling(full_width = FALSE)
```
Most participants had somewhere between 0 and 10 stressful events.
```{r target_distribution}
podatki %>%
group_by(pid) %>%
summarise(no_of_events = sum(target > 0)) %>%
ggplot(aes(no_of_events)) +
geom_histogram(binwidth = 1, fill = accent[1]) +
coord_cartesian(expand = FALSE) +
labs(x = "Number of events per participant") +
theme_classic()
```
When a stressful event occurred, participants mostly perceived it as slightly to moderately stressful on average.
```{r mean_stressfulness_distribution}
podatki %>%
filter(target > 0) %>%
group_by(pid) %>%
summarise(mean_stressfulness = mean(target)) %>%
ggplot(aes(mean_stressfulness)) +
geom_histogram(binwidth = 0.1, fill = accent[1]) +
coord_cartesian(expand = FALSE) +
labs(x = "Mean stressfulness per participant") +
theme_classic()
```
# Problem description
We are trying to predict whether a stressful event occurred, i.e. stressfulness > 0, or not (stressfulness == 0).
First, set up a leave-one-subject-out validation and use original distribution of the class variable.
For this, the majority classifier has a mean accuracy of 0.85 (and median 0.90), while the F1-score, precision and recall are all 0.
We also have an option to validate the results differently, such as with "half-loso", i.e. leaving half of the subject's data in the training set and only use half for testing, or k-fold cross-validation.
Additionally, we can undersample the majority class to balance the dataset.
# Results
## Leave one subject out, original distribution
```{r event_detection}
scores <- read_csv("event_stressful_detection_loso.csv", col_types = "ffdd")
scores_wide <- scores %>%
select(!max) %>%
pivot_wider(names_from = metric,
names_sep = "_",
values_from = mean) %>%
rename_all(~str_replace(.,"^test_",""))
kable(scores_wide, digits = 2) %>%
column_spec(4, color = 'white', background = 'black') %>%
kable_styling(full_width = TRUE)
```

View File

@ -1,127 +0,0 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %% jupyter={"source_hidden": true}
# %matplotlib inline
import datetime
import importlib
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
import xgboost as xg
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from pathlib import Path
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import machine_learning.labels
import machine_learning.model
from machine_learning.helper import run_all_classification_models
# %% [markdown]
# # RAPIDS models
# %% [markdown]
# ## Set script's parameters
#
# %%
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
# %% jupyter={"source_hidden": true}
filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
model_input = pd.read_csv(filename)
# %% jupyter={"source_hidden": true}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input.set_index(index_columns, inplace=True)
model_input['target'].value_counts()
# %% jupyter={"source_hidden": true}
bins = [-10, -1, 1, 10] # bins for z-scored targets
# bins = [0, 1, 4] # bins for stressfulness (1-4) target
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'medium', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
model_input['target'].value_counts(), edges
model_input = model_input[model_input['target'] != "medium"]
model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
model_input['target'].value_counts()
if cv_method_str == 'halflogo':
model_input['pid_index'] = model_input.groupby('pid').cumcount()
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
else:
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
# %% jupyter={"source_hidden": true}
categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features
categorical_features = data_x[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]
# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
train_x = pd.concat([numerical_features, categorical_features], axis=1)
# %% jupyter={"source_hidden": true}
cv_method = None # Defaults to 5 k-folds in cross_validate method
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
cv_method = LeaveOneGroupOut()
cv_method.get_n_splits(
train_x,
data_y,
groups=data_groups,
)
# %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
# %%
final_scores = run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
# %%
final_scores.index.name = "metric"
final_scores = final_scores.set_index(["method", final_scores.index])
final_scores.to_csv("event_stressfulness_lmh_lh_scores.csv")

View File

@ -1,29 +0,0 @@
method,metric,max,mean
Dummy,test_accuracy,0.8557046979865772,0.8548446932649828
Dummy,test_average_precision,0.1457286432160804,0.14515530673501736
Dummy,test_recall,0.0,0.0
Dummy,test_f1,0.0,0.0
logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606
logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834
logistic_reg,test_recall,0.3023255813953488,0.24266238973536486
logistic_reg,test_f1,0.3909774436090226,0.318943511424051
svc,test_accuracy,0.8557046979865772,0.8548446932649828
svc,test_average_precision,0.44514416839823046,0.4068200938341621
svc,test_recall,0.0,0.0
svc,test_f1,0.0,0.0
gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234
gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853
gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935
gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699
stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601
stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592
stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675
stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583
random_forest,test_accuracy,0.9110738255033557,0.9011129472867694
random_forest,test_average_precision,0.6998372262021191,0.6619275281099584
random_forest,test_recall,0.4069767441860465,0.35356856455493185
random_forest,test_f1,0.5691056910569107,0.5078402513053142
xgboost,test_accuracy,0.9128978224455612,0.9007711937764886
xgboost,test_average_precision,0.7366643049075349,0.698622165966308
xgboost,test_recall,0.5287356321839081,0.44346431435445066
xgboost,test_f1,0.638888888888889,0.5633957169928393
1 method metric max mean
2 Dummy test_accuracy 0.8557046979865772 0.8548446932649828
3 Dummy test_average_precision 0.1457286432160804 0.14515530673501736
4 Dummy test_recall 0.0 0.0
5 Dummy test_f1 0.0 0.0
6 logistic_reg test_accuracy 0.8640939597315436 0.8504895843872606
7 logistic_reg test_average_precision 0.44363425265068757 0.37511495347389834
8 logistic_reg test_recall 0.3023255813953488 0.24266238973536486
9 logistic_reg test_f1 0.3909774436090226 0.318943511424051
10 svc test_accuracy 0.8557046979865772 0.8548446932649828
11 svc test_average_precision 0.44514416839823046 0.4068200938341621
12 svc test_recall 0.0 0.0
13 svc test_f1 0.0 0.0
14 gaussian_naive_bayes test_accuracy 0.7684563758389261 0.7479123806954234
15 gaussian_naive_bayes test_average_precision 0.2534828030085334 0.23379392278901853
16 gaussian_naive_bayes test_recall 0.42528735632183906 0.3924619085805935
17 gaussian_naive_bayes test_f1 0.34285714285714286 0.3107236284017699
18 stochastic_gradient_descent test_accuracy 0.8576214405360134 0.7773610783222601
19 stochastic_gradient_descent test_average_precision 0.3813093757959869 0.3617503752215592
20 stochastic_gradient_descent test_recall 0.686046511627907 0.2822507350975675
21 stochastic_gradient_descent test_f1 0.3652173913043478 0.21849107443075583
22 random_forest test_accuracy 0.9110738255033557 0.9011129472867694
23 random_forest test_average_precision 0.6998372262021191 0.6619275281099584
24 random_forest test_recall 0.4069767441860465 0.35356856455493185
25 random_forest test_f1 0.5691056910569107 0.5078402513053142
26 xgboost test_accuracy 0.9128978224455612 0.9007711937764886
27 xgboost test_average_precision 0.7366643049075349 0.698622165966308
28 xgboost test_recall 0.5287356321839081 0.44346431435445066
29 xgboost test_f1 0.638888888888889 0.5633957169928393

View File

@ -1,29 +0,0 @@
method,metric,max,mean
Dummy,test_accuracy,1.0,0.8524114578096439
Dummy,test_average_precision,0.7,0.14758854219035614
Dummy,test_recall,0.0,0.0
Dummy,test_f1,0.0,0.0
logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311
logistic_reg,test_average_precision,1.0,0.44605167668563583
logistic_reg,test_recall,1.0,0.25353566685532386
logistic_reg,test_f1,0.823529411764706,0.27951926390778625
svc,test_accuracy,1.0,0.8524114578096439
svc,test_average_precision,0.9612401707068228,0.44179454944271934
svc,test_recall,0.0,0.0
svc,test_f1,0.0,0.0
gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129
gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991
gaussian_naive_bayes,test_recall,1.0,0.3743761174081108
gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668
stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739
stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582
stochastic_gradient_descent,test_recall,1.0,0.30156420704502945
stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388
random_forest,test_accuracy,1.0,0.8722158105763481
random_forest,test_average_precision,1.0,0.49817066323226833
random_forest,test_recall,1.0,0.18161263127840668
random_forest,test_f1,1.0,0.2508096532365307
xgboost,test_accuracy,1.0,0.8812627400277729
xgboost,test_average_precision,1.0,0.5505695112208401
xgboost,test_recall,1.0,0.2896161238315027
xgboost,test_f1,0.9411764705882353,0.36887408735855665
1 method metric max mean
2 Dummy test_accuracy 1.0 0.8524114578096439
3 Dummy test_average_precision 0.7 0.14758854219035614
4 Dummy test_recall 0.0 0.0
5 Dummy test_f1 0.0 0.0
6 logistic_reg test_accuracy 0.9824561403508771 0.8445351955631311
7 logistic_reg test_average_precision 1.0 0.44605167668563583
8 logistic_reg test_recall 1.0 0.25353566685532386
9 logistic_reg test_f1 0.823529411764706 0.27951926390778625
10 svc test_accuracy 1.0 0.8524114578096439
11 svc test_average_precision 0.9612401707068228 0.44179454944271934
12 svc test_recall 0.0 0.0
13 svc test_f1 0.0 0.0
14 gaussian_naive_bayes test_accuracy 0.9 0.7491301746887129
15 gaussian_naive_bayes test_average_precision 0.9189430193277607 0.2833170327386991
16 gaussian_naive_bayes test_recall 1.0 0.3743761174081108
17 gaussian_naive_bayes test_f1 0.7000000000000001 0.2698456659235668
18 stochastic_gradient_descent test_accuracy 1.0 0.7926428596764739
19 stochastic_gradient_descent test_average_precision 1.0 0.4421948838597582
20 stochastic_gradient_descent test_recall 1.0 0.30156420704502945
21 stochastic_gradient_descent test_f1 0.8148148148148148 0.24088393234361388
22 random_forest test_accuracy 1.0 0.8722158105763481
23 random_forest test_average_precision 1.0 0.49817066323226833
24 random_forest test_recall 1.0 0.18161263127840668
25 random_forest test_f1 1.0 0.2508096532365307
26 xgboost test_accuracy 1.0 0.8812627400277729
27 xgboost test_average_precision 1.0 0.5505695112208401
28 xgboost test_recall 1.0 0.2896161238315027
29 xgboost test_f1 0.9411764705882353 0.36887408735855665

View File

@ -1,60 +0,0 @@
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: Python 3.10.8 ('straw2analysis')
# language: python
# name: python3
# ---
# %%
# %matplotlib inline
import datetime
import importlib
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import yaml
from pyprojroot import here
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
from sklearn.model_selection import LeaveOneGroupOut, LeavePGroupsOut, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor
import xgboost as xg
from pathlib import Path
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import machine_learning.features_sensor
import machine_learning.labels
import machine_learning.model
import machine_learning.helper
# %% tags=["active-ipynb"]
# filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
# filename = Path('C:/Users/Primoz/VSCodeProjects/straw2analysis/data/stressfulness_event/input_appraisal_stressfulness_event_mean.csv')
# %%
final_scores = machine_learning.helper.run_all_regression_models(filename)
# %%
final_scores.index.name = "metric"
final_scores = final_scores.set_index(["method", final_scores.index])
# %%
final_scores.to_csv("event_stressfulness_scores.csv")

View File

@ -1,17 +0,0 @@
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
AutoAppendNewline: Yes
SpellingDictionary: en_GB

View File

@ -1,131 +0,0 @@
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
# %matplotlib inline
import yaml
from sklearn import linear_model
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
import os
import importlib
import matplotlib.pyplot as plt
import sys
import numpy as np
import seaborn as sns
import pandas as pd
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
# %%
from machine_learning import pipeline, features_sensor, labels, model
# %%
importlib.reload(labels)
# %%
with open("./config/prox_comm_PANAS_features.yaml", "r") as file:
sensor_features_params = yaml.safe_load(file)
sensor_features = features_sensor.SensorFeatures(**sensor_features_params)
#sensor_features.set_sensor_data()
sensor_features.calculate_features(cached=True)
# %%
all_features = sensor_features.get_features("all","all")
# %%
with open("./config/prox_comm_PANAS_labels.yaml", "r") as file:
labels_params = yaml.safe_load(file)
labels_current = labels.Labels(**labels_params)
#labels_current.set_labels()
labels_current.aggregate_labels(cached=True)
# %%
model_validation = model.ModelValidation(
sensor_features.get_features("all", "all"),
labels_current.get_aggregated_labels(),
group_variable="participant_id",
cv_name="loso",
)
model_validation.model = linear_model.LinearRegression()
model_validation.set_cv_method()
# %%
model_loso_r2 = model_validation.cross_validate()
# %%
print(model_loso_r2)
print(np.mean(model_loso_r2))
# %%
model_loso_r2[model_loso_r2 > 0]
# %%
logo = LeaveOneGroupOut()
# %%
try_X = model_validation.X.reset_index().drop(["participant_id","date_lj"], axis=1)
try_y = model_validation.y.reset_index().drop(["participant_id","date_lj"], axis=1)
# %%
model_loso_mean_absolute_error = -1 * cross_val_score(
estimator=model_validation.model,
X=try_X,
y=try_y,
groups=model_validation.groups,
cv=logo.split(X=try_X, y=try_y, groups=model_validation.groups),
scoring='neg_mean_absolute_error'
)
# %%
model_loso_mean_absolute_error
# %%
np.median(model_loso_mean_absolute_error)
# %%
model_validation.model.fit(try_X, try_y)
# %%
Y_predicted = model_validation.model.predict(try_X)
# %%
try_y.rename(columns={"NA": "NA_true"}, inplace=True)
try_y["NA_predicted"] = Y_predicted
NA_long = pd.wide_to_long(
try_y.reset_index(),
i="index",
j="value",
stubnames="NA",
sep="_",
suffix=".+",
)
# %%
g1 = sns.displot(NA_long, x="NA", hue="value", binwidth=0.1, height=5, aspect=1.5)
sns.move_legend(g1, "upper left", bbox_to_anchor=(.55, .45))
g1.set_axis_labels("Daily mean", "Day count")
display(g1)
g1.savefig("prox_comm_PANAS_predictions.pdf")
# %%
from sklearn.metrics import mean_absolute_error
# %%
mean_absolute_error(try_y["NA_true"], try_y["NA_predicted"])
# %%
model_loso_mean_absolute_error

2
rapids

@ -1 +1 @@
Subproject commit 7b8538ce5152bb6e978cae37fcb7099941e95364
Subproject commit f78aa3e7b3567423b44045766b230cd60d557cb0