diff --git a/exploration/all_sensors_sequential_addition_scores.xlsx b/exploration/all_sensors_sequential_addition_scores.xlsx new file mode 100644 index 0000000..a990393 Binary files /dev/null and b/exploration/all_sensors_sequential_addition_scores.xlsx differ diff --git a/exploration/expl_features_analysis.py b/exploration/expl_features_analysis.py new file mode 100644 index 0000000..056e54c --- /dev/null +++ b/exploration/expl_features_analysis.py @@ -0,0 +1,318 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.13.0 +# kernelspec: +# display_name: straw2analysis +# language: python +# name: straw2analysis +# --- + +# %% jupyter={"source_hidden": false, "outputs_hidden": false} +# %matplotlib inline + +import os, sys, math + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns + +from sklearn.tree import DecisionTreeClassifier +from sklearn import tree +from sklearn.impute import SimpleImputer +from sklearn.model_selection import train_test_split + + +# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} +def calc_entropy(column): + """ + Calculate entropy given a pandas series, list, or numpy array. + """ + # Compute the counts of each unique value in the column + counts = np.bincount(column) + # Divide by the total column length to get a probability + probabilities = counts / len(column) + + # Initialize the entropy to 0 + entropy = 0 + # Loop through the probabilities, and add each one to the total entropy + for prob in probabilities: + if prob > 0: + # use log from math and set base to 2 + entropy += prob * math.log(prob, 2) + + return -entropy + + +def calc_information_gain(data, split_name, target_name): + """ + Calculate information gain given a data set, column to split on, and target + """ + # Calculate the original entropy + original_entropy = calc_entropy(data[target_name]) + #Find the unique values in the column + values = data[split_name].unique() + + # Make two subsets of the data, based on the unique values + left_split = data[data[split_name] == values[0]] + right_split = data[data[split_name] == values[1]] + + # Loop through the splits and calculate the subset entropies + to_subtract = 0 + for subset in [left_split, right_split]: + prob = (subset.shape[0] / data.shape[0]) + to_subtract += prob * calc_entropy(subset[target_name]) + + # Return information gain + return original_entropy - to_subtract + + +def get_information_gains(data, target_name): + #Intialize an empty dictionary for information gains + information_gains = {} + + #Iterate through each column name in our list + for col in list(data.columns): + #Find the information gain for the column + information_gain = calc_information_gain(data, col, target_name) + #Add the information gain to our dictionary using the column name as the ekey + information_gains[col] = information_gain + + #Return the key with the highest value + #return max(information_gains, key=information_gains.get) + + return information_gains + +def n_features_with_highest_info_gain(info_gain_dict, n=None): + """ + Get n-features that have highest information gain + """ + if n is None: + n = len(info_gain_dict) + import heapq + n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1]) + return {feature[0]: feature[1] for feature in n_largest} + + +# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns) + +categorical_feature_colnames = ["gender", "startlanguage"] +additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col] +categorical_feature_colnames += additional_categorical_features + +categorical_features = model_input[categorical_feature_colnames].copy() +mode_categorical_features = categorical_features.mode().iloc[0] + +# fillna with mode +categorical_features = categorical_features.fillna(mode_categorical_features) + +# one-hot encoding +categorical_features = categorical_features.apply(lambda col: col.astype("category")) +if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) + +numerical_features = model_input.drop(categorical_feature_colnames, axis=1) +model_input = pd.concat([numerical_features, categorical_features], axis=1) + +# Binarizacija targeta +bins = [-1, 0, 4] # bins for stressfulness (0-4) target +model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) +print(model_input['target'].value_counts(), edges) + +# %% +info_gains = get_information_gains(model_input, 'target') + +# %% [markdown] +# Present the feature importance results + +# %% +print("Total columns:", len(info_gains)) +print(pd.Series(info_gains).value_counts()) + +n_features_with_highest_info_gain(info_gains, n=189) + +# %% +def compute_impurity(feature, impurity_criterion): + """ + This function calculates impurity of a feature. + Supported impurity criteria: 'entropy', 'gini' + input: feature (this needs to be a Pandas series) + output: feature impurity + """ + probs = feature.value_counts(normalize=True) + + if impurity_criterion == 'entropy': + impurity = -1 * np.sum(np.log2(probs) * probs) + elif impurity_criterion == 'gini': + impurity = 1 - np.sum(np.square(probs)) + else: + raise ValueError('Unknown impurity criterion') + + return impurity + + +def comp_feature_information_gain(df, target, descriptive_feature, split_criterion, print_flag=False): + """ + This function calculates information gain for splitting on + a particular descriptive feature for a given dataset + and a given impurity criteria. + Supported split criterion: 'entropy', 'gini' + """ + if print_flag: + print('target feature:', target) + print('descriptive_feature:', descriptive_feature) + print('split criterion:', split_criterion) + + target_entropy = compute_impurity(df[target], split_criterion) + + # we define two lists below: + # entropy_list to store the entropy of each partition + # weight_list to store the relative number of observations in each partition + entropy_list = list() + weight_list = list() + + # loop over each level of the descriptive feature + # to partition the dataset with respect to that level + # and compute the entropy and the weight of the level's partition + for level in df[descriptive_feature].unique(): + df_feature_level = df[df[descriptive_feature] == level] + entropy_level = compute_impurity(df_feature_level[target], split_criterion) + entropy_list.append(round(entropy_level, 3)) + weight_level = len(df_feature_level) / len(df) + weight_list.append(round(weight_level, 3)) + + # print('impurity of partitions:', entropy_list) + # print('weights of partitions:', weight_list) + + feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list)) + + information_gain = target_entropy - feature_remaining_impurity + + if print_flag: + print('impurity of partitions:', entropy_list) + print('weights of partitions:', weight_list) + print('remaining impurity:', feature_remaining_impurity) + print('information gain:', information_gain) + print('====================') + + return information_gain + + +def calc_information_gain_2(data, split_name, target_name, split_criterion): + """ + Calculate information gain given a data set, column to split on, and target + """ + # Calculate the original impurity + original_impurity = compute_impurity(data[target_name], split_criterion) + #Find the unique values in the column + values = data[split_name].unique() + + # Make two subsets of the data, based on the unique values + left_split = data[data[split_name] == values[0]] + right_split = data[data[split_name] == values[1]] + + # Loop through the splits and calculate the subset impurities + to_subtract = 0 + for subset in [left_split, right_split]: + prob = (subset.shape[0] / data.shape[0]) + to_subtract += prob * compute_impurity(subset[target_name], split_criterion) + + # Return information gain + return original_impurity - to_subtract + + +def get_information_gains_2(data, target_name, split_criterion): + #Intialize an empty dictionary for information gains + information_gains = {} + + #Iterate through each column name in our list + for feature in list(data.columns): + #Find the information gain for the column + information_gain = calc_information_gain_2(model_input, target_name, feature, split_criterion) + #Add the information gain to our dictionary using the column name as the ekey + information_gains[feature] = information_gain + + #Return the key with the highest value + #return max(information_gains, key=information_gains.get) + + return information_gains + +# %% [markdown] +# Present the feature importance results from other methods + +# %% +split_criterion = 'entropy' +print("Target impurity:", compute_impurity(model_input['target'], split_criterion)) +information_gains = get_information_gains_2(model_input, 'target', split_criterion) +print(pd.Series(information_gains).value_counts().sort_index(ascending=False)) +n_features_with_highest_info_gain(information_gains) + +# %% +# Present the feature importance using a tree (that uses gini imputity measure) +split_criterion = 'entropy' +print("Target impurity:", compute_impurity(model_input['target'], split_criterion)) + +X, y = model_input.drop(columns=['target', 'pid']), model_input['target'] +imputer = SimpleImputer(missing_values=np.nan, strategy='median') +X = imputer.fit_transform(X) +X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25) + + +clf = DecisionTreeClassifier(criterion=split_criterion) +clf.fit(X, y) + +feat_importance = clf.tree_.compute_feature_importances(normalize=False) +print("feat importance = ", feat_importance) +print("shape", feat_importance.shape) +tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist())) +info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp)) +info_gains_dict[info_gains_dict > 0] + +# %% +# Binarizacija vrednosti tree Information Gain-a +bins = [-0.1, 0, 0.1] # bins for target's correlations with features +cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True) +plt.title(f"Tree information gains by value ({split_criterion})") +cut_info_gains.value_counts().plot(kind='bar', color='purple') +plt.xticks(rotation=45, ha='right') +print(cut_info_gains.value_counts()) + + +pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20)) + +# %% +# Plot feature importance tree graph +plt.figure(figsize=(12,12)) +tree.plot_tree(clf, + feature_names = list(model_input.drop(columns=['target', 'pid']).columns), + class_names=True, + filled = True, fontsize=5, max_depth=3) + +plt.savefig('tree_high_dpi', dpi=800) + + +# %% [markdown] +# Present the feature importance by correlation with target + +corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int)))) +# corrs.sort_values(ascending=False) + +# Binarizacija vrednosti korelacij +bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features +cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True) +plt.title("Target's correlations with features") +cut_corrs.value_counts().plot(kind='bar') +plt.xticks(rotation=45, ha='right') +print(cut_corrs.value_counts()) +print(corrs[corrs > 0.1]) # or corrs < -0.1]) +# %% + +# %% diff --git a/exploration/expl_features_groups_analysis.py b/exploration/expl_features_groups_analysis.py new file mode 100644 index 0000000..3a8e375 --- /dev/null +++ b/exploration/expl_features_groups_analysis.py @@ -0,0 +1,328 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.13.0 +# kernelspec: +# display_name: straw2analysis +# language: python +# name: straw2analysis +# --- + +# %% jupyter={"source_hidden": false, "outputs_hidden": false} +# %matplotlib inline + +import os, sys, math + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +from sklearn.impute import SimpleImputer +from sklearn.naive_bayes import GaussianNB +from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold +from sklearn import metrics + +# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns) + +categorical_feature_colnames = ["gender", "startlanguage"] +additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col] +categorical_feature_colnames += additional_categorical_features + +categorical_features = model_input[categorical_feature_colnames].copy() +mode_categorical_features = categorical_features.mode().iloc[0] + +# fillna with mode +categorical_features = categorical_features.fillna(mode_categorical_features) + +# one-hot encoding +categorical_features = categorical_features.apply(lambda col: col.astype("category")) +if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) + +numerical_features = model_input.drop(categorical_feature_colnames, axis=1) +model_input = pd.concat([numerical_features, categorical_features], axis=1) + +# Binarizacija targeta +bins = [-1, 0, 4] # bins for stressfulness (0-4) target +model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) + +print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns))) +print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape) + + +# %% +# Add prefix to demographical features +demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile', + 'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl'] + +new_names = [(col, "demo_"+col) for col in demo_features] +model_input.rename(columns=dict(new_names), inplace=True) + +demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio', + 'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M', + 'demo_startlanguage_nl', 'demo_startlanguage_sl'] + +# %% +# Get phone and non-phone columns +import warnings + +def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False): + """ + This function makes predictions with sensor groups. + It takes in a dataframe (df), a list of group substrings (groups_substrings) + and an optional parameter include_group (default is True). + It creates a list of columns in the dataframe that contain the group substrings, + while excluding the 'pid' and 'target' columns. It then splits the data into training + and test sets, using a test size of 0.25 for the first split and 0.2 for the second split. + A SimpleImputer is used to fill in missing values with median values. + A LogisticRegression is then used to fit the training set and make predictions + on the test set. Finally, accuracy, precision, recall and F1 scores are printed + for each substring group depending on whether or not include_group + is set to True or False. + + """ + + best_sensor = None + best_recall_score, best_f1_score = None, None + + for fgroup_substr in groups_substrings: + if fgroup_substr is None: + feature_group_cols = list(df.columns) + feature_group_cols.remove("pid") + feature_group_cols.remove("target") + else: + if include_group: + feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']] + else: + feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']] + + + X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target'] + X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2) + + imputer = SimpleImputer(missing_values=np.nan, strategy='median') + + nb = GaussianNB() + model_cv = cross_validate( + nb, + X=imputer.fit_transform(X), + y=y, + cv=StratifiedKFold(n_splits=5, shuffle=True), + n_jobs=-1, + scoring=('accuracy', 'precision', 'recall', 'f1') + ) + X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2) + + + if print_flag: + if include_group: + print("\nPrediction with", fgroup_substr) + else: + print("\nPrediction without", fgroup_substr) + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.") + + acc = np.mean(model_cv['test_accuracy']) + acc_std = np.std(model_cv['test_accuracy']) + + prec = np.mean(model_cv['test_precision']) + prec_std = np.std(model_cv['test_precision']) + + rec = np.mean(model_cv['test_recall']) + rec_std = np.std(model_cv['test_recall']) + + f1 = np.mean(model_cv['test_f1']) + f1_std = np.std(model_cv['test_f1']) + + if print_flag: + print("************************************************") + print(f"Accuracy: {acc} (sd={acc_std})") + print(f"Precison: {prec} (sd={prec_std})") + print(f"Recall: {rec} (sd={rec_std})") + print(f"F1: {f1} (sd={f1_std})\n") + + if (not best_recall_score and not best_f1_score) or (rec > best_recall_score): + best_sensor = fgroup_substr + best_recall_score, best_f1_score = rec, f1 + best_recall_score_std, best_f1_score_std = rec_std, f1_std + + return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std + +# %% [markdown] +# ### sensor big feature groups (phone, empatica, demographical) +big_groups_substr = ["phone_", "empatica_", "demo_"] +make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False) + +# %% [markdown] +# ### Empatica sezor groups +# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) +# e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"] +# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False) + +# %% [markdown] +# ### Phone sensor groups +# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) +# phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", +# "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"] +# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False) + +# %% +# Write all the sensors (phone, empatica), seperate other (demographical) cols also + +sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_", + "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_", + "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"] +# %% +def find_sensor_group_features_importance(model_input, sensor_groups_strings): + """ + This function finds the importance of sensor groups for a given model input. It takes two parameters: + model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores, + which will be populated with tuples containing the best sensor, its recall score, and its F1 score. + It then makes a copy of the model input and the sensor groups strings. It then loops through each group + in the list of strings, creating a list of important columns from the sensor importance scores list. + It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score, + and its F1 score. These values are added to the sensor importance scores list as a tuple. The function + then removes that best sensor from the list of strings before looping again until all groups have been evaluated. + Finally, it returns the populated list of tuples containing all sensors' scores. + """ + sensor_importance_scores = [] + model_input = model_input.copy() + sensor_groups_strings = sensor_groups_strings.copy() + groups_len = len(sensor_groups_strings) + for i in range(groups_len): + important_cols = [col[0] for col in sensor_importance_scores] + with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)] + + + best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std = \ + make_predictions_with_sensor_groups(model_input, + groups_substrings=sensor_groups_strings, include_group=True, + with_cols=with_cols) + sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std )) + print(f"\nAdded sensor: {best_sensor}\n") + sensor_groups_strings.remove(best_sensor) + + return sensor_importance_scores + + +# %% +# Method for sorting list of tuples into 3 lists +def sort_tuples_to_lists(list_of_tuples): + """ + sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument + and sorts them into three separate lists. The first list, xs, contains the first element + of each tuple. The second list, yrecall, contains the second element of each tuple rounded + to 4 decimal places. The third list, y_fscore, contains the third element of each tuple + rounded to 4 decimal places. The method returns all three lists. + """ + xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], [] + for a_tuple in list_of_tuples: + xs.append(a_tuple[0]) + y_recall.append(round(a_tuple[1], 4)) + y_fscore.append(round(a_tuple[2], 4)) + recall_std.append(round(a_tuple[3], 4)) + fscore_std.append(round(a_tuple[4], 4)) + return xs, y_recall, y_fscore, recall_std, fscore_std + +def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std, + title="Sequential addition of features and its F1, and recall scores"): + """ + This function plots the sequential progress of feature addition scores using two subplots. + The first subplot is for recall scores and the second subplot is for F1-scores. + The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes. + The title of the plot can be specified by the user using the parameter title. + The maximum recall index and maximum F1-score index are also plotted using a black dot. + The figure size is set to 18.5 inches in width and 10.5 inches in height, + and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed + using plt.show(). + """ + + fig, ax = plt.subplots(nrows=2, sharex=True) + ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD + ax[0].plot(xs, y_recall, color='red') + ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD + mrec_indx = np.argmax(y_recall) + ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black') + ax[0].legend(["Upper std", "Mean Recall", "Lower std"]) + + ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD + ax[1].plot(xs, y_fscore) + ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD + mfscore_indx = np.argmax(y_fscore) + ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black') + ax[1].legend(["Upper std", "Mean F1-score", "Lower std"]) + + fig.set_size_inches(18.5, 10.5) + + ax[0].title.set_text('Recall scores') + ax[1].title.set_text('F1-scores') + plt.suptitle(title, fontsize=14) + plt.xticks(rotation=90) + plt.show() + +# %% +sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_", + "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_", + "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"] + +# sensors_features_groups = ["phone_", "empatica_", "demo_"] + +# %% +# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr) +sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups) +xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores) + +# %% [markdown] +# ### Visualize sensors groups F1 and recall scores +print(sensor_groups_importance_scores) +plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std, + title="Sequential addition of sensors and its F1, and recall scores") + +# %% +# Take the most important feature group and investigate it feature-by-feature +best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group +best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)] + +# best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features) + +# xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(best_sensor_features_scores) + +# %% [markdown] +# ### Visualize best sensor's F1 and recall scores +# print(best_sensor_features_scores) +# plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std, +# title="Best sensor addition it's features with F1 and recall scores") + +# %% +# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature +# It also saves the sequence of scores for all sensors' features in excel file +seq_columns = ["sensor_name", "feature_sequence", "recall", "f1_score"] +feature_sequence = pd.DataFrame(columns=seq_columns) +for i, sensor_group in enumerate(sensor_groups_importance_scores): + + current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])] + current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features) + xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(current_sensor_features_scores) + feature_sequence = pd.concat([feature_sequence, pd.DataFrame({"sensor_name":sensor_group[0], "feature_sequence": [xs], "recall": [y_recall], + "f1_score": [y_fscore], "recall_std": [recall_std], "f1_std": [fscore_std]})]) + + plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std, + title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores") + +feature_sequence.to_excel("all_sensors_sequential_addition_scores.xlsx", index=False) + +# %% +# TODO: method that reads data from the excel file, specified above, and then the method, +# that selects only features that are max a thresh[%] below the max value (best for recall +# possibly for f1). This method should additionally take threshold parameter. + +# %% + diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py new file mode 100644 index 0000000..eeaa9b3 --- /dev/null +++ b/exploration/ml_pipeline.py @@ -0,0 +1,49 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.13.0 +# kernelspec: +# display_name: straw2analysis +# language: python +# name: straw2analysis +# --- + +# %% +import sys, os + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +nb_dir = os.path.split(os.getcwd())[0] +if nb_dir not in sys.path: + sys.path.append(nb_dir) + +from machine_learning.cross_validation import CrossValidation +from machine_learning.preprocessing import Preprocessing + +# %% +df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv") +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +df.set_index(index_columns, inplace=True) + +cv = CrossValidation(data=df, cv_method="logo") + +categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"] +interval_feature_list, other_feature_list = [], [] + +print(df.columns.tolist()) + +for split in cv.get_splits(): + train_X, train_y, test_X, test_y = cv.get_train_test_sets(split) + pre = Preprocessing(train_X, train_y, test_X, test_y) + pre.one_hot_encode_train_and_test_sets(categorical_columns) + train_X, train_y, test_X, test_y = pre.get_train_test_sets() + break + +# %% diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py index 3deae61..04087f0 100644 --- a/exploration/ml_pipeline_classification.py +++ b/exploration/ml_pipeline_classification.py @@ -46,9 +46,9 @@ import machine_learning.helper # # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} -cv_method_str = '5kfold' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) +cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs -undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method) +undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method) # %% jupyter={"source_hidden": false, "outputs_hidden": false} model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv") @@ -72,20 +72,27 @@ model_input['target'].value_counts() # %% jupyter={"source_hidden": false, "outputs_hidden": false} # UnderSampling if undersampling: - model_input_new = pd.DataFrame(columns=model_input.columns) - for pid in model_input["pid"].unique(): - stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)] - no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)] - if (len(stress) == 0): - continue - if (len(no_stress) == 0): - continue - model_input_new = pd.concat([model_input_new, stress], axis=0) + no_stress = model_input[model_input['target'] == 0] + stress = model_input[model_input['target'] == 1] - no_stress = no_stress.sample(n=min(len(stress), len(no_stress))) - # In case there are more stress samples than no_stress, take all instances of no_stress. - model_input_new = pd.concat([model_input_new, no_stress], axis=0) - model_input = model_input_new + no_stress = no_stress.sample(n=len(stress)) + model_input = pd.concat([stress,no_stress], axis=0) + +# model_input_new = pd.DataFrame(columns=model_input.columns) +# for pid in model_input["pid"].unique(): +# stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)] +# no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)] +# if (len(stress) == 0): +# continue +# if (len(no_stress) == 0): +# continue +# model_input_new = pd.concat([model_input_new, stress], axis=0) + +# no_stress = no_stress.sample(n=min(len(stress), len(no_stress))) +# # In case there are more stress samples than no_stress, take all instances of no_stress. +# model_input_new = pd.concat([model_input_new, no_stress], axis=0) +# model_input = model_input_new +# model_input_new = pd.concat([model_input_new, no_stress], axis=0) # %% jupyter={"source_hidden": false, "outputs_hidden": false} @@ -170,7 +177,7 @@ final_scores = machine_learning.helper.run_all_classification_models(imputer.fit # %% final_scores.index.name = "metric" final_scores = final_scores.set_index(["method", final_scores.index]) -final_scores.to_csv("../presentation/event_stressful_detection_5fold.csv") +final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv") # %% [markdown] # ### Logistic Regression @@ -453,4 +460,3 @@ print("F1", np.mean(xgb_classifier_scores['test_f1'])) print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1]) print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])) -# %% jupyter={"outputs_hidden": false, "source_hidden": false} diff --git a/exploration/tree_high_dpi.png b/exploration/tree_high_dpi.png new file mode 100644 index 0000000..319c498 Binary files /dev/null and b/exploration/tree_high_dpi.png differ diff --git a/machine_learning/cross_validation.py b/machine_learning/cross_validation.py new file mode 100644 index 0000000..e030a8f --- /dev/null +++ b/machine_learning/cross_validation.py @@ -0,0 +1,121 @@ +import os +import sys + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold + +class CrossValidation(): + """This code implements a CrossValidation class for creating cross validation splits. + """ + + + def __init__(self, data=None, cv_method='logo'): + """This method initializes the cv_method argument and optionally prepares the data if supplied. + + Args: + cv_method (str, optional): String of cross validation method; options are 'logo', 'half_logo' and '5kfold'. + Defaults to 'logo'. + data (DataFrame, optional): Pandas DataFrame with target, pid columns and other features as columns. + Defaults to None. + """ + + self.initialize_cv_method(cv_method) + + if data is not None: + self.prepare_data(data) + + + def prepare_data(self, data): + """Prepares the data ready to be passed to the cross-validation algorithm, depending on the cv_method type. + For example, if cv_method is set to 'half_logo' new columns 'pid_index', 'pid_count', 'pid_half' + are added and used in the process. + + Args: + data (_type_): Pandas DataFrame with target, pid columns and other features as columns. + """ + self.data = data + if self.cv_method == "logo": + data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"] + + elif self.cv_method == "half_logo": + data['pid_index'] = data.groupby('pid').cumcount() + data['pid_count'] = data.groupby('pid')['pid'].transform('count') + + data["pid_index"] = (data['pid_index'] / data['pid_count'] + 1).round() + data["pid_half"] = data["pid"] + "_" + data["pid_index"].astype(int).astype(str) + + data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"] + + elif self.cv_method == "5kfold": + data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"] + + self.X, self.y, self.groups = data_X, data_y, data_groups + + + def initialize_cv_method(self, cv_method): + """Initializes the given cv_method type. Depending on the type, the appropriate splitting technique is used. + + Args: + cv_method (str): The type of cross-validation method to use; options are 'logo', 'half_logo' and '5kfold'. + + Raises: + ValueError: If cv_method is not in the list of available methods, it raises an ValueError. + """ + + self.cv_method = cv_method + if self.cv_method not in ["logo", "half_logo", "5kfold"]: + raise ValueError("Invalid cv_method input. Correct values are: 'logo', 'half_logo', '5kfold'") + + if self.cv_method in ["logo", "half_logo"]: + self.cv = LeaveOneGroupOut() + elif self.cv_method == "5kfold": + self.cv = StratifiedKFold(n_splits=5, shuffle=True) + + + def get_splits(self): + """Returns a generator object containing the cross-validation splits. + + Raises: + ValueError: Raises ValueError if no data has been set. + + """ + if not self.data.empty: + return self.cv.split(self.X, self.y, self.groups) + else: + raise ValueError("No data has been set. Use 'prepare_data(data)' method to set the data.") + + + def get_data(self): + """data getter + + Returns: + Pandas DataFrame: Returns the data from the class instance. + """ + return self.data + + + def get_x_y_groups(self): + """X, y, and groups data getter + + Returns: + Pandas DataFrame: Returns the data from the class instance. + """ + return self.X, self.y, self.groups + + + def get_train_test_sets(self, split): + """Gets train and test sets, dependent on the split parameter. This method can be used in a specific splitting context, + where by index we can get train and test sets. + + Args: + split (tuple of indices): It represents one iteration of the split generator (see get_splits method). + + Returns: + tuple of Pandas DataFrames: This method returns train_X, train_y, test_X, test_y, with correctly indexed rows by split param. + """ + return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]] + + \ No newline at end of file diff --git a/machine_learning/feature_selection.py b/machine_learning/feature_selection.py new file mode 100644 index 0000000..690712f --- /dev/null +++ b/machine_learning/feature_selection.py @@ -0,0 +1,45 @@ +import os +import sys + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +from sklearn.feature_selection import SequentialFeatureSelector +from sklearn.naive_bayes import GaussianNB + + +""" Feature selection pipeline: a methods that can be used in the wrapper metod alongside other wrapper contents (hyperparameter tuning etc.). + +(1) Establish methods for each of the steps in feature selection protocol: + (a) feature selection inside specific sensors (sklearn method): returns most important features from all sensors + (b) feature selection between "tuned" sensors: returns filtered sensors, containing most important features retured with (a) +(2) Ensure that above methods are given only a part of data and use appropriate random seeds - to later simulate use case in production. +(3) Implement a method which gives graphical exploration of (1) (a) and (b) steps of the feature selection. +(4) Prepare a core method that can be fit into a wrapper (see sklearn wrapper methods) and integrates methods from (1) + +""" + +class FeatureSelection: + + def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV? + pass + + + def within_sensors_feature_selection(estimator, scoring, tol): + features_list = [] + + nb = GaussianNB() + sfs = SequentialFeatureSelector(nb, n_features_to_select='auto', tol=0.02) # Can set n_features to an absolute value -> then remove tol parameter. + + + return features_list + + def between_sensors_feature_selection(): + pass + + def vizualize_feature_selection_process(): + pass + + def execute_feature_selection_step(): + pass \ No newline at end of file diff --git a/machine_learning/preprocessing.py b/machine_learning/preprocessing.py new file mode 100644 index 0000000..a11558c --- /dev/null +++ b/machine_learning/preprocessing.py @@ -0,0 +1,126 @@ +import os +import sys + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +class Preprocessing: + """This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data. + It's blind to the test data - e.g, it imputes the test data with train data mean. + This means, it somehow needs an access to the information about data split. In context + """ + + + def __init__(self, train_X, train_y, test_X, test_y): + self.train_X = train_X + self.train_y = train_y + self.test_X = test_X + self.test_y = test_y + + + def one_hot_encoder(self, categorical_features, numerical_features, mode): + """ + This code is an implementation of one-hot encoding. It takes in two data sets, + one with categorical features and one with numerical features and a mode parameter. + First it uses the fillna() function to fill in any missing values present in the + categorical data set with the mode value. Then it uses the apply () method to + convert each column of the data set into a category data type which is then + transformed using the pd.get_dummies() function. Finally it concatenates the + numerical data set and the transformed categorical data set using pd.concat() and + returns it. + + Args: + categorical_features (DataFrame): DataFrame including only categorical columns. + numerical_features (_type_): DataFrame including only numerical columns. + mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results + + Returns: + DataFrame: Hot-One Encoded DataFrame. + """ + # Fill train set with mode + categorical_features = categorical_features.fillna(mode) + + # one-hot encoding + categorical_features = categorical_features.apply(lambda col: col.astype("category")) + if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) + + return pd.concat([numerical_features, categorical_features], axis=1) + + + def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]): + """ + This code is used to transform categorical data into numerical representations. + It first identifies the categorical columns, then copies them and saves them as + a new dataset. The missing data is filled with the mode (most frequent value in + the respective column). This new dataset is then subjected to one-hot encoding, + which is a process of transforming categorical data into machine interpretable + numerical form by converting categories into multiple binary outcome variables. + These encoded values are then concatenated to the numerical features prior to + being returned as the final dataset. + + Args: + categorical_columns (list, optional): List of categorical columns in the dataset. + Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"]. + + """ + categorical_columns = [col for col in self.train_X.columns if col in categorical_columns] + + # For train set + + train_X_categorical_features = self.train_X[categorical_columns].copy() + train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1) + mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0] + + self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features) + + # For test set + + test_X_categorical_features = self.test_X[categorical_columns].copy() + test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1) + + self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features) + + + def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"): + + # TODO: TESTING + + if groupby: + # Interval numerical features # TODO: How can we get and assign appropriate groupby means and assign them to correct columns? + + # VVVVV ...... IN PROGRES ...... VVVVV + means = self.train_X[interval_feature_list].groupby(groupby_feature).mean() + self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + other_feature_list)]] = \ + self.train_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean())) + + self.test_X[self.test_X.loc[:, ~self.test_X.columns.isin([groupby_feature] + other_feature_list)]] = \ + self.test_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean())) + + # Other features + self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + interval_feature_list)]] = \ + self.train_X[other_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.median())) + + else: + # Interval numerical features + means = self.train_X[interval_feature_list].mean() + self.train_X[interval_feature_list].fillna(means, inplace=True) + self.test_X[interval_feature_list].fillna(means, inplace=True) + + # Other features + medians = self.train_X[other_feature_list].median() + self.train_X[other_feature_list].fillna(medians, inplace=True) + self.test_X[other_feature_list].fillna(medians, inplace=True) + + + def get_train_test_sets(self): + """Train and test sets getter + + Returns: + tuple of Pandas DataFrames: Gets train test sets in traditional sklearn format. + """ + return self.train_X, self.train_y, self.test_X, self.test_y + + + diff --git a/presentation/event_stressful_detection_5fold.csv b/presentation/event_stressful_detection_5fold.csv new file mode 100644 index 0000000..d005efe --- /dev/null +++ b/presentation/event_stressful_detection_5fold.csv @@ -0,0 +1,29 @@ +method,metric,max,mean +Dummy,test_accuracy,0.8557046979865772,0.8548446932649828 +Dummy,test_average_precision,0.1457286432160804,0.14515530673501736 +Dummy,test_recall,0.0,0.0 +Dummy,test_f1,0.0,0.0 +logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606 +logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834 +logistic_reg,test_recall,0.3023255813953488,0.24266238973536486 +logistic_reg,test_f1,0.3909774436090226,0.318943511424051 +svc,test_accuracy,0.8557046979865772,0.8548446932649828 +svc,test_average_precision,0.44514416839823046,0.4068200938341621 +svc,test_recall,0.0,0.0 +svc,test_f1,0.0,0.0 +gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234 +gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853 +gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935 +gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699 +stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601 +stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592 +stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675 +stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583 +random_forest,test_accuracy,0.9110738255033557,0.9011129472867694 +random_forest,test_average_precision,0.6998372262021191,0.6619275281099584 +random_forest,test_recall,0.4069767441860465,0.35356856455493185 +random_forest,test_f1,0.5691056910569107,0.5078402513053142 +xgboost,test_accuracy,0.9128978224455612,0.9007711937764886 +xgboost,test_average_precision,0.7366643049075349,0.698622165966308 +xgboost,test_recall,0.5287356321839081,0.44346431435445066 +xgboost,test_f1,0.638888888888889,0.5633957169928393 diff --git a/presentation/event_stressful_detection_logo.csv b/presentation/event_stressful_detection_logo.csv new file mode 100644 index 0000000..6874e7f --- /dev/null +++ b/presentation/event_stressful_detection_logo.csv @@ -0,0 +1,29 @@ +method,metric,max,mean +Dummy,test_accuracy,1.0,0.8524114578096439 +Dummy,test_average_precision,0.7,0.14758854219035614 +Dummy,test_recall,0.0,0.0 +Dummy,test_f1,0.0,0.0 +logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311 +logistic_reg,test_average_precision,1.0,0.44605167668563583 +logistic_reg,test_recall,1.0,0.25353566685532386 +logistic_reg,test_f1,0.823529411764706,0.27951926390778625 +svc,test_accuracy,1.0,0.8524114578096439 +svc,test_average_precision,0.9612401707068228,0.44179454944271934 +svc,test_recall,0.0,0.0 +svc,test_f1,0.0,0.0 +gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129 +gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991 +gaussian_naive_bayes,test_recall,1.0,0.3743761174081108 +gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668 +stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739 +stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582 +stochastic_gradient_descent,test_recall,1.0,0.30156420704502945 +stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388 +random_forest,test_accuracy,1.0,0.8722158105763481 +random_forest,test_average_precision,1.0,0.49817066323226833 +random_forest,test_recall,1.0,0.18161263127840668 +random_forest,test_f1,1.0,0.2508096532365307 +xgboost,test_accuracy,1.0,0.8812627400277729 +xgboost,test_average_precision,1.0,0.5505695112208401 +xgboost,test_recall,1.0,0.2896161238315027 +xgboost,test_f1,0.9411764705882353,0.36887408735855665