Merge remote-tracking branch 'origin/ml_pipeline' into ml_pipeline

2023-03-01 13:37:18 +01:00 · 2023-03-01 13:37:18 +01:00 · 633d029dc2
parent ec7cd09a09 8a532fa95a
commit 633d029dc2
11 changed files with 1068 additions and 17 deletions
--- a/exploration/all_sensors_sequential_addition_scores.xlsx
+++ b/exploration/all_sensors_sequential_addition_scores.xlsx
--- a/exploration/expl_features_analysis.py
+++ b/exploration/expl_features_analysis.py
@ -0,0 +1,318 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 # %matplotlib inline
 import os, sys, math
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 from sklearn.tree import DecisionTreeClassifier
 from sklearn import tree
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import train_test_split
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 def calc_entropy(column):
    """
    Calculate entropy given a pandas series, list, or numpy array.
    """
    # Compute the counts of each unique value in the column
    counts = np.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            # use log from math and set base to 2
            entropy += prob * math.log(prob, 2)
    return -entropy
 def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    #Find the unique values in the column
    values = data[split_name].unique()
    # Make two subsets of the data, based on the unique values
    left_split = data[data[split_name] == values[0]]
    right_split = data[data[split_name] == values[1]]
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    # Return information gain
    return original_entropy - to_subtract
 def get_information_gains(data, target_name):
  #Intialize an empty dictionary for information gains
  information_gains = {}
  #Iterate through each column name in our list
  for col in list(data.columns):
    #Find the information gain for the column
    information_gain = calc_information_gain(data, col, target_name)
    #Add the information gain to our dictionary using the column name as the ekey                                         
    information_gains[col] = information_gain
  #Return the key with the highest value                                          
  #return max(information_gains, key=information_gains.get)
  return information_gains
 def n_features_with_highest_info_gain(info_gain_dict, n=None):
    """
    Get n-features that have highest information gain
    """
    if n is None:
        n = len(info_gain_dict)
    import heapq
    n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
    return {feature[0]: feature[1] for feature in n_largest}
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
 categorical_feature_colnames = ["gender", "startlanguage"]
 additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
 categorical_features = model_input[categorical_feature_colnames].copy()
 mode_categorical_features = categorical_features.mode().iloc[0]
 # fillna with mode
 categorical_features = categorical_features.fillna(mode_categorical_features)
 # one-hot encoding
 categorical_features = categorical_features.apply(lambda col: col.astype("category"))
 if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)
 numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
 model_input = pd.concat([numerical_features, categorical_features], axis=1)
 # Binarizacija targeta
 bins = [-1, 0, 4] # bins for stressfulness (0-4) target
 model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) 
 print(model_input['target'].value_counts(), edges)
 # %%
 info_gains = get_information_gains(model_input, 'target')
 # %% [markdown]
 # Present the feature importance results
 # %%
 print("Total columns:", len(info_gains))
 print(pd.Series(info_gains).value_counts())
 n_features_with_highest_info_gain(info_gains, n=189)
 # %%
 def compute_impurity(feature, impurity_criterion):
    """
    This function calculates impurity of a feature.
    Supported impurity criteria: 'entropy', 'gini'
    input: feature (this needs to be a Pandas series)
    output: feature impurity
    """
    probs = feature.value_counts(normalize=True)
    if impurity_criterion == 'entropy':
        impurity = -1 * np.sum(np.log2(probs) * probs)
    elif impurity_criterion == 'gini':
        impurity = 1 - np.sum(np.square(probs))
    else:
        raise ValueError('Unknown impurity criterion')
    return impurity
 def comp_feature_information_gain(df, target, descriptive_feature, split_criterion, print_flag=False):
    """
    This function calculates information gain for splitting on 
    a particular descriptive feature for a given dataset
    and a given impurity criteria.
    Supported split criterion: 'entropy', 'gini'
    """
    if print_flag:
        print('target feature:', target)
        print('descriptive_feature:', descriptive_feature)
        print('split criterion:', split_criterion)
    target_entropy = compute_impurity(df[target], split_criterion)
    # we define two lists below:
    # entropy_list to store the entropy of each partition
    # weight_list to store the relative number of observations in each partition
    entropy_list = list()
    weight_list = list()
    # loop over each level of the descriptive feature
    # to partition the dataset with respect to that level
    # and compute the entropy and the weight of the level's partition
    for level in df[descriptive_feature].unique():
        df_feature_level = df[df[descriptive_feature] == level]
        entropy_level = compute_impurity(df_feature_level[target], split_criterion)
        entropy_list.append(round(entropy_level, 3))
        weight_level = len(df_feature_level) / len(df)
        weight_list.append(round(weight_level, 3))
    # print('impurity of partitions:', entropy_list)
    # print('weights of partitions:', weight_list)
    feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
    information_gain = target_entropy - feature_remaining_impurity
    if print_flag:
        print('impurity of partitions:', entropy_list)
        print('weights of partitions:', weight_list)
        print('remaining impurity:', feature_remaining_impurity)
        print('information gain:', information_gain)
        print('====================')
    return information_gain
 def calc_information_gain_2(data, split_name, target_name, split_criterion):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original impurity
    original_impurity = compute_impurity(data[target_name], split_criterion)
    #Find the unique values in the column
    values = data[split_name].unique()
    # Make two subsets of the data, based on the unique values
    left_split = data[data[split_name] == values[0]]
    right_split = data[data[split_name] == values[1]]
    # Loop through the splits and calculate the subset impurities
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * compute_impurity(subset[target_name], split_criterion) 
    # Return information gain
    return original_impurity - to_subtract
 def get_information_gains_2(data, target_name, split_criterion):
  #Intialize an empty dictionary for information gains
  information_gains = {}
  #Iterate through each column name in our list
  for feature in list(data.columns):
    #Find the information gain for the column
    information_gain = calc_information_gain_2(model_input, target_name, feature, split_criterion)
    #Add the information gain to our dictionary using the column name as the ekey                                         
    information_gains[feature] = information_gain
  #Return the key with the highest value                                          
  #return max(information_gains, key=information_gains.get)
  return information_gains
 # %% [markdown]
 # Present the feature importance results from other methods
 # %%
 split_criterion = 'entropy'
 print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
 information_gains = get_information_gains_2(model_input, 'target', split_criterion)
 print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
 n_features_with_highest_info_gain(information_gains)
 # %%
 # Present the feature importance using a tree (that uses gini imputity measure)
 split_criterion = 'entropy'
 print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
 X, y  = model_input.drop(columns=['target', 'pid']), model_input['target']
 imputer = SimpleImputer(missing_values=np.nan, strategy='median')
 X = imputer.fit_transform(X)
 X, _, y, _ =  train_test_split(X, y, random_state=19, test_size=0.25)
 clf = DecisionTreeClassifier(criterion=split_criterion)
 clf.fit(X, y)
 feat_importance = clf.tree_.compute_feature_importances(normalize=False)
 print("feat importance = ", feat_importance)
 print("shape", feat_importance.shape)
 tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist()))
 info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp))
 info_gains_dict[info_gains_dict > 0]
 # %%
 # Binarizacija vrednosti tree Information Gain-a
 bins = [-0.1, 0, 0.1] # bins for target's correlations with features
 cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True) 
 plt.title(f"Tree information gains by value ({split_criterion})")
 cut_info_gains.value_counts().plot(kind='bar', color='purple')
 plt.xticks(rotation=45, ha='right')
 print(cut_info_gains.value_counts())
 pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20))
 # %%
 # Plot feature importance tree graph 
 plt.figure(figsize=(12,12))
 tree.plot_tree(clf,
               feature_names = list(model_input.drop(columns=['target', 'pid']).columns), 
               class_names=True,
               filled = True, fontsize=5, max_depth=3)
 plt.savefig('tree_high_dpi', dpi=800)
 # %% [markdown]
 # Present the feature importance by correlation with target
 corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
 # corrs.sort_values(ascending=False)
 # Binarizacija vrednosti korelacij
 bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features
 cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True) 
 plt.title("Target's correlations with features")
 cut_corrs.value_counts().plot(kind='bar')
 plt.xticks(rotation=45, ha='right')
 print(cut_corrs.value_counts())
 print(corrs[corrs > 0.1]) # or corrs < -0.1])
 # %%
 # %%
--- a/exploration/expl_features_groups_analysis.py
+++ b/exploration/expl_features_groups_analysis.py
@ -0,0 +1,328 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 # %matplotlib inline
 import os, sys, math
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 from sklearn.impute import SimpleImputer
 from sklearn.naive_bayes import GaussianNB  
 from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
 from sklearn import metrics 
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
 categorical_feature_colnames = ["gender", "startlanguage"]
 additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
 categorical_features = model_input[categorical_feature_colnames].copy()
 mode_categorical_features = categorical_features.mode().iloc[0]
 # fillna with mode
 categorical_features = categorical_features.fillna(mode_categorical_features)
 # one-hot encoding
 categorical_features = categorical_features.apply(lambda col: col.astype("category"))
 if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)
 numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
 model_input = pd.concat([numerical_features, categorical_features], axis=1)
 # Binarizacija targeta
 bins = [-1, 0, 4] # bins for stressfulness (0-4) target
 model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) 
 print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
 print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)
 # %%
 # Add prefix to demographical features
 demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile', 
                'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']
 new_names = [(col, "demo_"+col) for col in demo_features]
 model_input.rename(columns=dict(new_names), inplace=True)
 demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio', 
                'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M', 
                'demo_startlanguage_nl', 'demo_startlanguage_sl']
 # %%
 # Get phone and non-phone columns
 import warnings
 def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
    """
    This function makes predictions with sensor groups. 
    It takes in a dataframe (df), a list of group substrings (groups_substrings) 
    and an optional parameter include_group (default is True). 
    It creates a list of columns in the dataframe that contain the group substrings, 
    while excluding the 'pid' and 'target' columns. It then splits the data into training 
    and test sets, using a test size of 0.25 for the first split and 0.2 for the second split. 
    A SimpleImputer is used to fill in missing values with median values. 
    A LogisticRegression is then used to fit the training set and make predictions 
    on the test set. Finally, accuracy, precision, recall and F1 scores are printed 
    for each substring group depending on whether or not include_group 
    is set to True or False.
    """
    best_sensor = None
    best_recall_score, best_f1_score = None, None
    for fgroup_substr in groups_substrings:
        if fgroup_substr is None:
            feature_group_cols = list(df.columns)
            feature_group_cols.remove("pid")
            feature_group_cols.remove("target")
        else: 
            if include_group:
                feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
            else:    
                feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
        X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
        X, _, y, _ =  train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        nb = GaussianNB()
        model_cv = cross_validate(
            nb,
            X=imputer.fit_transform(X),
            y=y,
            cv=StratifiedKFold(n_splits=5, shuffle=True),
            n_jobs=-1,
            scoring=('accuracy', 'precision', 'recall', 'f1')
        )
        X_train, X_test, y_train, y_test =  train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
        if print_flag:
            if include_group:
                print("\nPrediction with", fgroup_substr)
            else:
                print("\nPrediction without", fgroup_substr)
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
            acc = np.mean(model_cv['test_accuracy'])
            acc_std = np.std(model_cv['test_accuracy'])
            prec = np.mean(model_cv['test_precision'])
            prec_std = np.std(model_cv['test_precision'])
            rec = np.mean(model_cv['test_recall'])
            rec_std = np.std(model_cv['test_recall'])
            f1 = np.mean(model_cv['test_f1'])
            f1_std = np.std(model_cv['test_f1'])
        if print_flag:
            print("************************************************")
            print(f"Accuracy: {acc} (sd={acc_std})")
            print(f"Precison: {prec} (sd={prec_std})")
            print(f"Recall: {rec} (sd={rec_std})")
            print(f"F1: {f1} (sd={f1_std})\n")
        if (not best_recall_score and not best_f1_score) or (rec > best_recall_score):
            best_sensor = fgroup_substr
            best_recall_score, best_f1_score = rec, f1
            best_recall_score_std, best_f1_score_std = rec_std, f1_std
    return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std 
 # %% [markdown]
 # ### sensor big feature groups (phone, empatica, demographical)
 big_groups_substr = ["phone_", "empatica_", "demo_"]
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)
 # %% [markdown]
 # ### Empatica sezor groups
 # make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
 # e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
 # make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
 # %% [markdown]
 # ### Phone sensor groups
 # make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
 # phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", 
 #                 "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
 # make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
 # %%
 # Write all the sensors  (phone, empatica), seperate other (demographical) cols also
 sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
                        "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
 # %%
 def find_sensor_group_features_importance(model_input, sensor_groups_strings):
    """
    This function finds the importance of sensor groups for a given model input. It takes two parameters: 
    model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores, 
    which will be populated with tuples containing the best sensor, its recall score, and its F1 score. 
    It then makes a copy of the model input and the sensor groups strings. It then loops through each group 
    in the list of strings, creating a list of important columns from the sensor importance scores list. 
    It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score, 
    and its F1 score. These values are added to the sensor importance scores list as a tuple. The function 
    then removes that best sensor from the list of strings before looping again until all groups have been evaluated. 
    Finally, it returns the populated list of tuples containing all sensors' scores. 
    """
    sensor_importance_scores = []
    model_input = model_input.copy()
    sensor_groups_strings = sensor_groups_strings.copy()
    groups_len = len(sensor_groups_strings)
    for i in range(groups_len):
        important_cols = [col[0] for col in sensor_importance_scores]
        with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
        best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std  = \
            make_predictions_with_sensor_groups(model_input, 
            groups_substrings=sensor_groups_strings, include_group=True, 
            with_cols=with_cols)
        sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std ))
        print(f"\nAdded sensor: {best_sensor}\n")
        sensor_groups_strings.remove(best_sensor)
    return sensor_importance_scores
 # %%
 # Method for sorting list of tuples into 3 lists
 def sort_tuples_to_lists(list_of_tuples):
    """
    sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument 
    and sorts them into three separate lists. The first list, xs, contains the first element 
    of each tuple. The second list, yrecall, contains the second element of each tuple rounded 
    to 4 decimal places. The third list, y_fscore, contains the third element of each tuple 
    rounded to 4 decimal places. The method returns all three lists. 
    """
    xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], []
    for a_tuple in list_of_tuples:
        xs.append(a_tuple[0])
        y_recall.append(round(a_tuple[1], 4))
        y_fscore.append(round(a_tuple[2], 4))
        recall_std.append(round(a_tuple[3], 4))
        fscore_std.append(round(a_tuple[4], 4))
    return xs, y_recall, y_fscore, recall_std, fscore_std
 def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
                                                        title="Sequential addition of features and its F1, and recall scores"):
    """
    This function plots the sequential progress of feature addition scores using two subplots. 
    The first subplot is for recall scores and the second subplot is for F1-scores. 
    The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes. 
    The title of the plot can be specified by the user using the parameter title. 
    The maximum recall index and maximum F1-score index are also plotted using a black dot. 
    The figure size is set to 18.5 inches in width and 10.5 inches in height, 
    and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed 
    using plt.show().
    """
    fig, ax = plt.subplots(nrows=2, sharex=True)
    ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD
    ax[0].plot(xs, y_recall, color='red')
    ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD
    mrec_indx = np.argmax(y_recall)
    ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
    ax[0].legend(["Upper std", "Mean Recall", "Lower std"])
    ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD
    ax[1].plot(xs, y_fscore)
    ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD
    mfscore_indx = np.argmax(y_fscore)
    ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
    ax[1].legend(["Upper std", "Mean F1-score", "Lower std"])
    fig.set_size_inches(18.5, 10.5)
    ax[0].title.set_text('Recall scores')
    ax[1].title.set_text('F1-scores')
    plt.suptitle(title, fontsize=14)
    plt.xticks(rotation=90)
    plt.show()
 # %%
 sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
                        "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
 # sensors_features_groups = ["phone_", "empatica_", "demo_"]
 # %%
 # sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
 sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
 xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores)
 # %% [markdown]
 # ### Visualize sensors groups F1 and recall scores
 print(sensor_groups_importance_scores)
 plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
                                                    title="Sequential addition of sensors and its F1, and recall scores")
 # %%
 # Take the most important feature group and investigate it feature-by-feature
 best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
 best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
 # best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
 # xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(best_sensor_features_scores)
 # %% [markdown]
 # ### Visualize best sensor's F1 and recall scores
 # print(best_sensor_features_scores)
 # plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
 #                                                     title="Best sensor addition it's features with F1 and recall scores")
 # %%
 # This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
 # It also saves the sequence of scores for all sensors' features in excel file
 seq_columns = ["sensor_name", "feature_sequence", "recall", "f1_score"]
 feature_sequence = pd.DataFrame(columns=seq_columns)
 for i, sensor_group in enumerate(sensor_groups_importance_scores):
    current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
    current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
    xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(current_sensor_features_scores)
    feature_sequence = pd.concat([feature_sequence, pd.DataFrame({"sensor_name":sensor_group[0], "feature_sequence": [xs], "recall": [y_recall], 
                                                             "f1_score": [y_fscore], "recall_std": [recall_std], "f1_std": [fscore_std]})])
    plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std, 
    title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")
 feature_sequence.to_excel("all_sensors_sequential_addition_scores.xlsx", index=False)
 # %%
 # TODO: method that reads data from the excel file, specified above, and then the method,
 # that selects only features that are max a thresh[%] below the max value (best for recall
 # possibly for f1). This method should additionally take threshold parameter.
 # %%
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@ -0,0 +1,49 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% 
 import sys, os
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 from machine_learning.cross_validation import CrossValidation
 from machine_learning.preprocessing import Preprocessing
 # %% 
 df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 df.set_index(index_columns, inplace=True)
 cv = CrossValidation(data=df, cv_method="logo")
 categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
 interval_feature_list, other_feature_list = [], []
 print(df.columns.tolist())
 for split in cv.get_splits():
    train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
    pre = Preprocessing(train_X, train_y, test_X, test_y)
    pre.one_hot_encode_train_and_test_sets(categorical_columns)
    train_X, train_y, test_X, test_y = pre.get_train_test_sets()
    break
 # %%
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@ -46,9 +46,9 @@ import machine_learning.helper
 #
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
-cv_method_str = '5kfold' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
+cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
 n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
-undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)
+undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method)
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
@ -72,20 +72,27 @@ model_input['target'].value_counts()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 # UnderSampling
 if undersampling:
-  model_input_new = pd.DataFrame(columns=model_input.columns)
+    no_stress = model_input[model_input['target'] == 0]
-  for pid in model_input["pid"].unique():
+    stress = model_input[model_input['target'] == 1]
    stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
    no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
    if (len(stress) == 0):
      continue
    if (len(no_stress) == 0):
      continue
    model_input_new = pd.concat([model_input_new, stress], axis=0)
-    no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
+    no_stress = no_stress.sample(n=len(stress))
-    # In case there are more stress samples than no_stress, take all instances of no_stress.
+    model_input = pd.concat([stress,no_stress], axis=0)
-    model_input_new = pd.concat([model_input_new, no_stress], axis=0)
+
-  model_input = model_input_new
+#   model_input_new = pd.DataFrame(columns=model_input.columns)
 #   for pid in model_input["pid"].unique():
 #     stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
 #     no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
 #     if (len(stress) == 0):
 #       continue
 #     if (len(no_stress) == 0):
 #       continue
 #     model_input_new = pd.concat([model_input_new, stress], axis=0)
 #     no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
 #     # In case there are more stress samples than no_stress, take all instances of no_stress.
 #     model_input_new = pd.concat([model_input_new, no_stress], axis=0)
 #     model_input = model_input_new   
 #     model_input_new = pd.concat([model_input_new, no_stress], axis=0)
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
@ -170,7 +177,7 @@ final_scores = machine_learning.helper.run_all_classification_models(imputer.fit
 # %%
 final_scores.index.name = "metric"
 final_scores = final_scores.set_index(["method", final_scores.index])
-final_scores.to_csv("../presentation/event_stressful_detection_5fold.csv")
+final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv")
 # %% [markdown]
 # ### Logistic Regression
@ -453,4 +460,3 @@ print("F1", np.mean(xgb_classifier_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% jupyter={"outputs_hidden": false, "source_hidden": false}
--- a/exploration/tree_high_dpi.png
+++ b/exploration/tree_high_dpi.png
--- a/machine_learning/cross_validation.py
+++ b/machine_learning/cross_validation.py
@ -0,0 +1,121 @@
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold
 class CrossValidation():
    """This code implements a CrossValidation class for creating cross validation splits.
    """
    def __init__(self, data=None, cv_method='logo'):
        """This method initializes the cv_method argument and optionally prepares the data if supplied.
        Args:
            cv_method (str, optional): String of cross validation method; options are 'logo', 'half_logo' and '5kfold'. 
                Defaults to 'logo'.
            data (DataFrame, optional): Pandas DataFrame with target, pid columns and other features as columns. 
                Defaults to None.
        """
        self.initialize_cv_method(cv_method)
        if data is not None:
            self.prepare_data(data)
    def prepare_data(self, data):
        """Prepares the data ready to be passed to the cross-validation algorithm, depending on the cv_method type. 
            For example, if cv_method is set to 'half_logo' new columns 'pid_index', 'pid_count', 'pid_half' 
            are added and used in the process.
        Args:
            data (_type_): Pandas DataFrame with target, pid columns and other features as columns.
        """
        self.data = data
        if self.cv_method == "logo":
            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
        elif self.cv_method == "half_logo":
            data['pid_index'] = data.groupby('pid').cumcount()
            data['pid_count'] = data.groupby('pid')['pid'].transform('count')
            data["pid_index"] = (data['pid_index'] / data['pid_count'] + 1).round()
            data["pid_half"] = data["pid"] + "_" +  data["pid_index"].astype(int).astype(str)
            data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
        elif self.cv_method == "5kfold":
            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
        self.X, self.y, self.groups = data_X, data_y, data_groups
    def initialize_cv_method(self, cv_method):
        """Initializes the given cv_method type. Depending on the type, the appropriate splitting technique is used.
        Args:
            cv_method (str): The type of cross-validation method to use; options are 'logo', 'half_logo' and '5kfold'.
        Raises:
            ValueError: If cv_method is not in the list of available methods, it raises an ValueError.
        """
        self.cv_method = cv_method
        if self.cv_method not in ["logo", "half_logo", "5kfold"]:
            raise ValueError("Invalid cv_method input. Correct values are: 'logo', 'half_logo', '5kfold'")
        if self.cv_method in ["logo", "half_logo"]:
            self.cv = LeaveOneGroupOut()
        elif self.cv_method == "5kfold":
            self.cv = StratifiedKFold(n_splits=5, shuffle=True)
    def get_splits(self):
        """Returns a generator object containing the cross-validation splits. 
        Raises:
            ValueError: Raises ValueError if no data has been set.
        """
        if not self.data.empty:
            return self.cv.split(self.X, self.y, self.groups)
        else: 
            raise ValueError("No data has been set. Use 'prepare_data(data)' method to set the data.")
    def get_data(self):
        """data getter
        Returns:
            Pandas DataFrame: Returns the data from the class instance.
        """
        return self.data
    def get_x_y_groups(self):
        """X, y, and groups data getter
        Returns:
            Pandas DataFrame: Returns the data from the class instance.
        """
        return self.X, self.y, self.groups
    def get_train_test_sets(self, split):
        """Gets train and test sets, dependent on the split parameter. This method can be used in a specific splitting context,
        where by index we can get train and test sets.
        Args:
            split (tuple of indices): It represents one iteration of the split generator (see get_splits method). 
        Returns:
            tuple of Pandas DataFrames: This method returns train_X, train_y, test_X, test_y, with correctly indexed rows by split param.
        """
        return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@ -0,0 +1,45 @@
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 from sklearn.feature_selection import SequentialFeatureSelector
 from sklearn.naive_bayes import GaussianNB
 """ Feature selection pipeline: a methods that can be used in the wrapper metod alongside other wrapper contents (hyperparameter tuning etc.).
 (1) Establish methods for each of the steps in feature selection protocol: 
    (a) feature selection inside specific sensors (sklearn method): returns most important features from all sensors 
    (b) feature selection between "tuned" sensors: returns filtered sensors, containing most important features retured with (a)
 (2) Ensure that above methods are given only a part of data and use appropriate random seeds - to later simulate use case in production. 
 (3) Implement a method which gives graphical exploration of (1) (a) and (b) steps of the feature selection.
 (4) Prepare a core method that can be fit into a wrapper (see sklearn wrapper methods) and integrates methods from (1)
 """
 class FeatureSelection:
    def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
        pass
    def within_sensors_feature_selection(estimator, scoring, tol):
        features_list = []
        nb = GaussianNB()
        sfs = SequentialFeatureSelector(nb, n_features_to_select='auto', tol=0.02) # Can set n_features to an absolute value -> then remove tol parameter.
        return features_list
    def between_sensors_feature_selection():
        pass
    def vizualize_feature_selection_process():
        pass
    def execute_feature_selection_step():
        pass
--- a/machine_learning/preprocessing.py
+++ b/machine_learning/preprocessing.py
@ -0,0 +1,126 @@
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 class Preprocessing:
    """This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data. 
       It's blind to the test data - e.g, it imputes the test data with train data mean. 
       This means, it somehow needs an access to the information about data split. In context 
    """
    def __init__(self, train_X, train_y, test_X, test_y):
        self.train_X = train_X
        self.train_y = train_y
        self.test_X = test_X
        self.test_y = test_y
    def one_hot_encoder(self, categorical_features, numerical_features, mode):
        """
        This code is an implementation of one-hot encoding. It takes in two data sets, 
        one with categorical features and one with numerical features and a mode parameter. 
        First it uses the fillna() function to fill in any missing values present in the 
        categorical data set with the mode value. Then it uses the apply () method to 
        convert each column of the data set into a category data type which is then 
        transformed using the pd.get_dummies() function. Finally it concatenates the 
        numerical data set and the transformed categorical data set using pd.concat() and 
        returns it.
        Args:
            categorical_features (DataFrame): DataFrame including only categorical columns.
            numerical_features (_type_): DataFrame including only numerical columns.
            mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
        Returns:
            DataFrame: Hot-One Encoded DataFrame.
        """
        # Fill train set with mode
        categorical_features = categorical_features.fillna(mode)
        # one-hot encoding
        categorical_features = categorical_features.apply(lambda col: col.astype("category"))
        if not categorical_features.empty:
            categorical_features = pd.get_dummies(categorical_features)
        return pd.concat([numerical_features, categorical_features], axis=1)
    def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
        """
        This code is used to transform categorical data into numerical representations. 
        It first identifies the categorical columns, then copies them and saves them as 
        a new dataset. The missing data is filled with the mode (most frequent value in 
        the respective column). This new dataset is then subjected to one-hot encoding, 
        which is a process of transforming categorical data into machine interpretable 
        numerical form by converting categories into multiple binary outcome variables. 
        These encoded values are then concatenated to the numerical features prior to 
        being returned as the final dataset.
        Args:
            categorical_columns (list, optional): List of categorical columns in the dataset. 
                Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"].
        """
        categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
        # For train set
        train_X_categorical_features = self.train_X[categorical_columns].copy()
        train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
        mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
        self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
        # For test set
        test_X_categorical_features = self.test_X[categorical_columns].copy()
        test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
        self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
    def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
        # TODO: TESTING
        if groupby:
            # Interval numerical features # TODO: How can we get and assign appropriate groupby means and assign them to correct columns?
            # VVVVV ......  IN PROGRES ...... VVVVV
            means = self.train_X[interval_feature_list].groupby(groupby_feature).mean() 
            self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + other_feature_list)]] = \
                self.train_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
            self.test_X[self.test_X.loc[:, ~self.test_X.columns.isin([groupby_feature] + other_feature_list)]] = \
                self.test_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
            # Other features
            self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + interval_feature_list)]] = \
                self.train_X[other_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.median()))
        else:
            # Interval numerical features
            means = self.train_X[interval_feature_list].mean()
            self.train_X[interval_feature_list].fillna(means, inplace=True)
            self.test_X[interval_feature_list].fillna(means, inplace=True)
            # Other features
            medians = self.train_X[other_feature_list].median()
            self.train_X[other_feature_list].fillna(medians, inplace=True)
            self.test_X[other_feature_list].fillna(medians, inplace=True)
    def get_train_test_sets(self):
        """Train and test sets getter
        Returns:
            tuple of Pandas DataFrames: Gets train test sets in traditional sklearn format.
        """
        return self.train_X, self.train_y, self.test_X, self.test_y
--- a/presentation/event_stressful_detection_5fold.csv
+++ b/presentation/event_stressful_detection_5fold.csv
@ -0,0 +1,29 @@
 method,metric,max,mean
 Dummy,test_accuracy,0.8557046979865772,0.8548446932649828
 Dummy,test_average_precision,0.1457286432160804,0.14515530673501736
 Dummy,test_recall,0.0,0.0
 Dummy,test_f1,0.0,0.0
 logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606
 logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834
 logistic_reg,test_recall,0.3023255813953488,0.24266238973536486
 logistic_reg,test_f1,0.3909774436090226,0.318943511424051
 svc,test_accuracy,0.8557046979865772,0.8548446932649828
 svc,test_average_precision,0.44514416839823046,0.4068200938341621
 svc,test_recall,0.0,0.0
 svc,test_f1,0.0,0.0
 gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234
 gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853
 gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935
 gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699
 stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601
 stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592
 stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675
 stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583
 random_forest,test_accuracy,0.9110738255033557,0.9011129472867694
 random_forest,test_average_precision,0.6998372262021191,0.6619275281099584
 random_forest,test_recall,0.4069767441860465,0.35356856455493185
 random_forest,test_f1,0.5691056910569107,0.5078402513053142
 xgboost,test_accuracy,0.9128978224455612,0.9007711937764886
 xgboost,test_average_precision,0.7366643049075349,0.698622165966308
 xgboost,test_recall,0.5287356321839081,0.44346431435445066
 xgboost,test_f1,0.638888888888889,0.5633957169928393
--- a/presentation/event_stressful_detection_logo.csv
+++ b/presentation/event_stressful_detection_logo.csv
@ -0,0 +1,29 @@
 method,metric,max,mean
 Dummy,test_accuracy,1.0,0.8524114578096439
 Dummy,test_average_precision,0.7,0.14758854219035614
 Dummy,test_recall,0.0,0.0
 Dummy,test_f1,0.0,0.0
 logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311
 logistic_reg,test_average_precision,1.0,0.44605167668563583
 logistic_reg,test_recall,1.0,0.25353566685532386
 logistic_reg,test_f1,0.823529411764706,0.27951926390778625
 svc,test_accuracy,1.0,0.8524114578096439
 svc,test_average_precision,0.9612401707068228,0.44179454944271934
 svc,test_recall,0.0,0.0
 svc,test_f1,0.0,0.0
 gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129
 gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991
 gaussian_naive_bayes,test_recall,1.0,0.3743761174081108
 gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668
 stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739
 stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582
 stochastic_gradient_descent,test_recall,1.0,0.30156420704502945
 stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388
 random_forest,test_accuracy,1.0,0.8722158105763481
 random_forest,test_average_precision,1.0,0.49817066323226833
 random_forest,test_recall,1.0,0.18161263127840668
 random_forest,test_f1,1.0,0.2508096532365307
 xgboost,test_accuracy,1.0,0.8812627400277729
 xgboost,test_average_precision,1.0,0.5505695112208401
 xgboost,test_recall,1.0,0.2896161238315027
 xgboost,test_f1,0.9411764705882353,0.36887408735855665