diff --git a/exploration/expl_features_analysis.py b/exploration/expl_features_analysis.py index a498185..55f5358 100644 --- a/exploration/expl_features_analysis.py +++ b/exploration/expl_features_analysis.py @@ -21,6 +21,12 @@ import os, sys, math import numpy as np import matplotlib.pyplot as plt import pandas as pd +import seaborn as sns + +from sklearn.tree import DecisionTreeClassifier +from sklearn import tree +from sklearn.impute import SimpleImputer + # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} def calc_entropy(column): @@ -87,15 +93,14 @@ def n_features_with_highest_info_gain(info_gain_dict, n=50): Get n-features that have highest information gain """ import heapq - return heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1]) + n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1]) + return {feature[0]: feature[1] for feature in n_largest} # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns) - -# %% categorical_feature_colnames = ["gender", "startlanguage"] additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col] categorical_feature_colnames += additional_categorical_features @@ -114,12 +119,161 @@ if not categorical_features.empty: numerical_features = model_input.drop(categorical_feature_colnames, axis=1) model_input = pd.concat([numerical_features, categorical_features], axis=1) +# Binarizacija targeta +bins = [-1, 0, 4] # bins for stressfulness (0-4) target +model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) +print(model_input['target'].value_counts(), edges) + # %% info_gains = get_information_gains(model_input, 'target') -selected_features = n_features_with_highest_info_gain(info_gains, n=150) -selected_features - -# TODO: binarizacija targeta +# %% [markdown] +# Present the feature importance results + +# %% +print("Total columns:", len(info_gains)) +print(pd.Series(info_gains).value_counts()) + +n_features_with_highest_info_gain(info_gains, n=189) + +# %% +def compute_impurity(feature, impurity_criterion): + """ + This function calculates impurity of a feature. + Supported impurity criteria: 'entropy', 'gini' + input: feature (this needs to be a Pandas series) + output: feature impurity + """ + probs = feature.value_counts(normalize=True) + + if impurity_criterion == 'entropy': + impurity = -1 * np.sum(np.log2(probs) * probs) + elif impurity_criterion == 'gini': + impurity = 1 - np.sum(np.square(probs)) + else: + raise ValueError('Unknown impurity criterion') + + return impurity + + +def comp_feature_information_gain(df, target, descriptive_feature, split_criterion, print_flag=False): + """ + This function calculates information gain for splitting on + a particular descriptive feature for a given dataset + and a given impurity criteria. + Supported split criterion: 'entropy', 'gini' + """ + if print_flag: + print('target feature:', target) + print('descriptive_feature:', descriptive_feature) + print('split criterion:', split_criterion) + + target_entropy = compute_impurity(df[target], split_criterion) + + # we define two lists below: + # entropy_list to store the entropy of each partition + # weight_list to store the relative number of observations in each partition + entropy_list = list() + weight_list = list() + + # loop over each level of the descriptive feature + # to partition the dataset with respect to that level + # and compute the entropy and the weight of the level's partition + for level in df[descriptive_feature].unique(): + df_feature_level = df[df[descriptive_feature] == level] + entropy_level = compute_impurity(df_feature_level[target], split_criterion) + entropy_list.append(round(entropy_level, 3)) + weight_level = len(df_feature_level) / len(df) + weight_list.append(round(weight_level, 3)) + + # print('impurity of partitions:', entropy_list) + # print('weights of partitions:', weight_list) + + feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list)) + + information_gain = target_entropy - feature_remaining_impurity + + if print_flag: + print('impurity of partitions:', entropy_list) + print('weights of partitions:', weight_list) + print('remaining impurity:', feature_remaining_impurity) + print('information gain:', information_gain) + print('====================') + + return information_gain + + +def calc_information_gain_2(data, split_name, target_name, split_criterion): + """ + Calculate information gain given a data set, column to split on, and target + """ + # Calculate the original impurity + original_impurity = compute_impurity(data[target_name], split_criterion) + #Find the unique values in the column + values = data[split_name].unique() + + # Make two subsets of the data, based on the unique values + left_split = data[data[split_name] == values[0]] + right_split = data[data[split_name] == values[1]] + + # Loop through the splits and calculate the subset impurities + to_subtract = 0 + for subset in [left_split, right_split]: + prob = (subset.shape[0] / data.shape[0]) + to_subtract += prob * compute_impurity(subset[target_name], split_criterion) + + # Return information gain + return original_impurity - to_subtract + + +def get_information_gains_2(data, target_name, split_criterion): + #Intialize an empty dictionary for information gains + information_gains = {} + + #Iterate through each column name in our list + for feature in list(data.columns): + #Find the information gain for the column + information_gain = calc_information_gain_2(model_input, target_name, feature, split_criterion) + #Add the information gain to our dictionary using the column name as the ekey + information_gains[feature] = information_gain + + #Return the key with the highest value + #return max(information_gains, key=information_gains.get) + + return information_gains + +# %% [markdown] +# Present the feature importance results from other methods + +# %% +split_criterion = 'entropy' +print("Target impurity:", compute_impurity(model_input['target'], split_criterion)) +information_gains = get_information_gains_2(model_input, 'target', split_criterion) +print(pd.Series(information_gains).value_counts().sort_index(ascending=False)) +n_features_with_highest_info_gain(information_gains, n=19) + +# %% + +X, y = model_input.drop(columns=['target', 'pid']), model_input['target'] +imputer = SimpleImputer(missing_values=np.nan, strategy='median') +X = imputer.fit_transform(X) + +clf = DecisionTreeClassifier() +clf.fit(X, y) + +feat_importance = clf.tree_.compute_feature_importances(normalize=False) +print("feat importance = " + str(feat_importance)) + +plt.figure(figsize=(12,12)) +tree.plot_tree(clf, + feature_names = list(model_input.drop(columns=['target', 'pid']).columns), + class_names=True, + filled = True, fontsize=2, max_depth=10) + +plt.savefig('tree_high_dpi', dpi=800) +# %% +print(model_input['target']) +corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int)))) +list(corrs.sort_values(ascending=False).index) # %% diff --git a/exploration/expl_features_groups_analysis.py b/exploration/expl_features_groups_analysis.py new file mode 100644 index 0000000..3cf0d6a --- /dev/null +++ b/exploration/expl_features_groups_analysis.py @@ -0,0 +1,149 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.13.0 +# kernelspec: +# display_name: straw2analysis +# language: python +# name: straw2analysis +# --- + +# %% jupyter={"source_hidden": false, "outputs_hidden": false} +# %matplotlib inline + +import os, sys, math + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +from sklearn.impute import SimpleImputer +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split +from sklearn import metrics + +# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns) + +categorical_feature_colnames = ["gender", "startlanguage"] +additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col] +categorical_feature_colnames += additional_categorical_features + +categorical_features = model_input[categorical_feature_colnames].copy() +mode_categorical_features = categorical_features.mode().iloc[0] + +# fillna with mode +categorical_features = categorical_features.fillna(mode_categorical_features) + +# one-hot encoding +categorical_features = categorical_features.apply(lambda col: col.astype("category")) +if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) + +numerical_features = model_input.drop(categorical_feature_colnames, axis=1) +model_input = pd.concat([numerical_features, categorical_features], axis=1) + +# Binarizacija targeta +bins = [-1, 0, 4] # bins for stressfulness (0-4) target +model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) + +print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns))) +print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape) + + +# %% + +# Get phone and non-phone columns + +def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True): + """ + This function makes predictions with sensor groups. + It takes in a dataframe (df), a list of group substrings (groups_substrings) + and an optional parameter include_group (default is True). + It creates a list of columns in the dataframe that contain the group substrings, + while excluding the 'pid' and 'target' columns. It then splits the data into training + and test sets, using a test size of 0.25 for the first split and 0.2 for the second split. + A SimpleImputer is used to fill in missing values with median values. + A RandomForestClassifier is then used to fit the training set and make predictions + on the test set. Finally, accuracy, precision, recall and F1 scores are printed + for each substring group depending on whether or not include_group + is set to True or False. + + """ + for fgroup_substr in groups_substrings: + if include_group: + feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']] + else: + feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']] + + X, y = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target'] + imputer = SimpleImputer(missing_values=np.nan, strategy='median') + X = imputer.fit_transform(X) + + X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2) + + rfc = RandomForestClassifier(random_state=0) + rfc.fit(X_train, y_train) + y_pred = rfc.predict(X_test) + + if include_group: + print("\nPrediction with", fgroup_substr) + else: + print("\nPrediction without", fgroup_substr) + + print("************************************************") + print("Accuracy", metrics.accuracy_score(y_test, y_pred)) + print("Precision", metrics.precision_score(y_test, y_pred)) + print("Recall", metrics.recall_score(y_test, y_pred)) + print("F1", metrics.f1_score(y_test, y_pred), "\n") +# %% +model_input +groups_substr = ["_", "phone_", "empatica_"] +make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False) + +# %% +make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) +groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"] +make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False) + +# %% +make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) +groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"] +make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False) + +# %% +# Create an empty list to store the feature column groups +feature_column_groups = [] + +# Iterate through each column in model_input +for column in model_input.columns: + + # Split the column name by '_' + split_column = column.split('_') + + # Create a variable to store the prefix of the current column + prefix = '' + + # Iterate through each part of the split column name + for part in split_column: + + # Add the part to the prefix variable + prefix += part + '_' + + # Check if the prefix is already in our feature column groups list + if prefix not in feature_column_groups: + + # If not, add it to our list of feature columns groups + feature_column_groups.append(prefix) + +# Print out all possible feature columns groups that contain more than one entry in a columns list +print(feature_column_groups) +# %% +# Write all the sensors (phone, empatica), seperate other (demographical) cols also \ No newline at end of file diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py index 131c901..04087f0 100644 --- a/exploration/ml_pipeline_classification.py +++ b/exploration/ml_pipeline_classification.py @@ -460,4 +460,3 @@ print("F1", np.mean(xgb_classifier_scores['test_f1'])) print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1]) print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])) -# %% jupyter={"outputs_hidden": false, "source_hidden": false} diff --git a/exploration/tree_high_dpi.png b/exploration/tree_high_dpi.png new file mode 100644 index 0000000..319c498 Binary files /dev/null and b/exploration/tree_high_dpi.png differ