Prepare scripts for feature importance analysis.

2023-01-19 16:20:43 +01:00 · 2023-01-19 16:20:43 +01:00 · 6a98c8cdcf
parent d263b32564
commit 6a98c8cdcf
4 changed files with 310 additions and 8 deletions
--- a/exploration/expl_features_analysis.py
+++ b/exploration/expl_features_analysis.py
@ -21,6 +21,12 @@ import os, sys, math
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
+import seaborn as sns
+
+from sklearn.tree import DecisionTreeClassifier
+from sklearn import tree
+from sklearn.impute import SimpleImputer
+

 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 def calc_entropy(column):
@ -87,15 +93,14 @@ def n_features_with_highest_info_gain(info_gain_dict, n=50):
    Get n-features that have highest information gain
    """
    import heapq
-    return heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
+    n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
+    return {feature[0]: feature[1] for feature in n_largest}


 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)

-
-# %% 
 categorical_feature_colnames = ["gender", "startlanguage"]
 additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
@ -114,12 +119,161 @@ if not categorical_features.empty:
 numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
 model_input = pd.concat([numerical_features, categorical_features], axis=1)

+# Binarizacija targeta
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) 
+print(model_input['target'].value_counts(), edges)
+
 # %%
 info_gains = get_information_gains(model_input, 'target')
-selected_features = n_features_with_highest_info_gain(info_gains, n=150)
-selected_features
-
-# TODO: binarizacija targeta

+# %% [markdown]
+# Present the feature importance results
+
+# %%
+print("Total columns:", len(info_gains))
+print(pd.Series(info_gains).value_counts())
+
+n_features_with_highest_info_gain(info_gains, n=189)
+
+# %%
+def compute_impurity(feature, impurity_criterion):
+    """
+    This function calculates impurity of a feature.
+    Supported impurity criteria: 'entropy', 'gini'
+    input: feature (this needs to be a Pandas series)
+    output: feature impurity
+    """
+    probs = feature.value_counts(normalize=True)
+    
+    if impurity_criterion == 'entropy':
+        impurity = -1 * np.sum(np.log2(probs) * probs)
+    elif impurity_criterion == 'gini':
+        impurity = 1 - np.sum(np.square(probs))
+    else:
+        raise ValueError('Unknown impurity criterion')
+        
+    return impurity
+
+
+def comp_feature_information_gain(df, target, descriptive_feature, split_criterion, print_flag=False):
+    """
+    This function calculates information gain for splitting on 
+    a particular descriptive feature for a given dataset
+    and a given impurity criteria.
+    Supported split criterion: 'entropy', 'gini'
+    """
+    if print_flag:
+        print('target feature:', target)
+        print('descriptive_feature:', descriptive_feature)
+        print('split criterion:', split_criterion)
+            
+    target_entropy = compute_impurity(df[target], split_criterion)
+
+    # we define two lists below:
+    # entropy_list to store the entropy of each partition
+    # weight_list to store the relative number of observations in each partition
+    entropy_list = list()
+    weight_list = list()
+    
+    # loop over each level of the descriptive feature
+    # to partition the dataset with respect to that level
+    # and compute the entropy and the weight of the level's partition
+    for level in df[descriptive_feature].unique():
+        df_feature_level = df[df[descriptive_feature] == level]
+        entropy_level = compute_impurity(df_feature_level[target], split_criterion)
+        entropy_list.append(round(entropy_level, 3))
+        weight_level = len(df_feature_level) / len(df)
+        weight_list.append(round(weight_level, 3))
+
+    # print('impurity of partitions:', entropy_list)
+    # print('weights of partitions:', weight_list)
+
+    feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
+    
+    information_gain = target_entropy - feature_remaining_impurity
+    
+    if print_flag:
+        print('impurity of partitions:', entropy_list)
+        print('weights of partitions:', weight_list)
+        print('remaining impurity:', feature_remaining_impurity)
+        print('information gain:', information_gain)
+        print('====================')
+        
+    return information_gain
+
+
+def calc_information_gain_2(data, split_name, target_name, split_criterion):
+    """
+    Calculate information gain given a data set, column to split on, and target
+    """
+    # Calculate the original impurity
+    original_impurity = compute_impurity(data[target_name], split_criterion)
+    #Find the unique values in the column
+    values = data[split_name].unique()
+    
+    # Make two subsets of the data, based on the unique values
+    left_split = data[data[split_name] == values[0]]
+    right_split = data[data[split_name] == values[1]]
+    
+    # Loop through the splits and calculate the subset impurities
+    to_subtract = 0
+    for subset in [left_split, right_split]:
+        prob = (subset.shape[0] / data.shape[0]) 
+        to_subtract += prob * compute_impurity(subset[target_name], split_criterion) 
+    
+    # Return information gain
+    return original_impurity - to_subtract
+
+
+def get_information_gains_2(data, target_name, split_criterion):
+  #Intialize an empty dictionary for information gains
+  information_gains = {}
+  
+  #Iterate through each column name in our list
+  for feature in list(data.columns):
+    #Find the information gain for the column
+    information_gain = calc_information_gain_2(model_input, target_name, feature, split_criterion)
+    #Add the information gain to our dictionary using the column name as the ekey                                         
+    information_gains[feature] = information_gain
+  
+  #Return the key with the highest value                                          
+  #return max(information_gains, key=information_gains.get)
+  
+  return information_gains
+
+# %% [markdown]
+# Present the feature importance results from other methods
+
+# %%
+split_criterion = 'entropy'
+print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
+information_gains = get_information_gains_2(model_input, 'target', split_criterion)
+print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
+n_features_with_highest_info_gain(information_gains, n=19)
+
+# %%
+
+X, y  = model_input.drop(columns=['target', 'pid']), model_input['target']
+imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+X = imputer.fit_transform(X)
+
+clf = DecisionTreeClassifier()
+clf.fit(X, y)
+
+feat_importance = clf.tree_.compute_feature_importances(normalize=False)
+print("feat importance = " + str(feat_importance))
+
+plt.figure(figsize=(12,12))
+tree.plot_tree(clf,
+               feature_names = list(model_input.drop(columns=['target', 'pid']).columns), 
+               class_names=True,
+               filled = True, fontsize=2, max_depth=10)
+
+plt.savefig('tree_high_dpi', dpi=800)
+# %%
+print(model_input['target'])
+corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
+list(corrs.sort_values(ascending=False).index) 

 # %%
--- a/exploration/expl_features_groups_analysis.py
+++ b/exploration/expl_features_groups_analysis.py
@ -0,0 +1,149 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.13.0
+#   kernelspec:
+#     display_name: straw2analysis
+#     language: python
+#     name: straw2analysis
+# ---
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false}
+# %matplotlib inline
+
+import os, sys, math
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.impute import SimpleImputer
+from sklearn.ensemble import RandomForestClassifier 
+from sklearn.model_selection import train_test_split
+from sklearn import metrics 
+
+# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
+index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
+model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
+
+categorical_feature_colnames = ["gender", "startlanguage"]
+additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
+categorical_feature_colnames += additional_categorical_features
+
+categorical_features = model_input[categorical_feature_colnames].copy()
+mode_categorical_features = categorical_features.mode().iloc[0]
+
+# fillna with mode
+categorical_features = categorical_features.fillna(mode_categorical_features)
+
+# one-hot encoding
+categorical_features = categorical_features.apply(lambda col: col.astype("category"))
+if not categorical_features.empty:
+    categorical_features = pd.get_dummies(categorical_features)
+
+numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
+model_input = pd.concat([numerical_features, categorical_features], axis=1)
+
+# Binarizacija targeta
+bins = [-1, 0, 4] # bins for stressfulness (0-4) target
+model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) 
+
+print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
+print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)
+
+
+# %%
+
+# Get phone and non-phone columns
+
+def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
+    """
+    This function makes predictions with sensor groups. 
+    It takes in a dataframe (df), a list of group substrings (groups_substrings) 
+    and an optional parameter include_group (default is True). 
+    It creates a list of columns in the dataframe that contain the group substrings, 
+    while excluding the 'pid' and 'target' columns. It then splits the data into training 
+    and test sets, using a test size of 0.25 for the first split and 0.2 for the second split. 
+    A SimpleImputer is used to fill in missing values with median values. 
+    A RandomForestClassifier is then used to fit the training set and make predictions 
+    on the test set. Finally, accuracy, precision, recall and F1 scores are printed 
+    for each substring group depending on whether or not include_group 
+    is set to True or False.
+
+    """
+    for fgroup_substr in groups_substrings:
+        if include_group:
+            feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
+        else:    
+            feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
+
+        X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
+        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
+        X = imputer.fit_transform(X)
+
+        X, _, y, _ =  train_test_split(X, y, random_state=19, test_size=0.25)
+        X_train, X_test, y_train, y_test =  train_test_split(X, y, random_state=2, test_size=0.2)
+        
+        rfc = RandomForestClassifier(random_state=0)
+        rfc.fit(X_train, y_train)
+        y_pred = rfc.predict(X_test)
+
+        if include_group:
+            print("\nPrediction with", fgroup_substr)
+        else:
+            print("\nPrediction without", fgroup_substr)
+
+        print("************************************************")
+        print("Accuracy", metrics.accuracy_score(y_test, y_pred))
+        print("Precision", metrics.precision_score(y_test, y_pred))
+        print("Recall", metrics.recall_score(y_test, y_pred))
+        print("F1", metrics.f1_score(y_test, y_pred), "\n")
+# %%
+model_input
+groups_substr = ["_", "phone_", "empatica_"]
+make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
+
+# %%
+make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
+groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
+make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
+
+# %%
+make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
+groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
+make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
+
+# %%
+# Create an empty list to store the feature column groups 
+feature_column_groups = []
+
+# Iterate through each column in model_input 
+for column in model_input.columns: 
+
+    # Split the column name by '_' 
+    split_column = column.split('_')
+
+    # Create a variable to store the prefix of the current column 
+    prefix = ''
+
+    # Iterate through each part of the split column name 
+    for part in split_column: 
+
+        # Add the part to the prefix variable 
+        prefix += part + '_'
+
+        # Check if the prefix is already in our feature column groups list 
+        if prefix not in feature_column_groups: 
+
+            # If not, add it to our list of feature columns groups  
+            feature_column_groups.append(prefix)
+    
+# Print out all possible feature columns groups that contain more than one entry in a columns list  
+print(feature_column_groups)
+# %%
+# Write all the sensors  (phone, empatica), seperate other (demographical) cols also
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@ -460,4 +460,3 @@ print("F1", np.mean(xgb_classifier_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))

-# %% jupyter={"outputs_hidden": false, "source_hidden": false}
--- a/exploration/tree_high_dpi.png
+++ b/exploration/tree_high_dpi.png