From 85e572fca0518391da26834413fee30027405150 Mon Sep 17 00:00:00 2001
From: Primoz <sisko.primoz@gmail.com>
Date: Mon, 23 Jan 2023 16:32:07 +0100
Subject: [PATCH] Expand analysis of the features (individualy and by sensor
 groups).

---
 exploration/expl_features_analysis.py        | 55 ++++++++++++---
 exploration/expl_features_groups_analysis.py | 74 +++++++++++---------
 2 files changed, 86 insertions(+), 43 deletions(-)

diff --git a/exploration/expl_features_analysis.py b/exploration/expl_features_analysis.py
index 55f5358..056e54c 100644
--- a/exploration/expl_features_analysis.py
+++ b/exploration/expl_features_analysis.py
@@ -26,6 +26,7 @@ import seaborn as sns
 from sklearn.tree import DecisionTreeClassifier
 from sklearn import tree
 from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
 
 
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
@@ -88,10 +89,12 @@ def get_information_gains(data, target_name):
   
   return information_gains
 
-def n_features_with_highest_info_gain(info_gain_dict, n=50):
+def n_features_with_highest_info_gain(info_gain_dict, n=None):
     """
     Get n-features that have highest information gain
     """
+    if n is None:
+        n = len(info_gain_dict)
     import heapq
     n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
     return {feature[0]: feature[1] for feature in n_largest}
@@ -250,30 +253,66 @@ split_criterion = 'entropy'
 print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
 information_gains = get_information_gains_2(model_input, 'target', split_criterion)
 print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
-n_features_with_highest_info_gain(information_gains, n=19)
+n_features_with_highest_info_gain(information_gains)
 
 # %%
+# Present the feature importance using a tree (that uses gini imputity measure)
+split_criterion = 'entropy'
+print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
 
 X, y  = model_input.drop(columns=['target', 'pid']), model_input['target']
 imputer = SimpleImputer(missing_values=np.nan, strategy='median')
 X = imputer.fit_transform(X)
+X, _, y, _ =  train_test_split(X, y, random_state=19, test_size=0.25)
 
-clf = DecisionTreeClassifier()
+
+clf = DecisionTreeClassifier(criterion=split_criterion)
 clf.fit(X, y)
 
 feat_importance = clf.tree_.compute_feature_importances(normalize=False)
-print("feat importance = " + str(feat_importance))
+print("feat importance = ", feat_importance)
+print("shape", feat_importance.shape)
+tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist()))
+info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp))
+info_gains_dict[info_gains_dict > 0]
 
+# %%
+# Binarizacija vrednosti tree Information Gain-a
+bins = [-0.1, 0, 0.1] # bins for target's correlations with features
+cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True) 
+plt.title(f"Tree information gains by value ({split_criterion})")
+cut_info_gains.value_counts().plot(kind='bar', color='purple')
+plt.xticks(rotation=45, ha='right')
+print(cut_info_gains.value_counts())
+
+
+pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20))
+
+# %%
+# Plot feature importance tree graph 
 plt.figure(figsize=(12,12))
 tree.plot_tree(clf,
                feature_names = list(model_input.drop(columns=['target', 'pid']).columns), 
                class_names=True,
-               filled = True, fontsize=2, max_depth=10)
+               filled = True, fontsize=5, max_depth=3)
 
 plt.savefig('tree_high_dpi', dpi=800)
-# %%
-print(model_input['target'])
+
+
+# %% [markdown]
+# Present the feature importance by correlation with target
+
 corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
-list(corrs.sort_values(ascending=False).index) 
+# corrs.sort_values(ascending=False)
+
+# Binarizacija vrednosti korelacij
+bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features
+cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True) 
+plt.title("Target's correlations with features")
+cut_corrs.value_counts().plot(kind='bar')
+plt.xticks(rotation=45, ha='right')
+print(cut_corrs.value_counts())
+print(corrs[corrs > 0.1]) # or corrs < -0.1])
+# %%
 
 # %%
diff --git a/exploration/expl_features_groups_analysis.py b/exploration/expl_features_groups_analysis.py
index 3cf0d6a..bc46533 100644
--- a/exploration/expl_features_groups_analysis.py
+++ b/exploration/expl_features_groups_analysis.py
@@ -58,7 +58,18 @@ print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(incl
 
 
 # %%
+# Add prefix to demographical features
+demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile', 
+                'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']
 
+new_names = [(col, "demo_"+col) for col in demo_features]
+model_input.rename(columns=dict(new_names), inplace=True)
+
+demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio', 
+                'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M', 
+                'demo_startlanguage_nl', 'demo_startlanguage_sl']
+
+# %%
 # Get phone and non-phone columns
 
 def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
@@ -77,10 +88,16 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
 
     """
     for fgroup_substr in groups_substrings:
-        if include_group:
-            feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
-        else:    
-            feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
+        if fgroup_substr is None:
+            feature_group_cols = list(df.columns)
+            feature_group_cols.remove("pid")
+            feature_group_cols.remove("target")
+        else: 
+            if include_group:
+                feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
+            else:    
+                feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
+
 
         X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
         imputer = SimpleImputer(missing_values=np.nan, strategy='median')
@@ -103,47 +120,34 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
         print("Precision", metrics.precision_score(y_test, y_pred))
         print("Recall", metrics.recall_score(y_test, y_pred))
         print("F1", metrics.f1_score(y_test, y_pred), "\n")
-# %%
-model_input
-groups_substr = ["_", "phone_", "empatica_"]
+
+# %% [markdown]
+# ### Senzor big feature groups (phone, empatica, demografical)
+groups_substr = [None, "phone_", "empatica_", "demo_"]
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
 
-# %%
+# %% [markdown]
+# ### Empatica sezor groups
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
-groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
-make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
+e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
+make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
 
-# %%
+# %% [markdown]
+# ### Phone sensor groups
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
-groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
-make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
+phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", 
+                "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
+make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
 
 # %%
-# Create an empty list to store the feature column groups 
-feature_column_groups = []
+# Write all the sensors  (phone, empatica), seperate other (demographical) cols also
 
-# Iterate through each column in model_input 
-for column in model_input.columns: 
+sensors_features_groups = ["_", "phone_", "empatica_", "empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
+                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", "phone_light_"
+                        "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
 
-    # Split the column name by '_' 
-    split_column = column.split('_')
 
-    # Create a variable to store the prefix of the current column 
-    prefix = ''
 
-    # Iterate through each part of the split column name 
-    for part in split_column: 
+print([col for col in model_input.columns if "phone_" not in col or "empatica_" not in col])
 
-        # Add the part to the prefix variable 
-        prefix += part + '_'
-
-        # Check if the prefix is already in our feature column groups list 
-        if prefix not in feature_column_groups: 
-
-            # If not, add it to our list of feature columns groups  
-            feature_column_groups.append(prefix)
-    
-# Print out all possible feature columns groups that contain more than one entry in a columns list  
-print(feature_column_groups)
 # %%
-# Write all the sensors  (phone, empatica), seperate other (demographical) cols also
\ No newline at end of file