Expand analysis of the features (individualy and by sensor groups).

2023-01-23 16:32:07 +01:00 · 2023-01-23 16:32:07 +01:00 · 85e572fca0
parent 6a98c8cdcf
commit 85e572fca0
2 changed files with 86 additions and 43 deletions
--- a/exploration/expl_features_analysis.py
+++ b/exploration/expl_features_analysis.py
@ -26,6 +26,7 @@ import seaborn as sns
 from sklearn.tree import DecisionTreeClassifier
 from sklearn import tree
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import train_test_split
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
@ -88,10 +89,12 @@ def get_information_gains(data, target_name):
  return information_gains
-def n_features_with_highest_info_gain(info_gain_dict, n=50):
+def n_features_with_highest_info_gain(info_gain_dict, n=None):
    """
    Get n-features that have highest information gain
    """
    if n is None:
        n = len(info_gain_dict)
    import heapq
    n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
    return {feature[0]: feature[1] for feature in n_largest}
@ -250,30 +253,66 @@ split_criterion = 'entropy'
 print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
 information_gains = get_information_gains_2(model_input, 'target', split_criterion)
 print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
-n_features_with_highest_info_gain(information_gains, n=19)
+n_features_with_highest_info_gain(information_gains)
 # %%
 # Present the feature importance using a tree (that uses gini imputity measure)
 split_criterion = 'entropy'
 print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
 X, y  = model_input.drop(columns=['target', 'pid']), model_input['target']
 imputer = SimpleImputer(missing_values=np.nan, strategy='median')
 X = imputer.fit_transform(X)
 X, _, y, _ =  train_test_split(X, y, random_state=19, test_size=0.25)
-clf = DecisionTreeClassifier()
+
 clf = DecisionTreeClassifier(criterion=split_criterion)
 clf.fit(X, y)
 feat_importance = clf.tree_.compute_feature_importances(normalize=False)
-print("feat importance = " + str(feat_importance))
+print("feat importance = ", feat_importance)
 print("shape", feat_importance.shape)
 tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist()))
 info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp))
 info_gains_dict[info_gains_dict > 0]
 # %%
 # Binarizacija vrednosti tree Information Gain-a
 bins = [-0.1, 0, 0.1] # bins for target's correlations with features
 cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True) 
 plt.title(f"Tree information gains by value ({split_criterion})")
 cut_info_gains.value_counts().plot(kind='bar', color='purple')
 plt.xticks(rotation=45, ha='right')
 print(cut_info_gains.value_counts())
 pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20))
 # %%
 # Plot feature importance tree graph 
 plt.figure(figsize=(12,12))
 tree.plot_tree(clf,
               feature_names = list(model_input.drop(columns=['target', 'pid']).columns), 
               class_names=True,
-               filled = True, fontsize=2, max_depth=10)
+               filled = True, fontsize=5, max_depth=3)
 plt.savefig('tree_high_dpi', dpi=800)
-# %%
+
-print(model_input['target'])
+
 # %% [markdown]
 # Present the feature importance by correlation with target
 corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
-list(corrs.sort_values(ascending=False).index) 
+# corrs.sort_values(ascending=False)
 # Binarizacija vrednosti korelacij
 bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features
 cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True) 
 plt.title("Target's correlations with features")
 cut_corrs.value_counts().plot(kind='bar')
 plt.xticks(rotation=45, ha='right')
 print(cut_corrs.value_counts())
 print(corrs[corrs > 0.1]) # or corrs < -0.1])
 # %%
 # %%
--- a/exploration/expl_features_groups_analysis.py
+++ b/exploration/expl_features_groups_analysis.py
@ -58,7 +58,18 @@ print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(incl
 # %%
 # Add prefix to demographical features
 demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile', 
                'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']
 new_names = [(col, "demo_"+col) for col in demo_features]
 model_input.rename(columns=dict(new_names), inplace=True)
 demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio', 
                'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M', 
                'demo_startlanguage_nl', 'demo_startlanguage_sl']
 # %%
 # Get phone and non-phone columns
 def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
@ -77,11 +88,17 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
    """
    for fgroup_substr in groups_substrings:
        if fgroup_substr is None:
            feature_group_cols = list(df.columns)
            feature_group_cols.remove("pid")
            feature_group_cols.remove("target")
        else: 
            if include_group:
                feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
            else:    
                feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
        X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        X = imputer.fit_transform(X)
@ -103,47 +120,34 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
        print("Precision", metrics.precision_score(y_test, y_pred))
        print("Recall", metrics.recall_score(y_test, y_pred))
        print("F1", metrics.f1_score(y_test, y_pred), "\n")
-# %%
+
-model_input
+# %% [markdown]
-groups_substr = ["_", "phone_", "empatica_"]
+# ### Senzor big feature groups (phone, empatica, demografical)
 groups_substr = [None, "phone_", "empatica_", "demo_"]
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
-# %%
+# %% [markdown]
 # ### Empatica sezor groups
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
-groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
+e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
-make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
+make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
-# %%
+# %% [markdown]
 # ### Phone sensor groups
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
-groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
+phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", 
-make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
+                "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
 # %%
 # Create an empty list to store the feature column groups 
 feature_column_groups = []
 # Iterate through each column in model_input 
 for column in model_input.columns: 
    # Split the column name by '_' 
    split_column = column.split('_')
    # Create a variable to store the prefix of the current column 
    prefix = ''
    # Iterate through each part of the split column name 
    for part in split_column: 
        # Add the part to the prefix variable 
        prefix += part + '_'
        # Check if the prefix is already in our feature column groups list 
        if prefix not in feature_column_groups: 
            # If not, add it to our list of feature columns groups  
            feature_column_groups.append(prefix)
 # Print out all possible feature columns groups that contain more than one entry in a columns list  
 print(feature_column_groups)
 # %%
 # Write all the sensors  (phone, empatica), seperate other (demographical) cols also
 sensors_features_groups = ["_", "phone_", "empatica_", "empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", "phone_light_"
                        "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
 print([col for col in model_input.columns if "phone_" not in col or "empatica_" not in col])
 # %%