Expand analysis of the features (individualy and by sensor groups).

ml_pipeline
Primoz 2023-01-23 16:32:07 +01:00
parent 6a98c8cdcf
commit 85e572fca0
2 changed files with 86 additions and 43 deletions

View File

@ -26,6 +26,7 @@ import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
@ -88,10 +89,12 @@ def get_information_gains(data, target_name):
return information_gains
def n_features_with_highest_info_gain(info_gain_dict, n=50):
def n_features_with_highest_info_gain(info_gain_dict, n=None):
"""
Get n-features that have highest information gain
"""
if n is None:
n = len(info_gain_dict)
import heapq
n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
return {feature[0]: feature[1] for feature in n_largest}
@ -250,30 +253,66 @@ split_criterion = 'entropy'
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
information_gains = get_information_gains_2(model_input, 'target', split_criterion)
print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
n_features_with_highest_info_gain(information_gains, n=19)
n_features_with_highest_info_gain(information_gains)
# %%
# Present the feature importance using a tree (that uses gini imputity measure)
split_criterion = 'entropy'
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
X, y = model_input.drop(columns=['target', 'pid']), model_input['target']
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)
X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25)
clf = DecisionTreeClassifier()
clf = DecisionTreeClassifier(criterion=split_criterion)
clf.fit(X, y)
feat_importance = clf.tree_.compute_feature_importances(normalize=False)
print("feat importance = " + str(feat_importance))
print("feat importance = ", feat_importance)
print("shape", feat_importance.shape)
tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist()))
info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp))
info_gains_dict[info_gains_dict > 0]
# %%
# Binarizacija vrednosti tree Information Gain-a
bins = [-0.1, 0, 0.1] # bins for target's correlations with features
cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True)
plt.title(f"Tree information gains by value ({split_criterion})")
cut_info_gains.value_counts().plot(kind='bar', color='purple')
plt.xticks(rotation=45, ha='right')
print(cut_info_gains.value_counts())
pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20))
# %%
# Plot feature importance tree graph
plt.figure(figsize=(12,12))
tree.plot_tree(clf,
feature_names = list(model_input.drop(columns=['target', 'pid']).columns),
class_names=True,
filled = True, fontsize=2, max_depth=10)
filled = True, fontsize=5, max_depth=3)
plt.savefig('tree_high_dpi', dpi=800)
# %%
print(model_input['target'])
# %% [markdown]
# Present the feature importance by correlation with target
corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
list(corrs.sort_values(ascending=False).index)
# corrs.sort_values(ascending=False)
# Binarizacija vrednosti korelacij
bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features
cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True)
plt.title("Target's correlations with features")
cut_corrs.value_counts().plot(kind='bar')
plt.xticks(rotation=45, ha='right')
print(cut_corrs.value_counts())
print(corrs[corrs > 0.1]) # or corrs < -0.1])
# %%
# %%

View File

@ -58,7 +58,18 @@ print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(incl
# %%
# Add prefix to demographical features
demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile',
'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']
new_names = [(col, "demo_"+col) for col in demo_features]
model_input.rename(columns=dict(new_names), inplace=True)
demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio',
'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M',
'demo_startlanguage_nl', 'demo_startlanguage_sl']
# %%
# Get phone and non-phone columns
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
@ -77,10 +88,16 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
"""
for fgroup_substr in groups_substrings:
if include_group:
feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
else:
feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
if fgroup_substr is None:
feature_group_cols = list(df.columns)
feature_group_cols.remove("pid")
feature_group_cols.remove("target")
else:
if include_group:
feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
else:
feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
@ -103,47 +120,34 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
print("Precision", metrics.precision_score(y_test, y_pred))
print("Recall", metrics.recall_score(y_test, y_pred))
print("F1", metrics.f1_score(y_test, y_pred), "\n")
# %%
model_input
groups_substr = ["_", "phone_", "empatica_"]
# %% [markdown]
# ### Senzor big feature groups (phone, empatica, demografical)
groups_substr = [None, "phone_", "empatica_", "demo_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
# %%
# %% [markdown]
# ### Empatica sezor groups
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
# %%
# %% [markdown]
# ### Phone sensor groups
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_",
"phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
# %%
# Create an empty list to store the feature column groups
feature_column_groups = []
# Write all the sensors (phone, empatica), seperate other (demographical) cols also
# Iterate through each column in model_input
for column in model_input.columns:
sensors_features_groups = ["_", "phone_", "empatica_", "empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", "phone_light_"
"phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
# Split the column name by '_'
split_column = column.split('_')
# Create a variable to store the prefix of the current column
prefix = ''
# Iterate through each part of the split column name
for part in split_column:
print([col for col in model_input.columns if "phone_" not in col or "empatica_" not in col])
# Add the part to the prefix variable
prefix += part + '_'
# Check if the prefix is already in our feature column groups list
if prefix not in feature_column_groups:
# If not, add it to our list of feature columns groups
feature_column_groups.append(prefix)
# Print out all possible feature columns groups that contain more than one entry in a columns list
print(feature_column_groups)
# %%
# Write all the sensors (phone, empatica), seperate other (demographical) cols also