Expand analysis of the features (individualy and by sensor groups).

ml_pipeline
Primoz 2023-01-23 16:32:07 +01:00
parent 6a98c8cdcf
commit 85e572fca0
2 changed files with 86 additions and 43 deletions

View File

@ -26,6 +26,7 @@ import seaborn as sns
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
from sklearn import tree from sklearn import tree
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
@ -88,10 +89,12 @@ def get_information_gains(data, target_name):
return information_gains return information_gains
def n_features_with_highest_info_gain(info_gain_dict, n=50): def n_features_with_highest_info_gain(info_gain_dict, n=None):
""" """
Get n-features that have highest information gain Get n-features that have highest information gain
""" """
if n is None:
n = len(info_gain_dict)
import heapq import heapq
n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1]) n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
return {feature[0]: feature[1] for feature in n_largest} return {feature[0]: feature[1] for feature in n_largest}
@ -250,30 +253,66 @@ split_criterion = 'entropy'
print("Target impurity:", compute_impurity(model_input['target'], split_criterion)) print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
information_gains = get_information_gains_2(model_input, 'target', split_criterion) information_gains = get_information_gains_2(model_input, 'target', split_criterion)
print(pd.Series(information_gains).value_counts().sort_index(ascending=False)) print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
n_features_with_highest_info_gain(information_gains, n=19) n_features_with_highest_info_gain(information_gains)
# %% # %%
# Present the feature importance using a tree (that uses gini imputity measure)
split_criterion = 'entropy'
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
X, y = model_input.drop(columns=['target', 'pid']), model_input['target'] X, y = model_input.drop(columns=['target', 'pid']), model_input['target']
imputer = SimpleImputer(missing_values=np.nan, strategy='median') imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X) X = imputer.fit_transform(X)
X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25)
clf = DecisionTreeClassifier()
clf = DecisionTreeClassifier(criterion=split_criterion)
clf.fit(X, y) clf.fit(X, y)
feat_importance = clf.tree_.compute_feature_importances(normalize=False) feat_importance = clf.tree_.compute_feature_importances(normalize=False)
print("feat importance = " + str(feat_importance)) print("feat importance = ", feat_importance)
print("shape", feat_importance.shape)
tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist()))
info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp))
info_gains_dict[info_gains_dict > 0]
# %%
# Binarizacija vrednosti tree Information Gain-a
bins = [-0.1, 0, 0.1] # bins for target's correlations with features
cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True)
plt.title(f"Tree information gains by value ({split_criterion})")
cut_info_gains.value_counts().plot(kind='bar', color='purple')
plt.xticks(rotation=45, ha='right')
print(cut_info_gains.value_counts())
pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20))
# %%
# Plot feature importance tree graph
plt.figure(figsize=(12,12)) plt.figure(figsize=(12,12))
tree.plot_tree(clf, tree.plot_tree(clf,
feature_names = list(model_input.drop(columns=['target', 'pid']).columns), feature_names = list(model_input.drop(columns=['target', 'pid']).columns),
class_names=True, class_names=True,
filled = True, fontsize=2, max_depth=10) filled = True, fontsize=5, max_depth=3)
plt.savefig('tree_high_dpi', dpi=800) plt.savefig('tree_high_dpi', dpi=800)
# %%
print(model_input['target'])
# %% [markdown]
# Present the feature importance by correlation with target
corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int)))) corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
list(corrs.sort_values(ascending=False).index) # corrs.sort_values(ascending=False)
# Binarizacija vrednosti korelacij
bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features
cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True)
plt.title("Target's correlations with features")
cut_corrs.value_counts().plot(kind='bar')
plt.xticks(rotation=45, ha='right')
print(cut_corrs.value_counts())
print(corrs[corrs > 0.1]) # or corrs < -0.1])
# %%
# %% # %%

View File

@ -58,7 +58,18 @@ print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(incl
# %% # %%
# Add prefix to demographical features
demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile',
'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']
new_names = [(col, "demo_"+col) for col in demo_features]
model_input.rename(columns=dict(new_names), inplace=True)
demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio',
'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M',
'demo_startlanguage_nl', 'demo_startlanguage_sl']
# %%
# Get phone and non-phone columns # Get phone and non-phone columns
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True): def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
@ -77,11 +88,17 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
""" """
for fgroup_substr in groups_substrings: for fgroup_substr in groups_substrings:
if fgroup_substr is None:
feature_group_cols = list(df.columns)
feature_group_cols.remove("pid")
feature_group_cols.remove("target")
else:
if include_group: if include_group:
feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']] feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
else: else:
feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']] feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target'] X, y = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
imputer = SimpleImputer(missing_values=np.nan, strategy='median') imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X) X = imputer.fit_transform(X)
@ -103,47 +120,34 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
print("Precision", metrics.precision_score(y_test, y_pred)) print("Precision", metrics.precision_score(y_test, y_pred))
print("Recall", metrics.recall_score(y_test, y_pred)) print("Recall", metrics.recall_score(y_test, y_pred))
print("F1", metrics.f1_score(y_test, y_pred), "\n") print("F1", metrics.f1_score(y_test, y_pred), "\n")
# %%
model_input # %% [markdown]
groups_substr = ["_", "phone_", "empatica_"] # ### Senzor big feature groups (phone, empatica, demografical)
groups_substr = [None, "phone_", "empatica_", "demo_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False) make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
# %% # %% [markdown]
# ### Empatica sezor groups
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"] e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False) make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
# %% # %% [markdown]
# ### Phone sensor groups
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"] phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_",
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False) "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
# %%
# Create an empty list to store the feature column groups
feature_column_groups = []
# Iterate through each column in model_input
for column in model_input.columns:
# Split the column name by '_'
split_column = column.split('_')
# Create a variable to store the prefix of the current column
prefix = ''
# Iterate through each part of the split column name
for part in split_column:
# Add the part to the prefix variable
prefix += part + '_'
# Check if the prefix is already in our feature column groups list
if prefix not in feature_column_groups:
# If not, add it to our list of feature columns groups
feature_column_groups.append(prefix)
# Print out all possible feature columns groups that contain more than one entry in a columns list
print(feature_column_groups)
# %% # %%
# Write all the sensors (phone, empatica), seperate other (demographical) cols also # Write all the sensors (phone, empatica), seperate other (demographical) cols also
sensors_features_groups = ["_", "phone_", "empatica_", "empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", "phone_light_"
"phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
print([col for col in model_input.columns if "phone_" not in col or "empatica_" not in col])
# %%