Expand analysis of the features (individualy and by sensor groups).
parent
6a98c8cdcf
commit
85e572fca0
|
@ -26,6 +26,7 @@ import seaborn as sns
|
||||||
from sklearn.tree import DecisionTreeClassifier
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
from sklearn import tree
|
from sklearn import tree
|
||||||
from sklearn.impute import SimpleImputer
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||||
|
@ -88,10 +89,12 @@ def get_information_gains(data, target_name):
|
||||||
|
|
||||||
return information_gains
|
return information_gains
|
||||||
|
|
||||||
def n_features_with_highest_info_gain(info_gain_dict, n=50):
|
def n_features_with_highest_info_gain(info_gain_dict, n=None):
|
||||||
"""
|
"""
|
||||||
Get n-features that have highest information gain
|
Get n-features that have highest information gain
|
||||||
"""
|
"""
|
||||||
|
if n is None:
|
||||||
|
n = len(info_gain_dict)
|
||||||
import heapq
|
import heapq
|
||||||
n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
|
n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
|
||||||
return {feature[0]: feature[1] for feature in n_largest}
|
return {feature[0]: feature[1] for feature in n_largest}
|
||||||
|
@ -250,30 +253,66 @@ split_criterion = 'entropy'
|
||||||
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
|
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
|
||||||
information_gains = get_information_gains_2(model_input, 'target', split_criterion)
|
information_gains = get_information_gains_2(model_input, 'target', split_criterion)
|
||||||
print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
|
print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
|
||||||
n_features_with_highest_info_gain(information_gains, n=19)
|
n_features_with_highest_info_gain(information_gains)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
# Present the feature importance using a tree (that uses gini imputity measure)
|
||||||
|
split_criterion = 'entropy'
|
||||||
|
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
|
||||||
|
|
||||||
X, y = model_input.drop(columns=['target', 'pid']), model_input['target']
|
X, y = model_input.drop(columns=['target', 'pid']), model_input['target']
|
||||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||||
X = imputer.fit_transform(X)
|
X = imputer.fit_transform(X)
|
||||||
|
X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25)
|
||||||
|
|
||||||
clf = DecisionTreeClassifier()
|
|
||||||
|
clf = DecisionTreeClassifier(criterion=split_criterion)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
|
|
||||||
feat_importance = clf.tree_.compute_feature_importances(normalize=False)
|
feat_importance = clf.tree_.compute_feature_importances(normalize=False)
|
||||||
print("feat importance = " + str(feat_importance))
|
print("feat importance = ", feat_importance)
|
||||||
|
print("shape", feat_importance.shape)
|
||||||
|
tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist()))
|
||||||
|
info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp))
|
||||||
|
info_gains_dict[info_gains_dict > 0]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Binarizacija vrednosti tree Information Gain-a
|
||||||
|
bins = [-0.1, 0, 0.1] # bins for target's correlations with features
|
||||||
|
cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True)
|
||||||
|
plt.title(f"Tree information gains by value ({split_criterion})")
|
||||||
|
cut_info_gains.value_counts().plot(kind='bar', color='purple')
|
||||||
|
plt.xticks(rotation=45, ha='right')
|
||||||
|
print(cut_info_gains.value_counts())
|
||||||
|
|
||||||
|
|
||||||
|
pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Plot feature importance tree graph
|
||||||
plt.figure(figsize=(12,12))
|
plt.figure(figsize=(12,12))
|
||||||
tree.plot_tree(clf,
|
tree.plot_tree(clf,
|
||||||
feature_names = list(model_input.drop(columns=['target', 'pid']).columns),
|
feature_names = list(model_input.drop(columns=['target', 'pid']).columns),
|
||||||
class_names=True,
|
class_names=True,
|
||||||
filled = True, fontsize=2, max_depth=10)
|
filled = True, fontsize=5, max_depth=3)
|
||||||
|
|
||||||
plt.savefig('tree_high_dpi', dpi=800)
|
plt.savefig('tree_high_dpi', dpi=800)
|
||||||
# %%
|
|
||||||
print(model_input['target'])
|
|
||||||
|
# %% [markdown]
|
||||||
|
# Present the feature importance by correlation with target
|
||||||
|
|
||||||
corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
|
corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
|
||||||
list(corrs.sort_values(ascending=False).index)
|
# corrs.sort_values(ascending=False)
|
||||||
|
|
||||||
|
# Binarizacija vrednosti korelacij
|
||||||
|
bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features
|
||||||
|
cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True)
|
||||||
|
plt.title("Target's correlations with features")
|
||||||
|
cut_corrs.value_counts().plot(kind='bar')
|
||||||
|
plt.xticks(rotation=45, ha='right')
|
||||||
|
print(cut_corrs.value_counts())
|
||||||
|
print(corrs[corrs > 0.1]) # or corrs < -0.1])
|
||||||
|
# %%
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
|
|
@ -58,7 +58,18 @@ print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(incl
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
# Add prefix to demographical features
|
||||||
|
demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile',
|
||||||
|
'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']
|
||||||
|
|
||||||
|
new_names = [(col, "demo_"+col) for col in demo_features]
|
||||||
|
model_input.rename(columns=dict(new_names), inplace=True)
|
||||||
|
|
||||||
|
demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio',
|
||||||
|
'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M',
|
||||||
|
'demo_startlanguage_nl', 'demo_startlanguage_sl']
|
||||||
|
|
||||||
|
# %%
|
||||||
# Get phone and non-phone columns
|
# Get phone and non-phone columns
|
||||||
|
|
||||||
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
|
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
|
||||||
|
@ -77,11 +88,17 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for fgroup_substr in groups_substrings:
|
for fgroup_substr in groups_substrings:
|
||||||
|
if fgroup_substr is None:
|
||||||
|
feature_group_cols = list(df.columns)
|
||||||
|
feature_group_cols.remove("pid")
|
||||||
|
feature_group_cols.remove("target")
|
||||||
|
else:
|
||||||
if include_group:
|
if include_group:
|
||||||
feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
|
feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
|
||||||
else:
|
else:
|
||||||
feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
|
feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
|
||||||
|
|
||||||
|
|
||||||
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
|
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
|
||||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||||
X = imputer.fit_transform(X)
|
X = imputer.fit_transform(X)
|
||||||
|
@ -103,47 +120,34 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru
|
||||||
print("Precision", metrics.precision_score(y_test, y_pred))
|
print("Precision", metrics.precision_score(y_test, y_pred))
|
||||||
print("Recall", metrics.recall_score(y_test, y_pred))
|
print("Recall", metrics.recall_score(y_test, y_pred))
|
||||||
print("F1", metrics.f1_score(y_test, y_pred), "\n")
|
print("F1", metrics.f1_score(y_test, y_pred), "\n")
|
||||||
# %%
|
|
||||||
model_input
|
# %% [markdown]
|
||||||
groups_substr = ["_", "phone_", "empatica_"]
|
# ### Senzor big feature groups (phone, empatica, demografical)
|
||||||
|
groups_substr = [None, "phone_", "empatica_", "demo_"]
|
||||||
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
|
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
|
||||||
|
|
||||||
# %%
|
# %% [markdown]
|
||||||
|
# ### Empatica sezor groups
|
||||||
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
|
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
|
||||||
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
|
e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
|
||||||
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
|
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
|
||||||
|
|
||||||
# %%
|
# %% [markdown]
|
||||||
|
# ### Phone sensor groups
|
||||||
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
|
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
|
||||||
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
|
phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_",
|
||||||
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
|
"phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
|
||||||
|
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
|
||||||
|
|
||||||
# %%
|
|
||||||
# Create an empty list to store the feature column groups
|
|
||||||
feature_column_groups = []
|
|
||||||
|
|
||||||
# Iterate through each column in model_input
|
|
||||||
for column in model_input.columns:
|
|
||||||
|
|
||||||
# Split the column name by '_'
|
|
||||||
split_column = column.split('_')
|
|
||||||
|
|
||||||
# Create a variable to store the prefix of the current column
|
|
||||||
prefix = ''
|
|
||||||
|
|
||||||
# Iterate through each part of the split column name
|
|
||||||
for part in split_column:
|
|
||||||
|
|
||||||
# Add the part to the prefix variable
|
|
||||||
prefix += part + '_'
|
|
||||||
|
|
||||||
# Check if the prefix is already in our feature column groups list
|
|
||||||
if prefix not in feature_column_groups:
|
|
||||||
|
|
||||||
# If not, add it to our list of feature columns groups
|
|
||||||
feature_column_groups.append(prefix)
|
|
||||||
|
|
||||||
# Print out all possible feature columns groups that contain more than one entry in a columns list
|
|
||||||
print(feature_column_groups)
|
|
||||||
# %%
|
# %%
|
||||||
# Write all the sensors (phone, empatica), seperate other (demographical) cols also
|
# Write all the sensors (phone, empatica), seperate other (demographical) cols also
|
||||||
|
|
||||||
|
sensors_features_groups = ["_", "phone_", "empatica_", "empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
|
||||||
|
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", "phone_light_"
|
||||||
|
"phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
print([col for col in model_input.columns if "phone_" not in col or "empatica_" not in col])
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
Loading…
Reference in New Issue