diff --git a/exploration/expl_features_analysis.py b/exploration/expl_features_analysis.py index 55f5358..056e54c 100644 --- a/exploration/expl_features_analysis.py +++ b/exploration/expl_features_analysis.py @@ -26,6 +26,7 @@ import seaborn as sns from sklearn.tree import DecisionTreeClassifier from sklearn import tree from sklearn.impute import SimpleImputer +from sklearn.model_selection import train_test_split # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} @@ -88,10 +89,12 @@ def get_information_gains(data, target_name): return information_gains -def n_features_with_highest_info_gain(info_gain_dict, n=50): +def n_features_with_highest_info_gain(info_gain_dict, n=None): """ Get n-features that have highest information gain """ + if n is None: + n = len(info_gain_dict) import heapq n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1]) return {feature[0]: feature[1] for feature in n_largest} @@ -250,30 +253,66 @@ split_criterion = 'entropy' print("Target impurity:", compute_impurity(model_input['target'], split_criterion)) information_gains = get_information_gains_2(model_input, 'target', split_criterion) print(pd.Series(information_gains).value_counts().sort_index(ascending=False)) -n_features_with_highest_info_gain(information_gains, n=19) +n_features_with_highest_info_gain(information_gains) # %% +# Present the feature importance using a tree (that uses gini imputity measure) +split_criterion = 'entropy' +print("Target impurity:", compute_impurity(model_input['target'], split_criterion)) X, y = model_input.drop(columns=['target', 'pid']), model_input['target'] imputer = SimpleImputer(missing_values=np.nan, strategy='median') X = imputer.fit_transform(X) +X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25) -clf = DecisionTreeClassifier() + +clf = DecisionTreeClassifier(criterion=split_criterion) clf.fit(X, y) feat_importance = clf.tree_.compute_feature_importances(normalize=False) -print("feat importance = " + str(feat_importance)) +print("feat importance = ", feat_importance) +print("shape", feat_importance.shape) +tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist())) +info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp)) +info_gains_dict[info_gains_dict > 0] +# %% +# Binarizacija vrednosti tree Information Gain-a +bins = [-0.1, 0, 0.1] # bins for target's correlations with features +cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True) +plt.title(f"Tree information gains by value ({split_criterion})") +cut_info_gains.value_counts().plot(kind='bar', color='purple') +plt.xticks(rotation=45, ha='right') +print(cut_info_gains.value_counts()) + + +pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20)) + +# %% +# Plot feature importance tree graph plt.figure(figsize=(12,12)) tree.plot_tree(clf, feature_names = list(model_input.drop(columns=['target', 'pid']).columns), class_names=True, - filled = True, fontsize=2, max_depth=10) + filled = True, fontsize=5, max_depth=3) plt.savefig('tree_high_dpi', dpi=800) -# %% -print(model_input['target']) + + +# %% [markdown] +# Present the feature importance by correlation with target + corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int)))) -list(corrs.sort_values(ascending=False).index) +# corrs.sort_values(ascending=False) + +# Binarizacija vrednosti korelacij +bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features +cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True) +plt.title("Target's correlations with features") +cut_corrs.value_counts().plot(kind='bar') +plt.xticks(rotation=45, ha='right') +print(cut_corrs.value_counts()) +print(corrs[corrs > 0.1]) # or corrs < -0.1]) +# %% # %% diff --git a/exploration/expl_features_groups_analysis.py b/exploration/expl_features_groups_analysis.py index 3cf0d6a..bc46533 100644 --- a/exploration/expl_features_groups_analysis.py +++ b/exploration/expl_features_groups_analysis.py @@ -58,7 +58,18 @@ print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(incl # %% +# Add prefix to demographical features +demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile', + 'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl'] +new_names = [(col, "demo_"+col) for col in demo_features] +model_input.rename(columns=dict(new_names), inplace=True) + +demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio', + 'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M', + 'demo_startlanguage_nl', 'demo_startlanguage_sl'] + +# %% # Get phone and non-phone columns def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True): @@ -77,10 +88,16 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru """ for fgroup_substr in groups_substrings: - if include_group: - feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']] - else: - feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']] + if fgroup_substr is None: + feature_group_cols = list(df.columns) + feature_group_cols.remove("pid") + feature_group_cols.remove("target") + else: + if include_group: + feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']] + else: + feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']] + X, y = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target'] imputer = SimpleImputer(missing_values=np.nan, strategy='median') @@ -103,47 +120,34 @@ def make_predictions_with_sensor_groups(df, groups_substrings, include_group=Tru print("Precision", metrics.precision_score(y_test, y_pred)) print("Recall", metrics.recall_score(y_test, y_pred)) print("F1", metrics.f1_score(y_test, y_pred), "\n") -# %% -model_input -groups_substr = ["_", "phone_", "empatica_"] + +# %% [markdown] +# ### Senzor big feature groups (phone, empatica, demografical) +groups_substr = [None, "phone_", "empatica_", "demo_"] make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False) -# %% +# %% [markdown] +# ### Empatica sezor groups make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) -groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"] -make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False) +e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"] +make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False) -# %% +# %% [markdown] +# ### Phone sensor groups make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) -groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"] -make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False) +phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", + "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"] +make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False) # %% -# Create an empty list to store the feature column groups -feature_column_groups = [] +# Write all the sensors (phone, empatica), seperate other (demographical) cols also -# Iterate through each column in model_input -for column in model_input.columns: +sensors_features_groups = ["_", "phone_", "empatica_", "empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_", + "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", "phone_data_yield_", "phone_light_" + "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"] - # Split the column name by '_' - split_column = column.split('_') - # Create a variable to store the prefix of the current column - prefix = '' - # Iterate through each part of the split column name - for part in split_column: +print([col for col in model_input.columns if "phone_" not in col or "empatica_" not in col]) - # Add the part to the prefix variable - prefix += part + '_' - - # Check if the prefix is already in our feature column groups list - if prefix not in feature_column_groups: - - # If not, add it to our list of feature columns groups - feature_column_groups.append(prefix) - -# Print out all possible feature columns groups that contain more than one entry in a columns list -print(feature_column_groups) # %% -# Write all the sensors (phone, empatica), seperate other (demographical) cols also \ No newline at end of file