Merge remote-tracking branch 'origin/ml_pipeline' into ml_pipeline
commit
633d029dc2
Binary file not shown.
|
@ -0,0 +1,318 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
# %matplotlib inline
|
||||
|
||||
import os, sys, math
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn import tree
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
def calc_entropy(column):
|
||||
"""
|
||||
Calculate entropy given a pandas series, list, or numpy array.
|
||||
"""
|
||||
# Compute the counts of each unique value in the column
|
||||
counts = np.bincount(column)
|
||||
# Divide by the total column length to get a probability
|
||||
probabilities = counts / len(column)
|
||||
|
||||
# Initialize the entropy to 0
|
||||
entropy = 0
|
||||
# Loop through the probabilities, and add each one to the total entropy
|
||||
for prob in probabilities:
|
||||
if prob > 0:
|
||||
# use log from math and set base to 2
|
||||
entropy += prob * math.log(prob, 2)
|
||||
|
||||
return -entropy
|
||||
|
||||
|
||||
def calc_information_gain(data, split_name, target_name):
|
||||
"""
|
||||
Calculate information gain given a data set, column to split on, and target
|
||||
"""
|
||||
# Calculate the original entropy
|
||||
original_entropy = calc_entropy(data[target_name])
|
||||
#Find the unique values in the column
|
||||
values = data[split_name].unique()
|
||||
|
||||
# Make two subsets of the data, based on the unique values
|
||||
left_split = data[data[split_name] == values[0]]
|
||||
right_split = data[data[split_name] == values[1]]
|
||||
|
||||
# Loop through the splits and calculate the subset entropies
|
||||
to_subtract = 0
|
||||
for subset in [left_split, right_split]:
|
||||
prob = (subset.shape[0] / data.shape[0])
|
||||
to_subtract += prob * calc_entropy(subset[target_name])
|
||||
|
||||
# Return information gain
|
||||
return original_entropy - to_subtract
|
||||
|
||||
|
||||
def get_information_gains(data, target_name):
|
||||
#Intialize an empty dictionary for information gains
|
||||
information_gains = {}
|
||||
|
||||
#Iterate through each column name in our list
|
||||
for col in list(data.columns):
|
||||
#Find the information gain for the column
|
||||
information_gain = calc_information_gain(data, col, target_name)
|
||||
#Add the information gain to our dictionary using the column name as the ekey
|
||||
information_gains[col] = information_gain
|
||||
|
||||
#Return the key with the highest value
|
||||
#return max(information_gains, key=information_gains.get)
|
||||
|
||||
return information_gains
|
||||
|
||||
def n_features_with_highest_info_gain(info_gain_dict, n=None):
|
||||
"""
|
||||
Get n-features that have highest information gain
|
||||
"""
|
||||
if n is None:
|
||||
n = len(info_gain_dict)
|
||||
import heapq
|
||||
n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
|
||||
return {feature[0]: feature[1] for feature in n_largest}
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
|
||||
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = model_input[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
|
||||
model_input = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# Binarizacija targeta
|
||||
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)
|
||||
print(model_input['target'].value_counts(), edges)
|
||||
|
||||
# %%
|
||||
info_gains = get_information_gains(model_input, 'target')
|
||||
|
||||
# %% [markdown]
|
||||
# Present the feature importance results
|
||||
|
||||
# %%
|
||||
print("Total columns:", len(info_gains))
|
||||
print(pd.Series(info_gains).value_counts())
|
||||
|
||||
n_features_with_highest_info_gain(info_gains, n=189)
|
||||
|
||||
# %%
|
||||
def compute_impurity(feature, impurity_criterion):
|
||||
"""
|
||||
This function calculates impurity of a feature.
|
||||
Supported impurity criteria: 'entropy', 'gini'
|
||||
input: feature (this needs to be a Pandas series)
|
||||
output: feature impurity
|
||||
"""
|
||||
probs = feature.value_counts(normalize=True)
|
||||
|
||||
if impurity_criterion == 'entropy':
|
||||
impurity = -1 * np.sum(np.log2(probs) * probs)
|
||||
elif impurity_criterion == 'gini':
|
||||
impurity = 1 - np.sum(np.square(probs))
|
||||
else:
|
||||
raise ValueError('Unknown impurity criterion')
|
||||
|
||||
return impurity
|
||||
|
||||
|
||||
def comp_feature_information_gain(df, target, descriptive_feature, split_criterion, print_flag=False):
|
||||
"""
|
||||
This function calculates information gain for splitting on
|
||||
a particular descriptive feature for a given dataset
|
||||
and a given impurity criteria.
|
||||
Supported split criterion: 'entropy', 'gini'
|
||||
"""
|
||||
if print_flag:
|
||||
print('target feature:', target)
|
||||
print('descriptive_feature:', descriptive_feature)
|
||||
print('split criterion:', split_criterion)
|
||||
|
||||
target_entropy = compute_impurity(df[target], split_criterion)
|
||||
|
||||
# we define two lists below:
|
||||
# entropy_list to store the entropy of each partition
|
||||
# weight_list to store the relative number of observations in each partition
|
||||
entropy_list = list()
|
||||
weight_list = list()
|
||||
|
||||
# loop over each level of the descriptive feature
|
||||
# to partition the dataset with respect to that level
|
||||
# and compute the entropy and the weight of the level's partition
|
||||
for level in df[descriptive_feature].unique():
|
||||
df_feature_level = df[df[descriptive_feature] == level]
|
||||
entropy_level = compute_impurity(df_feature_level[target], split_criterion)
|
||||
entropy_list.append(round(entropy_level, 3))
|
||||
weight_level = len(df_feature_level) / len(df)
|
||||
weight_list.append(round(weight_level, 3))
|
||||
|
||||
# print('impurity of partitions:', entropy_list)
|
||||
# print('weights of partitions:', weight_list)
|
||||
|
||||
feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
|
||||
|
||||
information_gain = target_entropy - feature_remaining_impurity
|
||||
|
||||
if print_flag:
|
||||
print('impurity of partitions:', entropy_list)
|
||||
print('weights of partitions:', weight_list)
|
||||
print('remaining impurity:', feature_remaining_impurity)
|
||||
print('information gain:', information_gain)
|
||||
print('====================')
|
||||
|
||||
return information_gain
|
||||
|
||||
|
||||
def calc_information_gain_2(data, split_name, target_name, split_criterion):
|
||||
"""
|
||||
Calculate information gain given a data set, column to split on, and target
|
||||
"""
|
||||
# Calculate the original impurity
|
||||
original_impurity = compute_impurity(data[target_name], split_criterion)
|
||||
#Find the unique values in the column
|
||||
values = data[split_name].unique()
|
||||
|
||||
# Make two subsets of the data, based on the unique values
|
||||
left_split = data[data[split_name] == values[0]]
|
||||
right_split = data[data[split_name] == values[1]]
|
||||
|
||||
# Loop through the splits and calculate the subset impurities
|
||||
to_subtract = 0
|
||||
for subset in [left_split, right_split]:
|
||||
prob = (subset.shape[0] / data.shape[0])
|
||||
to_subtract += prob * compute_impurity(subset[target_name], split_criterion)
|
||||
|
||||
# Return information gain
|
||||
return original_impurity - to_subtract
|
||||
|
||||
|
||||
def get_information_gains_2(data, target_name, split_criterion):
|
||||
#Intialize an empty dictionary for information gains
|
||||
information_gains = {}
|
||||
|
||||
#Iterate through each column name in our list
|
||||
for feature in list(data.columns):
|
||||
#Find the information gain for the column
|
||||
information_gain = calc_information_gain_2(model_input, target_name, feature, split_criterion)
|
||||
#Add the information gain to our dictionary using the column name as the ekey
|
||||
information_gains[feature] = information_gain
|
||||
|
||||
#Return the key with the highest value
|
||||
#return max(information_gains, key=information_gains.get)
|
||||
|
||||
return information_gains
|
||||
|
||||
# %% [markdown]
|
||||
# Present the feature importance results from other methods
|
||||
|
||||
# %%
|
||||
split_criterion = 'entropy'
|
||||
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
|
||||
information_gains = get_information_gains_2(model_input, 'target', split_criterion)
|
||||
print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
|
||||
n_features_with_highest_info_gain(information_gains)
|
||||
|
||||
# %%
|
||||
# Present the feature importance using a tree (that uses gini imputity measure)
|
||||
split_criterion = 'entropy'
|
||||
print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
|
||||
|
||||
X, y = model_input.drop(columns=['target', 'pid']), model_input['target']
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
X = imputer.fit_transform(X)
|
||||
X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25)
|
||||
|
||||
|
||||
clf = DecisionTreeClassifier(criterion=split_criterion)
|
||||
clf.fit(X, y)
|
||||
|
||||
feat_importance = clf.tree_.compute_feature_importances(normalize=False)
|
||||
print("feat importance = ", feat_importance)
|
||||
print("shape", feat_importance.shape)
|
||||
tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist()))
|
||||
info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp))
|
||||
info_gains_dict[info_gains_dict > 0]
|
||||
|
||||
# %%
|
||||
# Binarizacija vrednosti tree Information Gain-a
|
||||
bins = [-0.1, 0, 0.1] # bins for target's correlations with features
|
||||
cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True)
|
||||
plt.title(f"Tree information gains by value ({split_criterion})")
|
||||
cut_info_gains.value_counts().plot(kind='bar', color='purple')
|
||||
plt.xticks(rotation=45, ha='right')
|
||||
print(cut_info_gains.value_counts())
|
||||
|
||||
|
||||
pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20))
|
||||
|
||||
# %%
|
||||
# Plot feature importance tree graph
|
||||
plt.figure(figsize=(12,12))
|
||||
tree.plot_tree(clf,
|
||||
feature_names = list(model_input.drop(columns=['target', 'pid']).columns),
|
||||
class_names=True,
|
||||
filled = True, fontsize=5, max_depth=3)
|
||||
|
||||
plt.savefig('tree_high_dpi', dpi=800)
|
||||
|
||||
|
||||
# %% [markdown]
|
||||
# Present the feature importance by correlation with target
|
||||
|
||||
corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
|
||||
# corrs.sort_values(ascending=False)
|
||||
|
||||
# Binarizacija vrednosti korelacij
|
||||
bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features
|
||||
cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True)
|
||||
plt.title("Target's correlations with features")
|
||||
cut_corrs.value_counts().plot(kind='bar')
|
||||
plt.xticks(rotation=45, ha='right')
|
||||
print(cut_corrs.value_counts())
|
||||
print(corrs[corrs > 0.1]) # or corrs < -0.1])
|
||||
# %%
|
||||
|
||||
# %%
|
|
@ -0,0 +1,328 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
# %matplotlib inline
|
||||
|
||||
import os, sys, math
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
|
||||
from sklearn import metrics
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
|
||||
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = model_input[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
|
||||
model_input = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# Binarizacija targeta
|
||||
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)
|
||||
|
||||
print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
|
||||
print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)
|
||||
|
||||
|
||||
# %%
|
||||
# Add prefix to demographical features
|
||||
demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile',
|
||||
'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']
|
||||
|
||||
new_names = [(col, "demo_"+col) for col in demo_features]
|
||||
model_input.rename(columns=dict(new_names), inplace=True)
|
||||
|
||||
demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio',
|
||||
'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M',
|
||||
'demo_startlanguage_nl', 'demo_startlanguage_sl']
|
||||
|
||||
# %%
|
||||
# Get phone and non-phone columns
|
||||
import warnings
|
||||
|
||||
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
|
||||
"""
|
||||
This function makes predictions with sensor groups.
|
||||
It takes in a dataframe (df), a list of group substrings (groups_substrings)
|
||||
and an optional parameter include_group (default is True).
|
||||
It creates a list of columns in the dataframe that contain the group substrings,
|
||||
while excluding the 'pid' and 'target' columns. It then splits the data into training
|
||||
and test sets, using a test size of 0.25 for the first split and 0.2 for the second split.
|
||||
A SimpleImputer is used to fill in missing values with median values.
|
||||
A LogisticRegression is then used to fit the training set and make predictions
|
||||
on the test set. Finally, accuracy, precision, recall and F1 scores are printed
|
||||
for each substring group depending on whether or not include_group
|
||||
is set to True or False.
|
||||
|
||||
"""
|
||||
|
||||
best_sensor = None
|
||||
best_recall_score, best_f1_score = None, None
|
||||
|
||||
for fgroup_substr in groups_substrings:
|
||||
if fgroup_substr is None:
|
||||
feature_group_cols = list(df.columns)
|
||||
feature_group_cols.remove("pid")
|
||||
feature_group_cols.remove("target")
|
||||
else:
|
||||
if include_group:
|
||||
feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
|
||||
else:
|
||||
feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
|
||||
|
||||
|
||||
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
|
||||
X, _, y, _ = train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
|
||||
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
|
||||
nb = GaussianNB()
|
||||
model_cv = cross_validate(
|
||||
nb,
|
||||
X=imputer.fit_transform(X),
|
||||
y=y,
|
||||
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
||||
n_jobs=-1,
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
|
||||
|
||||
|
||||
if print_flag:
|
||||
if include_group:
|
||||
print("\nPrediction with", fgroup_substr)
|
||||
else:
|
||||
print("\nPrediction without", fgroup_substr)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
||||
|
||||
acc = np.mean(model_cv['test_accuracy'])
|
||||
acc_std = np.std(model_cv['test_accuracy'])
|
||||
|
||||
prec = np.mean(model_cv['test_precision'])
|
||||
prec_std = np.std(model_cv['test_precision'])
|
||||
|
||||
rec = np.mean(model_cv['test_recall'])
|
||||
rec_std = np.std(model_cv['test_recall'])
|
||||
|
||||
f1 = np.mean(model_cv['test_f1'])
|
||||
f1_std = np.std(model_cv['test_f1'])
|
||||
|
||||
if print_flag:
|
||||
print("************************************************")
|
||||
print(f"Accuracy: {acc} (sd={acc_std})")
|
||||
print(f"Precison: {prec} (sd={prec_std})")
|
||||
print(f"Recall: {rec} (sd={rec_std})")
|
||||
print(f"F1: {f1} (sd={f1_std})\n")
|
||||
|
||||
if (not best_recall_score and not best_f1_score) or (rec > best_recall_score):
|
||||
best_sensor = fgroup_substr
|
||||
best_recall_score, best_f1_score = rec, f1
|
||||
best_recall_score_std, best_f1_score_std = rec_std, f1_std
|
||||
|
||||
return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std
|
||||
|
||||
# %% [markdown]
|
||||
# ### sensor big feature groups (phone, empatica, demographical)
|
||||
big_groups_substr = ["phone_", "empatica_", "demo_"]
|
||||
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Empatica sezor groups
|
||||
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
|
||||
# e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
|
||||
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Phone sensor groups
|
||||
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
|
||||
# phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_",
|
||||
# "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
|
||||
# make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
|
||||
|
||||
# %%
|
||||
# Write all the sensors (phone, empatica), seperate other (demographical) cols also
|
||||
|
||||
sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
|
||||
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
|
||||
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
|
||||
# %%
|
||||
def find_sensor_group_features_importance(model_input, sensor_groups_strings):
|
||||
"""
|
||||
This function finds the importance of sensor groups for a given model input. It takes two parameters:
|
||||
model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores,
|
||||
which will be populated with tuples containing the best sensor, its recall score, and its F1 score.
|
||||
It then makes a copy of the model input and the sensor groups strings. It then loops through each group
|
||||
in the list of strings, creating a list of important columns from the sensor importance scores list.
|
||||
It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score,
|
||||
and its F1 score. These values are added to the sensor importance scores list as a tuple. The function
|
||||
then removes that best sensor from the list of strings before looping again until all groups have been evaluated.
|
||||
Finally, it returns the populated list of tuples containing all sensors' scores.
|
||||
"""
|
||||
sensor_importance_scores = []
|
||||
model_input = model_input.copy()
|
||||
sensor_groups_strings = sensor_groups_strings.copy()
|
||||
groups_len = len(sensor_groups_strings)
|
||||
for i in range(groups_len):
|
||||
important_cols = [col[0] for col in sensor_importance_scores]
|
||||
with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
|
||||
|
||||
|
||||
best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std = \
|
||||
make_predictions_with_sensor_groups(model_input,
|
||||
groups_substrings=sensor_groups_strings, include_group=True,
|
||||
with_cols=with_cols)
|
||||
sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std ))
|
||||
print(f"\nAdded sensor: {best_sensor}\n")
|
||||
sensor_groups_strings.remove(best_sensor)
|
||||
|
||||
return sensor_importance_scores
|
||||
|
||||
|
||||
# %%
|
||||
# Method for sorting list of tuples into 3 lists
|
||||
def sort_tuples_to_lists(list_of_tuples):
|
||||
"""
|
||||
sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument
|
||||
and sorts them into three separate lists. The first list, xs, contains the first element
|
||||
of each tuple. The second list, yrecall, contains the second element of each tuple rounded
|
||||
to 4 decimal places. The third list, y_fscore, contains the third element of each tuple
|
||||
rounded to 4 decimal places. The method returns all three lists.
|
||||
"""
|
||||
xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], []
|
||||
for a_tuple in list_of_tuples:
|
||||
xs.append(a_tuple[0])
|
||||
y_recall.append(round(a_tuple[1], 4))
|
||||
y_fscore.append(round(a_tuple[2], 4))
|
||||
recall_std.append(round(a_tuple[3], 4))
|
||||
fscore_std.append(round(a_tuple[4], 4))
|
||||
return xs, y_recall, y_fscore, recall_std, fscore_std
|
||||
|
||||
def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
|
||||
title="Sequential addition of features and its F1, and recall scores"):
|
||||
"""
|
||||
This function plots the sequential progress of feature addition scores using two subplots.
|
||||
The first subplot is for recall scores and the second subplot is for F1-scores.
|
||||
The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes.
|
||||
The title of the plot can be specified by the user using the parameter title.
|
||||
The maximum recall index and maximum F1-score index are also plotted using a black dot.
|
||||
The figure size is set to 18.5 inches in width and 10.5 inches in height,
|
||||
and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed
|
||||
using plt.show().
|
||||
"""
|
||||
|
||||
fig, ax = plt.subplots(nrows=2, sharex=True)
|
||||
ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD
|
||||
ax[0].plot(xs, y_recall, color='red')
|
||||
ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD
|
||||
mrec_indx = np.argmax(y_recall)
|
||||
ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
|
||||
ax[0].legend(["Upper std", "Mean Recall", "Lower std"])
|
||||
|
||||
ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD
|
||||
ax[1].plot(xs, y_fscore)
|
||||
ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD
|
||||
mfscore_indx = np.argmax(y_fscore)
|
||||
ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
|
||||
ax[1].legend(["Upper std", "Mean F1-score", "Lower std"])
|
||||
|
||||
fig.set_size_inches(18.5, 10.5)
|
||||
|
||||
ax[0].title.set_text('Recall scores')
|
||||
ax[1].title.set_text('F1-scores')
|
||||
plt.suptitle(title, fontsize=14)
|
||||
plt.xticks(rotation=90)
|
||||
plt.show()
|
||||
|
||||
# %%
|
||||
sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
|
||||
"phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
|
||||
"phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
|
||||
|
||||
# sensors_features_groups = ["phone_", "empatica_", "demo_"]
|
||||
|
||||
# %%
|
||||
# sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
|
||||
sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
|
||||
xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Visualize sensors groups F1 and recall scores
|
||||
print(sensor_groups_importance_scores)
|
||||
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
|
||||
title="Sequential addition of sensors and its F1, and recall scores")
|
||||
|
||||
# %%
|
||||
# Take the most important feature group and investigate it feature-by-feature
|
||||
best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
|
||||
best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
|
||||
|
||||
# best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
|
||||
|
||||
# xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(best_sensor_features_scores)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Visualize best sensor's F1 and recall scores
|
||||
# print(best_sensor_features_scores)
|
||||
# plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
|
||||
# title="Best sensor addition it's features with F1 and recall scores")
|
||||
|
||||
# %%
|
||||
# This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
|
||||
# It also saves the sequence of scores for all sensors' features in excel file
|
||||
seq_columns = ["sensor_name", "feature_sequence", "recall", "f1_score"]
|
||||
feature_sequence = pd.DataFrame(columns=seq_columns)
|
||||
for i, sensor_group in enumerate(sensor_groups_importance_scores):
|
||||
|
||||
current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
|
||||
current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
|
||||
xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(current_sensor_features_scores)
|
||||
feature_sequence = pd.concat([feature_sequence, pd.DataFrame({"sensor_name":sensor_group[0], "feature_sequence": [xs], "recall": [y_recall],
|
||||
"f1_score": [y_fscore], "recall_std": [recall_std], "f1_std": [fscore_std]})])
|
||||
|
||||
plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
|
||||
title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")
|
||||
|
||||
feature_sequence.to_excel("all_sensors_sequential_addition_scores.xlsx", index=False)
|
||||
|
||||
# %%
|
||||
# TODO: method that reads data from the excel file, specified above, and then the method,
|
||||
# that selects only features that are max a thresh[%] below the max value (best for recall
|
||||
# possibly for f1). This method should additionally take threshold parameter.
|
||||
|
||||
# %%
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %%
|
||||
import sys, os
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
from machine_learning.cross_validation import CrossValidation
|
||||
from machine_learning.preprocessing import Preprocessing
|
||||
|
||||
# %%
|
||||
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
df.set_index(index_columns, inplace=True)
|
||||
|
||||
cv = CrossValidation(data=df, cv_method="logo")
|
||||
|
||||
categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
|
||||
interval_feature_list, other_feature_list = [], []
|
||||
|
||||
print(df.columns.tolist())
|
||||
|
||||
for split in cv.get_splits():
|
||||
train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
|
||||
pre = Preprocessing(train_X, train_y, test_X, test_y)
|
||||
pre.one_hot_encode_train_and_test_sets(categorical_columns)
|
||||
train_X, train_y, test_X, test_y = pre.get_train_test_sets()
|
||||
break
|
||||
|
||||
# %%
|
|
@ -46,9 +46,9 @@ import machine_learning.helper
|
|||
#
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
cv_method_str = '5kfold' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
undersampling = True # (bool) If True this will train and test data on balanced dataset (using undersampling method)
|
||||
undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method)
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
|
||||
|
@ -72,20 +72,27 @@ model_input['target'].value_counts()
|
|||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
# UnderSampling
|
||||
if undersampling:
|
||||
model_input_new = pd.DataFrame(columns=model_input.columns)
|
||||
for pid in model_input["pid"].unique():
|
||||
stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
|
||||
no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
|
||||
if (len(stress) == 0):
|
||||
continue
|
||||
if (len(no_stress) == 0):
|
||||
continue
|
||||
model_input_new = pd.concat([model_input_new, stress], axis=0)
|
||||
no_stress = model_input[model_input['target'] == 0]
|
||||
stress = model_input[model_input['target'] == 1]
|
||||
|
||||
no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
|
||||
# In case there are more stress samples than no_stress, take all instances of no_stress.
|
||||
model_input_new = pd.concat([model_input_new, no_stress], axis=0)
|
||||
model_input = model_input_new
|
||||
no_stress = no_stress.sample(n=len(stress))
|
||||
model_input = pd.concat([stress,no_stress], axis=0)
|
||||
|
||||
# model_input_new = pd.DataFrame(columns=model_input.columns)
|
||||
# for pid in model_input["pid"].unique():
|
||||
# stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
|
||||
# no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
|
||||
# if (len(stress) == 0):
|
||||
# continue
|
||||
# if (len(no_stress) == 0):
|
||||
# continue
|
||||
# model_input_new = pd.concat([model_input_new, stress], axis=0)
|
||||
|
||||
# no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
|
||||
# # In case there are more stress samples than no_stress, take all instances of no_stress.
|
||||
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
|
||||
# model_input = model_input_new
|
||||
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
|
@ -170,7 +177,7 @@ final_scores = machine_learning.helper.run_all_classification_models(imputer.fit
|
|||
# %%
|
||||
final_scores.index.name = "metric"
|
||||
final_scores = final_scores.set_index(["method", final_scores.index])
|
||||
final_scores.to_csv("../presentation/event_stressful_detection_5fold.csv")
|
||||
final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Logistic Regression
|
||||
|
@ -453,4 +460,3 @@ print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
|||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 2.5 MiB |
|
@ -0,0 +1,121 @@
|
|||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold
|
||||
|
||||
class CrossValidation():
|
||||
"""This code implements a CrossValidation class for creating cross validation splits.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, data=None, cv_method='logo'):
|
||||
"""This method initializes the cv_method argument and optionally prepares the data if supplied.
|
||||
|
||||
Args:
|
||||
cv_method (str, optional): String of cross validation method; options are 'logo', 'half_logo' and '5kfold'.
|
||||
Defaults to 'logo'.
|
||||
data (DataFrame, optional): Pandas DataFrame with target, pid columns and other features as columns.
|
||||
Defaults to None.
|
||||
"""
|
||||
|
||||
self.initialize_cv_method(cv_method)
|
||||
|
||||
if data is not None:
|
||||
self.prepare_data(data)
|
||||
|
||||
|
||||
def prepare_data(self, data):
|
||||
"""Prepares the data ready to be passed to the cross-validation algorithm, depending on the cv_method type.
|
||||
For example, if cv_method is set to 'half_logo' new columns 'pid_index', 'pid_count', 'pid_half'
|
||||
are added and used in the process.
|
||||
|
||||
Args:
|
||||
data (_type_): Pandas DataFrame with target, pid columns and other features as columns.
|
||||
"""
|
||||
self.data = data
|
||||
if self.cv_method == "logo":
|
||||
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
|
||||
|
||||
elif self.cv_method == "half_logo":
|
||||
data['pid_index'] = data.groupby('pid').cumcount()
|
||||
data['pid_count'] = data.groupby('pid')['pid'].transform('count')
|
||||
|
||||
data["pid_index"] = (data['pid_index'] / data['pid_count'] + 1).round()
|
||||
data["pid_half"] = data["pid"] + "_" + data["pid_index"].astype(int).astype(str)
|
||||
|
||||
data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
|
||||
|
||||
elif self.cv_method == "5kfold":
|
||||
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
|
||||
|
||||
self.X, self.y, self.groups = data_X, data_y, data_groups
|
||||
|
||||
|
||||
def initialize_cv_method(self, cv_method):
|
||||
"""Initializes the given cv_method type. Depending on the type, the appropriate splitting technique is used.
|
||||
|
||||
Args:
|
||||
cv_method (str): The type of cross-validation method to use; options are 'logo', 'half_logo' and '5kfold'.
|
||||
|
||||
Raises:
|
||||
ValueError: If cv_method is not in the list of available methods, it raises an ValueError.
|
||||
"""
|
||||
|
||||
self.cv_method = cv_method
|
||||
if self.cv_method not in ["logo", "half_logo", "5kfold"]:
|
||||
raise ValueError("Invalid cv_method input. Correct values are: 'logo', 'half_logo', '5kfold'")
|
||||
|
||||
if self.cv_method in ["logo", "half_logo"]:
|
||||
self.cv = LeaveOneGroupOut()
|
||||
elif self.cv_method == "5kfold":
|
||||
self.cv = StratifiedKFold(n_splits=5, shuffle=True)
|
||||
|
||||
|
||||
def get_splits(self):
|
||||
"""Returns a generator object containing the cross-validation splits.
|
||||
|
||||
Raises:
|
||||
ValueError: Raises ValueError if no data has been set.
|
||||
|
||||
"""
|
||||
if not self.data.empty:
|
||||
return self.cv.split(self.X, self.y, self.groups)
|
||||
else:
|
||||
raise ValueError("No data has been set. Use 'prepare_data(data)' method to set the data.")
|
||||
|
||||
|
||||
def get_data(self):
|
||||
"""data getter
|
||||
|
||||
Returns:
|
||||
Pandas DataFrame: Returns the data from the class instance.
|
||||
"""
|
||||
return self.data
|
||||
|
||||
|
||||
def get_x_y_groups(self):
|
||||
"""X, y, and groups data getter
|
||||
|
||||
Returns:
|
||||
Pandas DataFrame: Returns the data from the class instance.
|
||||
"""
|
||||
return self.X, self.y, self.groups
|
||||
|
||||
|
||||
def get_train_test_sets(self, split):
|
||||
"""Gets train and test sets, dependent on the split parameter. This method can be used in a specific splitting context,
|
||||
where by index we can get train and test sets.
|
||||
|
||||
Args:
|
||||
split (tuple of indices): It represents one iteration of the split generator (see get_splits method).
|
||||
|
||||
Returns:
|
||||
tuple of Pandas DataFrames: This method returns train_X, train_y, test_X, test_y, with correctly indexed rows by split param.
|
||||
"""
|
||||
return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
|
||||
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.feature_selection import SequentialFeatureSelector
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
|
||||
|
||||
""" Feature selection pipeline: a methods that can be used in the wrapper metod alongside other wrapper contents (hyperparameter tuning etc.).
|
||||
|
||||
(1) Establish methods for each of the steps in feature selection protocol:
|
||||
(a) feature selection inside specific sensors (sklearn method): returns most important features from all sensors
|
||||
(b) feature selection between "tuned" sensors: returns filtered sensors, containing most important features retured with (a)
|
||||
(2) Ensure that above methods are given only a part of data and use appropriate random seeds - to later simulate use case in production.
|
||||
(3) Implement a method which gives graphical exploration of (1) (a) and (b) steps of the feature selection.
|
||||
(4) Prepare a core method that can be fit into a wrapper (see sklearn wrapper methods) and integrates methods from (1)
|
||||
|
||||
"""
|
||||
|
||||
class FeatureSelection:
|
||||
|
||||
def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
|
||||
pass
|
||||
|
||||
|
||||
def within_sensors_feature_selection(estimator, scoring, tol):
|
||||
features_list = []
|
||||
|
||||
nb = GaussianNB()
|
||||
sfs = SequentialFeatureSelector(nb, n_features_to_select='auto', tol=0.02) # Can set n_features to an absolute value -> then remove tol parameter.
|
||||
|
||||
|
||||
return features_list
|
||||
|
||||
def between_sensors_feature_selection():
|
||||
pass
|
||||
|
||||
def vizualize_feature_selection_process():
|
||||
pass
|
||||
|
||||
def execute_feature_selection_step():
|
||||
pass
|
|
@ -0,0 +1,126 @@
|
|||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
class Preprocessing:
|
||||
"""This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data.
|
||||
It's blind to the test data - e.g, it imputes the test data with train data mean.
|
||||
This means, it somehow needs an access to the information about data split. In context
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, train_X, train_y, test_X, test_y):
|
||||
self.train_X = train_X
|
||||
self.train_y = train_y
|
||||
self.test_X = test_X
|
||||
self.test_y = test_y
|
||||
|
||||
|
||||
def one_hot_encoder(self, categorical_features, numerical_features, mode):
|
||||
"""
|
||||
This code is an implementation of one-hot encoding. It takes in two data sets,
|
||||
one with categorical features and one with numerical features and a mode parameter.
|
||||
First it uses the fillna() function to fill in any missing values present in the
|
||||
categorical data set with the mode value. Then it uses the apply () method to
|
||||
convert each column of the data set into a category data type which is then
|
||||
transformed using the pd.get_dummies() function. Finally it concatenates the
|
||||
numerical data set and the transformed categorical data set using pd.concat() and
|
||||
returns it.
|
||||
|
||||
Args:
|
||||
categorical_features (DataFrame): DataFrame including only categorical columns.
|
||||
numerical_features (_type_): DataFrame including only numerical columns.
|
||||
mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
|
||||
|
||||
Returns:
|
||||
DataFrame: Hot-One Encoded DataFrame.
|
||||
"""
|
||||
# Fill train set with mode
|
||||
categorical_features = categorical_features.fillna(mode)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
return pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
|
||||
def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
|
||||
"""
|
||||
This code is used to transform categorical data into numerical representations.
|
||||
It first identifies the categorical columns, then copies them and saves them as
|
||||
a new dataset. The missing data is filled with the mode (most frequent value in
|
||||
the respective column). This new dataset is then subjected to one-hot encoding,
|
||||
which is a process of transforming categorical data into machine interpretable
|
||||
numerical form by converting categories into multiple binary outcome variables.
|
||||
These encoded values are then concatenated to the numerical features prior to
|
||||
being returned as the final dataset.
|
||||
|
||||
Args:
|
||||
categorical_columns (list, optional): List of categorical columns in the dataset.
|
||||
Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"].
|
||||
|
||||
"""
|
||||
categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
|
||||
|
||||
# For train set
|
||||
|
||||
train_X_categorical_features = self.train_X[categorical_columns].copy()
|
||||
train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
|
||||
mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
|
||||
|
||||
self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
|
||||
|
||||
# For test set
|
||||
|
||||
test_X_categorical_features = self.test_X[categorical_columns].copy()
|
||||
test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
|
||||
|
||||
self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
|
||||
|
||||
|
||||
def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
|
||||
|
||||
# TODO: TESTING
|
||||
|
||||
if groupby:
|
||||
# Interval numerical features # TODO: How can we get and assign appropriate groupby means and assign them to correct columns?
|
||||
|
||||
# VVVVV ...... IN PROGRES ...... VVVVV
|
||||
means = self.train_X[interval_feature_list].groupby(groupby_feature).mean()
|
||||
self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + other_feature_list)]] = \
|
||||
self.train_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
|
||||
|
||||
self.test_X[self.test_X.loc[:, ~self.test_X.columns.isin([groupby_feature] + other_feature_list)]] = \
|
||||
self.test_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
|
||||
|
||||
# Other features
|
||||
self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + interval_feature_list)]] = \
|
||||
self.train_X[other_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.median()))
|
||||
|
||||
else:
|
||||
# Interval numerical features
|
||||
means = self.train_X[interval_feature_list].mean()
|
||||
self.train_X[interval_feature_list].fillna(means, inplace=True)
|
||||
self.test_X[interval_feature_list].fillna(means, inplace=True)
|
||||
|
||||
# Other features
|
||||
medians = self.train_X[other_feature_list].median()
|
||||
self.train_X[other_feature_list].fillna(medians, inplace=True)
|
||||
self.test_X[other_feature_list].fillna(medians, inplace=True)
|
||||
|
||||
|
||||
def get_train_test_sets(self):
|
||||
"""Train and test sets getter
|
||||
|
||||
Returns:
|
||||
tuple of Pandas DataFrames: Gets train test sets in traditional sklearn format.
|
||||
"""
|
||||
return self.train_X, self.train_y, self.test_X, self.test_y
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
method,metric,max,mean
|
||||
Dummy,test_accuracy,0.8557046979865772,0.8548446932649828
|
||||
Dummy,test_average_precision,0.1457286432160804,0.14515530673501736
|
||||
Dummy,test_recall,0.0,0.0
|
||||
Dummy,test_f1,0.0,0.0
|
||||
logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606
|
||||
logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834
|
||||
logistic_reg,test_recall,0.3023255813953488,0.24266238973536486
|
||||
logistic_reg,test_f1,0.3909774436090226,0.318943511424051
|
||||
svc,test_accuracy,0.8557046979865772,0.8548446932649828
|
||||
svc,test_average_precision,0.44514416839823046,0.4068200938341621
|
||||
svc,test_recall,0.0,0.0
|
||||
svc,test_f1,0.0,0.0
|
||||
gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234
|
||||
gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853
|
||||
gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935
|
||||
gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699
|
||||
stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601
|
||||
stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592
|
||||
stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675
|
||||
stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583
|
||||
random_forest,test_accuracy,0.9110738255033557,0.9011129472867694
|
||||
random_forest,test_average_precision,0.6998372262021191,0.6619275281099584
|
||||
random_forest,test_recall,0.4069767441860465,0.35356856455493185
|
||||
random_forest,test_f1,0.5691056910569107,0.5078402513053142
|
||||
xgboost,test_accuracy,0.9128978224455612,0.9007711937764886
|
||||
xgboost,test_average_precision,0.7366643049075349,0.698622165966308
|
||||
xgboost,test_recall,0.5287356321839081,0.44346431435445066
|
||||
xgboost,test_f1,0.638888888888889,0.5633957169928393
|
|
|
@ -0,0 +1,29 @@
|
|||
method,metric,max,mean
|
||||
Dummy,test_accuracy,1.0,0.8524114578096439
|
||||
Dummy,test_average_precision,0.7,0.14758854219035614
|
||||
Dummy,test_recall,0.0,0.0
|
||||
Dummy,test_f1,0.0,0.0
|
||||
logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311
|
||||
logistic_reg,test_average_precision,1.0,0.44605167668563583
|
||||
logistic_reg,test_recall,1.0,0.25353566685532386
|
||||
logistic_reg,test_f1,0.823529411764706,0.27951926390778625
|
||||
svc,test_accuracy,1.0,0.8524114578096439
|
||||
svc,test_average_precision,0.9612401707068228,0.44179454944271934
|
||||
svc,test_recall,0.0,0.0
|
||||
svc,test_f1,0.0,0.0
|
||||
gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129
|
||||
gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991
|
||||
gaussian_naive_bayes,test_recall,1.0,0.3743761174081108
|
||||
gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668
|
||||
stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739
|
||||
stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582
|
||||
stochastic_gradient_descent,test_recall,1.0,0.30156420704502945
|
||||
stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388
|
||||
random_forest,test_accuracy,1.0,0.8722158105763481
|
||||
random_forest,test_average_precision,1.0,0.49817066323226833
|
||||
random_forest,test_recall,1.0,0.18161263127840668
|
||||
random_forest,test_f1,1.0,0.2508096532365307
|
||||
xgboost,test_accuracy,1.0,0.8812627400277729
|
||||
xgboost,test_average_precision,1.0,0.5505695112208401
|
||||
xgboost,test_recall,1.0,0.2896161238315027
|
||||
xgboost,test_f1,0.9411764705882353,0.36887408735855665
|
|
Loading…
Reference in New Issue