Finish classification presentation.

ml_pipeline
junos 2022-12-08 10:02:13 +01:00
parent 852e17afbe
commit 2bb95657d8
1 changed files with 18 additions and 276 deletions

View File

@ -30,28 +30,34 @@ from sklearn.model_selection import LeaveOneGroupOut, cross_validate
from sklearn.dummy import DummyClassifier from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
import xgboost as xg import xgboost as xg
from IPython.core.interactiveshell import InteractiveShell from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" InteractiveShell.ast_node_interactivity = "all"
from pathlib import Path
nb_dir = os.path.split(os.getcwd())[0] nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path: if nb_dir not in sys.path:
sys.path.append(nb_dir) sys.path.append(nb_dir)
import machine_learning.labels import machine_learning.labels
import machine_learning.model import machine_learning.model
from machine_learning.helper import run_all_classification_models
# %% [markdown] # %% [markdown]
# # RAPIDS models # # RAPIDS models
# %% [markdown] # %% [markdown]
# ## Set script's parameters # ## Set script's parameters
#
# %%
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/stressfulness_event_nonstandardized/input_appraisal_stressfulness_event_mean.csv") filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
model_input = pd.read_csv(filename)
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
@ -59,11 +65,11 @@ model_input.set_index(index_columns, inplace=True)
model_input['target'].value_counts() model_input['target'].value_counts()
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
# bins = [-10, -1, 1, 10] # bins for z-scored targets bins = [-10, -1, 1, 10] # bins for z-scored targets
bins = [0, 1, 4] # bins for stressfulness (1-4) target # bins = [0, 1, 4] # bins for stressfulness (1-4) target
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high'] model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'medium', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
model_input['target'].value_counts(), edges model_input['target'].value_counts(), edges
# model_input = model_input[model_input['target'] != "medium"] model_input = model_input[model_input['target'] != "medium"]
model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1) model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
model_input['target'].value_counts() model_input['target'].value_counts()
@ -98,7 +104,6 @@ if not categorical_features.empty:
numerical_features = data_x.drop(categorical_feature_colnames, axis=1) numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
train_x = pd.concat([numerical_features, categorical_features], axis=1) train_x = pd.concat([numerical_features, categorical_features], axis=1)
train_x.dtypes
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
cv_method = None # Defaults to 5 k-folds in cross_validate method cv_method = None # Defaults to 5 k-folds in cross_validate method
@ -113,273 +118,10 @@ if cv_method_str == 'logo' or cv_method_str == 'half_logo':
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy='median') imputer = SimpleImputer(missing_values=np.nan, strategy='median')
# %% [markdown] # %%
# ### Baseline: Dummy Classifier (most frequent) final_scores = run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
dummy_class = DummyClassifier(strategy="most_frequent")
# %% jupyter={"source_hidden": true} # %%
dummy_classifier = cross_validate( final_scores.index.name = "metric"
dummy_class, final_scores = final_scores.set_index(["method", final_scores.index])
X=imputer.fit_transform(train_x), final_scores.to_csv("event_stressfulness_lmh_lh_scores.csv")
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=('accuracy', 'average_precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(dummy_classifier['test_accuracy']))
print("Precision", np.mean(dummy_classifier['test_average_precision']))
print("Recall", np.mean(dummy_classifier['test_recall']))
print("F1", np.mean(dummy_classifier['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown]
# ### Logistic Regression
# %% jupyter={"source_hidden": true}
logistic_regression = linear_model.LogisticRegression()
# %% jupyter={"source_hidden": true}
log_reg_scores = cross_validate(
logistic_regression,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(log_reg_scores['test_accuracy']))
print("Precision", np.mean(log_reg_scores['test_precision']))
print("Recall", np.mean(log_reg_scores['test_recall']))
print("F1", np.mean(log_reg_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown]
# ### Support Vector Machine
# %% jupyter={"source_hidden": true}
svc = svm.SVC()
# %% jupyter={"source_hidden": true}
svc_scores = cross_validate(
svc,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(svc_scores['test_accuracy']))
print("Precision", np.mean(svc_scores['test_precision']))
print("Recall", np.mean(svc_scores['test_recall']))
print("F1", np.mean(svc_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown]
# ### Gaussian Naive Bayes
# %% jupyter={"source_hidden": true}
gaussian_nb = naive_bayes.GaussianNB()
# %% jupyter={"source_hidden": true}
gaussian_nb_scores = cross_validate(
gaussian_nb,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(gaussian_nb_scores['test_accuracy']))
print("Precision", np.mean(gaussian_nb_scores['test_precision']))
print("Recall", np.mean(gaussian_nb_scores['test_recall']))
print("F1", np.mean(gaussian_nb_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown]
# ### Stochastic Gradient Descent Classifier
# %% jupyter={"source_hidden": true}
sgdc = linear_model.SGDClassifier()
# %% jupyter={"source_hidden": true}
sgdc_scores = cross_validate(
sgdc,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(sgdc_scores['test_accuracy']))
print("Precision", np.mean(sgdc_scores['test_precision']))
print("Recall", np.mean(sgdc_scores['test_recall']))
print("F1", np.mean(sgdc_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown]
# ### K-nearest neighbors
# %% jupyter={"source_hidden": true}
knn = neighbors.KNeighborsClassifier()
# %% jupyter={"source_hidden": true}
knn_scores = cross_validate(
knn,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(knn_scores['test_accuracy']))
print("Precision", np.mean(knn_scores['test_precision']))
print("Recall", np.mean(knn_scores['test_recall']))
print("F1", np.mean(knn_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown]
# ### Decision Tree
# %% jupyter={"source_hidden": true}
dtree = tree.DecisionTreeClassifier()
# %% jupyter={"source_hidden": true}
dtree_scores = cross_validate(
dtree,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(dtree_scores['test_accuracy']))
print("Precision", np.mean(dtree_scores['test_precision']))
print("Recall", np.mean(dtree_scores['test_recall']))
print("F1", np.mean(dtree_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown]
# ### Random Forest Classifier
# %% jupyter={"source_hidden": true}
rfc = ensemble.RandomForestClassifier()
# %% jupyter={"source_hidden": true}
rfc_scores = cross_validate(
rfc,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(rfc_scores['test_accuracy']))
print("Precision", np.mean(rfc_scores['test_precision']))
print("Recall", np.mean(rfc_scores['test_recall']))
print("F1", np.mean(rfc_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown]
# ### Gradient Boosting Classifier
# %% jupyter={"source_hidden": true}
gbc = ensemble.GradientBoostingClassifier()
# %% jupyter={"source_hidden": true}
gbc_scores = cross_validate(
gbc,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(gbc_scores['test_accuracy']))
print("Precision", np.mean(gbc_scores['test_precision']))
print("Recall", np.mean(gbc_scores['test_recall']))
print("F1", np.mean(gbc_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown]
# ### LGBM Classifier
# %% jupyter={"source_hidden": true}
lgbm = LGBMClassifier()
# %% jupyter={"source_hidden": true}
lgbm_scores = cross_validate(
lgbm,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(lgbm_scores['test_accuracy']))
print("Precision", np.mean(lgbm_scores['test_precision']))
print("Recall", np.mean(lgbm_scores['test_recall']))
print("F1", np.mean(lgbm_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n_sl)[:n_sl]))
# %% [markdown]
# ### XGBoost Classifier
# %% jupyter={"source_hidden": true}
xgb_classifier = xg.sklearn.XGBClassifier()
# %% jupyter={"source_hidden": true}
xgb_classifier_scores = cross_validate(
xgb_classifier,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=('accuracy', 'precision', 'recall', 'f1')
)
# %% jupyter={"source_hidden": true}
print("Acc", np.mean(xgb_classifier_scores['test_accuracy']))
print("Precision", np.mean(xgb_classifier_scores['test_precision']))
print("Recall", np.mean(xgb_classifier_scores['test_recall']))
print("F1", np.mean(xgb_classifier_scores['test_f1']))
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))