Add classification with clustering ml pipeline script.

ml_pipeline
Primoz 2022-11-24 09:24:13 +01:00
parent 7afef5582f
commit ddde80b421
2 changed files with 178 additions and 7 deletions

View File

@ -51,13 +51,6 @@ import machine_learning.model
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv") model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
lime_cols = [col for col in model_input if col.startswith('limesurvey_demand')]
model_input['limesurvey_demand_control_ratio'].describe()
lime_cols
# TODO: prek lime_cols ustvari klastre, ki jih nato kasneje ločeno preveriš z modeli (npr. k=5). Potrebno bo trikrat ponoviti spodnji postopek.
# Pomisli, če gre kaj zavizi v for loop (npr. modeli v seznamu)
# %% jupyter={"source_hidden": true} # %% jupyter={"source_hidden": true}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input.set_index(index_columns, inplace=True) model_input.set_index(index_columns, inplace=True)

View File

@ -0,0 +1,178 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %% jupyter={"source_hidden": true}
# %matplotlib inline
import datetime
import importlib
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
import xgboost as xg
from sklearn.cluster import KMeans
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import machine_learning.labels
import machine_learning.model
# %% [markdown]
# # RAPIDS models
# %% [markdown]
# ## PANAS negative affect
# %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
lime_cols = [col for col in model_input if col.startswith('limesurvey_demand')]
lime_col = 'limesurvey_demand_control_ratio'
model_input[lime_col].describe()
# %% jupyter={"source_hidden": true}
# Filter-out outlier rows by lime_col
model_input = model_input[(np.abs(stats.zscore(model_input[lime_col])) < 3)]
uniq = model_input[[lime_col, 'pid']].drop_duplicates().reset_index(drop=True)
plt.bar(uniq['pid'], uniq[lime_col])
# %% jupyter={"source_hidden": true}
# Get clusters by lime col & and merge the clusters to main df
km = KMeans(n_clusters=5).fit_predict(uniq.set_index('pid'))
np.unique(km, return_counts=True)
uniq['cluster'] = km
uniq
model_input = model_input.merge(uniq[['pid', 'cluster']])
# %% jupyter={"source_hidden": true}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
model_input.set_index(index_columns, inplace=True)
# %% jupyter={"source_hidden": true}
for k in range(5):
model_input_subset = model_input[model_input["cluster"] == k].copy()
bins = [-10, -1, 1, 10] # bins for z-scored targets
model_input_subset.loc[:, 'target'] = \
pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
model_input_subset['target'].value_counts()
model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
model_input_subset['target'].value_counts()
cv_method_str = 'logo' # logo, halflogo, 5kfold
if cv_method_str == 'halflogo':
model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
model_input_subset["pid_half"] = model_input_subset["pid"] + "_" + model_input_subset["pid_index"].astype(int).astype(str)
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
else:
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]
# Treat categorical features
categorical_feature_colnames = ["gender", "startlanguage"]
additional_categorical_features = [] #[col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
categorical_feature_colnames += additional_categorical_features
categorical_features = data_x[categorical_feature_colnames].copy()
mode_categorical_features = categorical_features.mode().iloc[0]
# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
train_x = pd.concat([numerical_features, categorical_features], axis=1)
# Establish cv method
cv_method = None # Defaults to 5 k-folds in cross_validate method
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
cv_method = LeaveOneGroupOut()
cv_method.get_n_splits(
train_x,
data_y,
groups=data_groups,
)
n = 3
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
# Create dict with classification ml models
cmodels = {
'dummy_classifier': DummyClassifier(strategy="most_frequent"),
'logistic_regression': linear_model.LogisticRegression(),
'support_vector_machine': svm.SVC(),
'gaussian_naive_bayes': naive_bayes.GaussianNB(),
'stochastic_gradient_descent_classifier': linear_model.SGDClassifier(),
'knn': neighbors.KNeighborsClassifier(),
'decision_tree': tree.DecisionTreeClassifier(),
'random_forest_classifier': ensemble.RandomForestClassifier(),
'gradient_boosting_classifier': ensemble.GradientBoostingClassifier(),
'lgbm_classifier': LGBMClassifier(),
'XGBoost_classifier': xg.sklearn.XGBClassifier()
}
for model_title, model in cmodels.items():
classifier = cross_validate(
model,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=cv_method,
n_jobs=-1,
error_score='raise',
scoring=('accuracy', 'average_precision', 'recall', 'f1')
)
print("\n-------------------------------------\n")
print("Current cluster:", k, end="\n")
print("Current model:", model_title, end="\n")
print("Acc", np.median(classifier['test_accuracy']))
print("Precision", np.median(classifier['test_average_precision']))
print("Recall", np.median(classifier['test_recall']))
print("F1", np.median(classifier['test_f1']))
print("Largest 5 ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n)[:n])[::-1])
print("Smallest 5 ACC:", np.sort(np.partition(classifier['test_accuracy'], n)[:n]))
# %%