stress_at_work_analysis/exploration/ml_pipeline_classification_...

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.13.0
#   kernelspec:
#     display_name: straw2analysis
#     language: python
#     name: straw2analysis
# ---

# %% jupyter={"source_hidden": true}
# %matplotlib inline
import datetime
import importlib
import os
import sys

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats

from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
import xgboost as xg

from sklearn.cluster import KMeans

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import machine_learning.labels
import machine_learning.model

# %% [markdown]
# # RAPIDS models

# %% [markdown]
# ## Set script's parameters
n_clusters = 5 # Number of clusters (could be regarded as a hyperparameter)
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs

# %% jupyter={"source_hidden": true}
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]

clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance

model_input.columns[list(model_input.columns).index('age'):-1]

lime_cols = [col for col in model_input if col.startswith('limesurvey')]
lime_cols
lime_col = 'limesurvey_demand_control_ratio'
clust_col = lime_col

model_input[clust_col].describe()


# %% jupyter={"source_hidden": true}

# Filter-out outlier rows by clust_col 
model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]

uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
plt.bar(uniq['pid'], uniq[clust_col])

# %% jupyter={"source_hidden": true}
# Get clusters by cluster col & and merge the clusters to main df
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
np.unique(km, return_counts=True)
uniq['cluster'] = km
uniq

model_input = model_input.merge(uniq[['pid', 'cluster']])   

# %% jupyter={"source_hidden": true}
model_input.set_index(index_columns, inplace=True)

# %% jupyter={"source_hidden": true}
# Create dict with classification ml models
cmodels = {
    'dummy_classifier': {
        'model': DummyClassifier(strategy="most_frequent"),
        'metrics': [0, 0, 0, 0]
    },
    'logistic_regression': {
        'model': linear_model.LogisticRegression(),
        'metrics': [0, 0, 0, 0]
    },
    'support_vector_machine': {
        'model': svm.SVC(),
        'metrics': [0, 0, 0, 0]
    },
    'gaussian_naive_bayes': {
        'model': naive_bayes.GaussianNB(),
        'metrics': [0, 0, 0, 0]
    },
    'stochastic_gradient_descent_classifier': {
        'model': linear_model.SGDClassifier(),
        'metrics': [0, 0, 0, 0]
    },
    'knn': {
        'model': neighbors.KNeighborsClassifier(),
        'metrics': [0, 0, 0, 0]
    },
    'decision_tree': {
        'model': tree.DecisionTreeClassifier(),
        'metrics': [0, 0, 0, 0]
    },
    'random_forest_classifier': {
        'model': ensemble.RandomForestClassifier(),
        'metrics': [0, 0, 0, 0]
    },
    'gradient_boosting_classifier': {
        'model': ensemble.GradientBoostingClassifier(),
        'metrics': [0, 0, 0, 0]
    },
    'lgbm_classifier': {
        'model': LGBMClassifier(),
        'metrics': [0, 0, 0, 0]
    },
    'XGBoost_classifier': {
        'model': xg.sklearn.XGBClassifier(),
        'metrics': [0, 0, 0, 0]
    }
}

# %% jupyter={"source_hidden": true}
for k in range(n_clusters):
    model_input_subset = model_input[model_input["cluster"] == k].copy()
    bins = [-10, -1, 1, 10] # bins for z-scored targets
    model_input_subset.loc[:, 'target'] = \
        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
    model_input_subset['target'].value_counts()
    model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
    model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)

    model_input_subset['target'].value_counts()
    
    if cv_method_str == 'halflogo':
        model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
        model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')

        model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
        model_input_subset["pid_half"] = model_input_subset["pid"] + "_" +  model_input_subset["pid_index"].astype(int).astype(str)

        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
    else:
        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]

    # Treat categorical features
    categorical_feature_colnames = ["gender", "startlanguage"]
    additional_categorical_features = [] #[col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
    categorical_feature_colnames += additional_categorical_features

    categorical_features = data_x[categorical_feature_colnames].copy()
    mode_categorical_features = categorical_features.mode().iloc[0]

    # fillna with mode
    categorical_features = categorical_features.fillna(mode_categorical_features)

    # one-hot encoding
    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
    if not categorical_features.empty:
        categorical_features = pd.get_dummies(categorical_features)

    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
    train_x = pd.concat([numerical_features, categorical_features], axis=1)

    # Establish cv method
    cv_method = None # Defaults to 5 k-folds in cross_validate method
    if cv_method_str == 'logo' or cv_method_str == 'half_logo':
        cv_method = LeaveOneGroupOut()
        cv_method.get_n_splits(
            train_x,
            data_y,
            groups=data_groups,
        )

    imputer = SimpleImputer(missing_values=np.nan, strategy='median')

    for model_title, model in cmodels.items():

        classifier = cross_validate(
            model['model'],
            X=imputer.fit_transform(train_x),
            y=data_y,
            groups=data_groups,
            cv=cv_method,
            n_jobs=-1,
            error_score='raise',
            scoring=('accuracy', 'precision', 'recall', 'f1')
        )
        
        print("\n-------------------------------------\n")
        print("Current cluster:", k, end="\n")
        print("Current model:", model_title, end="\n")
        print("Acc", np.mean(classifier['test_accuracy']))
        print("Precision", np.mean(classifier['test_precision']))
        print("Recall", np.mean(classifier['test_recall']))
        print("F1", np.mean(classifier['test_f1']))
        print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
        print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))
        
        cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
        cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
        cmodels[model_title]['metrics'][2] += np.mean(classifier['test_accuracy'])
        cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])

# %% jupyter={"source_hidden": true}
# Get overall results
for model_title, model in cmodels.items():
    print("\n************************************\n")
    print("Current model:", model_title, end="\n")
    print("Acc", model['metrics'][0]/n_clusters)
    print("Precision", model['metrics'][1]/n_clusters)
    print("Recall", model['metrics'][2]/n_clusters)
    print("F1", model['metrics'][3]/n_clusters)
Add classification with clustering ml pipeline script. 2022-11-24 09:24:13 +01:00			`# ---`
			`# jupyter:`
			`# jupytext:`
			`# formats: ipynb,py:percent`
			`# text_representation:`
			`# extension: .py`
			`# format_name: percent`
			`# format_version: '1.3'`
			`# jupytext_version: 1.13.0`
			`# kernelspec:`
			`# display_name: straw2analysis`
			`# language: python`
			`# name: straw2analysis`
			`# ---`

			`# %% jupyter={"source_hidden": true}`
			`# %matplotlib inline`
			`import datetime`
			`import importlib`
			`import os`
			`import sys`

			`import numpy as np`
			`import matplotlib.pyplot as plt`
			`import pandas as pd`
			`import seaborn as sns`
			`from scipy import stats`

			`from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble`
			`from sklearn.model_selection import LeaveOneGroupOut, cross_validate`
			`from sklearn.dummy import DummyClassifier`
			`from sklearn.impute import SimpleImputer`
			`from lightgbm import LGBMClassifier`
			`import xgboost as xg`

			`from sklearn.cluster import KMeans`

			`from IPython.core.interactiveshell import InteractiveShell`
			`InteractiveShell.ast_node_interactivity = "all"`

			`nb_dir = os.path.split(os.getcwd())[0]`
			`if nb_dir not in sys.path:`
			`sys.path.append(nb_dir)`

			`import machine_learning.labels`
			`import machine_learning.model`

			`# %% [markdown]`
			`# # RAPIDS models`

			`# %% [markdown]`
Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements. 2022-11-24 16:12:20 +01:00			`# ## Set script's parameters`
			`n_clusters = 5 # Number of clusters (could be regarded as a hyperparameter)`
			`cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)`
			`n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs`
Add classification with clustering ml pipeline script. 2022-11-24 09:24:13 +01:00
			`# %% jupyter={"source_hidden": true}`
			`model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")`
Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements. 2022-11-24 16:12:20 +01:00			`index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]`

			`clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance`

			`model_input.columns[list(model_input.columns).index('age'):-1]`
Add classification with clustering ml pipeline script. 2022-11-24 09:24:13 +01:00
Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements. 2022-11-24 16:12:20 +01:00			`lime_cols = [col for col in model_input if col.startswith('limesurvey')]`
			`lime_cols`
Add classification with clustering ml pipeline script. 2022-11-24 09:24:13 +01:00			`lime_col = 'limesurvey_demand_control_ratio'`
Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements. 2022-11-24 16:12:20 +01:00			`clust_col = lime_col`

			`model_input[clust_col].describe()`

Add classification with clustering ml pipeline script. 2022-11-24 09:24:13 +01:00
			`# %% jupyter={"source_hidden": true}`

Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements. 2022-11-24 16:12:20 +01:00			`# Filter-out outlier rows by clust_col`
			`model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]`
Add classification with clustering ml pipeline script. 2022-11-24 09:24:13 +01:00
Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements. 2022-11-24 16:12:20 +01:00			`uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)`
			`plt.bar(uniq['pid'], uniq[clust_col])`
Add classification with clustering ml pipeline script. 2022-11-24 09:24:13 +01:00
			`# %% jupyter={"source_hidden": true}`
Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements. 2022-11-24 16:12:20 +01:00			`# Get clusters by cluster col & and merge the clusters to main df`
			`km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))`
Add classification with clustering ml pipeline script. 2022-11-24 09:24:13 +01:00			`np.unique(km, return_counts=True)`
			`uniq['cluster'] = km`
			`uniq`

			`model_input = model_input.merge(uniq[['pid', 'cluster']])`

			`# %% jupyter={"source_hidden": true}`
			`model_input.set_index(index_columns, inplace=True)`

			`# %% jupyter={"source_hidden": true}`
Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements. 2022-11-24 16:12:20 +01:00			`# Create dict with classification ml models`
			`cmodels = {`
			`'dummy_classifier': {`
			`'model': DummyClassifier(strategy="most_frequent"),`
			`'metrics': [0, 0, 0, 0]`
			`},`
			`'logistic_regression': {`
			`'model': linear_model.LogisticRegression(),`
			`'metrics': [0, 0, 0, 0]`
			`},`
			`'support_vector_machine': {`
			`'model': svm.SVC(),`
			`'metrics': [0, 0, 0, 0]`
			`},`
			`'gaussian_naive_bayes': {`
			`'model': naive_bayes.GaussianNB(),`
			`'metrics': [0, 0, 0, 0]`
			`},`
			`'stochastic_gradient_descent_classifier': {`
			`'model': linear_model.SGDClassifier(),`
			`'metrics': [0, 0, 0, 0]`
			`},`
			`'knn': {`
			`'model': neighbors.KNeighborsClassifier(),`
			`'metrics': [0, 0, 0, 0]`
			`},`
			`'decision_tree': {`
			`'model': tree.DecisionTreeClassifier(),`
			`'metrics': [0, 0, 0, 0]`
			`},`
			`'random_forest_classifier': {`
			`'model': ensemble.RandomForestClassifier(),`
			`'metrics': [0, 0, 0, 0]`
			`},`
			`'gradient_boosting_classifier': {`
			`'model': ensemble.GradientBoostingClassifier(),`
			`'metrics': [0, 0, 0, 0]`
			`},`
			`'lgbm_classifier': {`
			`'model': LGBMClassifier(),`
			`'metrics': [0, 0, 0, 0]`
			`},`
			`'XGBoost_classifier': {`
			`'model': xg.sklearn.XGBClassifier(),`
			`'metrics': [0, 0, 0, 0]`
			`}`
			`}`
Add classification with clustering ml pipeline script. 2022-11-24 09:24:13 +01:00
Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements. 2022-11-24 16:12:20 +01:00			`# %% jupyter={"source_hidden": true}`
			`for k in range(n_clusters):`
Add classification with clustering ml pipeline script. 2022-11-24 09:24:13 +01:00			`model_input_subset = model_input[model_input["cluster"] == k].copy()`
			`bins = [-10, -1, 1, 10] # bins for z-scored targets`
			`model_input_subset.loc[:, 'target'] = \`
			`pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']`
			`model_input_subset['target'].value_counts()`
			`model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]`
			`model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)`

			`model_input_subset['target'].value_counts()`

			`if cv_method_str == 'halflogo':`
			`model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()`
			`model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')`

			`model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()`
			`model_input_subset["pid_half"] = model_input_subset["pid"] + "_" + model_input_subset["pid_index"].astype(int).astype(str)`

			`data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]`
			`else:`
			`data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]`

			`# Treat categorical features`
			`categorical_feature_colnames = ["gender", "startlanguage"]`
			`additional_categorical_features = [] #[col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]`
			`categorical_feature_colnames += additional_categorical_features`

			`categorical_features = data_x[categorical_feature_colnames].copy()`
			`mode_categorical_features = categorical_features.mode().iloc[0]`

			`# fillna with mode`
			`categorical_features = categorical_features.fillna(mode_categorical_features)`

			`# one-hot encoding`
			`categorical_features = categorical_features.apply(lambda col: col.astype("category"))`
			`if not categorical_features.empty:`
			`categorical_features = pd.get_dummies(categorical_features)`

			`numerical_features = data_x.drop(categorical_feature_colnames, axis=1)`
			`train_x = pd.concat([numerical_features, categorical_features], axis=1)`

			`# Establish cv method`
			`cv_method = None # Defaults to 5 k-folds in cross_validate method`
			`if cv_method_str == 'logo' or cv_method_str == 'half_logo':`
			`cv_method = LeaveOneGroupOut()`
			`cv_method.get_n_splits(`
			`train_x,`
			`data_y,`
			`groups=data_groups,`
			`)`

			`imputer = SimpleImputer(missing_values=np.nan, strategy='median')`

			`for model_title, model in cmodels.items():`
Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements. 2022-11-24 16:12:20 +01:00
Add classification with clustering ml pipeline script. 2022-11-24 09:24:13 +01:00			`classifier = cross_validate(`
Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements. 2022-11-24 16:12:20 +01:00			`model['model'],`
Add classification with clustering ml pipeline script. 2022-11-24 09:24:13 +01:00			`X=imputer.fit_transform(train_x),`
			`y=data_y,`
			`groups=data_groups,`
			`cv=cv_method,`
			`n_jobs=-1,`
			`error_score='raise',`
Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements. 2022-11-24 16:12:20 +01:00			`scoring=('accuracy', 'precision', 'recall', 'f1')`
Add classification with clustering ml pipeline script. 2022-11-24 09:24:13 +01:00			`)`

			`print("\n-------------------------------------\n")`
			`print("Current cluster:", k, end="\n")`
			`print("Current model:", model_title, end="\n")`
Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements. 2022-11-24 16:12:20 +01:00			`print("Acc", np.mean(classifier['test_accuracy']))`
			`print("Precision", np.mean(classifier['test_precision']))`
			`print("Recall", np.mean(classifier['test_recall']))`
			`print("F1", np.mean(classifier['test_f1']))`
			`print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])`
			`print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))`

			`cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])`
			`cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])`
			`cmodels[model_title]['metrics'][2] += np.mean(classifier['test_accuracy'])`
			`cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])`

			`# %% jupyter={"source_hidden": true}`
			`# Get overall results`
			`for model_title, model in cmodels.items():`
			`print("\n************************************\n")`
			`print("Current model:", model_title, end="\n")`
			`print("Acc", model['metrics'][0]/n_clusters)`
			`print("Precision", model['metrics'][1]/n_clusters)`
			`print("Recall", model['metrics'][2]/n_clusters)`
			`print("F1", model['metrics'][3]/n_clusters)`