# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% jupyter={"source_hidden": true} # %matplotlib inline import datetime import importlib import os import sys import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from scipy import stats from sklearn.model_selection import LeaveOneGroupOut, cross_validate, train_test_split from sklearn.impute import SimpleImputer from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.dummy import DummyClassifier from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble from lightgbm import LGBMClassifier import xgboost as xg from sklearn.cluster import KMeans from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import machine_learning.labels import machine_learning.model from machine_learning.classification_models import ClassificationModels # %% [markdown] # # RAPIDS models # %% [markdown] # # Useful method def treat_categorical_features(input_set): categorical_feature_colnames = ["gender", "startlanguage"] additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col] categorical_feature_colnames += additional_categorical_features categorical_features = input_set[categorical_feature_colnames].copy() mode_categorical_features = categorical_features.mode().iloc[0] # fillna with mode categorical_features = categorical_features.fillna(mode_categorical_features) # one-hot encoding categorical_features = categorical_features.apply(lambda col: col.astype("category")) if not categorical_features.empty: categorical_features = pd.get_dummies(categorical_features) numerical_features = input_set.drop(categorical_feature_colnames, axis=1) return pd.concat([numerical_features, categorical_features], axis=1) # %% [markdown] # ## Set script's parameters n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter) n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs # %% jupyter={"source_hidden": true} model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv") index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance model_input.columns[list(model_input.columns).index('age'):-1] lime_cols = [col for col in model_input if col.startswith('limesurvey')] lime_cols lime_col = 'limesurvey_demand_control_ratio' clust_col = lime_col model_input[clust_col].describe() # %% jupyter={"source_hidden": true} # Filter-out outlier rows by clust_col model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)] uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True) plt.bar(uniq['pid'], uniq[clust_col]) # %% jupyter={"source_hidden": true} # Get clusters by cluster col & and merge the clusters to main df km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid')) np.unique(km, return_counts=True) uniq['cluster'] = km uniq model_input = model_input.merge(uniq[['pid', 'cluster']]) # %% jupyter={"source_hidden": true} model_input.set_index(index_columns, inplace=True) # %% jupyter={"source_hidden": true} # Create dict with classification ml models cm = ClassificationModels() cmodels = cm.get_cmodels() # %% jupyter={"source_hidden": true} for k in range(n_clusters): model_input_subset = model_input[model_input["cluster"] == k].copy() # Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0. model_input_subset['numerical_target'] = model_input_subset['target'] bins = [-10, 0, 10] # bins for z-scored targets model_input_subset.loc[:, 'target'] = \ pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True) p15 = np.percentile(model_input_subset['numerical_target'], 15) p85 = np.percentile(model_input_subset['numerical_target'], 85) # Treat categorical features model_input_subset = treat_categorical_features(model_input_subset) # Split to train, validate, and test subsets train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1) test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1) train_set['target'].value_counts() test_set['target'].value_counts() train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"] validate_x, test_x, validate_y, test_y = \ train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42) # Impute missing values imputer = SimpleImputer(missing_values=np.nan, strategy='median') train_x = imputer.fit_transform(train_x) validate_x = imputer.fit_transform(validate_x) test_x = imputer.fit_transform(test_x) for model_title, model in cmodels.items(): model['model'].fit(train_x, train_y) y_pred = model['model'].predict(validate_x) acc = accuracy_score(validate_y, y_pred) prec = precision_score(validate_y, y_pred) rec = recall_score(validate_y, y_pred) f1 = f1_score(validate_y, y_pred) print("\n-------------------------------------\n") print("Current cluster:", k, end="\n") print("Current model:", model_title, end="\n") print("Acc", acc) print("Precision", prec) print("Recall", rec) print("F1", f1) cmodels[model_title]['metrics'][0] += acc cmodels[model_title]['metrics'][1] += prec cmodels[model_title]['metrics'][2] += rec cmodels[model_title]['metrics'][3] += f1 # %% jupyter={"source_hidden": true} # Get overall results cm.get_total_models_scores(n_clusters=n_clusters)