# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% jupyter={"source_hidden": true} # %matplotlib inline import datetime import importlib import os import sys import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from scipy import stats from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble from sklearn.model_selection import LeaveOneGroupOut, cross_validate from sklearn.dummy import DummyClassifier from sklearn.impute import SimpleImputer from lightgbm import LGBMClassifier import xgboost as xg from sklearn.cluster import KMeans from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import machine_learning.labels import machine_learning.model # %% [markdown] # # RAPIDS models # %% [markdown] # ## PANAS negative affect # %% jupyter={"source_hidden": true} model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv") lime_cols = [col for col in model_input if col.startswith('limesurvey_demand')] lime_col = 'limesurvey_demand_control_ratio' model_input[lime_col].describe() # %% jupyter={"source_hidden": true} # Filter-out outlier rows by lime_col model_input = model_input[(np.abs(stats.zscore(model_input[lime_col])) < 3)] uniq = model_input[[lime_col, 'pid']].drop_duplicates().reset_index(drop=True) plt.bar(uniq['pid'], uniq[lime_col]) # %% jupyter={"source_hidden": true} # Get clusters by lime col & and merge the clusters to main df km = KMeans(n_clusters=5).fit_predict(uniq.set_index('pid')) np.unique(km, return_counts=True) uniq['cluster'] = km uniq model_input = model_input.merge(uniq[['pid', 'cluster']]) # %% jupyter={"source_hidden": true} index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] model_input.set_index(index_columns, inplace=True) # %% jupyter={"source_hidden": true} for k in range(5): model_input_subset = model_input[model_input["cluster"] == k].copy() bins = [-10, -1, 1, 10] # bins for z-scored targets model_input_subset.loc[:, 'target'] = \ pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high'] model_input_subset['target'].value_counts() model_input_subset = model_input_subset[model_input_subset['target'] != "medium"] model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1) model_input_subset['target'].value_counts() cv_method_str = 'logo' # logo, halflogo, 5kfold if cv_method_str == 'halflogo': model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount() model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count') model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round() model_input_subset["pid_half"] = model_input_subset["pid"] + "_" + model_input_subset["pid_index"].astype(int).astype(str) data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"] else: data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"] # Treat categorical features categorical_feature_colnames = ["gender", "startlanguage"] additional_categorical_features = [] #[col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col] categorical_feature_colnames += additional_categorical_features categorical_features = data_x[categorical_feature_colnames].copy() mode_categorical_features = categorical_features.mode().iloc[0] # fillna with mode categorical_features = categorical_features.fillna(mode_categorical_features) # one-hot encoding categorical_features = categorical_features.apply(lambda col: col.astype("category")) if not categorical_features.empty: categorical_features = pd.get_dummies(categorical_features) numerical_features = data_x.drop(categorical_feature_colnames, axis=1) train_x = pd.concat([numerical_features, categorical_features], axis=1) # Establish cv method cv_method = None # Defaults to 5 k-folds in cross_validate method if cv_method_str == 'logo' or cv_method_str == 'half_logo': cv_method = LeaveOneGroupOut() cv_method.get_n_splits( train_x, data_y, groups=data_groups, ) n = 3 imputer = SimpleImputer(missing_values=np.nan, strategy='median') # Create dict with classification ml models cmodels = { 'dummy_classifier': DummyClassifier(strategy="most_frequent"), 'logistic_regression': linear_model.LogisticRegression(), 'support_vector_machine': svm.SVC(), 'gaussian_naive_bayes': naive_bayes.GaussianNB(), 'stochastic_gradient_descent_classifier': linear_model.SGDClassifier(), 'knn': neighbors.KNeighborsClassifier(), 'decision_tree': tree.DecisionTreeClassifier(), 'random_forest_classifier': ensemble.RandomForestClassifier(), 'gradient_boosting_classifier': ensemble.GradientBoostingClassifier(), 'lgbm_classifier': LGBMClassifier(), 'XGBoost_classifier': xg.sklearn.XGBClassifier() } for model_title, model in cmodels.items(): classifier = cross_validate( model, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=cv_method, n_jobs=-1, error_score='raise', scoring=('accuracy', 'average_precision', 'recall', 'f1') ) print("\n-------------------------------------\n") print("Current cluster:", k, end="\n") print("Current model:", model_title, end="\n") print("Acc", np.median(classifier['test_accuracy'])) print("Precision", np.median(classifier['test_average_precision'])) print("Recall", np.median(classifier['test_recall'])) print("F1", np.median(classifier['test_f1'])) print("Largest 5 ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n)[:n])[::-1]) print("Smallest 5 ACC:", np.sort(np.partition(classifier['test_accuracy'], n)[:n])) # %%