# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% jupyter={"source_hidden": true} # %matplotlib inline import datetime import importlib import os import sys import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import yaml from pyprojroot import here from sklearn import linear_model, svm, kernel_ridge, gaussian_process from sklearn.model_selection import LeaveOneGroupOut, cross_val_score from sklearn.metrics import mean_squared_error, r2_score from sklearn.impute import SimpleImputer nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import machine_learning.features_sensor import machine_learning.labels import machine_learning.model # %% [markdown] # # RAPIDS models # %% [markdown] # ## PANAS negative affect # %% jupyter={"source_hidden": true} # model_input = pd.read_csv("../data/input_PANAS_NA.csv") # Nestandardizirani podatki - pred temeljitim čiščenjem model_input = pd.read_csv("../data/z_input_PANAS_NA.csv") # Standardizirani podatki - pred temeljitim čiščenjem # %% [markdown] # ### NaNs before dropping cols and rows # %% jupyter={"source_hidden": true} sns.set(rc={"figure.figsize":(16, 8)}) sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False) # %% jupyter={"source_hidden": true} nan_cols = list(model_input.loc[:, model_input.isna().all()].columns) nan_cols # %% jupyter={"source_hidden": true} model_input.dropna(axis=1, how="all", inplace=True) model_input.dropna(axis=0, how="any", subset=["target"], inplace=True) # %% [markdown] # ### NaNs after dropping NaN cols and rows where target is NaN # %% jupyter={"source_hidden": true} sns.set(rc={"figure.figsize":(16, 8)}) sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False) # %% jupyter={"source_hidden": true} index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] model_input.set_index(index_columns, inplace=True) cv_method = '5kfold' if cv_method == 'half_logo': data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"] else: model_input['pid_index'] = model_input.groupby('pid').cumcount() model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count') model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round() model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str) data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"] # %% jupyter={"source_hidden": true} categorical_feature_colnames = ["gender", "startlanguage"] # %% jupyter={"source_hidden": true} categorical_features = data_x[categorical_feature_colnames].copy() # %% jupyter={"source_hidden": true} mode_categorical_features = categorical_features.mode().iloc[0] # %% jupyter={"source_hidden": true} # fillna with mode categorical_features = categorical_features.fillna(mode_categorical_features) # %% jupyter={"source_hidden": true} # one-hot encoding categorical_features = categorical_features.apply(lambda col: col.astype("category")) if not categorical_features.empty: categorical_features = pd.get_dummies(categorical_features) # %% jupyter={"source_hidden": true} numerical_features = data_x.drop(categorical_feature_colnames, axis=1) # %% jupyter={"source_hidden": true} train_x = pd.concat([numerical_features, categorical_features], axis=1) # %% jupyter={"source_hidden": true} train_x.dtypes # %% jupyter={"source_hidden": true} logo = LeaveOneGroupOut() logo.get_n_splits( train_x, data_y, groups=data_groups, ) # Defaults to 5 k folds in cross_validate method if cv_method != 'logo' and cv_method != 'half_logo': logo = None # %% jupyter={"source_hidden": true} sum(data_y.isna()) # %% [markdown] # ### Linear Regression # %% jupyter={"source_hidden": true} lin_reg_rapids = linear_model.LinearRegression() # %% jupyter={"source_hidden": true} imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # %% jupyter={"source_hidden": true} lin_reg_scores = cross_val_score( lin_reg_rapids, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring='r2' ) lin_reg_scores np.median(lin_reg_scores) # %% [markdown] # ### Ridge regression # %% jupyter={"source_hidden": true} ridge_reg = linear_model.Ridge(alpha=.5) # %% tags=[] jupyter={"source_hidden": true} ridge_reg_scores = cross_val_score( ridge_reg, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring="r2" ) np.median(ridge_reg_scores) # %% [markdown] # ### Lasso # %% jupyter={"source_hidden": true} lasso_reg = linear_model.Lasso(alpha=0.1) # %% jupyter={"source_hidden": true} lasso_reg_score = cross_val_score( lasso_reg, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring="r2" ) np.median(lasso_reg_score) # %% [markdown] # ### Bayesian Ridge # %% jupyter={"source_hidden": true} bayesian_ridge_reg = linear_model.BayesianRidge() # %% jupyter={"source_hidden": true} bayesian_ridge_reg_score = cross_val_score( bayesian_ridge_reg, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring="r2" ) np.median(bayesian_ridge_reg_score) # %% [markdown] # ### RANSAC (outlier robust regression) # %% jupyter={"source_hidden": true} ransac_reg = linear_model.RANSACRegressor() # %% jupyter={"source_hidden": true} np.median( cross_val_score( ransac_reg, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring="r2" ) ) # %% [markdown] # ### Support vector regression # %% jupyter={"source_hidden": true} svr = svm.SVR() # %% jupyter={"source_hidden": true} np.median( cross_val_score( svr, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring="r2" ) ) # %% [markdown] # ### Kernel Ridge regression # %% jupyter={"source_hidden": true} kridge = kernel_ridge.KernelRidge() # %% jupyter={"source_hidden": true} np.median( cross_val_score( kridge, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring="r2" ) ) # %% [markdown] # ### Gaussian Process Regression # %% jupyter={"source_hidden": true} gpr = gaussian_process.GaussianProcessRegressor() # %% jupyter={"source_hidden": true} np.median( cross_val_score( gpr, X=imputer.fit_transform(train_x), y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring="r2" ) ) # %%