Add 30 min features data and modify script.
parent
9f7fa0c8e0
commit
1d8dcf8b21
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,332 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# %matplotlib inline
|
||||
import datetime
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import yaml
|
||||
from pyprojroot import here
|
||||
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.dummy import DummyRegressor
|
||||
import xgboost as xg
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
import machine_learning.features_sensor
|
||||
import machine_learning.labels
|
||||
import machine_learning.model
|
||||
|
||||
# %% [markdown]
|
||||
# # RAPIDS models
|
||||
|
||||
# %% [markdown]
|
||||
# ## PANAS negative affect
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input = pd.read_csv("../data/daily_18_hours_all_targets/input_PANAS_negative_affect_mean.csv")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
#if "pid" in model_input.columns:
|
||||
# index_columns.append("pid")
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
train_x.dtypes
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
logo = LeaveOneGroupOut()
|
||||
logo.get_n_splits(
|
||||
train_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
sum(data_y.isna())
|
||||
|
||||
# %% [markdown]
|
||||
# ### Baseline: Dummy Regression (mean)
|
||||
dummy_regr = DummyRegressor(strategy="mean")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lin_reg_scores = cross_validate(
|
||||
dummy_regr,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(lin_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Linear Regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lin_reg_rapids = linear_model.LinearRegression()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lin_reg_scores = cross_validate(
|
||||
lin_reg_rapids,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(lin_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### XGBRegressor Linear Regression
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_reg_scores = cross_validate(
|
||||
xgb_r,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(xgb_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(xgb_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(xgb_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(xgb_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### XGBRegressor Pseudo Huber Error Regression
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_psuedo_huber_reg_scores = cross_validate(
|
||||
xgb_psuedo_huber_r,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(xgb_psuedo_huber_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Ridge regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ridge_reg = linear_model.Ridge(alpha=.5)
|
||||
|
||||
# %% tags=[] jupyter={"source_hidden": true}
|
||||
ridge_reg_scores = cross_validate(
|
||||
ridge_reg,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(ridge_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(ridge_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(ridge_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(ridge_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Lasso
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lasso_reg = linear_model.Lasso(alpha=0.1)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lasso_reg_score = cross_validate(
|
||||
lasso_reg,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(lasso_reg_score['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(lasso_reg_score['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(lasso_reg_score['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(lasso_reg_score['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Bayesian Ridge
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
bayesian_ridge_reg = linear_model.BayesianRidge()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
bayesian_ridge_reg_score = cross_validate(
|
||||
bayesian_ridge_reg,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### RANSAC (outlier robust regression)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ransac_reg = linear_model.RANSACRegressor()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ransac_reg_scores = cross_validate(
|
||||
ransac_reg,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(ransac_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(ransac_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(ransac_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(ransac_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Support vector regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
svr = svm.SVR()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
svr_scores = cross_validate(
|
||||
svr,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(svr_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(svr_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(svr_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(svr_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Kernel Ridge regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
kridge = kernel_ridge.KernelRidge()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
kridge_scores = cross_validate(
|
||||
kridge,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(kridge_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(kridge_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(kridge_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(kridge_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Gaussian Process Regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
gpr = gaussian_process.GaussianProcessRegressor()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
|
||||
gpr_scores = cross_validate(
|
||||
gpr,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(gpr_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(gpr_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(gpr_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(gpr_scores['test_r2']))
|
||||
|
||||
# %%
|
|
@ -50,7 +50,7 @@ import machine_learning.model
|
|||
# ## PANAS negative affect
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input = pd.read_csv("../data/daily_18_hours_all_targets/input_PANAS_negative_affect_mean.csv")
|
||||
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_PANAS_negative_affect_mean.csv")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
|
@ -0,0 +1 @@
|
|||
Subproject commit a620def209e9f043f852097d0318cb45bde74467
|
Loading…
Reference in New Issue