Merge branch 'ml_pipeline' into rapids

# Conflicts:
#	exploration/ex_ml_pipeline.py
rapids
junos 2022-08-31 16:20:11 +02:00
commit 8b8d626cf0
6 changed files with 3815 additions and 81 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,272 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %% jupyter={"source_hidden": true}
# %matplotlib inline
import datetime
import importlib
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import yaml
from pyprojroot import here
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import machine_learning.features_sensor
import machine_learning.labels
import machine_learning.model
# %% [markdown]
# # RAPIDS models
# %% [markdown]
# ## PANAS negative affect
# %% jupyter={"source_hidden": true}
# model_input = pd.read_csv("../data/input_PANAS_NA.csv") # Nestandardizirani podatki
model_input = pd.read_csv("../data/z_input_PANAS_NA.csv") # Standardizirani podatki
# %% [markdown]
# ### NaNs before dropping cols and rows
# %% jupyter={"source_hidden": true}
sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
# %% jupyter={"source_hidden": true}
nan_cols = list(model_input.loc[:, model_input.isna().all()].columns)
nan_cols
# %% jupyter={"source_hidden": true}
model_input.dropna(axis=1, how="all", inplace=True)
model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
# %% [markdown]
# ### NaNs after dropping NaN cols and rows where target is NaN
# %% jupyter={"source_hidden": true}
sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
# %% jupyter={"source_hidden": true}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
#if "pid" in model_input.columns:
# index_columns.append("pid")
model_input.set_index(index_columns, inplace=True)
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
# %% jupyter={"source_hidden": true}
categorical_feature_colnames = ["gender", "startlanguage"]
# %% jupyter={"source_hidden": true}
categorical_features = data_x[categorical_feature_colnames].copy()
# %% jupyter={"source_hidden": true}
mode_categorical_features = categorical_features.mode().iloc[0]
# %% jupyter={"source_hidden": true}
# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)
# %% jupyter={"source_hidden": true}
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
# %% jupyter={"source_hidden": true}
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
# %% jupyter={"source_hidden": true}
train_x = pd.concat([numerical_features, categorical_features], axis=1)
# %% jupyter={"source_hidden": true}
train_x.dtypes
# %% jupyter={"source_hidden": true}
logo = LeaveOneGroupOut()
logo.get_n_splits(
train_x,
data_y,
groups=data_groups,
)
# %% jupyter={"source_hidden": true}
sum(data_y.isna())
# %% [markdown]
# ### Linear Regression
# %% jupyter={"source_hidden": true}
lin_reg_rapids = linear_model.LinearRegression()
# %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# %% jupyter={"source_hidden": true}
lin_reg_scores = cross_val_score(
lin_reg_rapids,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring='r2'
)
lin_reg_scores
np.median(lin_reg_scores)
# %% [markdown]
# ### Ridge regression
# %% jupyter={"source_hidden": true}
ridge_reg = linear_model.Ridge(alpha=.5)
# %% tags=[] jupyter={"source_hidden": true}
ridge_reg_scores = cross_val_score(
ridge_reg,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
np.median(ridge_reg_scores)
# %% [markdown]
# ### Lasso
# %% jupyter={"source_hidden": true}
lasso_reg = linear_model.Lasso(alpha=0.1)
# %% jupyter={"source_hidden": true}
lasso_reg_score = cross_val_score(
lasso_reg,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
np.median(lasso_reg_score)
# %% [markdown]
# ### Bayesian Ridge
# %% jupyter={"source_hidden": true}
bayesian_ridge_reg = linear_model.BayesianRidge()
# %% jupyter={"source_hidden": true}
bayesian_ridge_reg_score = cross_val_score(
bayesian_ridge_reg,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
np.median(bayesian_ridge_reg_score)
# %% [markdown]
# ### RANSAC (outlier robust regression)
# %% jupyter={"source_hidden": true}
ransac_reg = linear_model.RANSACRegressor()
# %% jupyter={"source_hidden": true}
np.median(
cross_val_score(
ransac_reg,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
)
# %% [markdown]
# ### Support vector regression
# %% jupyter={"source_hidden": true}
svr = svm.SVR()
# %% jupyter={"source_hidden": true}
np.median(
cross_val_score(
svr,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
)
# %% [markdown]
# ### Kernel Ridge regression
# %% jupyter={"source_hidden": true}
kridge = kernel_ridge.KernelRidge()
# %% jupyter={"source_hidden": true}
np.median(
cross_val_score(
kridge,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
)
# %% [markdown]
# ### Gaussian Process Regression
# %% jupyter={"source_hidden": true}
gpr = gaussian_process.GaussianProcessRegressor()
# %% jupyter={"source_hidden": true}
np.median(
cross_val_score(
gpr,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
)
# %%

File diff suppressed because it is too large Load Diff

View File

@ -7,7 +7,7 @@
# extension: .py # extension: .py
# format_name: percent # format_name: percent
# format_version: '1.3' # format_version: '1.3'
# jupytext_version: 1.11.2 # jupytext_version: 1.13.0
# kernelspec: # kernelspec:
# display_name: straw2analysis # display_name: straw2analysis
# language: python # language: python
@ -17,6 +17,7 @@
# %% # %%
import os import os
import sys import sys
import datetime
import seaborn as sns import seaborn as sns
@ -26,6 +27,7 @@ if nb_dir not in sys.path:
import participants.query_db import participants.query_db
from features.esm import * from features.esm import *
from features.esm_JCQ import * from features.esm_JCQ import *
from features.esm_SAM import *
# %% # %%
participants_inactive_usernames = participants.query_db.get_usernames( participants_inactive_usernames = participants.query_db.get_usernames(
@ -99,6 +101,12 @@ df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
# %% [markdown] # %% [markdown]
# # Stress appraisal measure # # Stress appraisal measure
# %%
df_SAM_all = extract_stressful_events(df_esm_inactive)
# %%
df_SAM_all.head()
# %% # %%
df_esm_SAM = df_esm_preprocessed[ df_esm_SAM = df_esm_preprocessed[
(df_esm_preprocessed["questionnaire_id"] >= 87) (df_esm_preprocessed["questionnaire_id"] >= 87)

View File

@ -6,7 +6,7 @@
# extension: .py # extension: .py
# format_name: percent # format_name: percent
# format_version: '1.3' # format_version: '1.3'
# jupytext_version: 1.12.0 # jupytext_version: 1.13.0
# kernelspec: # kernelspec:
# display_name: straw2analysis # display_name: straw2analysis
# language: python # language: python
@ -14,25 +14,7 @@
# --- # ---
# %% # %%
# %matplotlib inline SAVE_FIGS = False
import datetime
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import participants.query_db
from features.esm import *
# %%
SAVE_FIGS = True
FIG_HEIGHT = 5 FIG_HEIGHT = 5
FIG_ASPECT = 1.7 FIG_ASPECT = 1.7
FIG_COLOUR = "#28827C" FIG_COLOUR = "#28827C"