Compare commits

..

No commits in common. "8b8d626cf0091d42856604ec90a37a24638f585e" and "6295cc8e9140ffdf4c7bc80669cd12350df75b48" have entirely different histories.

8 changed files with 82 additions and 3818 deletions

2
.gitignore vendored
View File

@ -7,5 +7,3 @@ __pycache__/
/statistical_analysis/*.ipynb /statistical_analysis/*.ipynb
/machine_learning/intermediate_results/ /machine_learning/intermediate_results/
/data/features/ /data/features/
/data/baseline/
/data/*input*.csv

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,272 +0,0 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %% jupyter={"source_hidden": true}
# %matplotlib inline
import datetime
import importlib
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import yaml
from pyprojroot import here
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import machine_learning.features_sensor
import machine_learning.labels
import machine_learning.model
# %% [markdown]
# # RAPIDS models
# %% [markdown]
# ## PANAS negative affect
# %% jupyter={"source_hidden": true}
# model_input = pd.read_csv("../data/input_PANAS_NA.csv") # Nestandardizirani podatki
model_input = pd.read_csv("../data/z_input_PANAS_NA.csv") # Standardizirani podatki
# %% [markdown]
# ### NaNs before dropping cols and rows
# %% jupyter={"source_hidden": true}
sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
# %% jupyter={"source_hidden": true}
nan_cols = list(model_input.loc[:, model_input.isna().all()].columns)
nan_cols
# %% jupyter={"source_hidden": true}
model_input.dropna(axis=1, how="all", inplace=True)
model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
# %% [markdown]
# ### NaNs after dropping NaN cols and rows where target is NaN
# %% jupyter={"source_hidden": true}
sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
# %% jupyter={"source_hidden": true}
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
#if "pid" in model_input.columns:
# index_columns.append("pid")
model_input.set_index(index_columns, inplace=True)
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
# %% jupyter={"source_hidden": true}
categorical_feature_colnames = ["gender", "startlanguage"]
# %% jupyter={"source_hidden": true}
categorical_features = data_x[categorical_feature_colnames].copy()
# %% jupyter={"source_hidden": true}
mode_categorical_features = categorical_features.mode().iloc[0]
# %% jupyter={"source_hidden": true}
# fillna with mode
categorical_features = categorical_features.fillna(mode_categorical_features)
# %% jupyter={"source_hidden": true}
# one-hot encoding
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
if not categorical_features.empty:
categorical_features = pd.get_dummies(categorical_features)
# %% jupyter={"source_hidden": true}
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
# %% jupyter={"source_hidden": true}
train_x = pd.concat([numerical_features, categorical_features], axis=1)
# %% jupyter={"source_hidden": true}
train_x.dtypes
# %% jupyter={"source_hidden": true}
logo = LeaveOneGroupOut()
logo.get_n_splits(
train_x,
data_y,
groups=data_groups,
)
# %% jupyter={"source_hidden": true}
sum(data_y.isna())
# %% [markdown]
# ### Linear Regression
# %% jupyter={"source_hidden": true}
lin_reg_rapids = linear_model.LinearRegression()
# %% jupyter={"source_hidden": true}
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# %% jupyter={"source_hidden": true}
lin_reg_scores = cross_val_score(
lin_reg_rapids,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring='r2'
)
lin_reg_scores
np.median(lin_reg_scores)
# %% [markdown]
# ### Ridge regression
# %% jupyter={"source_hidden": true}
ridge_reg = linear_model.Ridge(alpha=.5)
# %% tags=[] jupyter={"source_hidden": true}
ridge_reg_scores = cross_val_score(
ridge_reg,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
np.median(ridge_reg_scores)
# %% [markdown]
# ### Lasso
# %% jupyter={"source_hidden": true}
lasso_reg = linear_model.Lasso(alpha=0.1)
# %% jupyter={"source_hidden": true}
lasso_reg_score = cross_val_score(
lasso_reg,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
np.median(lasso_reg_score)
# %% [markdown]
# ### Bayesian Ridge
# %% jupyter={"source_hidden": true}
bayesian_ridge_reg = linear_model.BayesianRidge()
# %% jupyter={"source_hidden": true}
bayesian_ridge_reg_score = cross_val_score(
bayesian_ridge_reg,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
np.median(bayesian_ridge_reg_score)
# %% [markdown]
# ### RANSAC (outlier robust regression)
# %% jupyter={"source_hidden": true}
ransac_reg = linear_model.RANSACRegressor()
# %% jupyter={"source_hidden": true}
np.median(
cross_val_score(
ransac_reg,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
)
# %% [markdown]
# ### Support vector regression
# %% jupyter={"source_hidden": true}
svr = svm.SVR()
# %% jupyter={"source_hidden": true}
np.median(
cross_val_score(
svr,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
)
# %% [markdown]
# ### Kernel Ridge regression
# %% jupyter={"source_hidden": true}
kridge = kernel_ridge.KernelRidge()
# %% jupyter={"source_hidden": true}
np.median(
cross_val_score(
kridge,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
)
# %% [markdown]
# ### Gaussian Process Regression
# %% jupyter={"source_hidden": true}
gpr = gaussian_process.GaussianProcessRegressor()
# %% jupyter={"source_hidden": true}
np.median(
cross_val_score(
gpr,
X=imputer.fit_transform(train_x),
y=data_y,
groups=data_groups,
cv=logo,
n_jobs=-1,
scoring="r2"
)
)
# %%

File diff suppressed because it is too large Load Diff

View File

@ -7,7 +7,7 @@
# extension: .py # extension: .py
# format_name: percent # format_name: percent
# format_version: '1.3' # format_version: '1.3'
# jupytext_version: 1.13.0 # jupytext_version: 1.11.2
# kernelspec: # kernelspec:
# display_name: straw2analysis # display_name: straw2analysis
# language: python # language: python
@ -17,7 +17,6 @@
# %% # %%
import os import os
import sys import sys
import datetime
import seaborn as sns import seaborn as sns
@ -27,7 +26,6 @@ if nb_dir not in sys.path:
import participants.query_db import participants.query_db
from features.esm import * from features.esm import *
from features.esm_JCQ import * from features.esm_JCQ import *
from features.esm_SAM import *
# %% # %%
participants_inactive_usernames = participants.query_db.get_usernames( participants_inactive_usernames = participants.query_db.get_usernames(
@ -101,12 +99,6 @@ df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
# %% [markdown] # %% [markdown]
# # Stress appraisal measure # # Stress appraisal measure
# %%
df_SAM_all = extract_stressful_events(df_esm_inactive)
# %%
df_SAM_all.head()
# %% # %%
df_esm_SAM = df_esm_preprocessed[ df_esm_SAM = df_esm_preprocessed[
(df_esm_preprocessed["questionnaire_id"] >= 87) (df_esm_preprocessed["questionnaire_id"] >= 87)

2
rapids

@ -1 +1 @@
Subproject commit f78aa3e7b3567423b44045766b230cd60d557cb0 Subproject commit bf9c764c97f076f4af288f7afa1a32931996b2db

View File

@ -6,7 +6,7 @@
# extension: .py # extension: .py
# format_name: percent # format_name: percent
# format_version: '1.3' # format_version: '1.3'
# jupytext_version: 1.13.0 # jupytext_version: 1.12.0
# kernelspec: # kernelspec:
# display_name: straw2analysis # display_name: straw2analysis
# language: python # language: python
@ -14,7 +14,25 @@
# --- # ---
# %% # %%
SAVE_FIGS = False # %matplotlib inline
import datetime
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import participants.query_db
from features.esm import *
# %%
SAVE_FIGS = True
FIG_HEIGHT = 5 FIG_HEIGHT = 5
FIG_ASPECT = 1.7 FIG_ASPECT = 1.7
FIG_COLOUR = "#28827C" FIG_COLOUR = "#28827C"