Compare commits
63 Commits
e33a49c9fc
...
12f2c927fa
Author | SHA1 | Date |
---|---|---|
junos | 12f2c927fa | |
junos | 71e1fcf8ca | |
Primoz | cf0e4f89be | |
Primoz | 7504aa34cf | |
Primoz | 9a218c8e2a | |
Primoz | 98f78d72fc | |
Primoz | 218b684514 | |
Primoz | ddde80b421 | |
Primoz | 7afef5582f | |
Primoz | 183758cd37 | |
Primoz | 40029a8205 | |
Primoz | ae0f54ecc2 | |
Primoz | 8defb271c9 | |
junos | ae2d7a038d | |
junos | 389198b17f | |
junos | c462d55096 | |
junos | a5c09a292f | |
junos | 848416bf6a | |
Primoz | b59798df26 | |
Primoz | 87ebb9f296 | |
Primoz | 1d8dcf8b21 | |
Primoz | 9f7fa0c8e0 | |
Primoz | cdff4da930 | |
Primoz | ad5f50babe | |
Primoz | 466cd3dc23 | |
junos | 2d2f0b916f | |
junos | 1dcd060211 | |
junos | 8b8d626cf0 | |
junos | 75782a53c2 | |
Primoz | 27b2282ee0 | |
junos | a8fd96d2f1 | |
junos | 79fd5298be | |
junos | ff006c0834 | |
junos | 6295cc8e91 | |
junos | 360ec7de4b | |
junos | e177b15058 | |
junos | 832eb6137e | |
junos | 702b091d73 | |
junos | 257a044227 | |
junos | ae358f1e24 | |
junos | 4dee4b6fc1 | |
junos | ed3483ace4 | |
junos | de10269d36 | |
junos | 7e8e922d71 | |
junos | c4aacfffe1 | |
junos | 2e1e771b3d | |
junos | 54c712b1cd | |
junos | 375cb10c1e | |
junos | 689535e444 | |
junos | f5c17aa7ce | |
junos | 7d0355d095 | |
junos | 6eca98962f | |
junos | 0e6a18a660 | |
junos | cf3801b120 | |
junos | 578f5ed5f3 | |
junos | 6f9b513ba0 | |
junos | c2b27aef45 | |
junos | 4bb01ba871 | |
junos | 080dc1d1fe | |
junos | 4e87664fd6 | |
junos | c0ea17dbe3 | |
junos | 1d65a31d8c | |
junos | d4ba701202 |
|
@ -6,3 +6,8 @@ __pycache__/
|
|||
/config/*.ipynb
|
||||
/statistical_analysis/*.ipynb
|
||||
/machine_learning/intermediate_results/
|
||||
/data/features/
|
||||
/data/baseline/
|
||||
/data/*input*.csv
|
||||
/data/daily*
|
||||
/data/intradaily*
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
[submodule "rapids"]
|
||||
path = rapids
|
||||
url = https://repo.ijs.si/junoslukan/rapids.git
|
||||
branch = master
|
|
@ -4,4 +4,17 @@
|
|||
<component name="PyCharmProfessionalAdvertiser">
|
||||
<option name="shown" value="true" />
|
||||
</component>
|
||||
<component name="RMarkdownSettings">
|
||||
<option name="renderProfiles">
|
||||
<map>
|
||||
<entry key="file://$PROJECT_DIR$/rapids/src/visualization/merge_heatmap_sensors_per_minute_per_time_segment.Rmd">
|
||||
<value>
|
||||
<RMarkdownRenderProfile>
|
||||
<option name="outputDirectoryUrl" value="file://$PROJECT_DIR$/rapids/src/visualization" />
|
||||
</RMarkdownRenderProfile>
|
||||
</value>
|
||||
</entry>
|
||||
</map>
|
||||
</option>
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="SmkProjectSettings" sdk="Python 3.10 (snakemake)" enabled="true" />
|
||||
</project>
|
|
@ -2,5 +2,6 @@
|
|||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
<mapping directory="$PROJECT_DIR$/rapids" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
128
README.md
128
README.md
|
@ -32,4 +32,130 @@ To install:
|
|||
|
||||
```
|
||||
DB_PASSWORD=database-password
|
||||
```
|
||||
```
|
||||
|
||||
# RAPIDS
|
||||
|
||||
To install RAPIDS, follow the [instructions on their webpage](https://www.rapids.science/1.6/setup/installation/).
|
||||
|
||||
Here, I include additional information related to the installation and specific to the STRAW2analysis project.
|
||||
The installation was tested on Windows using Ubuntu 20.04 on Windows Subsystem for Linux ([WSL2](https://docs.microsoft.com/en-us/windows/wsl/install)).
|
||||
|
||||
## Custom configuration
|
||||
### Credentials
|
||||
|
||||
As mentioned under [Database in RAPIDS documentation](https://www.rapids.science/1.6/snippets/database/), a `credentials.yaml` file is needed to connect to a database.
|
||||
It should contain:
|
||||
|
||||
```yaml
|
||||
PSQL_STRAW:
|
||||
database: staw
|
||||
host: 212.235.208.113
|
||||
password: password
|
||||
port: 5432
|
||||
user: staw_db
|
||||
```
|
||||
|
||||
where`password` needs to be specified as well.
|
||||
|
||||
## Possible installation issues
|
||||
### Missing dependencies for RPostgres
|
||||
|
||||
To install `RPostgres` R package (used to connect to the PostgreSQL database), an error might occur:
|
||||
|
||||
```text
|
||||
------------------------- ANTICONF ERROR ---------------------------
|
||||
Configuration failed because libpq was not found. Try installing:
|
||||
* deb: libpq-dev (Debian, Ubuntu, etc)
|
||||
* rpm: postgresql-devel (Fedora, EPEL)
|
||||
* rpm: postgreql8-devel, psstgresql92-devel, postgresql93-devel, or postgresql94-devel (Amazon Linux)
|
||||
* csw: postgresql_dev (Solaris)
|
||||
* brew: libpq (OSX)
|
||||
If libpq is already installed, check that either:
|
||||
(i) 'pkg-config' is in your PATH AND PKG_CONFIG_PATH contains a libpq.pc file; or
|
||||
(ii) 'pg_config' is in your PATH.
|
||||
If neither can detect , you can set INCLUDE_DIR
|
||||
and LIB_DIR manually via:
|
||||
R CMD INSTALL --configure-vars='INCLUDE_DIR=... LIB_DIR=...'
|
||||
--------------------------[ ERROR MESSAGE ]----------------------------
|
||||
<stdin>:1:10: fatal error: libpq-fe.h: No such file or directory
|
||||
compilation terminated.
|
||||
```
|
||||
|
||||
The library requires `libpq` for compiling from source, so install accordingly.
|
||||
|
||||
### Timezone environment variable for tidyverse (relevant for WSL2)
|
||||
|
||||
One of the R packages, `tidyverse` might need access to the `TZ` environment variable during the installation.
|
||||
On Ubuntu 20.04 on WSL2 this triggers the following error:
|
||||
|
||||
```text
|
||||
> install.packages('tidyverse')
|
||||
|
||||
ERROR: configuration failed for package ‘xml2’
|
||||
System has not been booted with systemd as init system (PID 1). Can't operate.
|
||||
Failed to create bus connection: Host is down
|
||||
Warning in system("timedatectl", intern = TRUE) :
|
||||
running command 'timedatectl' had status 1
|
||||
Error in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]) :
|
||||
namespace ‘xml2’ 1.3.1 is already loaded, but >= 1.3.2 is required
|
||||
Calls: <Anonymous> ... namespaceImportFrom -> asNamespace -> loadNamespace
|
||||
Execution halted
|
||||
ERROR: lazy loading failed for package ‘tidyverse’
|
||||
```
|
||||
|
||||
This happens because WSL2 does not use the `timedatectl` service, which provides this variable.
|
||||
|
||||
```bash
|
||||
~$ timedatectl
|
||||
System has not been booted with systemd as init system (PID 1). Can't operate.
|
||||
Failed to create bus connection: Host is down
|
||||
```
|
||||
|
||||
and later
|
||||
|
||||
```bash
|
||||
Warning message:
|
||||
In system("timedatectl", intern = TRUE) :
|
||||
running command 'timedatectl' had status 1
|
||||
Execution halted
|
||||
```
|
||||
|
||||
This can be amended by setting the environment variable manually before attempting to install `tidyverse`:
|
||||
|
||||
```bash
|
||||
export TZ='Europe/Ljubljana'
|
||||
```
|
||||
|
||||
## Possible runtime issues
|
||||
### Unix end of line characters
|
||||
|
||||
Upon running rapids, an error might occur:
|
||||
|
||||
```bash
|
||||
/usr/bin/env: ‘python3\r’: No such file or directory
|
||||
```
|
||||
|
||||
This is due to Windows style end of line characters.
|
||||
To amend this, I added a `.gitattributes` files to force `git` to checkout `rapids` using Unix EOL characters.
|
||||
If this still fails, `dos2unix` can be used to change them.
|
||||
|
||||
### System has not been booted with systemd as init system (PID 1)
|
||||
|
||||
See [the installation issue above](#Timezone-environment-variable-for-tidyverse-(relevant-for-WSL2)).
|
||||
|
||||
## Update RAPIDS
|
||||
|
||||
To update RAPIDS, first pull and merge [origin]( https://github.com/carissalow/rapids), such as with:
|
||||
|
||||
```commandline
|
||||
git fetch --progress "origin" refs/heads/master
|
||||
git merge --no-ff origin/master
|
||||
```
|
||||
|
||||
Next, update the conda and R virtual environment.
|
||||
|
||||
```bash
|
||||
R -e 'renv::restore(repos = c(CRAN = "https://packagemanager.rstudio.com/all/__linux__/focal/latest"))'
|
||||
```
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,323 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %%
|
||||
import os, sys
|
||||
import importlib
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# import plotly.graph_objects as go
|
||||
from importlib import util
|
||||
from pathlib import Path
|
||||
import yaml
|
||||
|
||||
# %%
|
||||
phone_data_yield = pd.read_csv(
|
||||
"../rapids/data/interim/p011/phone_yielded_timestamps_with_datetime.csv",
|
||||
parse_dates=["local_date_time"],
|
||||
)
|
||||
time_segments_labels = pd.read_csv(
|
||||
"../rapids/data/interim/time_segments/p011_time_segments_labels.csv"
|
||||
)
|
||||
|
||||
# %%
|
||||
phone_data_yield["assigned_segments"] = phone_data_yield[
|
||||
"assigned_segments"
|
||||
].str.replace(r"_RR\d+SS#", "#")
|
||||
time_segments_labels["label"] = time_segments_labels["label"].str.replace(
|
||||
r"_RR\d+SS$", ""
|
||||
)
|
||||
|
||||
|
||||
# %% tags=[]
|
||||
def filter_data_by_segment(data, time_segment):
|
||||
data.dropna(subset=["assigned_segments"], inplace=True)
|
||||
if data.shape[0] == 0: # data is empty
|
||||
data["local_segment"] = data["timestamps_segment"] = None
|
||||
return data
|
||||
|
||||
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||
timestamps_regex = "[0-9]{13}"
|
||||
segment_regex = "\[({}#{},{};{},{})\]".format(
|
||||
time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex
|
||||
)
|
||||
data["local_segment"] = data["assigned_segments"].str.extract(
|
||||
segment_regex, expand=True
|
||||
)
|
||||
data = data.drop(columns=["assigned_segments"])
|
||||
data = data.dropna(subset=["local_segment"])
|
||||
if (
|
||||
data.shape[0] == 0
|
||||
): # there are no rows belonging to time_segment after droping na
|
||||
data["timestamps_segment"] = None
|
||||
else:
|
||||
data[["local_segment", "timestamps_segment"]] = data["local_segment"].str.split(
|
||||
pat=";", n=1, expand=True
|
||||
)
|
||||
|
||||
# chunk episodes
|
||||
if (
|
||||
(not data.empty)
|
||||
and ("start_timestamp" in data.columns)
|
||||
and ("end_timestamp" in data.columns)
|
||||
):
|
||||
data = chunk_episodes(data)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
# %% tags=[]
|
||||
time_segment = "daily"
|
||||
phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
|
||||
|
||||
# %%
|
||||
phone_data_yield.tail()
|
||||
|
||||
# %%
|
||||
phone_data_yield_per_segment.tail()
|
||||
|
||||
|
||||
# %%
|
||||
def getDataForPlot(phone_data_yield_per_segment):
|
||||
# calculate the length (in minute) of per segment instance
|
||||
phone_data_yield_per_segment["length"] = (
|
||||
phone_data_yield_per_segment["timestamps_segment"]
|
||||
.str.split(",")
|
||||
.apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
|
||||
)
|
||||
# calculate the number of sensors logged at least one row of data per minute.
|
||||
phone_data_yield_per_segment = (
|
||||
phone_data_yield_per_segment.groupby(
|
||||
["local_segment", "length", "local_date", "local_hour", "local_minute"]
|
||||
)[["sensor", "local_date_time"]]
|
||||
.max()
|
||||
.reset_index()
|
||||
)
|
||||
# extract local start datetime of the segment from "local_segment" column
|
||||
phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
|
||||
phone_data_yield_per_segment["local_segment"].apply(
|
||||
lambda x: x.split("#")[1].split(",")[0]
|
||||
)
|
||||
)
|
||||
# calculate the number of minutes after local start datetime of the segment
|
||||
phone_data_yield_per_segment["minutes_after_segment_start"] = (
|
||||
(
|
||||
phone_data_yield_per_segment["local_date_time"]
|
||||
- phone_data_yield_per_segment["local_segment_start_datetimes"]
|
||||
)
|
||||
/ pd.Timedelta(minutes=1)
|
||||
).astype("int")
|
||||
|
||||
# impute missing rows with 0
|
||||
columns_for_full_index = phone_data_yield_per_segment[
|
||||
["local_segment_start_datetimes", "length"]
|
||||
].drop_duplicates(keep="first")
|
||||
columns_for_full_index = columns_for_full_index.apply(
|
||||
lambda row: [
|
||||
[row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
full_index = []
|
||||
for columns in columns_for_full_index:
|
||||
full_index = full_index + columns
|
||||
full_index = pd.MultiIndex.from_tuples(
|
||||
full_index,
|
||||
names=("local_segment_start_datetimes", "minutes_after_segment_start"),
|
||||
)
|
||||
phone_data_yield_per_segment = (
|
||||
phone_data_yield_per_segment.set_index(
|
||||
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||
)
|
||||
.reindex(full_index)
|
||||
.reset_index()
|
||||
.fillna(0)
|
||||
)
|
||||
|
||||
# transpose the dataframe per local start datetime of the segment and discard the useless index layer
|
||||
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
||||
"local_segment_start_datetimes"
|
||||
)[["minutes_after_segment_start", "sensor"]].apply(
|
||||
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
||||
)
|
||||
phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
|
||||
"local_segment_start_datetimes"
|
||||
)
|
||||
return phone_data_yield_per_segment
|
||||
|
||||
|
||||
# %%
|
||||
data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
|
||||
|
||||
# %%
|
||||
# calculate the length (in minute) of per segment instance
|
||||
phone_data_yield_per_segment["length"] = (
|
||||
phone_data_yield_per_segment["timestamps_segment"]
|
||||
.str.split(",")
|
||||
.apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
|
||||
)
|
||||
|
||||
# %%
|
||||
phone_data_yield_per_segment.tail()
|
||||
|
||||
# %%
|
||||
# calculate the number of sensors logged at least one row of data per minute.
|
||||
phone_data_yield_per_segment = (
|
||||
phone_data_yield_per_segment.groupby(
|
||||
["local_segment", "length", "local_date", "local_hour", "local_minute"]
|
||||
)[["sensor", "local_date_time"]]
|
||||
.max()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
# %%
|
||||
# extract local start datetime of the segment from "local_segment" column
|
||||
phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
|
||||
phone_data_yield_per_segment["local_segment"].apply(
|
||||
lambda x: x.split("#")[1].split(",")[0]
|
||||
)
|
||||
)
|
||||
|
||||
# %%
|
||||
# calculate the number of minutes after local start datetime of the segment
|
||||
phone_data_yield_per_segment["minutes_after_segment_start"] = (
|
||||
(
|
||||
phone_data_yield_per_segment["local_date_time"]
|
||||
- phone_data_yield_per_segment["local_segment_start_datetimes"]
|
||||
)
|
||||
/ pd.Timedelta(minutes=1)
|
||||
).astype("int")
|
||||
|
||||
# %%
|
||||
columns_for_full_index = phone_data_yield_per_segment[
|
||||
["local_segment_start_datetimes", "length"]
|
||||
].drop_duplicates(keep="first")
|
||||
columns_for_full_index = columns_for_full_index.apply(
|
||||
lambda row: [
|
||||
[row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
# %%
|
||||
full_index = []
|
||||
for columns in columns_for_full_index:
|
||||
full_index = full_index + columns
|
||||
full_index = pd.MultiIndex.from_tuples(
|
||||
full_index, names=("local_segment_start_datetimes", "minutes_after_segment_start")
|
||||
)
|
||||
|
||||
# %%
|
||||
phone_data_yield_per_segment.tail()
|
||||
|
||||
# %% [markdown]
|
||||
# # A workaround
|
||||
|
||||
# %%
|
||||
phone_data_yield_per_segment["local_segment_start_datetimes", "minutes_after_segment_start"] = phone_data_yield_per_segment[
|
||||
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||
].drop_duplicates(keep="first")
|
||||
|
||||
# %%
|
||||
phone_data_yield_per_segment.set_index(
|
||||
["local_segment_start_datetimes", "minutes_after_segment_start"],
|
||||
verify_integrity=True,
|
||||
).reindex(full_index)
|
||||
|
||||
# %%
|
||||
phone_data_yield_per_segment.head()
|
||||
|
||||
|
||||
# %% [markdown]
|
||||
# # Retry
|
||||
|
||||
# %%
|
||||
def getDataForPlot(phone_data_yield_per_segment):
|
||||
# calculate the length (in minute) of per segment instance
|
||||
phone_data_yield_per_segment["length"] = (
|
||||
phone_data_yield_per_segment["timestamps_segment"]
|
||||
.str.split(",")
|
||||
.apply(lambda x: int((int(x[1]) - int(x[0])) / (1000 * 60)))
|
||||
)
|
||||
# calculate the number of sensors logged at least one row of data per minute.
|
||||
phone_data_yield_per_segment = (
|
||||
phone_data_yield_per_segment.groupby(
|
||||
["local_segment", "length", "local_date", "local_hour", "local_minute"]
|
||||
)[["sensor", "local_date_time"]]
|
||||
.max()
|
||||
.reset_index()
|
||||
)
|
||||
# extract local start datetime of the segment from "local_segment" column
|
||||
phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(
|
||||
phone_data_yield_per_segment["local_segment"].apply(
|
||||
lambda x: x.split("#")[1].split(",")[0]
|
||||
)
|
||||
)
|
||||
# calculate the number of minutes after local start datetime of the segment
|
||||
phone_data_yield_per_segment["minutes_after_segment_start"] = (
|
||||
(
|
||||
phone_data_yield_per_segment["local_date_time"]
|
||||
- phone_data_yield_per_segment["local_segment_start_datetimes"]
|
||||
)
|
||||
/ pd.Timedelta(minutes=1)
|
||||
).astype("int")
|
||||
|
||||
# impute missing rows with 0
|
||||
columns_for_full_index = phone_data_yield_per_segment[
|
||||
["local_segment_start_datetimes", "length"]
|
||||
].drop_duplicates(keep="first")
|
||||
columns_for_full_index = columns_for_full_index.apply(
|
||||
lambda row: [
|
||||
[row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
full_index = []
|
||||
for columns in columns_for_full_index:
|
||||
full_index = full_index + columns
|
||||
full_index = pd.MultiIndex.from_tuples(
|
||||
full_index,
|
||||
names=("local_segment_start_datetimes", "minutes_after_segment_start"),
|
||||
)
|
||||
phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(subset=["local_segment_start_datetimes", "minutes_after_segment_start"],keep="first")
|
||||
phone_data_yield_per_segment = (
|
||||
phone_data_yield_per_segment.set_index(
|
||||
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||
)
|
||||
.reindex(full_index)
|
||||
.reset_index()
|
||||
.fillna(0)
|
||||
)
|
||||
|
||||
# transpose the dataframe per local start datetime of the segment and discard the useless index layer
|
||||
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
||||
"local_segment_start_datetimes"
|
||||
)[["minutes_after_segment_start", "sensor"]].apply(
|
||||
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
||||
)
|
||||
phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
|
||||
"local_segment_start_datetimes"
|
||||
)
|
||||
return phone_data_yield_per_segment
|
||||
|
||||
|
||||
# %%
|
||||
phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
|
||||
|
||||
# %%
|
||||
data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
|
||||
|
||||
# %%
|
File diff suppressed because it is too large
Load Diff
|
@ -6,7 +6,7 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.11.4
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
|
@ -74,3 +74,29 @@ rows_os_manufacturer = df_category_not_found["package_name"].str.contains(
|
|||
# %%
|
||||
with pd.option_context("display.max_rows", None, "display.max_columns", None):
|
||||
display(df_category_not_found.loc[~rows_os_manufacturer])
|
||||
|
||||
# %% [markdown]
|
||||
# # Export categories
|
||||
|
||||
# %% [markdown]
|
||||
# Rename all of "not_found" to "system" or "other".
|
||||
|
||||
# %%
|
||||
df_app_categories_to_export = df_app_categories.copy()
|
||||
rows_os_manufacturer_full = (df_app_categories_to_export["package_name"].str.contains(
|
||||
"|".join(manufacturers + custom_rom + other), case=False
|
||||
)) & (df_app_categories_to_export["play_store_genre"] == "not_found")
|
||||
df_app_categories_to_export.loc[rows_os_manufacturer_full, "play_store_genre"] = "System"
|
||||
|
||||
# %%
|
||||
rows_not_found = (df_app_categories_to_export["play_store_genre"] == "not_found")
|
||||
df_app_categories_to_export.loc[rows_not_found, "play_store_genre"] = "Other"
|
||||
|
||||
# %%
|
||||
df_app_categories_to_export["play_store_genre"].value_counts()
|
||||
|
||||
# %%
|
||||
df_app_categories_to_export.rename(columns={"play_store_genre": "genre"},inplace=True)
|
||||
df_app_categories_to_export.to_csv("../data/app_categories.csv", columns=["package_hash","genre"],index=False)
|
||||
|
||||
# %%
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.11.2
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
|
@ -17,6 +17,7 @@
|
|||
# %%
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
|
||||
import seaborn as sns
|
||||
|
||||
|
@ -26,6 +27,7 @@ if nb_dir not in sys.path:
|
|||
import participants.query_db
|
||||
from features.esm import *
|
||||
from features.esm_JCQ import *
|
||||
from features.esm_SAM import *
|
||||
|
||||
# %%
|
||||
participants_inactive_usernames = participants.query_db.get_usernames(
|
||||
|
@ -99,6 +101,12 @@ df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
|
|||
# %% [markdown]
|
||||
# # Stress appraisal measure
|
||||
|
||||
# %%
|
||||
df_SAM_all = extract_stressful_events(df_esm_inactive)
|
||||
|
||||
# %%
|
||||
df_SAM_all.head()
|
||||
|
||||
# %%
|
||||
df_esm_SAM = df_esm_preprocessed[
|
||||
(df_esm_preprocessed["questionnaire_id"] >= 87)
|
||||
|
|
|
@ -0,0 +1,385 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# %matplotlib inline
|
||||
import datetime
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
|
||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
|
||||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn.impute import SimpleImputer
|
||||
|
||||
from lightgbm import LGBMClassifier
|
||||
import xgboost as xg
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
import machine_learning.labels
|
||||
import machine_learning.model
|
||||
|
||||
# %% [markdown]
|
||||
# # RAPIDS models
|
||||
|
||||
# %% [markdown]
|
||||
# ## Set script's parameters
|
||||
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input = pd.read_csv("../data/stressfulness_event_nonstandardized/input_appraisal_stressfulness_event_mean.csv")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
model_input['target'].value_counts()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# bins = [-10, -1, 1, 10] # bins for z-scored targets
|
||||
bins = [0, 1, 4] # bins for stressfulness (1-4) target
|
||||
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
|
||||
model_input['target'].value_counts(), edges
|
||||
# model_input = model_input[model_input['target'] != "medium"]
|
||||
model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||
|
||||
model_input['target'].value_counts()
|
||||
|
||||
if cv_method_str == 'halflogo':
|
||||
model_input['pid_index'] = model_input.groupby('pid').cumcount()
|
||||
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
|
||||
|
||||
model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
|
||||
model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
|
||||
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
|
||||
else:
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
train_x.dtypes
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
cv_method = None # Defaults to 5 k-folds in cross_validate method
|
||||
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
||||
cv_method = LeaveOneGroupOut()
|
||||
cv_method.get_n_splits(
|
||||
train_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
|
||||
# %% [markdown]
|
||||
# ### Baseline: Dummy Classifier (most frequent)
|
||||
dummy_class = DummyClassifier(strategy="most_frequent")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
dummy_classifier = cross_validate(
|
||||
dummy_class,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'average_precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
print("Acc", np.mean(dummy_classifier['test_accuracy']))
|
||||
print("Precision", np.mean(dummy_classifier['test_average_precision']))
|
||||
print("Recall", np.mean(dummy_classifier['test_recall']))
|
||||
print("F1", np.mean(dummy_classifier['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Logistic Regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
logistic_regression = linear_model.LogisticRegression()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
log_reg_scores = cross_validate(
|
||||
logistic_regression,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
print("Acc", np.mean(log_reg_scores['test_accuracy']))
|
||||
print("Precision", np.mean(log_reg_scores['test_precision']))
|
||||
print("Recall", np.mean(log_reg_scores['test_recall']))
|
||||
print("F1", np.mean(log_reg_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Support Vector Machine
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
svc = svm.SVC()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
svc_scores = cross_validate(
|
||||
svc,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
print("Acc", np.mean(svc_scores['test_accuracy']))
|
||||
print("Precision", np.mean(svc_scores['test_precision']))
|
||||
print("Recall", np.mean(svc_scores['test_recall']))
|
||||
print("F1", np.mean(svc_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Gaussian Naive Bayes
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
gaussian_nb = naive_bayes.GaussianNB()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
gaussian_nb_scores = cross_validate(
|
||||
gaussian_nb,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
print("Acc", np.mean(gaussian_nb_scores['test_accuracy']))
|
||||
print("Precision", np.mean(gaussian_nb_scores['test_precision']))
|
||||
print("Recall", np.mean(gaussian_nb_scores['test_recall']))
|
||||
print("F1", np.mean(gaussian_nb_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Stochastic Gradient Descent Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
sgdc = linear_model.SGDClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
sgdc_scores = cross_validate(
|
||||
sgdc,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
print("Acc", np.mean(sgdc_scores['test_accuracy']))
|
||||
print("Precision", np.mean(sgdc_scores['test_precision']))
|
||||
print("Recall", np.mean(sgdc_scores['test_recall']))
|
||||
print("F1", np.mean(sgdc_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### K-nearest neighbors
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
knn = neighbors.KNeighborsClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
knn_scores = cross_validate(
|
||||
knn,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
print("Acc", np.mean(knn_scores['test_accuracy']))
|
||||
print("Precision", np.mean(knn_scores['test_precision']))
|
||||
print("Recall", np.mean(knn_scores['test_recall']))
|
||||
print("F1", np.mean(knn_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Decision Tree
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
dtree = tree.DecisionTreeClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
dtree_scores = cross_validate(
|
||||
dtree,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
print("Acc", np.mean(dtree_scores['test_accuracy']))
|
||||
print("Precision", np.mean(dtree_scores['test_precision']))
|
||||
print("Recall", np.mean(dtree_scores['test_recall']))
|
||||
print("F1", np.mean(dtree_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Random Forest Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
rfc = ensemble.RandomForestClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
rfc_scores = cross_validate(
|
||||
rfc,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
print("Acc", np.mean(rfc_scores['test_accuracy']))
|
||||
print("Precision", np.mean(rfc_scores['test_precision']))
|
||||
print("Recall", np.mean(rfc_scores['test_recall']))
|
||||
print("F1", np.mean(rfc_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Gradient Boosting Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
gbc = ensemble.GradientBoostingClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
gbc_scores = cross_validate(
|
||||
gbc,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
print("Acc", np.mean(gbc_scores['test_accuracy']))
|
||||
print("Precision", np.mean(gbc_scores['test_precision']))
|
||||
print("Recall", np.mean(gbc_scores['test_recall']))
|
||||
print("F1", np.mean(gbc_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### LGBM Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lgbm = LGBMClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lgbm_scores = cross_validate(
|
||||
lgbm,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
print("Acc", np.mean(lgbm_scores['test_accuracy']))
|
||||
print("Precision", np.mean(lgbm_scores['test_precision']))
|
||||
print("Recall", np.mean(lgbm_scores['test_recall']))
|
||||
print("F1", np.mean(lgbm_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### XGBoost Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_classifier = xg.sklearn.XGBClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_classifier_scores = cross_validate(
|
||||
xgb_classifier,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
print("Acc", np.mean(xgb_classifier_scores['test_accuracy']))
|
||||
print("Precision", np.mean(xgb_classifier_scores['test_precision']))
|
||||
print("Recall", np.mean(xgb_classifier_scores['test_recall']))
|
||||
print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
|
|
@ -0,0 +1,184 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# %matplotlib inline
|
||||
import datetime
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
|
||||
from sklearn.impute import SimpleImputer
|
||||
|
||||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||
from lightgbm import LGBMClassifier
|
||||
import xgboost as xg
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
import machine_learning.labels
|
||||
import machine_learning.model
|
||||
from machine_learning.classification_models import ClassificationModels
|
||||
|
||||
# %% [markdown]
|
||||
# # RAPIDS models
|
||||
|
||||
# %% [markdown]
|
||||
# ## Set script's parameters
|
||||
n_clusters = 5 # Number of clusters (could be regarded as a hyperparameter)
|
||||
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
|
||||
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
|
||||
|
||||
model_input.columns[list(model_input.columns).index('age'):-1]
|
||||
|
||||
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
|
||||
lime_cols
|
||||
lime_col = 'limesurvey_demand_control_ratio'
|
||||
clust_col = lime_col
|
||||
|
||||
model_input[clust_col].describe()
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
|
||||
# Filter-out outlier rows by clust_col
|
||||
model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||
|
||||
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
||||
plt.bar(uniq['pid'], uniq[clust_col])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Get clusters by cluster col & and merge the clusters to main df
|
||||
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
|
||||
np.unique(km, return_counts=True)
|
||||
uniq['cluster'] = km
|
||||
uniq
|
||||
|
||||
model_input = model_input.merge(uniq[['pid', 'cluster']])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Create dict with classification ml models
|
||||
cm = ClassificationModels()
|
||||
cmodels = cm.get_cmodels()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
for k in range(n_clusters):
|
||||
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
||||
bins = [-10, -1, 1, 10] # bins for z-scored targets
|
||||
model_input_subset.loc[:, 'target'] = \
|
||||
pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
|
||||
model_input_subset['target'].value_counts()
|
||||
model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
|
||||
model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||
|
||||
model_input_subset['target'].value_counts()
|
||||
|
||||
if cv_method_str == 'halflogo':
|
||||
model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
|
||||
model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
|
||||
|
||||
model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
|
||||
model_input_subset["pid_half"] = model_input_subset["pid"] + "_" + model_input_subset["pid_index"].astype(int).astype(str)
|
||||
|
||||
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
|
||||
else:
|
||||
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]
|
||||
|
||||
# Treat categorical features
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# Establish cv method
|
||||
cv_method = None # Defaults to 5 k-folds in cross_validate method
|
||||
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
||||
cv_method = LeaveOneGroupOut()
|
||||
cv_method.get_n_splits(
|
||||
train_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
|
||||
for model_title, model in cmodels.items():
|
||||
|
||||
classifier = cross_validate(
|
||||
model['model'],
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
|
||||
print("\n-------------------------------------\n")
|
||||
print("Current cluster:", k, end="\n")
|
||||
print("Current model:", model_title, end="\n")
|
||||
print("Acc", np.mean(classifier['test_accuracy']))
|
||||
print("Precision", np.mean(classifier['test_precision']))
|
||||
print("Recall", np.mean(classifier['test_recall']))
|
||||
print("F1", np.mean(classifier['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
|
||||
cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
|
||||
cmodels[model_title]['metrics'][2] += np.mean(classifier['test_recall'])
|
||||
cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Get overall results
|
||||
cm.get_total_models_scores(n_clusters=n_clusters)
|
|
@ -0,0 +1,181 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# %matplotlib inline
|
||||
import datetime
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, train_test_split
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
||||
|
||||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||
from lightgbm import LGBMClassifier
|
||||
import xgboost as xg
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
import machine_learning.labels
|
||||
import machine_learning.model
|
||||
from machine_learning.classification_models import ClassificationModels
|
||||
|
||||
# %% [markdown]
|
||||
# # RAPIDS models
|
||||
|
||||
# %% [markdown]
|
||||
# # Useful method
|
||||
def treat_categorical_features(input_set):
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = input_set[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = input_set.drop(categorical_feature_colnames, axis=1)
|
||||
|
||||
return pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# %% [markdown]
|
||||
# ## Set script's parameters
|
||||
n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
|
||||
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
|
||||
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
|
||||
|
||||
model_input.columns[list(model_input.columns).index('age'):-1]
|
||||
|
||||
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
|
||||
lime_cols
|
||||
lime_col = 'limesurvey_demand_control_ratio'
|
||||
clust_col = lime_col
|
||||
|
||||
model_input[clust_col].describe()
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
|
||||
# Filter-out outlier rows by clust_col
|
||||
model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||
|
||||
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
||||
plt.bar(uniq['pid'], uniq[clust_col])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Get clusters by cluster col & and merge the clusters to main df
|
||||
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
|
||||
np.unique(km, return_counts=True)
|
||||
uniq['cluster'] = km
|
||||
uniq
|
||||
|
||||
model_input = model_input.merge(uniq[['pid', 'cluster']])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Create dict with classification ml models
|
||||
cm = ClassificationModels()
|
||||
cmodels = cm.get_cmodels()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
for k in range(n_clusters):
|
||||
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
||||
|
||||
# Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
|
||||
model_input_subset['numerical_target'] = model_input_subset['target']
|
||||
bins = [-10, 0, 10] # bins for z-scored targets
|
||||
model_input_subset.loc[:, 'target'] = \
|
||||
pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True)
|
||||
|
||||
p15 = np.percentile(model_input_subset['numerical_target'], 15)
|
||||
p85 = np.percentile(model_input_subset['numerical_target'], 85)
|
||||
|
||||
# Treat categorical features
|
||||
model_input_subset = treat_categorical_features(model_input_subset)
|
||||
|
||||
# Split to train, validate, and test subsets
|
||||
train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
|
||||
test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
|
||||
|
||||
train_set['target'].value_counts()
|
||||
test_set['target'].value_counts()
|
||||
|
||||
train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
|
||||
|
||||
validate_x, test_x, validate_y, test_y = \
|
||||
train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42)
|
||||
|
||||
# Impute missing values
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
|
||||
train_x = imputer.fit_transform(train_x)
|
||||
validate_x = imputer.fit_transform(validate_x)
|
||||
test_x = imputer.fit_transform(test_x)
|
||||
|
||||
for model_title, model in cmodels.items():
|
||||
model['model'].fit(train_x, train_y)
|
||||
y_pred = model['model'].predict(validate_x)
|
||||
|
||||
acc = accuracy_score(validate_y, y_pred)
|
||||
prec = precision_score(validate_y, y_pred)
|
||||
rec = recall_score(validate_y, y_pred)
|
||||
f1 = f1_score(validate_y, y_pred)
|
||||
|
||||
print("\n-------------------------------------\n")
|
||||
print("Current cluster:", k, end="\n")
|
||||
print("Current model:", model_title, end="\n")
|
||||
print("Acc", acc)
|
||||
print("Precision", prec)
|
||||
print("Recall", rec)
|
||||
print("F1", f1)
|
||||
|
||||
cmodels[model_title]['metrics'][0] += acc
|
||||
cmodels[model_title]['metrics'][1] += prec
|
||||
cmodels[model_title]['metrics'][2] += rec
|
||||
cmodels[model_title]['metrics'][3] += f1
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Get overall results
|
||||
cm.get_total_models_scores(n_clusters=n_clusters)
|
|
@ -0,0 +1,355 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# %matplotlib inline
|
||||
import datetime
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import yaml
|
||||
from pyprojroot import here
|
||||
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.dummy import DummyRegressor
|
||||
import xgboost as xg
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
import machine_learning.features_sensor
|
||||
import machine_learning.labels
|
||||
import machine_learning.model
|
||||
|
||||
# %% [markdown]
|
||||
# # RAPIDS models
|
||||
|
||||
# %% [markdown]
|
||||
# ## PANAS negative affect
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
#if "pid" in model_input.columns:
|
||||
# index_columns.append("pid")
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
|
||||
cv_method = 'half_logo' # logo, half_logo, 5kfold
|
||||
if cv_method == 'logo':
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
|
||||
else:
|
||||
model_input['pid_index'] = model_input.groupby('pid').cumcount()
|
||||
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
|
||||
|
||||
model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
|
||||
model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
|
||||
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
train_x.dtypes
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
logo = LeaveOneGroupOut()
|
||||
logo.get_n_splits(
|
||||
train_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
|
||||
# Defaults to 5 k folds in cross_validate method
|
||||
if cv_method != 'logo' and cv_method != 'half_logo':
|
||||
logo = None
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
sum(data_y.isna())
|
||||
|
||||
# %% [markdown]
|
||||
# ### Baseline: Dummy Regression (mean)
|
||||
dummy_regr = DummyRegressor(strategy="mean")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
dummy_regressor = cross_validate(
|
||||
dummy_regr,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(dummy_regressor['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(dummy_regressor['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(dummy_regressor['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(dummy_regressor['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Linear Regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lin_reg_rapids = linear_model.LinearRegression()
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lin_reg_scores = cross_validate(
|
||||
lin_reg_rapids,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(lin_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### XGBRegressor Linear Regression
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_reg_scores = cross_validate(
|
||||
xgb_r,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(xgb_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(xgb_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(xgb_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(xgb_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### XGBRegressor Pseudo Huber Error Regression
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_psuedo_huber_reg_scores = cross_validate(
|
||||
xgb_psuedo_huber_r,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(xgb_psuedo_huber_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Ridge regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ridge_reg = linear_model.Ridge(alpha=.5)
|
||||
|
||||
# %% tags=[] jupyter={"source_hidden": true}
|
||||
ridge_reg_scores = cross_validate(
|
||||
ridge_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(ridge_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(ridge_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(ridge_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(ridge_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Lasso
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lasso_reg = linear_model.Lasso(alpha=0.1)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lasso_reg_score = cross_validate(
|
||||
lasso_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(lasso_reg_score['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(lasso_reg_score['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(lasso_reg_score['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(lasso_reg_score['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Bayesian Ridge
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
bayesian_ridge_reg = linear_model.BayesianRidge()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
bayesian_ridge_reg_score = cross_validate(
|
||||
bayesian_ridge_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### RANSAC (outlier robust regression)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ransac_reg = linear_model.RANSACRegressor()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ransac_reg_scores = cross_validate(
|
||||
ransac_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(ransac_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(ransac_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(ransac_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(ransac_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Support vector regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
svr = svm.SVR()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
svr_scores = cross_validate(
|
||||
svr,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(svr_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(svr_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(svr_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(svr_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Kernel Ridge regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
kridge = kernel_ridge.KernelRidge()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
kridge_scores = cross_validate(
|
||||
kridge,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(kridge_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(kridge_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(kridge_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(kridge_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Gaussian Process Regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
gpr = gaussian_process.GaussianProcessRegressor()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
|
||||
gpr_scores = cross_validate(
|
||||
gpr,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(gpr_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(gpr_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(gpr_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(gpr_scores['test_r2']))
|
||||
|
||||
# %%
|
|
@ -0,0 +1,358 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# %matplotlib inline
|
||||
import datetime
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import yaml
|
||||
from pyprojroot import here
|
||||
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
|
||||
from sklearn.model_selection import LeaveOneGroupOut, LeavePGroupsOut, cross_val_score, cross_validate
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.dummy import DummyRegressor
|
||||
import xgboost as xg
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
import machine_learning.features_sensor
|
||||
import machine_learning.labels
|
||||
import machine_learning.model
|
||||
|
||||
# %% [markdown]
|
||||
# # RAPIDS models
|
||||
|
||||
# %% [markdown]
|
||||
# ## PANAS negative affect
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input = pd.read_csv("../data/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
|
||||
cv_method = 'half_logo'
|
||||
if cv_method == 'logo':
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
|
||||
else:
|
||||
|
||||
model_input[(model_input['pid'] == "p037") | (model_input['pid'] == "p064") | (model_input['pid'] == "p092")]
|
||||
|
||||
model_input['pid_index'] = model_input.groupby('pid').cumcount()
|
||||
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
|
||||
|
||||
model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
|
||||
model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
|
||||
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
train_x.dtypes
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
logo = LeaveOneGroupOut()
|
||||
logo.get_n_splits(
|
||||
train_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
|
||||
# Defaults to 5 k folds in cross_validate method
|
||||
if cv_method != 'logo' and cv_method != 'half_logo':
|
||||
logo = None
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
sum(data_y.isna())
|
||||
|
||||
# %% [markdown]
|
||||
# ### Baseline: Dummy Regression (mean)
|
||||
dummy_regr = DummyRegressor(strategy="mean")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lin_reg_scores = cross_validate(
|
||||
dummy_regr,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.nanmedian(lin_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.nanmedian(lin_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Linear Regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lin_reg_rapids = linear_model.LinearRegression()
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lin_reg_scores = cross_validate(
|
||||
lin_reg_rapids,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.nanmedian(lin_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.nanmedian(lin_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### XGBRegressor Linear Regression
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_reg_scores = cross_validate(
|
||||
xgb_r,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.nanmedian(xgb_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.nanmedian(xgb_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.nanmedian(xgb_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.nanmedian(xgb_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### XGBRegressor Pseudo Huber Error Regression
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_psuedo_huber_reg_scores = cross_validate(
|
||||
xgb_psuedo_huber_r,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.nanmedian(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.nanmedian(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.nanmedian(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.nanmedian(xgb_psuedo_huber_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Ridge regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ridge_reg = linear_model.Ridge(alpha=.5)
|
||||
|
||||
# %% tags=[] jupyter={"source_hidden": true}
|
||||
ridge_reg_scores = cross_validate(
|
||||
ridge_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.nanmedian(ridge_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.nanmedian(ridge_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.nanmedian(ridge_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.nanmedian(ridge_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Lasso
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lasso_reg = linear_model.Lasso(alpha=0.1)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lasso_reg_score = cross_validate(
|
||||
lasso_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.nanmedian(lasso_reg_score['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.nanmedian(lasso_reg_score['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.nanmedian(lasso_reg_score['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.nanmedian(lasso_reg_score['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Bayesian Ridge
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
bayesian_ridge_reg = linear_model.BayesianRidge()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
bayesian_ridge_reg_score = cross_validate(
|
||||
bayesian_ridge_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.nanmedian(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.nanmedian(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.nanmedian(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.nanmedian(bayesian_ridge_reg_score['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### RANSAC (outlier robust regression)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ransac_reg = linear_model.RANSACRegressor()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ransac_reg_scores = cross_validate(
|
||||
ransac_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.nanmedian(ransac_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.nanmedian(ransac_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.nanmedian(ransac_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.nanmedian(ransac_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Support vector regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
svr = svm.SVR()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
svr_scores = cross_validate(
|
||||
svr,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.nanmedian(svr_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.nanmedian(svr_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.nanmedian(svr_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.nanmedian(svr_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Kernel Ridge regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
kridge = kernel_ridge.KernelRidge()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
kridge_scores = cross_validate(
|
||||
kridge,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.nanmedian(kridge_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.nanmedian(kridge_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.nanmedian(kridge_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.nanmedian(kridge_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Gaussian Process Regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
gpr = gaussian_process.GaussianProcessRegressor()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
|
||||
gpr_scores = cross_validate(
|
||||
gpr,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.nanmedian(gpr_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.nanmedian(gpr_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.nanmedian(gpr_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.nanmedian(gpr_scores['test_r2']))
|
||||
|
||||
# %%
|
|
@ -0,0 +1,30 @@
|
|||
from collections.abc import Collection
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from config.models import Participant, Timezone
|
||||
from setup import db_engine, session
|
||||
|
||||
|
||||
def get_timezone_data(usernames: Collection) -> pd.DataFrame:
|
||||
"""
|
||||
Read the data from the proximity sensor table and return it in a dataframe.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
usernames: Collection
|
||||
A list of usernames to put into the WHERE condition.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_proximity: pd.DataFrame
|
||||
A dataframe of proximity data.
|
||||
"""
|
||||
query_timezone = (
|
||||
session.query(Timezone, Participant.username)
|
||||
.filter(Participant.id == Timezone.participant_id)
|
||||
.filter(Participant.username.in_(usernames))
|
||||
)
|
||||
with db_engine.connect() as connection:
|
||||
df_timezone = pd.read_sql(query_timezone.statement, connection)
|
||||
return df_timezone
|
|
@ -0,0 +1,205 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
|
||||
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<!-- Generated by graphviz version 2.43.0 (0)
|
||||
-->
|
||||
<!-- Title: snakemake_dag Pages: 1 -->
|
||||
<svg width="548pt" height="625pt"
|
||||
viewBox="0.00 0.00 548.00 625.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 621)">
|
||||
<title>snakemake_dag</title>
|
||||
<polygon fill="white" stroke="transparent" points="-4,4 -4,-621 544,-621 544,4 -4,4"/>
|
||||
<!-- 0 -->
|
||||
<g id="node1" class="node">
|
||||
<title>0</title>
|
||||
<path fill="none" stroke="#565bd8" stroke-width="2" d="M202,-36C202,-36 172,-36 172,-36 166,-36 160,-30 160,-24 160,-24 160,-12 160,-12 160,-6 166,0 172,0 172,0 202,0 202,0 208,0 214,-6 214,-12 214,-12 214,-24 214,-24 214,-30 208,-36 202,-36"/>
|
||||
<text text-anchor="middle" x="187" y="-15.5" font-family="sans" font-size="10.00">all</text>
|
||||
</g>
|
||||
<!-- 1 -->
|
||||
<g id="node2" class="node">
|
||||
<title>1</title>
|
||||
<path fill="none" stroke="#56d8a9" stroke-width="2" d="M100,-617C100,-617 12,-617 12,-617 6,-617 0,-611 0,-605 0,-605 0,-588 0,-588 0,-582 6,-576 12,-576 12,-576 100,-576 100,-576 106,-576 112,-582 112,-588 112,-588 112,-605 112,-605 112,-611 106,-617 100,-617"/>
|
||||
<text text-anchor="middle" x="56" y="-605" font-family="sans" font-size="10.00">pull_phone_data</text>
|
||||
<text text-anchor="middle" x="56" y="-594" font-family="sans" font-size="10.00">pid: nokia_0000003</text>
|
||||
<text text-anchor="middle" x="56" y="-583" font-family="sans" font-size="10.00">sensor: calls</text>
|
||||
</g>
|
||||
<!-- 1->0 -->
|
||||
<g id="edge1" class="edge">
|
||||
<title>1->0</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M47.83,-575.78C37.21,-548.32 20,-496.76 20,-451 20,-451 20,-451 20,-161 20,-114.96 38.83,-102.85 73,-72 95.21,-51.94 126.33,-38.17 150.45,-29.7"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="151.61,-33 159.97,-26.5 149.38,-26.37 151.61,-33"/>
|
||||
</g>
|
||||
<!-- 2 -->
|
||||
<g id="node3" class="node">
|
||||
<title>2</title>
|
||||
<path fill="none" stroke="#56d863" stroke-width="2" d="M124,-540C124,-540 60,-540 60,-540 54,-540 48,-534 48,-528 48,-528 48,-516 48,-516 48,-510 54,-504 60,-504 60,-504 124,-504 124,-504 130,-504 136,-510 136,-516 136,-516 136,-528 136,-528 136,-534 130,-540 124,-540"/>
|
||||
<text text-anchor="middle" x="92" y="-519.5" font-family="sans" font-size="10.00">calls_episodes</text>
|
||||
</g>
|
||||
<!-- 1->2 -->
|
||||
<g id="edge9" class="edge">
|
||||
<title>1->2</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M65.84,-575.69C69.87,-567.56 74.6,-558.03 78.92,-549.33"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="82.09,-550.83 83.4,-540.32 75.82,-547.72 82.09,-550.83"/>
|
||||
</g>
|
||||
<!-- 2->0 -->
|
||||
<g id="edge2" class="edge">
|
||||
<title>2->0</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M85.12,-503.83C75.18,-477.44 58,-425.14 58,-379 58,-379 58,-379 58,-161 58,-105.34 112.96,-61.84 151.14,-38.34"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="153.16,-41.21 159.96,-33.08 149.58,-35.2 153.16,-41.21"/>
|
||||
</g>
|
||||
<!-- 3 -->
|
||||
<g id="node4" class="node">
|
||||
<title>3</title>
|
||||
<path fill="none" stroke="#d8a456" stroke-width="2" d="M187.5,-468C187.5,-468 98.5,-468 98.5,-468 92.5,-468 86.5,-462 86.5,-456 86.5,-456 86.5,-444 86.5,-444 86.5,-438 92.5,-432 98.5,-432 98.5,-432 187.5,-432 187.5,-432 193.5,-432 199.5,-438 199.5,-444 199.5,-444 199.5,-456 199.5,-456 199.5,-462 193.5,-468 187.5,-468"/>
|
||||
<text text-anchor="middle" x="143" y="-453" font-family="sans" font-size="10.00">resample_episodes</text>
|
||||
<text text-anchor="middle" x="143" y="-442" font-family="sans" font-size="10.00">sensor: phone_calls</text>
|
||||
</g>
|
||||
<!-- 2->3 -->
|
||||
<g id="edge10" class="edge">
|
||||
<title>2->3</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M104.61,-503.7C110.6,-495.47 117.88,-485.48 124.48,-476.42"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="127.48,-478.25 130.54,-468.1 121.82,-474.13 127.48,-478.25"/>
|
||||
</g>
|
||||
<!-- 3->0 -->
|
||||
<g id="edge3" class="edge">
|
||||
<title>3->0</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M140.43,-432C136.64,-405.4 130,-352.3 130,-307 130,-307 130,-307 130,-161 130,-117.8 153,-72.19 169.78,-44.66"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="172.83,-46.37 175.19,-36.04 166.91,-42.65 172.83,-46.37"/>
|
||||
</g>
|
||||
<!-- 4 -->
|
||||
<g id="node5" class="node">
|
||||
<title>4</title>
|
||||
<path fill="none" stroke="#56d8d8" stroke-width="2" d="M357.5,-396C357.5,-396 194.5,-396 194.5,-396 188.5,-396 182.5,-390 182.5,-384 182.5,-384 182.5,-372 182.5,-372 182.5,-366 188.5,-360 194.5,-360 194.5,-360 357.5,-360 357.5,-360 363.5,-360 369.5,-366 369.5,-372 369.5,-372 369.5,-384 369.5,-384 369.5,-390 363.5,-396 357.5,-396"/>
|
||||
<text text-anchor="middle" x="276" y="-375.5" font-family="sans" font-size="10.00">resample_episodes_with_datetime</text>
|
||||
</g>
|
||||
<!-- 3->4 -->
|
||||
<g id="edge11" class="edge">
|
||||
<title>3->4</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M175.54,-431.88C193.25,-422.55 215.35,-410.92 234.32,-400.94"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="236.12,-403.94 243.34,-396.19 232.86,-397.75 236.12,-403.94"/>
|
||||
</g>
|
||||
<!-- 4->0 -->
|
||||
<g id="edge4" class="edge">
|
||||
<title>4->0</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M250.68,-359.83C218.76,-335.92 168,-289.36 168,-235 168,-235 168,-235 168,-161 168,-120.86 175.55,-74.9 181.13,-46.4"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="184.61,-46.88 183.16,-36.39 177.74,-45.5 184.61,-46.88"/>
|
||||
</g>
|
||||
<!-- 8 -->
|
||||
<g id="node9" class="node">
|
||||
<title>8</title>
|
||||
<path fill="none" stroke="#68d856" stroke-width="2" d="M353.5,-324C353.5,-324 248.5,-324 248.5,-324 242.5,-324 236.5,-318 236.5,-312 236.5,-312 236.5,-300 236.5,-300 236.5,-294 242.5,-288 248.5,-288 248.5,-288 353.5,-288 353.5,-288 359.5,-288 365.5,-294 365.5,-300 365.5,-300 365.5,-312 365.5,-312 365.5,-318 359.5,-324 353.5,-324"/>
|
||||
<text text-anchor="middle" x="301" y="-309" font-family="sans" font-size="10.00">phone_calls_r_features</text>
|
||||
<text text-anchor="middle" x="301" y="-298" font-family="sans" font-size="10.00">provider_key: rapids</text>
|
||||
</g>
|
||||
<!-- 4->8 -->
|
||||
<g id="edge15" class="edge">
|
||||
<title>4->8</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M282.18,-359.7C285,-351.81 288.39,-342.3 291.52,-333.55"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="294.82,-334.7 294.89,-324.1 288.23,-332.34 294.82,-334.7"/>
|
||||
</g>
|
||||
<!-- 5 -->
|
||||
<g id="node6" class="node">
|
||||
<title>5</title>
|
||||
<path fill="none" stroke="#afd856" stroke-width="2" d="M475.5,-468C475.5,-468 364.5,-468 364.5,-468 358.5,-468 352.5,-462 352.5,-456 352.5,-456 352.5,-444 352.5,-444 352.5,-438 358.5,-432 364.5,-432 364.5,-432 475.5,-432 475.5,-432 481.5,-432 487.5,-438 487.5,-444 487.5,-444 487.5,-456 487.5,-456 487.5,-462 481.5,-468 475.5,-468"/>
|
||||
<text text-anchor="middle" x="420" y="-453" font-family="sans" font-size="10.00">process_time_segments</text>
|
||||
<text text-anchor="middle" x="420" y="-442" font-family="sans" font-size="10.00">pid: nokia_0000003</text>
|
||||
</g>
|
||||
<!-- 5->4 -->
|
||||
<g id="edge12" class="edge">
|
||||
<title>5->4</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M384.77,-431.88C365.42,-422.47 341.23,-410.71 320.57,-400.67"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="321.89,-397.41 311.36,-396.19 318.83,-403.71 321.89,-397.41"/>
|
||||
</g>
|
||||
<!-- 5->8 -->
|
||||
<g id="edge16" class="edge">
|
||||
<title>5->8</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M415.13,-431.72C409.07,-412.57 397.25,-381.55 379,-360 369.03,-348.23 355.82,-337.94 343.12,-329.64"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="344.79,-326.55 334.45,-324.21 341.08,-332.49 344.79,-326.55"/>
|
||||
</g>
|
||||
<!-- 6 -->
|
||||
<g id="node7" class="node">
|
||||
<title>6</title>
|
||||
<path fill="none" stroke="#d86656" stroke-width="2" stroke-dasharray="5,2" d="M322.5,-468C322.5,-468 229.5,-468 229.5,-468 223.5,-468 217.5,-462 217.5,-456 217.5,-456 217.5,-444 217.5,-444 217.5,-438 223.5,-432 229.5,-432 229.5,-432 322.5,-432 322.5,-432 328.5,-432 334.5,-438 334.5,-444 334.5,-444 334.5,-456 334.5,-456 334.5,-462 328.5,-468 322.5,-468"/>
|
||||
<text text-anchor="middle" x="276" y="-447.5" font-family="sans" font-size="10.00">prepare_tzcodes_file</text>
|
||||
</g>
|
||||
<!-- 6->4 -->
|
||||
<g id="edge13" class="edge">
|
||||
<title>6->4</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M276,-431.7C276,-423.98 276,-414.71 276,-406.11"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="279.5,-406.1 276,-396.1 272.5,-406.1 279.5,-406.1"/>
|
||||
</g>
|
||||
<!-- 7 -->
|
||||
<g id="node8" class="node">
|
||||
<title>7</title>
|
||||
<path fill="none" stroke="#56d86b" stroke-width="2" stroke-dasharray="5,2" d="M370,-540C370,-540 182,-540 182,-540 176,-540 170,-534 170,-528 170,-528 170,-516 170,-516 170,-510 176,-504 182,-504 182,-504 370,-504 370,-504 376,-504 382,-510 382,-516 382,-516 382,-528 382,-528 382,-534 376,-540 370,-540"/>
|
||||
<text text-anchor="middle" x="276" y="-519.5" font-family="sans" font-size="10.00">query_usernames_device_empatica_ids</text>
|
||||
</g>
|
||||
<!-- 7->6 -->
|
||||
<g id="edge14" class="edge">
|
||||
<title>7->6</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M276,-503.7C276,-495.98 276,-486.71 276,-478.11"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="279.5,-478.1 276,-468.1 272.5,-478.1 279.5,-478.1"/>
|
||||
</g>
|
||||
<!-- 8->0 -->
|
||||
<g id="edge5" class="edge">
|
||||
<title>8->0</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M264.63,-287.8C250.06,-279.08 234.51,-267.11 225,-252 184.07,-186.97 182.71,-92.23 184.91,-46.17"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="188.41,-46.26 185.49,-36.07 181.42,-45.85 188.41,-46.26"/>
|
||||
</g>
|
||||
<!-- 9 -->
|
||||
<g id="node10" class="node">
|
||||
<title>9</title>
|
||||
<path fill="none" stroke="#d87556" stroke-width="2" d="M382,-252C382,-252 246,-252 246,-252 240,-252 234,-246 234,-240 234,-240 234,-228 234,-228 234,-222 240,-216 246,-216 246,-216 382,-216 382,-216 388,-216 394,-222 394,-228 394,-228 394,-240 394,-240 394,-246 388,-252 382,-252"/>
|
||||
<text text-anchor="middle" x="314" y="-237" font-family="sans" font-size="10.00">join_features_from_providers</text>
|
||||
<text text-anchor="middle" x="314" y="-226" font-family="sans" font-size="10.00">sensor_key: phone_calls</text>
|
||||
</g>
|
||||
<!-- 8->9 -->
|
||||
<g id="edge17" class="edge">
|
||||
<title>8->9</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M304.21,-287.7C305.65,-279.98 307.37,-270.71 308.96,-262.11"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="312.44,-262.58 310.82,-252.1 305.56,-261.3 312.44,-262.58"/>
|
||||
</g>
|
||||
<!-- 9->0 -->
|
||||
<g id="edge6" class="edge">
|
||||
<title>9->0</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M294.15,-215.87C283.81,-206.16 271.58,-193.31 263,-180 235.01,-136.57 243.3,-118.11 220,-72 215.36,-62.81 209.61,-53.14 204.23,-44.62"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="207.17,-42.72 198.81,-36.21 201.29,-46.51 207.17,-42.72"/>
|
||||
</g>
|
||||
<!-- 10 -->
|
||||
<g id="node11" class="node">
|
||||
<title>10</title>
|
||||
<path fill="none" stroke="#56d8d0" stroke-width="2" d="M526,-180C526,-180 284,-180 284,-180 278,-180 272,-174 272,-168 272,-168 272,-156 272,-156 272,-150 278,-144 284,-144 284,-144 526,-144 526,-144 532,-144 538,-150 538,-156 538,-156 538,-168 538,-168 538,-174 532,-180 526,-180"/>
|
||||
<text text-anchor="middle" x="405" y="-159.5" font-family="sans" font-size="10.00">merge_sensor_features_for_individual_participants</text>
|
||||
</g>
|
||||
<!-- 9->10 -->
|
||||
<g id="edge18" class="edge">
|
||||
<title>9->10</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M336.49,-215.7C347.96,-206.88 362.06,-196.03 374.48,-186.47"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="376.97,-188.98 382.76,-180.1 372.7,-183.43 376.97,-188.98"/>
|
||||
</g>
|
||||
<!-- 10->0 -->
|
||||
<g id="edge7" class="edge">
|
||||
<title>10->0</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M366.3,-143.85C346.21,-134.31 321.62,-121.63 301,-108 280.21,-94.25 277.55,-87.46 258,-72 245.35,-62 231.16,-51.3 218.81,-42.16"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="220.72,-39.22 210.59,-36.1 216.57,-44.85 220.72,-39.22"/>
|
||||
</g>
|
||||
<!-- 11 -->
|
||||
<g id="node12" class="node">
|
||||
<title>11</title>
|
||||
<path fill="none" stroke="#56d892" stroke-width="2" d="M528,-108C528,-108 322,-108 322,-108 316,-108 310,-102 310,-96 310,-96 310,-84 310,-84 310,-78 316,-72 322,-72 322,-72 528,-72 528,-72 534,-72 540,-78 540,-84 540,-84 540,-96 540,-96 540,-102 534,-108 528,-108"/>
|
||||
<text text-anchor="middle" x="425" y="-87.5" font-family="sans" font-size="10.00">merge_sensor_features_for_all_participants</text>
|
||||
</g>
|
||||
<!-- 10->11 -->
|
||||
<g id="edge19" class="edge">
|
||||
<title>10->11</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M409.94,-143.7C412.17,-135.9 414.85,-126.51 417.33,-117.83"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="420.73,-118.68 420.11,-108.1 414,-116.76 420.73,-118.68"/>
|
||||
</g>
|
||||
<!-- 11->0 -->
|
||||
<g id="edge8" class="edge">
|
||||
<title>11->0</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M367.08,-71.97C322.5,-58.85 262.21,-41.12 223.96,-29.87"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="224.84,-26.48 214.26,-27.02 222.87,-33.2 224.84,-26.48"/>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 13 KiB |
File diff suppressed because it is too large
Load Diff
After Width: | Height: | Size: 135 KiB |
|
@ -0,0 +1,68 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
|
||||
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<!-- Generated by graphviz version 2.43.0 (0)
|
||||
-->
|
||||
<!-- Title: snakemake_dag Pages: 1 -->
|
||||
<svg width="414pt" height="396pt"
|
||||
viewBox="0.00 0.00 414.00 396.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 392)">
|
||||
<title>snakemake_dag</title>
|
||||
<polygon fill="white" stroke="transparent" points="-4,4 -4,-392 410,-392 410,4 -4,4"/>
|
||||
<!-- 0 -->
|
||||
<g id="node1" class="node">
|
||||
<title>0</title>
|
||||
<text text-anchor="start" x="81" y="-71.6" font-family="sans" font-weight="bold" font-size="18.00">create_participants_files</text>
|
||||
<text text-anchor="start" x="81" y="-47.8" font-family="sans" font-size="10.00"> </text>
|
||||
<text text-anchor="start" x="85" y="-47.8" font-family="sans" font-weight="bold" font-size="14.00">↪ input</text>
|
||||
<text text-anchor="start" x="143" y="-47.8" font-family="sans" font-size="10.00"> </text>
|
||||
<text text-anchor="start" x="81" y="-28" font-family="monospace" font-size="10.00">data/external/example_participants.csv</text>
|
||||
<text text-anchor="start" x="319" y="-10" font-family="sans" font-size="10.00">  </text>
|
||||
<polygon fill="#acd957" stroke="#acd957" points="75,-62 75,-62 333,-62 333,-62 75,-62"/>
|
||||
<polygon fill="#acd957" stroke="#acd957" points="75,-22 75,-22 333,-22 333,-22 75,-22"/>
|
||||
<polygon fill="none" stroke="#acd957" stroke-width="2" points="74.5,-1 74.5,-91 331.5,-91 331.5,-1 74.5,-1"/>
|
||||
</g>
|
||||
<!-- 1 -->
|
||||
<g id="node2" class="node">
|
||||
<title>1</title>
|
||||
<text text-anchor="start" x="77" y="-221.6" font-family="sans" font-weight="bold" font-size="18.00">prepare_participants_csv</text>
|
||||
<text text-anchor="start" x="77" y="-197.8" font-family="sans" font-size="10.00"> </text>
|
||||
<text text-anchor="start" x="81" y="-197.8" font-family="sans" font-weight="bold" font-size="14.00">↪ input</text>
|
||||
<text text-anchor="start" x="139" y="-197.8" font-family="sans" font-size="10.00"> </text>
|
||||
<text text-anchor="start" x="77" y="-178" font-family="monospace" font-size="10.00">data/external/example_usernames.csv</text>
|
||||
<text text-anchor="start" x="251" y="-157.8" font-family="sans" font-size="10.00"> </text>
|
||||
<text text-anchor="start" x="255" y="-157.8" font-family="sans" font-weight="bold" font-size="14.00">output →</text>
|
||||
<text text-anchor="start" x="325" y="-157.8" font-family="sans" font-size="10.00"> </text>
|
||||
<text text-anchor="start" x="77" y="-138" font-family="monospace" font-size="10.00">data/external/example_participants.csv</text>
|
||||
<polygon fill="#57d99e" stroke="#57d99e" points="71,-212 71,-212 336,-212 336,-212 71,-212"/>
|
||||
<polygon fill="#57d99e" stroke="#57d99e" points="71,-172 71,-172 336,-172 336,-172 71,-172"/>
|
||||
<polygon fill="none" stroke="#57d99e" stroke-width="2" points="71,-129 71,-241 335,-241 335,-129 71,-129"/>
|
||||
</g>
|
||||
<!-- 1->0 -->
|
||||
<g id="edge1" class="edge">
|
||||
<title>1->0</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M203,-127.88C203,-119.48 203,-110.81 203,-102.42"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="206.5,-102.36 203,-92.36 199.5,-102.36 206.5,-102.36"/>
|
||||
</g>
|
||||
<!-- 2 -->
|
||||
<g id="node3" class="node">
|
||||
<title>2</title>
|
||||
<text text-anchor="start" x="7" y="-367.6" font-family="sans" font-weight="bold" font-size="18.00">query_usernames_device_empatica_ids</text>
|
||||
<text text-anchor="start" x="7" y="-346" font-family="sans" font-size="10.00">  </text>
|
||||
<text text-anchor="start" x="321" y="-325.8" font-family="sans" font-size="10.00"> </text>
|
||||
<text text-anchor="start" x="325" y="-325.8" font-family="sans" font-weight="bold" font-size="14.00">output →</text>
|
||||
<text text-anchor="start" x="395" y="-325.8" font-family="sans" font-size="10.00"> </text>
|
||||
<text text-anchor="start" x="7" y="-306" font-family="monospace" font-size="10.00">data/external/example_usernames.csv</text>
|
||||
<text text-anchor="start" x="7" y="-288" font-family="monospace" font-size="10.00">data/external/timezone.csv</text>
|
||||
<polygon fill="#86d957" stroke="#86d957" points="1,-358 1,-358 406,-358 406,-358 1,-358"/>
|
||||
<polygon fill="#86d957" stroke="#86d957" points="1,-340 1,-340 406,-340 406,-340 1,-340"/>
|
||||
<polygon fill="none" stroke="#86d957" stroke-width="2" points="1,-279 1,-387 405,-387 405,-279 1,-279"/>
|
||||
</g>
|
||||
<!-- 2->1 -->
|
||||
<g id="edge2" class="edge">
|
||||
<title>2->1</title>
|
||||
<path fill="none" stroke="grey" stroke-width="2" d="M203,-277.63C203,-269.45 203,-260.93 203,-252.53"/>
|
||||
<polygon fill="grey" stroke="grey" stroke-width="2" points="206.5,-252.36 203,-242.36 199.5,-252.36 206.5,-252.36"/>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 4.6 KiB |
|
@ -0,0 +1,71 @@
|
|||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||
from lightgbm import LGBMClassifier
|
||||
import xgboost as xg
|
||||
|
||||
class ClassificationModels():
|
||||
|
||||
def __init__(self):
|
||||
self.cmodels = self.init_classification_models()
|
||||
|
||||
def get_cmodels(self):
|
||||
return self.cmodels
|
||||
|
||||
def init_classification_models(self):
|
||||
cmodels = {
|
||||
'dummy_classifier': {
|
||||
'model': DummyClassifier(strategy="most_frequent"),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'logistic_regression': {
|
||||
'model': linear_model.LogisticRegression(max_iter=1000),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'support_vector_machine': {
|
||||
'model': svm.SVC(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'gaussian_naive_bayes': {
|
||||
'model': naive_bayes.GaussianNB(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'stochastic_gradient_descent_classifier': {
|
||||
'model': linear_model.SGDClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'knn': {
|
||||
'model': neighbors.KNeighborsClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'decision_tree': {
|
||||
'model': tree.DecisionTreeClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'random_forest_classifier': {
|
||||
'model': ensemble.RandomForestClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'gradient_boosting_classifier': {
|
||||
'model': ensemble.GradientBoostingClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'lgbm_classifier': {
|
||||
'model': LGBMClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'XGBoost_classifier': {
|
||||
'model': xg.sklearn.XGBClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
}
|
||||
}
|
||||
|
||||
return cmodels
|
||||
|
||||
def get_total_models_scores(self, n_clusters=1):
|
||||
for model_title, model in self.cmodels.items():
|
||||
print("\n************************************\n")
|
||||
print("Current model:", model_title, end="\n")
|
||||
print("Acc:", model['metrics'][0]/n_clusters)
|
||||
print("Precision:", model['metrics'][1]/n_clusters)
|
||||
print("Recall:", model['metrics'][2]/n_clusters)
|
||||
print("F1:", model['metrics'][3]/n_clusters)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,163 @@
|
|||
# %%
|
||||
import datetime
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import yaml
|
||||
from pyprojroot import here
|
||||
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate, cross_val_predict
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.dummy import DummyRegressor
|
||||
from sklearn.decomposition import PCA
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
import machine_learning.helper
|
||||
|
||||
# %%
|
||||
segment = "intradaily_30_min"
|
||||
target = "JCQ_job_demand"
|
||||
csv_name = "./data/" + segment + "_all_targets/input_" + target + "_mean.csv"
|
||||
#csv_name = "./data/daily_18_hours_all_targets/input_JCQ_job_demand_mean.csv"
|
||||
|
||||
# %%
|
||||
data_x, data_y, data_groups = machine_learning.helper.prepare_model_input(csv_name)
|
||||
|
||||
# %%
|
||||
data_y.head()
|
||||
|
||||
# %%
|
||||
scores = machine_learning.helper.run_all_models(csv_name)
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
logo = LeaveOneGroupOut()
|
||||
logo.get_n_splits(
|
||||
data_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Baseline: Dummy Regression (mean)
|
||||
dummy_regr = DummyRegressor(strategy="mean")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lin_reg_scores = cross_validate(
|
||||
dummy_regr,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(lin_reg_scores['test_r2']))
|
||||
|
||||
##################
|
||||
# %%
|
||||
chosen_model = "Random Forest"
|
||||
rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
|
||||
rfr_score = cross_validate(
|
||||
rfr,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(rfr_score['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(rfr_score['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(rfr_score['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(rfr_score['test_r2']))
|
||||
|
||||
# %%
|
||||
y_predicted = cross_val_predict(rfr, data_x, data_y, groups=data_groups, cv=logo)
|
||||
#########################
|
||||
# %%
|
||||
chosen_model = "Bayesian Ridge"
|
||||
bayesian_ridge_reg = linear_model.BayesianRidge()
|
||||
bayesian_ridge_reg_score = cross_validate(
|
||||
bayesian_ridge_reg,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
|
||||
|
||||
# %%
|
||||
y_predicted = cross_val_predict(bayesian_ridge_reg, data_x, data_y, groups=data_groups, cv=logo)
|
||||
|
||||
# %%
|
||||
data_y = pd.DataFrame(pd.concat([data_y, data_groups], axis=1))
|
||||
data_y.rename(columns={"target": "y_true"}, inplace=True)
|
||||
data_y["y_predicted"] = y_predicted
|
||||
|
||||
# %%
|
||||
data_y.head()
|
||||
|
||||
# %%
|
||||
g1 = sns.relplot(data=data_y, x="y_true", y="y_predicted")
|
||||
#g1.set_axis_labels("true", "predicted")
|
||||
#g1.map(plt.axhline, y=0, color=".7", dashes=(2, 1), zorder=0)
|
||||
#g1.map(plt.axline, xy1=(0,0), slope=1)
|
||||
g1.set(title=",".join([segment, target, chosen_model]))
|
||||
display(g1)
|
||||
g1.savefig("_".join([segment, target, chosen_model, "_relplot.pdf"]))
|
||||
|
||||
# %%
|
||||
data_y_long = pd.wide_to_long(
|
||||
data_y.reset_index(),
|
||||
i=["local_segment", "pid"],
|
||||
j="value",
|
||||
stubnames="y",
|
||||
sep="_",
|
||||
suffix=".+",
|
||||
)
|
||||
|
||||
# %%
|
||||
data_y_long.head()
|
||||
# %%
|
||||
g2 = sns.displot(data_y_long, x="y", hue="value", binwidth=0.1, height=5, aspect=1.5)
|
||||
sns.move_legend(g2, "upper left", bbox_to_anchor=(.55, .45))
|
||||
g2.set(title=",".join([segment, target, chosen_model]))
|
||||
g2.savefig("_".join([segment, target, chosen_model, "hist.pdf"]))
|
||||
|
||||
# %%
|
||||
pca = PCA(n_components=2)
|
||||
pca.fit(data_x)
|
||||
print(pca.explained_variance_ratio_)
|
||||
|
||||
# %%
|
||||
data_x_pca = pca.fit_transform(data_x)
|
||||
data_pca = pd.DataFrame(pd.concat([data_y.reset_index()["y_true"], pd.DataFrame(data_x_pca, columns = {"pca_0", "pca_1"})], axis=1))
|
||||
|
||||
# %%
|
||||
data_pca
|
||||
# %%
|
||||
|
||||
g3 = sns.relplot(data = data_pca, x = "pca_0", y = "pca_1", hue = "y_true", palette = sns.color_palette("Spectral", as_cmap=True))
|
||||
g3.set(title=",".join([segment, target, chosen_model]) + "\n variance explained = " + str(round(sum(pca.explained_variance_ratio_), 2)))
|
||||
g3.savefig("_".join([segment, target, chosen_model, "_PCA.pdf"]))
|
||||
|
||||
# %%
|
|
@ -1,6 +1,13 @@
|
|||
from pathlib import Path
|
||||
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.dummy import DummyRegressor
|
||||
from xgboost import XGBRegressor
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
|
||||
def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
|
||||
|
@ -55,3 +62,206 @@ def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> P
|
|||
export_filename = filename_prefix + "_" + data_type + ".csv"
|
||||
full_path = folder / export_filename
|
||||
return full_path
|
||||
|
||||
def insert_row(df, row):
|
||||
return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
|
||||
|
||||
def prepare_model_input(input_csv):
|
||||
|
||||
model_input = pd.read_csv(input_csv)
|
||||
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
|
||||
|
||||
categorical_feature_colnames = ["gender", "startlanguage", "limesurvey_demand_control_ratio_quartile"]
|
||||
#TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully
|
||||
#additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
#TODO: check if mostcommonactivity is indeed a categorical features after aggregating
|
||||
#categorical_feature_colnames += additional_categorical_features
|
||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||
|
||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
return train_x, data_y, data_groups
|
||||
|
||||
|
||||
def run_all_models(input_csv):
|
||||
# Prepare data
|
||||
train_x, data_y, data_groups = prepare_model_input(input_csv)
|
||||
|
||||
# Prepare cross validation
|
||||
logo = LeaveOneGroupOut()
|
||||
logo.get_n_splits(
|
||||
train_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
scores = pd.DataFrame(columns=["method", "median", "max"])
|
||||
|
||||
# Validate models
|
||||
lin_reg_rapids = linear_model.LinearRegression()
|
||||
lin_reg_scores = cross_val_score(
|
||||
lin_reg_rapids,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring='r2'
|
||||
)
|
||||
print("Linear regression:")
|
||||
print(np.median(lin_reg_scores))
|
||||
scores = insert_row(scores, ["Linear regression",np.median(lin_reg_scores),np.max(lin_reg_scores)])
|
||||
|
||||
ridge_reg = linear_model.Ridge(alpha=.5)
|
||||
ridge_reg_scores = cross_val_score(
|
||||
ridge_reg,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring="r2"
|
||||
)
|
||||
print("Ridge regression")
|
||||
print(np.median(ridge_reg_scores))
|
||||
scores = insert_row(scores, ["Ridge regression",np.median(ridge_reg_scores),np.max(ridge_reg_scores)])
|
||||
|
||||
lasso_reg = linear_model.Lasso(alpha=0.1)
|
||||
lasso_reg_score = cross_val_score(
|
||||
lasso_reg,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring="r2"
|
||||
)
|
||||
print("Lasso regression")
|
||||
print(np.median(lasso_reg_score))
|
||||
scores = insert_row(scores, ["Lasso regression",np.median(lasso_reg_score),np.max(lasso_reg_score)])
|
||||
|
||||
bayesian_ridge_reg = linear_model.BayesianRidge()
|
||||
bayesian_ridge_reg_score = cross_val_score(
|
||||
bayesian_ridge_reg,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring="r2"
|
||||
)
|
||||
print("Bayesian Ridge")
|
||||
print(np.median(bayesian_ridge_reg_score))
|
||||
scores = insert_row(scores, ["Bayesian Ridge",np.median(bayesian_ridge_reg_score),np.max(bayesian_ridge_reg_score)])
|
||||
|
||||
ransac_reg = linear_model.RANSACRegressor()
|
||||
ransac_reg_score = cross_val_score(
|
||||
ransac_reg,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring="r2"
|
||||
)
|
||||
print("RANSAC (outlier robust regression)")
|
||||
print(np.median(ransac_reg_score))
|
||||
scores = insert_row(scores, ["RANSAC",np.median(ransac_reg_score),np.max(ransac_reg_score)])
|
||||
|
||||
svr = svm.SVR()
|
||||
svr_score = cross_val_score(
|
||||
svr,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring="r2"
|
||||
)
|
||||
print("Support vector regression")
|
||||
print(np.median(svr_score))
|
||||
scores = insert_row(scores, ["Support vector regression",np.median(svr_score),np.max(svr_score)])
|
||||
|
||||
kridge = kernel_ridge.KernelRidge()
|
||||
kridge_score = cross_val_score(
|
||||
kridge,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring="r2"
|
||||
)
|
||||
print("Kernel Ridge regression")
|
||||
print(np.median(kridge_score))
|
||||
scores = insert_row(scores, ["Kernel Ridge regression",np.median(kridge_score),np.max(kridge_score)])
|
||||
|
||||
gpr = gaussian_process.GaussianProcessRegressor()
|
||||
gpr_score = cross_val_score(
|
||||
gpr,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring="r2"
|
||||
)
|
||||
print("Gaussian Process Regression")
|
||||
print(np.median(gpr_score))
|
||||
scores = insert_row(scores, ["Gaussian Process Regression",np.median(gpr_score),np.max(gpr_score)])
|
||||
|
||||
rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
|
||||
rfr_score = cross_val_score(
|
||||
rfr,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring="r2"
|
||||
)
|
||||
print("Random Forest Regression")
|
||||
print(np.median(rfr_score))
|
||||
scores = insert_row(scores, ["Random Forest Regression",np.median(rfr_score),np.max(rfr_score)])
|
||||
|
||||
xgb = XGBRegressor()
|
||||
xgb_score = cross_val_score(
|
||||
xgb,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring="r2"
|
||||
)
|
||||
print("XGBoost Regressor")
|
||||
print(np.median(xgb_score))
|
||||
scores = insert_row(scores, ["XGBoost Regressor",np.median(xgb_score),np.max(xgb_score)])
|
||||
|
||||
ada = ensemble.AdaBoostRegressor()
|
||||
ada_score = cross_val_score(
|
||||
ada,
|
||||
X=train_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring="r2"
|
||||
)
|
||||
print("ADA Boost Regressor")
|
||||
print(np.median(ada_score))
|
||||
scores = insert_row(scores, ["ADA Boost Regressor",np.median(ada_score),np.max(ada_score)])
|
||||
|
||||
return scores
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
import datetime
|
||||
import os
|
||||
import sys
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
import pandas as pd
|
||||
from features.timezone import get_timezone_data
|
||||
from pyprojroot import here
|
||||
|
||||
import participants.query_db
|
||||
|
||||
participants_inactive_usernames = participants.query_db.get_usernames(
|
||||
tester=False, # True participants are wanted.
|
||||
active=False, # They have all finished their participation.
|
||||
collection_start=datetime.date.fromisoformat(
|
||||
"2020-08-01"
|
||||
), # This is the timeframe of the main study.
|
||||
last_upload=datetime.date.fromisoformat("2021-09-01"),
|
||||
)
|
||||
|
||||
participants_overview_si = pd.read_csv(
|
||||
snakemake.params["baseline_folder"] + "Participants_overview_Slovenia.csv", sep=";"
|
||||
)
|
||||
participants_overview_be = pd.read_csv(
|
||||
snakemake.params["baseline_folder"]+ "Participants_overview_Belgium.csv", sep=";"
|
||||
)
|
||||
|
||||
participants_true_si = participants_overview_si[
|
||||
participants_overview_si["Wristband_SerialNo"] != "DECLINED"
|
||||
]
|
||||
participants_true_be = participants_overview_be[
|
||||
participants_overview_be["SmartphoneBrand+Generation"].str.slice(0, 3) != "Not"
|
||||
]
|
||||
|
||||
# Concatenate participants from both countries.
|
||||
participants_usernames_empatica = pd.concat(
|
||||
[participants_true_be, participants_true_si]
|
||||
)
|
||||
# Filter only the participants from the main study (queried from the database).
|
||||
participants_usernames_empatica = participants_usernames_empatica[
|
||||
participants_usernames_empatica["Username"].isin(participants_inactive_usernames)
|
||||
]
|
||||
# Rename and select columns.
|
||||
participants_usernames_empatica = participants_usernames_empatica.rename(
|
||||
columns={"Username": "label", "Wristband_SerialNo": "empatica_id"}
|
||||
)[["label", "empatica_id"]]
|
||||
# Adapt for csv export.
|
||||
participants_usernames_empatica["empatica_id"] = participants_usernames_empatica[
|
||||
"empatica_id"
|
||||
].str.replace(",", ";")
|
||||
|
||||
participants_usernames_empatica.to_csv(
|
||||
snakemake.output["usernames_file"],
|
||||
header=True,
|
||||
index=False,
|
||||
line_terminator="\n",
|
||||
)
|
||||
|
||||
timezone_df = get_timezone_data(participants_inactive_usernames)
|
||||
|
||||
timezone_df.to_csv(
|
||||
snakemake.output["timezone_file"],
|
||||
header=True,
|
||||
index=False,
|
||||
line_terminator="\n",
|
||||
)
|
|
@ -0,0 +1 @@
|
|||
Subproject commit f78aa3e7b3567423b44045766b230cd60d557cb0
|
|
@ -6,7 +6,7 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.12.0
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
|
@ -14,25 +14,7 @@
|
|||
# ---
|
||||
|
||||
# %%
|
||||
# %matplotlib inline
|
||||
import datetime
|
||||
import os
|
||||
import sys
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.formula.api as smf
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
import participants.query_db
|
||||
from features.esm import *
|
||||
|
||||
# %%
|
||||
SAVE_FIGS = True
|
||||
SAVE_FIGS = False
|
||||
FIG_HEIGHT = 5
|
||||
FIG_ASPECT = 1.7
|
||||
FIG_COLOUR = "#28827C"
|
||||
|
|
Loading…
Reference in New Issue