Compare commits
80 Commits
c66e046014
...
777d30365e
Author | SHA1 | Date |
---|---|---|
junos | 777d30365e | |
junos | ca85131ed2 | |
junos | 2aa5c5cb07 | |
junos | 82b53bc0d3 | |
junos | c688580fe8 | |
junos | 8c0b66eddc | |
junos | e3ff4846e1 | |
junos | 825380a47e | |
junos | ef26772038 | |
junos | 64c05ec5ec | |
junos | c0236b251c | |
junos | 2f22f2052a | |
junos | ec51d7d406 | |
junos | 2aca64aa09 | |
junos | 9a87a0a34a | |
junos | 91a9f20839 | |
junos | e3be17e56e | |
junos | 5c0e2e2621 | |
junos | 51201c2bc9 | |
junos | fed4b33611 | |
junos | 471ce7c2cb | |
junos | dbb2033f78 | |
junos | 1b77fb119c | |
junos | ae2ca63bc4 | |
junos | 577f1330da | |
junos | 4af360f411 | |
junos | 96bbe32f56 | |
junos | 40170339c2 | |
junos | 7b0c0037f7 | |
junos | db06584ddd | |
junos | 112d968715 | |
junos | 9cc6bf7c21 | |
junos | 78807b941c | |
junos | a9af113c9c | |
junos | 97113fe9ab | |
junos | bc78a1d498 | |
junos | aca84b214d | |
junos | aa13123136 | |
junos | c51e0da0f7 | |
junos | a2401b5e36 | |
junos | 70232949c3 | |
junos | 8a9595c615 | |
junos | 045b9fa0dc | |
junos | bb4445f1a8 | |
junos | 1318ae3609 | |
junos | 45441c288d | |
junos | fa45b30955 | |
junos | 2336edffb6 | |
junos | b756ed5feb | |
junos | cad28c3fe8 | |
junos | 38a405d378 | |
junos | 2c5a0b4157 | |
junos | 0409c9e982 | |
junos | a7446cc34a | |
junos | 118e686491 | |
junos | 9417a1b9f1 | |
junos | 7b5db88f1d | |
junos | 0f8f0b0fb6 | |
junos | 26c7d22b83 | |
junos | 87781840d4 | |
junos | 3091328fc5 | |
junos | 055e87dbac | |
junos | f58d20ffc2 | |
junos | 075fdab9ea | |
junos | 91e7352480 | |
junos | 35c09374dd | |
junos | b505fb2b6a | |
junos | 47b1ecdbb9 | |
junos | 24744c288d | |
junos | caeaf03239 | |
junos | cd5d8b6a10 | |
junos | 3e38b64b45 | |
junos | 76071fd550 | |
Primoz | 26804cf8ea | |
Primoz | 865225994b | |
Primoz | 259be708aa | |
Primoz | 0594993133 | |
Primoz | 1cbc743cf7 | |
Primoz | 2a8f1ee613 | |
Primoz | ce13a9e13b |
|
@ -0,0 +1,9 @@
|
||||||
|
[flake8]
|
||||||
|
max-line-length = 88
|
||||||
|
extend-ignore =
|
||||||
|
E203,
|
||||||
|
# E501 line too long for docstrings
|
||||||
|
D501
|
||||||
|
per-file-ignores =
|
||||||
|
exploration/*.py:E501
|
||||||
|
docstring-convention = numpy
|
|
@ -12,12 +12,15 @@ __pycache__/
|
||||||
/data/*input*.csv
|
/data/*input*.csv
|
||||||
/data/daily*
|
/data/daily*
|
||||||
/data/intradaily*
|
/data/intradaily*
|
||||||
|
/data/raw
|
||||||
/data/stressfulness_event*
|
/data/stressfulness_event*
|
||||||
/data/30min*
|
/data/30min*
|
||||||
/presentation/*scores.csv
|
/presentation/*scores.csv
|
||||||
/presentation/Results.ods
|
/presentation/Results.ods
|
||||||
|
/presentation/results/
|
||||||
.Rproj.user
|
.Rproj.user
|
||||||
.Rhistory
|
.Rhistory
|
||||||
/presentation/*.nb.html
|
/presentation/*.nb.html
|
||||||
presentation/event_stressful_detection_half_loso.csv
|
presentation/event_stressful_detection_half_loso.csv
|
||||||
presentation/event_stressful_detection_loso.csv
|
presentation/event_stressful_detection_loso.csv
|
||||||
|
/statistical_analysis/scale_reliability.nb.html
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
<component name="ProjectCodeStyleConfiguration">
|
||||||
|
<code_scheme name="Project" version="173">
|
||||||
|
<option name="RIGHT_MARGIN" value="150" />
|
||||||
|
<option name="SOFT_MARGINS" value="88" />
|
||||||
|
</code_scheme>
|
||||||
|
</component>
|
|
@ -0,0 +1,5 @@
|
||||||
|
<component name="ProjectCodeStyleConfiguration">
|
||||||
|
<state>
|
||||||
|
<option name="USE_PER_PROJECT_SETTINGS" value="true" />
|
||||||
|
</state>
|
||||||
|
</component>
|
|
@ -0,0 +1,3 @@
|
||||||
|
<component name="ProjectDictionaryState">
|
||||||
|
<dictionary name="junos" />
|
||||||
|
</component>
|
|
@ -17,7 +17,15 @@
|
||||||
</RMarkdownRenderProfile>
|
</RMarkdownRenderProfile>
|
||||||
</value>
|
</value>
|
||||||
</entry>
|
</entry>
|
||||||
|
<entry key="file://$PROJECT_DIR$/statistical_analysis/scale_reliability.rmd">
|
||||||
|
<value>
|
||||||
|
<RMarkdownRenderProfile>
|
||||||
|
<option name="lastOutput" value="$PROJECT_DIR$/statistical_analysis/scale_reliability.nb.html" />
|
||||||
|
<option name="outputDirectoryUrl" value="file://$PROJECT_DIR$/statistical_analysis" />
|
||||||
|
</RMarkdownRenderProfile>
|
||||||
|
</value>
|
||||||
|
</entry>
|
||||||
</map>
|
</map>
|
||||||
</option>
|
</option>
|
||||||
</component>
|
</component>
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="RGraphicsSettings">
|
||||||
|
<option name="height" value="600" />
|
||||||
|
<option name="resolution" value="75" />
|
||||||
|
<option name="version" value="2" />
|
||||||
|
<option name="width" value="960" />
|
||||||
|
</component>
|
||||||
|
</project>
|
|
@ -0,0 +1,7 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="RMarkdownGraphicsSettings">
|
||||||
|
<option name="globalResolution" value="75" />
|
||||||
|
<option name="version" value="2" />
|
||||||
|
</component>
|
||||||
|
</project>
|
|
@ -0,0 +1,6 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="RSettings">
|
||||||
|
<option name="interpreterPath" value="C:\Program Files\R\R-4.3.1\bin\R.exe" />
|
||||||
|
</component>
|
||||||
|
</project>
|
|
@ -0,0 +1,30 @@
|
||||||
|
repos:
|
||||||
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
rev: v4.4.0
|
||||||
|
hooks:
|
||||||
|
- id: check-yaml
|
||||||
|
- id: end-of-file-fixer
|
||||||
|
- id: trailing-whitespace
|
||||||
|
- repo: https://github.com/pycqa/isort
|
||||||
|
rev: 5.12.0
|
||||||
|
hooks:
|
||||||
|
- id: isort
|
||||||
|
name: isort (python)
|
||||||
|
- repo: https://github.com/psf/black
|
||||||
|
rev: 23.3.0
|
||||||
|
hooks:
|
||||||
|
- id: black
|
||||||
|
language_version: python3
|
||||||
|
- repo: https://github.com/pycqa/flake8
|
||||||
|
rev: 6.0.0
|
||||||
|
hooks:
|
||||||
|
- id: flake8
|
||||||
|
# - repo: https://github.com/mwouts/jupytext
|
||||||
|
# rev: v1.14.7
|
||||||
|
# hooks:
|
||||||
|
# - id: jupytext
|
||||||
|
# args: [ --from, "py:percent", --to, "ipynb" ]
|
||||||
|
# additional_dependencies:
|
||||||
|
# - isort==5.12.0 # Matches hook
|
||||||
|
# - black==23.3.0
|
||||||
|
# - flake8==6.0.0
|
|
@ -6,6 +6,7 @@ dependencies:
|
||||||
- black
|
- black
|
||||||
- isort
|
- isort
|
||||||
- flake8
|
- flake8
|
||||||
|
- flake8-docstrings
|
||||||
- imbalanced-learn=0.10.0
|
- imbalanced-learn=0.10.0
|
||||||
- jupyterlab
|
- jupyterlab
|
||||||
- jupytext
|
- jupytext
|
||||||
|
@ -14,6 +15,7 @@ dependencies:
|
||||||
- nodejs
|
- nodejs
|
||||||
- pandas
|
- pandas
|
||||||
- psycopg2 >= 2.9.1
|
- psycopg2 >= 2.9.1
|
||||||
|
- pre-commit
|
||||||
- python-dotenv
|
- python-dotenv
|
||||||
- pytz
|
- pytz
|
||||||
- pyprojroot
|
- pyprojroot
|
||||||
|
@ -23,4 +25,4 @@ dependencies:
|
||||||
- sqlalchemy
|
- sqlalchemy
|
||||||
- statsmodels
|
- statsmodels
|
||||||
- tabulate
|
- tabulate
|
||||||
- xgboost
|
- xgboost
|
||||||
|
|
|
@ -14,15 +14,9 @@
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
import os, sys
|
|
||||||
import importlib
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
# import plotly.graph_objects as go
|
from rapids.src.features.utils.utils import chunk_episodes
|
||||||
from importlib import util
|
|
||||||
from pathlib import Path
|
|
||||||
import yaml
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
phone_data_yield = pd.read_csv(
|
phone_data_yield = pd.read_csv(
|
||||||
|
@ -36,23 +30,29 @@ time_segments_labels = pd.read_csv(
|
||||||
# %%
|
# %%
|
||||||
phone_data_yield["assigned_segments"] = phone_data_yield[
|
phone_data_yield["assigned_segments"] = phone_data_yield[
|
||||||
"assigned_segments"
|
"assigned_segments"
|
||||||
].str.replace(r"_RR\d+SS#", "#")
|
].str.replace(r"_RR\d+SS#", "#", regex=True)
|
||||||
time_segments_labels["label"] = time_segments_labels["label"].str.replace(
|
time_segments_labels["label"] = time_segments_labels["label"].str.replace(
|
||||||
r"_RR\d+SS$", ""
|
r"_RR\d+SS$", "", regex=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# %% tags=[]
|
# %% tags=[]
|
||||||
def filter_data_by_segment(data, time_segment):
|
def filter_data_by_segment(data, time_segment_current):
|
||||||
data.dropna(subset=["assigned_segments"], inplace=True)
|
data.dropna(subset=["assigned_segments"], inplace=True)
|
||||||
if data.shape[0] == 0: # data is empty
|
if data.shape[0] == 0: # data is empty
|
||||||
data["local_segment"] = data["timestamps_segment"] = None
|
data["local_segment"] = data["timestamps_segment"] = None
|
||||||
return data
|
return data
|
||||||
|
|
||||||
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
datetime_regex = (
|
||||||
timestamps_regex = "[0-9]{13}"
|
r"[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||||
segment_regex = "\[({}#{},{};{},{})\]".format(
|
)
|
||||||
time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex
|
timestamps_regex = r"[0-9]{13}"
|
||||||
|
segment_regex = r"\[({}#{},{};{},{})\]".format(
|
||||||
|
time_segment_current,
|
||||||
|
datetime_regex,
|
||||||
|
datetime_regex,
|
||||||
|
timestamps_regex,
|
||||||
|
timestamps_regex,
|
||||||
)
|
)
|
||||||
data["local_segment"] = data["assigned_segments"].str.extract(
|
data["local_segment"] = data["assigned_segments"].str.extract(
|
||||||
segment_regex, expand=True
|
segment_regex, expand=True
|
||||||
|
@ -147,14 +147,17 @@ def getDataForPlot(phone_data_yield_per_segment):
|
||||||
.fillna(0)
|
.fillna(0)
|
||||||
)
|
)
|
||||||
|
|
||||||
# transpose the dataframe per local start datetime of the segment and discard the useless index layer
|
# transpose the dataframe per local start datetime of the segment
|
||||||
|
# and discard the useless index layer
|
||||||
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
||||||
"local_segment_start_datetimes"
|
"local_segment_start_datetimes"
|
||||||
)[["minutes_after_segment_start", "sensor"]].apply(
|
)[["minutes_after_segment_start", "sensor"]].apply(
|
||||||
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
||||||
)
|
)
|
||||||
phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
|
phone_data_yield_per_segment.index = (
|
||||||
"local_segment_start_datetimes"
|
phone_data_yield_per_segment.index.get_level_values(
|
||||||
|
"local_segment_start_datetimes"
|
||||||
|
)
|
||||||
)
|
)
|
||||||
return phone_data_yield_per_segment
|
return phone_data_yield_per_segment
|
||||||
|
|
||||||
|
@ -227,9 +230,13 @@ phone_data_yield_per_segment.tail()
|
||||||
# # A workaround
|
# # A workaround
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
phone_data_yield_per_segment["local_segment_start_datetimes", "minutes_after_segment_start"] = phone_data_yield_per_segment[
|
phone_data_yield_per_segment[
|
||||||
|
"local_segment_start_datetimes", "minutes_after_segment_start"
|
||||||
|
] = phone_data_yield_per_segment[
|
||||||
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||||
].drop_duplicates(keep="first")
|
].drop_duplicates(
|
||||||
|
keep="first"
|
||||||
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
phone_data_yield_per_segment.set_index(
|
phone_data_yield_per_segment.set_index(
|
||||||
|
@ -244,8 +251,9 @@ phone_data_yield_per_segment.head()
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# # Retry
|
# # Retry
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
def getDataForPlot(phone_data_yield_per_segment):
|
def get_data_for_plot(phone_data_yield_per_segment):
|
||||||
# calculate the length (in minute) of per segment instance
|
# calculate the length (in minute) of per segment instance
|
||||||
phone_data_yield_per_segment["length"] = (
|
phone_data_yield_per_segment["length"] = (
|
||||||
phone_data_yield_per_segment["timestamps_segment"]
|
phone_data_yield_per_segment["timestamps_segment"]
|
||||||
|
@ -292,7 +300,10 @@ def getDataForPlot(phone_data_yield_per_segment):
|
||||||
full_index,
|
full_index,
|
||||||
names=("local_segment_start_datetimes", "minutes_after_segment_start"),
|
names=("local_segment_start_datetimes", "minutes_after_segment_start"),
|
||||||
)
|
)
|
||||||
phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(subset=["local_segment_start_datetimes", "minutes_after_segment_start"],keep="first")
|
phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(
|
||||||
|
subset=["local_segment_start_datetimes", "minutes_after_segment_start"],
|
||||||
|
keep="first",
|
||||||
|
)
|
||||||
phone_data_yield_per_segment = (
|
phone_data_yield_per_segment = (
|
||||||
phone_data_yield_per_segment.set_index(
|
phone_data_yield_per_segment.set_index(
|
||||||
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||||
|
@ -302,14 +313,17 @@ def getDataForPlot(phone_data_yield_per_segment):
|
||||||
.fillna(0)
|
.fillna(0)
|
||||||
)
|
)
|
||||||
|
|
||||||
# transpose the dataframe per local start datetime of the segment and discard the useless index layer
|
# transpose the dataframe per local start datetime of the segment
|
||||||
|
# and discard the useless index layer
|
||||||
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
||||||
"local_segment_start_datetimes"
|
"local_segment_start_datetimes"
|
||||||
)[["minutes_after_segment_start", "sensor"]].apply(
|
)[["minutes_after_segment_start", "sensor"]].apply(
|
||||||
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
||||||
)
|
)
|
||||||
phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
|
phone_data_yield_per_segment.index = (
|
||||||
"local_segment_start_datetimes"
|
phone_data_yield_per_segment.index.get_level_values(
|
||||||
|
"local_segment_start_datetimes"
|
||||||
|
)
|
||||||
)
|
)
|
||||||
return phone_data_yield_per_segment
|
return phone_data_yield_per_segment
|
||||||
|
|
||||||
|
@ -318,6 +332,6 @@ def getDataForPlot(phone_data_yield_per_segment):
|
||||||
phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
|
phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
|
data_for_plot_per_segment = get_data_for_plot(phone_data_yield_per_segment)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
# extension: .py
|
# extension: .py
|
||||||
# format_name: percent
|
# format_name: percent
|
||||||
# format_version: '1.3'
|
# format_version: '1.3'
|
||||||
# jupytext_version: 1.13.0
|
# jupytext_version: 1.14.5
|
||||||
# kernelspec:
|
# kernelspec:
|
||||||
# display_name: straw2analysis
|
# display_name: straw2analysis
|
||||||
# language: python
|
# language: python
|
||||||
|
@ -15,19 +15,33 @@
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
|
|
||||||
nb_dir = os.path.split(os.getcwd())[0]
|
|
||||||
if nb_dir not in sys.path:
|
|
||||||
sys.path.append(nb_dir)
|
|
||||||
import participants.query_db
|
import participants.query_db
|
||||||
from features.esm import *
|
from features.esm import (
|
||||||
from features.esm_JCQ import *
|
QUESTIONNAIRE_IDS,
|
||||||
from features.esm_SAM import *
|
clean_up_esm,
|
||||||
|
get_esm_data,
|
||||||
|
increment_answers,
|
||||||
|
preprocess_esm,
|
||||||
|
reassign_question_ids,
|
||||||
|
)
|
||||||
|
from features.esm_COPE import DICT_COPE_QUESTION_IDS
|
||||||
|
from features.esm_JCQ import reverse_jcq_demand_control_scoring
|
||||||
|
from features.esm_SAM import DICT_SAM_QUESTION_IDS, extract_stressful_events
|
||||||
|
|
||||||
|
# import os
|
||||||
|
# import sys
|
||||||
|
# nb_dir = os.path.split(os.getcwd())[0]
|
||||||
|
# if nb_dir not in sys.path:
|
||||||
|
# sys.path.append(nb_dir)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
save_figs = False
|
||||||
|
export_data = True
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
participants_inactive_usernames = participants.query_db.get_usernames(
|
participants_inactive_usernames = participants.query_db.get_usernames(
|
||||||
|
@ -43,8 +57,14 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_PANAS = df_esm_preprocessed[
|
df_esm_PANAS = df_esm_preprocessed[
|
||||||
(df_esm_preprocessed["questionnaire_id"] == 8)
|
(
|
||||||
| (df_esm_preprocessed["questionnaire_id"] == 9)
|
df_esm_preprocessed["questionnaire_id"]
|
||||||
|
== QUESTIONNAIRE_IDS["PANAS_positive_affect"]
|
||||||
|
)
|
||||||
|
| (
|
||||||
|
df_esm_preprocessed["questionnaire_id"]
|
||||||
|
== QUESTIONNAIRE_IDS["PANAS_negative_affect"]
|
||||||
|
)
|
||||||
]
|
]
|
||||||
df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)
|
df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)
|
||||||
|
|
||||||
|
@ -65,35 +85,47 @@ df_esm_PANAS_daily_means = (
|
||||||
# %%
|
# %%
|
||||||
df_esm_PANAS_summary_participant = (
|
df_esm_PANAS_summary_participant = (
|
||||||
df_esm_PANAS_daily_means.groupby(["participant_id", "questionnaire_id"])
|
df_esm_PANAS_daily_means.groupby(["participant_id", "questionnaire_id"])
|
||||||
.agg(["mean", "median", "std"])
|
.esm_numeric_mean.agg(["mean", "median", "std"])
|
||||||
.reset_index(col_level=1)
|
.reset_index(col_level=1)
|
||||||
)
|
)
|
||||||
df_esm_PANAS_summary_participant.columns = df_esm_PANAS_summary_participant.columns.get_level_values(
|
|
||||||
1
|
|
||||||
)
|
|
||||||
df_esm_PANAS_summary_participant[
|
df_esm_PANAS_summary_participant[
|
||||||
"PANAS_subscale"
|
"PANAS subscale"
|
||||||
] = df_esm_PANAS_daily_means.questionnaire_id.astype("category").cat.rename_categories(
|
] = df_esm_PANAS_daily_means.questionnaire_id.astype("category").cat.rename_categories(
|
||||||
{8.0: "PA", 9.0: "NA"}
|
{8.0: "positive affect", 9.0: "negative affect"}
|
||||||
)
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
sns.displot(
|
df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["mean"]
|
||||||
data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS_subscale", binwidth=0.2
|
|
||||||
|
# %%
|
||||||
|
df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["std"]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_esm_PANAS_summary_participant.query("std == 0")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
fig1 = sns.displot(
|
||||||
|
data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS subscale", binwidth=0.2
|
||||||
)
|
)
|
||||||
|
fig1.set_axis_labels(x_var="participant mean", y_var="frequency")
|
||||||
|
if save_figs:
|
||||||
|
fig1.figure.savefig("PANAS_mean_participant.pdf", dpi=300)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
sns.displot(
|
sns.displot(
|
||||||
data=df_esm_PANAS_summary_participant,
|
data=df_esm_PANAS_summary_participant,
|
||||||
x="median",
|
x="median",
|
||||||
hue="PANAS_subscale",
|
hue="PANAS subscale",
|
||||||
binwidth=0.2,
|
binwidth=0.2,
|
||||||
)
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
sns.displot(
|
fig2 = sns.displot(
|
||||||
data=df_esm_PANAS_summary_participant, x="std", hue="PANAS_subscale", binwidth=0.05
|
data=df_esm_PANAS_summary_participant, x="std", hue="PANAS subscale", binwidth=0.05
|
||||||
)
|
)
|
||||||
|
fig2.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
|
||||||
|
if save_figs:
|
||||||
|
fig2.figure.savefig("PANAS_std_participant.pdf", dpi=300)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
|
df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
|
||||||
|
@ -109,8 +141,14 @@ df_SAM_all.head()
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM = df_esm_preprocessed[
|
df_esm_SAM = df_esm_preprocessed[
|
||||||
(df_esm_preprocessed["questionnaire_id"] >= 87)
|
(
|
||||||
& (df_esm_preprocessed["questionnaire_id"] <= 93)
|
df_esm_preprocessed["questionnaire_id"]
|
||||||
|
>= QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
|
||||||
|
)
|
||||||
|
& (
|
||||||
|
df_esm_preprocessed["questionnaire_id"]
|
||||||
|
<= QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
|
||||||
|
)
|
||||||
]
|
]
|
||||||
df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
|
df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
|
||||||
|
|
||||||
|
@ -118,9 +156,10 @@ df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
|
||||||
# ## Stressful events
|
# ## Stressful events
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_event = df_esm_SAM_clean[df_esm_SAM_clean["questionnaire_id"] == 87].assign(
|
df_esm_SAM_event = df_esm_SAM_clean[
|
||||||
stressful_event=lambda x: (x.esm_user_answer_numeric > 0)
|
df_esm_SAM_clean["questionnaire_id"]
|
||||||
)
|
== QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
|
||||||
|
].assign(stressful_event=lambda x: (x.esm_user_answer_numeric > 0))
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_daily_events = (
|
df_esm_SAM_daily_events = (
|
||||||
|
@ -131,20 +170,22 @@ df_esm_SAM_daily_events = (
|
||||||
)
|
)
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# Calculate the daily mean of YES (1) or NO (0) answers to the question about a stressful events. This is then the daily ratio of EMA sessions that included a stressful event.
|
# Calculate the daily mean of YES (1) or NO (0) answers to the question about stressful events. This is then the daily ratio of EMA sessions that included a stressful event.
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_event_summary_participant = (
|
df_esm_SAM_event_summary_participant = (
|
||||||
df_esm_SAM_daily_events.groupby(["participant_id"])
|
df_esm_SAM_daily_events.groupby(["participant_id"])
|
||||||
.agg(["mean", "median", "std"])
|
.SAM_event_ratio.agg(["mean", "median", "std"])
|
||||||
.reset_index(col_level=1)
|
.reset_index(col_level=1)
|
||||||
)
|
)
|
||||||
df_esm_SAM_event_summary_participant.columns = df_esm_SAM_event_summary_participant.columns.get_level_values(
|
|
||||||
1
|
|
||||||
)
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
|
fig6 = sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
|
||||||
|
fig6.set_axis_labels(
|
||||||
|
x_var="participant proportion of stressful events", y_var="frequency"
|
||||||
|
)
|
||||||
|
if save_figs:
|
||||||
|
fig6.figure.savefig("SAM_events_mean_participant.pdf", dpi=300)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
|
sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
|
||||||
|
@ -155,7 +196,12 @@ sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# * Example of threat: "Did this event make you feel anxious?"
|
# * Example of threat: "Did this event make you feel anxious?"
|
||||||
# * Example of challenge: "How eager are you to tackle this event?"
|
# * Example of challenge: "How eager are you to tackle this event?"
|
||||||
# * Possible answers: 0 - Not at all, 1 - Slightly, 2 - Moderately, 3 - Considerably, 4 - Extremely
|
# * Possible answers:
|
||||||
|
# 0 - Not at all,
|
||||||
|
# 1 - Slightly,
|
||||||
|
# 2 - Moderately,
|
||||||
|
# 3 - Considerably,
|
||||||
|
# 4 - Extremely
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_daily = (
|
df_esm_SAM_daily = (
|
||||||
|
@ -167,27 +213,45 @@ df_esm_SAM_daily = (
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[
|
df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[
|
||||||
(df_esm_SAM_daily["questionnaire_id"] == 88)
|
(df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
|
||||||
| (df_esm_SAM_daily["questionnaire_id"] == 89)
|
| (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
|
||||||
]
|
]
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_summary_participant = (
|
df_esm_SAM_summary_participant = (
|
||||||
df_esm_SAM_daily.groupby(["participant_id", "questionnaire_id"])
|
df_esm_SAM_daily.groupby(["participant_id", "questionnaire_id"])
|
||||||
.agg(["mean", "median", "std"])
|
.esm_numeric_mean.agg(["mean", "median", "std"])
|
||||||
.reset_index(col_level=1)
|
.reset_index(col_level=1)
|
||||||
)
|
)
|
||||||
df_esm_SAM_summary_participant.columns = df_esm_SAM_summary_participant.columns.get_level_values(
|
|
||||||
1
|
# %%
|
||||||
|
df_esm_SAM_event_stressfulness_summary_participant = df_esm_SAM_summary_participant[
|
||||||
|
df_esm_SAM_summary_participant["questionnaire_id"]
|
||||||
|
== QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
|
||||||
|
]
|
||||||
|
df_esm_SAM_event_stressfulness_summary_participant.describe()["mean"]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_esm_SAM_event_stressfulness_summary_participant.describe()["std"]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
sns.displot(
|
||||||
|
data=df_esm_SAM_event_stressfulness_summary_participant, x="mean", binwidth=0.2
|
||||||
)
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[
|
df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[
|
||||||
(df_esm_SAM_summary_participant["questionnaire_id"] == 88)
|
(
|
||||||
| (df_esm_SAM_summary_participant["questionnaire_id"] == 89)
|
df_esm_SAM_summary_participant["questionnaire_id"]
|
||||||
|
== QUESTIONNAIRE_IDS["appraisal_threat"]
|
||||||
|
)
|
||||||
|
| (
|
||||||
|
df_esm_SAM_summary_participant["questionnaire_id"]
|
||||||
|
== QUESTIONNAIRE_IDS["appraisal_challenge"]
|
||||||
|
)
|
||||||
]
|
]
|
||||||
df_esm_SAM_threat_challenge_summary_participant[
|
df_esm_SAM_threat_challenge_summary_participant[
|
||||||
"event_subscale"
|
"event subscale"
|
||||||
] = df_esm_SAM_threat_challenge_summary_participant.questionnaire_id.astype(
|
] = df_esm_SAM_threat_challenge_summary_participant.questionnaire_id.astype(
|
||||||
"category"
|
"category"
|
||||||
).cat.rename_categories(
|
).cat.rename_categories(
|
||||||
|
@ -198,26 +262,84 @@ df_esm_SAM_threat_challenge_summary_participant[
|
||||||
sns.displot(
|
sns.displot(
|
||||||
data=df_esm_SAM_threat_challenge_summary_participant,
|
data=df_esm_SAM_threat_challenge_summary_participant,
|
||||||
x="mean",
|
x="mean",
|
||||||
hue="event_subscale",
|
hue="event subscale",
|
||||||
binwidth=0.2,
|
binwidth=0.2,
|
||||||
)
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
sns.displot(
|
fig3 = sns.displot(
|
||||||
data=df_esm_SAM_threat_challenge_summary_participant,
|
data=df_esm_SAM_threat_challenge_summary_participant,
|
||||||
x="std",
|
x="std",
|
||||||
hue="event_subscale",
|
hue="event subscale",
|
||||||
binwidth=0.1,
|
binwidth=0.1,
|
||||||
)
|
)
|
||||||
|
fig3.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
|
||||||
|
if save_figs:
|
||||||
|
fig3.figure.savefig("SAM_std_participant.pdf", dpi=300)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
|
||||||
|
"mean"
|
||||||
|
]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
|
||||||
|
"std"
|
||||||
|
]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_esm_SAM_clean.columns
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_esm_SAM_clean.esm_status.value_counts()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
if export_data:
|
||||||
|
df_esm_SAM_fixed = reassign_question_ids(df_esm_SAM_clean, DICT_SAM_QUESTION_IDS)
|
||||||
|
df_esm_SAM_fixed = increment_answers(df_esm_SAM_fixed)
|
||||||
|
df_esm_SAM_for_export = df_esm_SAM_fixed[
|
||||||
|
[
|
||||||
|
"participant_id",
|
||||||
|
"username",
|
||||||
|
"device_id",
|
||||||
|
"_id",
|
||||||
|
"esm_trigger",
|
||||||
|
"esm_session",
|
||||||
|
"esm_notification_id",
|
||||||
|
"question_id",
|
||||||
|
"questionnaire_id",
|
||||||
|
"esm_instructions",
|
||||||
|
"double_esm_user_answer_timestamp",
|
||||||
|
"datetime_lj",
|
||||||
|
"date_lj",
|
||||||
|
"time",
|
||||||
|
"esm_user_answer",
|
||||||
|
"esm_user_answer_numeric",
|
||||||
|
]
|
||||||
|
]
|
||||||
|
df_esm_SAM_for_export.sort_values(
|
||||||
|
by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
|
||||||
|
)
|
||||||
|
print(df_esm_SAM_for_export.head())
|
||||||
|
df_esm_SAM_for_export.to_csv(
|
||||||
|
"../data/raw/df_esm_SAM_threat_challenge.csv", index=False
|
||||||
|
)
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ## Stressfulness of period
|
# ## Stressfulness of period
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[
|
df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[
|
||||||
df_esm_SAM_summary_participant["questionnaire_id"] == 93
|
df_esm_SAM_summary_participant["questionnaire_id"]
|
||||||
|
== QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_esm_SAM_period_summary_participant.describe()["mean"]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_esm_SAM_period_summary_participant.describe()["std"]
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
sns.displot(data=df_esm_SAM_period_summary_participant, x="mean", binwidth=0.2)
|
sns.displot(data=df_esm_SAM_period_summary_participant, x="mean", binwidth=0.2)
|
||||||
|
|
||||||
|
@ -229,8 +351,8 @@ sns.displot(data=df_esm_SAM_period_summary_participant, x="std", binwidth=0.1)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_JCQ_demand_control = df_esm_preprocessed[
|
df_esm_JCQ_demand_control = df_esm_preprocessed[
|
||||||
(df_esm_preprocessed["questionnaire_id"] >= 10)
|
(df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["JCQ_job_demand"])
|
||||||
& (df_esm_preprocessed["questionnaire_id"] <= 11)
|
& (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["JCQ_job_control"])
|
||||||
]
|
]
|
||||||
df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control)
|
df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control)
|
||||||
|
|
||||||
|
@ -250,14 +372,11 @@ df_esm_JCQ_daily = (
|
||||||
)
|
)
|
||||||
df_esm_JCQ_summary_participant = (
|
df_esm_JCQ_summary_participant = (
|
||||||
df_esm_JCQ_daily.groupby(["participant_id", "questionnaire_id"])
|
df_esm_JCQ_daily.groupby(["participant_id", "questionnaire_id"])
|
||||||
.agg(["mean", "median", "std"])
|
.esm_score_mean.agg(["mean", "median", "std"])
|
||||||
.reset_index(col_level=1)
|
.reset_index(col_level=1)
|
||||||
)
|
)
|
||||||
df_esm_JCQ_summary_participant.columns = df_esm_JCQ_summary_participant.columns.get_level_values(
|
|
||||||
1
|
|
||||||
)
|
|
||||||
df_esm_JCQ_summary_participant[
|
df_esm_JCQ_summary_participant[
|
||||||
"JCQ_subscale"
|
"JCQ subscale"
|
||||||
] = df_esm_JCQ_summary_participant.questionnaire_id.astype(
|
] = df_esm_JCQ_summary_participant.questionnaire_id.astype(
|
||||||
"category"
|
"category"
|
||||||
).cat.rename_categories(
|
).cat.rename_categories(
|
||||||
|
@ -265,11 +384,71 @@ df_esm_JCQ_summary_participant[
|
||||||
)
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
sns.displot(
|
df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["mean"]
|
||||||
data=df_esm_JCQ_summary_participant, x="mean", hue="JCQ_subscale", binwidth=0.1,
|
|
||||||
)
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
sns.displot(
|
df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["std"]
|
||||||
data=df_esm_JCQ_summary_participant, x="std", hue="JCQ_subscale", binwidth=0.05,
|
|
||||||
|
# %%
|
||||||
|
fig4 = sns.displot(
|
||||||
|
data=df_esm_JCQ_summary_participant,
|
||||||
|
x="mean",
|
||||||
|
hue="JCQ subscale",
|
||||||
|
binwidth=0.1,
|
||||||
)
|
)
|
||||||
|
fig4.set_axis_labels(x_var="participant mean", y_var="frequency")
|
||||||
|
if save_figs:
|
||||||
|
fig4.figure.savefig("JCQ_mean_participant.pdf", dpi=300)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
fig5 = sns.displot(
|
||||||
|
data=df_esm_JCQ_summary_participant,
|
||||||
|
x="std",
|
||||||
|
hue="JCQ subscale",
|
||||||
|
binwidth=0.05,
|
||||||
|
)
|
||||||
|
fig6.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
|
||||||
|
if save_figs:
|
||||||
|
fig5.figure.savefig("JCQ_std_participant.pdf", dpi=300)
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# # COPE Inventory
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_esm_COPE = df_esm_preprocessed[
|
||||||
|
(df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["COPE_active"])
|
||||||
|
& (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["COPE_emotions"])
|
||||||
|
]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_esm_COPE_clean = clean_up_esm(df_esm_COPE)
|
||||||
|
df_esm_COPE_clean = increment_answers(df_esm_COPE_clean)
|
||||||
|
df_esm_COPE_fixed = reassign_question_ids(df_esm_COPE_clean, DICT_COPE_QUESTION_IDS)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
if export_data:
|
||||||
|
df_esm_COPE_for_export = df_esm_COPE_fixed[
|
||||||
|
[
|
||||||
|
"participant_id",
|
||||||
|
"username",
|
||||||
|
"device_id",
|
||||||
|
"_id",
|
||||||
|
"esm_trigger",
|
||||||
|
"esm_session",
|
||||||
|
"esm_notification_id",
|
||||||
|
"question_id",
|
||||||
|
"questionnaire_id",
|
||||||
|
"esm_instructions",
|
||||||
|
"double_esm_user_answer_timestamp",
|
||||||
|
"datetime_lj",
|
||||||
|
"date_lj",
|
||||||
|
"time",
|
||||||
|
"esm_user_answer",
|
||||||
|
"esm_user_answer_numeric",
|
||||||
|
]
|
||||||
|
]
|
||||||
|
df_esm_COPE_for_export.sort_values(
|
||||||
|
by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
|
||||||
|
)
|
||||||
|
print(df_esm_COPE_for_export.head())
|
||||||
|
df_esm_COPE_for_export.to_csv("../data/raw/df_esm_COPE.csv", index=False)
|
||||||
|
|
|
@ -20,30 +20,74 @@ import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.metrics import recall_score, f1_score
|
||||||
|
|
||||||
nb_dir = os.path.split(os.getcwd())[0]
|
nb_dir = os.path.split(os.getcwd())[0]
|
||||||
if nb_dir not in sys.path:
|
if nb_dir not in sys.path:
|
||||||
sys.path.append(nb_dir)
|
sys.path.append(nb_dir)
|
||||||
|
|
||||||
from machine_learning.cross_validation import CrossValidation
|
from machine_learning.cross_validation import CrossValidation
|
||||||
from machine_learning.preprocessing import Preprocessing
|
from machine_learning.preprocessing import Preprocessing
|
||||||
|
from machine_learning.feature_selection import FeatureSelection
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
|
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
|
||||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
df.set_index(index_columns, inplace=True)
|
df.set_index(index_columns, inplace=True)
|
||||||
|
|
||||||
|
# Create binary target
|
||||||
|
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||||
|
df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
|
||||||
|
|
||||||
|
|
||||||
|
nan_cols = df.columns[df.isna().any()].tolist()
|
||||||
|
df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))
|
||||||
|
|
||||||
cv = CrossValidation(data=df, cv_method="logo")
|
cv = CrossValidation(data=df, cv_method="logo")
|
||||||
|
|
||||||
categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
|
categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
|
||||||
interval_feature_list, other_feature_list = [], []
|
interval_feature_list, other_feature_list = [], []
|
||||||
|
|
||||||
print(df.columns.tolist())
|
# %%
|
||||||
|
|
||||||
for split in cv.get_splits():
|
for split in cv.get_splits():
|
||||||
train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
|
train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
|
||||||
pre = Preprocessing(train_X, train_y, test_X, test_y)
|
pre = Preprocessing(train_X, train_y, test_X, test_y)
|
||||||
pre.one_hot_encode_train_and_test_sets(categorical_columns)
|
pre.one_hot_encode_train_and_test_sets(categorical_columns)
|
||||||
train_X, train_y, test_X, test_y = pre.get_train_test_sets()
|
train_X, train_y, test_X, test_y = pre.get_train_test_sets()
|
||||||
|
|
||||||
|
|
||||||
|
print(train_X.shape, test_X.shape)
|
||||||
|
# Predict before feature selection
|
||||||
|
rfc = RandomForestClassifier(n_estimators=10)
|
||||||
|
rfc.fit(train_X, train_y)
|
||||||
|
predictions = rfc.predict(test_X)
|
||||||
|
|
||||||
|
print("Recall:", recall_score(test_y, predictions))
|
||||||
|
print("F1:", f1_score(test_y, predictions))
|
||||||
|
|
||||||
|
# Feature selection on train set
|
||||||
|
train_groups, test_groups = cv.get_groups_sets(split)
|
||||||
|
|
||||||
|
fs = FeatureSelection(train_X, train_y, train_groups)
|
||||||
|
selected_features = fs.select_features(n_min=20, n_max=29, k=40,
|
||||||
|
ml_type="classification_bin",
|
||||||
|
metric="recall", n_tolerance=20)
|
||||||
|
|
||||||
|
train_X = train_X[selected_features]
|
||||||
|
test_X = test_X[selected_features]
|
||||||
|
|
||||||
|
print(selected_features)
|
||||||
|
print(len(selected_features))
|
||||||
|
|
||||||
|
# Predict after feature selection
|
||||||
|
rfc = RandomForestClassifier(n_estimators=500)
|
||||||
|
rfc.fit(train_X, train_y)
|
||||||
|
predictions = rfc.predict(test_X)
|
||||||
|
|
||||||
|
print("Recall:", recall_score(test_y, predictions))
|
||||||
|
print("F1:", f1_score(test_y, predictions))
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
|
|
@ -6,457 +6,129 @@
|
||||||
# extension: .py
|
# extension: .py
|
||||||
# format_name: percent
|
# format_name: percent
|
||||||
# format_version: '1.3'
|
# format_version: '1.3'
|
||||||
# jupytext_version: 1.13.0
|
# jupytext_version: 1.14.5
|
||||||
# kernelspec:
|
# kernelspec:
|
||||||
# display_name: straw2analysis
|
# display_name: straw2analysis
|
||||||
# language: python
|
# language: python
|
||||||
# name: straw2analysis
|
# name: straw2analysis
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||||
# %matplotlib inline
|
# from IPython.core.interactiveshell import InteractiveShell
|
||||||
import os
|
from pathlib import Path
|
||||||
import sys
|
|
||||||
|
|
||||||
import numpy as np
|
# matplotlib inline
|
||||||
import matplotlib.pyplot as plt
|
# import os
|
||||||
|
# import sys
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
from machine_learning.helper import (
|
||||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
|
impute_encode_categorical_features,
|
||||||
from sklearn.dummy import DummyClassifier
|
prepare_cross_validator,
|
||||||
from sklearn.impute import SimpleImputer
|
prepare_sklearn_data_format,
|
||||||
|
run_all_classification_models,
|
||||||
|
)
|
||||||
|
|
||||||
from lightgbm import LGBMClassifier
|
# InteractiveShell.ast_node_interactivity = "all"
|
||||||
import xgboost as xg
|
|
||||||
from IPython.core.interactiveshell import InteractiveShell
|
|
||||||
InteractiveShell.ast_node_interactivity = "all"
|
|
||||||
|
|
||||||
nb_dir = os.path.split(os.getcwd())[0]
|
|
||||||
if nb_dir not in sys.path:
|
|
||||||
sys.path.append(nb_dir)
|
|
||||||
|
|
||||||
import machine_learning.helper
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# # RAPIDS models
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ## Set script's parameters
|
|
||||||
#
|
#
|
||||||
|
# nb_dir = os.path.split(os.getcwd())[0]
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
# if nb_dir not in sys.path:
|
||||||
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
# sys.path.append(nb_dir)
|
||||||
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
|
||||||
undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method)
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
|
|
||||||
# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
|
||||||
model_input.set_index(index_columns, inplace=True)
|
|
||||||
model_input['target'].value_counts()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
# bins = [-10, 0, 10] # bins for z-scored targets
|
|
||||||
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
|
||||||
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
|
|
||||||
model_input['target'].value_counts(), edges
|
|
||||||
# model_input = model_input[model_input['target'] != "medium"]
|
|
||||||
model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
|
||||||
|
|
||||||
model_input['target'].value_counts()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
# UnderSampling
|
|
||||||
if undersampling:
|
|
||||||
no_stress = model_input[model_input['target'] == 0]
|
|
||||||
stress = model_input[model_input['target'] == 1]
|
|
||||||
|
|
||||||
no_stress = no_stress.sample(n=len(stress))
|
|
||||||
model_input = pd.concat([stress,no_stress], axis=0)
|
|
||||||
|
|
||||||
# model_input_new = pd.DataFrame(columns=model_input.columns)
|
|
||||||
# for pid in model_input["pid"].unique():
|
|
||||||
# stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
|
|
||||||
# no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
|
|
||||||
# if (len(stress) == 0):
|
|
||||||
# continue
|
|
||||||
# if (len(no_stress) == 0):
|
|
||||||
# continue
|
|
||||||
# model_input_new = pd.concat([model_input_new, stress], axis=0)
|
|
||||||
|
|
||||||
# no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
|
|
||||||
# # In case there are more stress samples than no_stress, take all instances of no_stress.
|
|
||||||
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
|
|
||||||
# model_input = model_input_new
|
|
||||||
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
|
|
||||||
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
if cv_method_str == 'half_logo':
|
|
||||||
model_input['pid_index'] = model_input.groupby('pid').cumcount()
|
|
||||||
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
|
|
||||||
|
|
||||||
model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
|
|
||||||
model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
|
|
||||||
|
|
||||||
data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
|
|
||||||
else:
|
|
||||||
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
|
|
||||||
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
|
||||||
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
|
||||||
categorical_feature_colnames += additional_categorical_features
|
|
||||||
|
|
||||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
|
||||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
|
||||||
|
|
||||||
# fillna with mode
|
|
||||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
|
||||||
|
|
||||||
# one-hot encoding
|
|
||||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
|
||||||
if not categorical_features.empty:
|
|
||||||
categorical_features = pd.get_dummies(categorical_features)
|
|
||||||
|
|
||||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
|
||||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
|
||||||
train_x.dtypes
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
|
|
||||||
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
|
||||||
cv_method = LeaveOneGroupOut()
|
|
||||||
cv_method.get_n_splits(
|
|
||||||
train_x,
|
|
||||||
data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
)
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Baseline: Dummy Classifier (most frequent)
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
|
||||||
dummy_class = DummyClassifier(strategy="most_frequent")
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
dummy_classifier = cross_validate(
|
|
||||||
dummy_class,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cv_method,
|
|
||||||
n_jobs=-1,
|
|
||||||
error_score='raise',
|
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
|
||||||
)
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
|
|
||||||
print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
|
|
||||||
print("Precision", np.mean(dummy_classifier['test_precision']))
|
|
||||||
print("Recall", np.mean(dummy_classifier['test_recall']))
|
|
||||||
print("F1", np.mean(dummy_classifier['test_f1']))
|
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
|
|
||||||
|
|
||||||
# %% [markdown] nteract={"transient": {"deleting": false}}
|
|
||||||
# ### All models
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
|
||||||
final_scores = machine_learning.helper.run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
|
||||||
# %%
|
# %%
|
||||||
final_scores.index.name = "metric"
|
CV_METHOD = "logo" # logo, half_logo, 5kfold
|
||||||
final_scores = final_scores.set_index(["method", final_scores.index])
|
# Cross-validation method (could be regarded as a hyperparameter)
|
||||||
final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv")
|
print("CV_METHOD: " + CV_METHOD)
|
||||||
|
N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||||
|
UNDERSAMPLING = False
|
||||||
|
# (bool) If True this will train and test data on balanced dataset
|
||||||
|
# (using undersampling method)
|
||||||
|
|
||||||
# %% [markdown]
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||||
# ### Logistic Regression
|
PATH_BASE = Path("E:/STRAWresults/20230415")
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
SEGMENT_TYPE = "period"
|
||||||
logistic_regression = linear_model.LogisticRegression()
|
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
||||||
|
SEGMENT_LENGTH = "30_minutes_before"
|
||||||
|
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
||||||
|
TARGET_VARIABLE = "JCQ_job_control"
|
||||||
|
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
|
||||||
log_reg_scores = cross_validate(
|
TARGET_VARIABLE += "_"
|
||||||
logistic_regression,
|
TARGET_VARIABLE += SEGMENT_TYPE
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
||||||
groups=data_groups,
|
|
||||||
cv=cv_method,
|
model_input = pd.read_csv(PATH_FULL)
|
||||||
n_jobs=-1,
|
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
if SEGMENT_LENGTH == "daily":
|
||||||
|
DAY_LENGTH = "daily" # or "working"
|
||||||
|
print(DAY_LENGTH)
|
||||||
|
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
|
||||||
|
|
||||||
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||||
|
model_input["target"].value_counts()
|
||||||
|
|
||||||
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||||
|
# bins = [-10, 0, 10] # bins for z-scored targets
|
||||||
|
BINS = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||||
|
print("BINS: ", BINS)
|
||||||
|
model_input["target"], edges = pd.cut(
|
||||||
|
model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
|
||||||
|
) # ['low', 'medium', 'high']
|
||||||
|
print(model_input["target"].value_counts())
|
||||||
|
REMOVE_MEDIUM = True
|
||||||
|
if ("medium" in model_input["target"]) and REMOVE_MEDIUM:
|
||||||
|
model_input = model_input[model_input["target"] != "medium"]
|
||||||
|
model_input["target"] = (
|
||||||
|
model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
model_input["target"] = model_input["target"].map(
|
||||||
|
{"low": 0, "medium": 1, "high": 2}
|
||||||
|
)
|
||||||
|
print(model_input["target"].value_counts())
|
||||||
|
|
||||||
|
|
||||||
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||||
|
# UnderSampling
|
||||||
|
if UNDERSAMPLING:
|
||||||
|
no_stress = model_input[model_input["target"] == 0]
|
||||||
|
stress = model_input[model_input["target"] == 1]
|
||||||
|
|
||||||
|
no_stress = no_stress.sample(n=len(stress))
|
||||||
|
model_input = pd.concat([stress, no_stress], axis=0)
|
||||||
|
|
||||||
|
|
||||||
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||||
|
model_input_encoded = impute_encode_categorical_features(model_input)
|
||||||
|
# %%
|
||||||
|
data_x, data_y, data_groups = prepare_sklearn_data_format(
|
||||||
|
model_input_encoded, CV_METHOD
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
|
||||||
print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
|
|
||||||
print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
|
|
||||||
print("Precision", np.mean(log_reg_scores['test_precision']))
|
|
||||||
print("Recall", np.mean(log_reg_scores['test_recall']))
|
|
||||||
print("F1", np.mean(log_reg_scores['test_f1']))
|
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n_sl)[:n_sl]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
# %%
|
||||||
# ### Support Vector Machine
|
data_y.head()
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
# %%
|
||||||
svc = svm.SVC()
|
data_y.tail()
|
||||||
|
# %%
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
data_y.shape
|
||||||
svc_scores = cross_validate(
|
# %%
|
||||||
svc,
|
scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
|
||||||
X=imputer.fit_transform(train_x),
|
# %%
|
||||||
y=data_y,
|
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
||||||
groups=data_groups,
|
path_output_full = PATH_OUTPUT / (
|
||||||
cv=cv_method,
|
TARGET_VARIABLE
|
||||||
n_jobs=-1,
|
+ "_"
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
+ SEGMENT_LENGTH
|
||||||
|
+ "_classification"
|
||||||
|
+ str(BINS)
|
||||||
|
+ "_"
|
||||||
|
+ CV_METHOD
|
||||||
|
+ ".csv"
|
||||||
)
|
)
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
scores.to_csv(path_output_full, index=False)
|
||||||
print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
|
|
||||||
print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
|
|
||||||
print("Precision", np.mean(svc_scores['test_precision']))
|
|
||||||
print("Recall", np.mean(svc_scores['test_recall']))
|
|
||||||
print("F1", np.mean(svc_scores['test_f1']))
|
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n_sl)[:n_sl]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Gaussian Naive Bayes
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
gaussian_nb = naive_bayes.GaussianNB()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
gaussian_nb_scores = cross_validate(
|
|
||||||
gaussian_nb,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cv_method,
|
|
||||||
n_jobs=-1,
|
|
||||||
error_score='raise',
|
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
|
||||||
)
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
|
|
||||||
print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
|
|
||||||
print("Precision", np.mean(gaussian_nb_scores['test_precision']))
|
|
||||||
print("Recall", np.mean(gaussian_nb_scores['test_recall']))
|
|
||||||
print("F1", np.mean(gaussian_nb_scores['test_f1']))
|
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Stochastic Gradient Descent Classifier
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
sgdc = linear_model.SGDClassifier()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
sgdc_scores = cross_validate(
|
|
||||||
sgdc,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cv_method,
|
|
||||||
n_jobs=-1,
|
|
||||||
error_score='raise',
|
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
|
||||||
)
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
|
|
||||||
print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
|
|
||||||
print("Precision", np.mean(sgdc_scores['test_precision']))
|
|
||||||
print("Recall", np.mean(sgdc_scores['test_recall']))
|
|
||||||
print("F1", np.mean(sgdc_scores['test_f1']))
|
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n_sl)[:n_sl]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### K-nearest neighbors
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
knn = neighbors.KNeighborsClassifier()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
knn_scores = cross_validate(
|
|
||||||
knn,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cv_method,
|
|
||||||
n_jobs=-1,
|
|
||||||
error_score='raise',
|
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
|
||||||
)
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
|
|
||||||
print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
|
|
||||||
print("Precision", np.mean(knn_scores['test_precision']))
|
|
||||||
print("Recall", np.mean(knn_scores['test_recall']))
|
|
||||||
print("F1", np.mean(knn_scores['test_f1']))
|
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n_sl)[:n_sl]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Decision Tree
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
dtree = tree.DecisionTreeClassifier()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
dtree_scores = cross_validate(
|
|
||||||
dtree,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cv_method,
|
|
||||||
n_jobs=-1,
|
|
||||||
error_score='raise',
|
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
|
||||||
)
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
|
|
||||||
print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
|
|
||||||
print("Precision", np.mean(dtree_scores['test_precision']))
|
|
||||||
print("Recall", np.mean(dtree_scores['test_recall']))
|
|
||||||
print("F1", np.mean(dtree_scores['test_f1']))
|
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n_sl)[:n_sl]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Random Forest Classifier
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
rfc = ensemble.RandomForestClassifier()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
rfc_scores = cross_validate(
|
|
||||||
rfc,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cv_method,
|
|
||||||
n_jobs=-1,
|
|
||||||
error_score='raise',
|
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1'),
|
|
||||||
return_estimator=True
|
|
||||||
)
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
|
|
||||||
print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
|
|
||||||
print("Precision", np.mean(rfc_scores['test_precision']))
|
|
||||||
print("Recall", np.mean(rfc_scores['test_recall']))
|
|
||||||
print("F1", np.mean(rfc_scores['test_f1']))
|
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Feature importance (RFC)
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
|
|
||||||
for idx, estimator in enumerate(rfc_scores['estimator']):
|
|
||||||
feature_importances = pd.DataFrame(estimator.feature_importances_,
|
|
||||||
index = list(train_x.columns),
|
|
||||||
columns=['importance'])
|
|
||||||
# print("\nFeatures sorted by their score for estimator {}:".format(idx))
|
|
||||||
# print(feature_importances.sort_values('importance', ascending=False).head(10))
|
|
||||||
rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
|
|
||||||
|
|
||||||
pd.set_option('display.max_rows', 100)
|
|
||||||
print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
|
|
||||||
|
|
||||||
rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
|
|
||||||
|
|
||||||
rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
|
|
||||||
|
|
||||||
train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Gradient Boosting Classifier
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
gbc = ensemble.GradientBoostingClassifier()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
gbc_scores = cross_validate(
|
|
||||||
gbc,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cv_method,
|
|
||||||
n_jobs=-1,
|
|
||||||
error_score='raise',
|
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
|
||||||
)
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
|
|
||||||
print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
|
|
||||||
print("Precision", np.mean(gbc_scores['test_precision']))
|
|
||||||
print("Recall", np.mean(gbc_scores['test_recall']))
|
|
||||||
print("F1", np.mean(gbc_scores['test_f1']))
|
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n_sl)[:n_sl]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### LGBM Classifier
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
lgbm = LGBMClassifier()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
lgbm_scores = cross_validate(
|
|
||||||
lgbm,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cv_method,
|
|
||||||
n_jobs=-1,
|
|
||||||
error_score='raise',
|
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
|
||||||
)
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
|
|
||||||
print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
|
|
||||||
print("Precision", np.mean(lgbm_scores['test_precision']))
|
|
||||||
print("Recall", np.mean(lgbm_scores['test_recall']))
|
|
||||||
print("F1", np.mean(lgbm_scores['test_f1']))
|
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n_sl)[:n_sl]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### XGBoost Classifier
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
xgb_classifier = xg.sklearn.XGBClassifier()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
xgb_classifier_scores = cross_validate(
|
|
||||||
xgb_classifier,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cv_method,
|
|
||||||
n_jobs=-1,
|
|
||||||
error_score='raise',
|
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
|
||||||
)
|
|
||||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
||||||
print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
|
|
||||||
print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
|
|
||||||
print("Precision", np.mean(xgb_classifier_scores['test_precision']))
|
|
||||||
print("Recall", np.mean(xgb_classifier_scores['test_recall']))
|
|
||||||
print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,177 @@
|
||||||
|
# ---
|
||||||
|
# jupyter:
|
||||||
|
# jupytext:
|
||||||
|
# formats: ipynb,py:percent
|
||||||
|
# text_representation:
|
||||||
|
# extension: .py
|
||||||
|
# format_name: percent
|
||||||
|
# format_version: '1.3'
|
||||||
|
# jupytext_version: 1.14.5
|
||||||
|
# kernelspec:
|
||||||
|
# display_name: straw2analysis
|
||||||
|
# language: python
|
||||||
|
# name: straw2analysis
|
||||||
|
# ---
|
||||||
|
|
||||||
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
|
||||||
|
from machine_learning.helper import (
|
||||||
|
impute_encode_categorical_features,
|
||||||
|
prepare_cross_validator,
|
||||||
|
prepare_sklearn_data_format,
|
||||||
|
run_all_classification_models,
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
CV_METHOD = "logo" # logo, half_logo, 5kfold
|
||||||
|
# Cross-validation method (could be regarded as a hyperparameter)
|
||||||
|
print("CV_METHOD: " + CV_METHOD)
|
||||||
|
N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||||
|
UNDERSAMPLING = False
|
||||||
|
# (bool) If True this will train and test data on balanced dataset
|
||||||
|
# (using undersampling method)
|
||||||
|
|
||||||
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||||
|
PATH_BASE = Path("E:/STRAWresults/20230415")
|
||||||
|
|
||||||
|
SEGMENT_TYPE = "period"
|
||||||
|
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
||||||
|
SEGMENT_LENGTH = "30_minutes_before"
|
||||||
|
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
||||||
|
|
||||||
|
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv"
|
||||||
|
|
||||||
|
all_features_with_baseline = pd.read_csv(PATH_FULL)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
TARGETS = [
|
||||||
|
"PANAS_negative_affect_mean",
|
||||||
|
"PANAS_positive_affect_mean",
|
||||||
|
"JCQ_job_demand_mean",
|
||||||
|
"JCQ_job_control_mean",
|
||||||
|
"appraisal_stressfulness_period_mean",
|
||||||
|
]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
all_features_cleaned = pd.DataFrame()
|
||||||
|
for target in TARGETS:
|
||||||
|
PATH_FULL = (
|
||||||
|
PATH_BASE
|
||||||
|
/ SEGMENT_LENGTH
|
||||||
|
/ "features"
|
||||||
|
/ ("all_sensor_features_cleaned_straw_py_(" + target + ").csv")
|
||||||
|
)
|
||||||
|
current_features = pd.read_csv(PATH_FULL, index_col="local_segment")
|
||||||
|
if all_features_cleaned.empty:
|
||||||
|
all_features_cleaned = current_features
|
||||||
|
else:
|
||||||
|
all_features_cleaned = all_features_cleaned.join(
|
||||||
|
current_features[("phone_esm_straw_" + target)],
|
||||||
|
how="inner",
|
||||||
|
rsuffix="_" + target,
|
||||||
|
)
|
||||||
|
print(all_features_cleaned.shape)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
pca = PCA(n_components=1)
|
||||||
|
TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS]
|
||||||
|
pca.fit(all_features_cleaned[TARGETS_PREFIXED])
|
||||||
|
print(pca.explained_variance_ratio_)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED)
|
||||||
|
model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED])
|
||||||
|
|
||||||
|
# %%
|
||||||
|
sns.histplot(data=model_input, x="target")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
model_input.target.quantile(0.6)
|
||||||
|
|
||||||
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||||
|
# bins = [-10, 0, 10] # bins for z-scored targets
|
||||||
|
BINS = [-10, 0, 10] # bins for stressfulness (0-4) target
|
||||||
|
print("BINS: ", BINS)
|
||||||
|
model_input["target"], edges = pd.cut(
|
||||||
|
model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
|
||||||
|
) # ['low', 'medium', 'high']
|
||||||
|
print(model_input["target"].value_counts())
|
||||||
|
REMOVE_MEDIUM = True
|
||||||
|
if REMOVE_MEDIUM:
|
||||||
|
if "medium" in model_input["target"]:
|
||||||
|
model_input = model_input[model_input["target"] != "medium"]
|
||||||
|
model_input["target"] = (
|
||||||
|
model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
model_input["target"] = model_input["target"].map(
|
||||||
|
{"low": 0, "medium": 1, "high": 2}
|
||||||
|
)
|
||||||
|
print(model_input["target"].value_counts())
|
||||||
|
|
||||||
|
|
||||||
|
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||||
|
# UnderSampling
|
||||||
|
if UNDERSAMPLING:
|
||||||
|
no_stress = model_input[model_input["target"] == 0]
|
||||||
|
stress = model_input[model_input["target"] == 1]
|
||||||
|
|
||||||
|
no_stress = no_stress.sample(n=len(stress))
|
||||||
|
model_input = pd.concat([stress, no_stress], axis=0)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
TARGET_VARIABLE = "PANAS_negative_affect"
|
||||||
|
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
||||||
|
|
||||||
|
PATH_FULL_HELP = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
||||||
|
|
||||||
|
model_input_with_baseline = pd.read_csv(PATH_FULL_HELP, index_col="local_segment")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
baseline_col_names = [
|
||||||
|
col for col in model_input_with_baseline.columns if col not in model_input.columns
|
||||||
|
]
|
||||||
|
print(baseline_col_names)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
model_input = model_input.join(
|
||||||
|
model_input_with_baseline[baseline_col_names], how="left"
|
||||||
|
)
|
||||||
|
model_input.reset_index(inplace=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
model_input_encoded = impute_encode_categorical_features(model_input)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_x, data_y, data_groups = prepare_sklearn_data_format(
|
||||||
|
model_input_encoded, CV_METHOD
|
||||||
|
)
|
||||||
|
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_y.head()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_y.tail()
|
||||||
|
# %%
|
||||||
|
data_y.shape
|
||||||
|
# %%
|
||||||
|
scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
|
||||||
|
# %%
|
||||||
|
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
||||||
|
path_output_full = PATH_OUTPUT / (
|
||||||
|
"composite_"
|
||||||
|
+ SEGMENT_LENGTH
|
||||||
|
+ "_classification"
|
||||||
|
+ str(BINS)
|
||||||
|
+ "_"
|
||||||
|
+ CV_METHOD
|
||||||
|
+ ".csv"
|
||||||
|
)
|
||||||
|
scores.to_csv(path_output_full, index=False)
|
|
@ -6,7 +6,7 @@
|
||||||
# extension: .py
|
# extension: .py
|
||||||
# format_name: percent
|
# format_name: percent
|
||||||
# format_version: '1.3'
|
# format_version: '1.3'
|
||||||
# jupytext_version: 1.13.0
|
# jupytext_version: 1.14.5
|
||||||
# kernelspec:
|
# kernelspec:
|
||||||
# display_name: straw2analysis
|
# display_name: straw2analysis
|
||||||
# language: python
|
# language: python
|
||||||
|
@ -14,80 +14,85 @@
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# %matplotlib inline
|
from pathlib import Path
|
||||||
import datetime
|
|
||||||
import importlib
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import seaborn as sns
|
|
||||||
from scipy import stats
|
|
||||||
|
|
||||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
|
|
||||||
from sklearn.impute import SimpleImputer
|
|
||||||
|
|
||||||
from sklearn.dummy import DummyClassifier
|
|
||||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
|
||||||
import xgboost as xg
|
|
||||||
|
|
||||||
from sklearn.cluster import KMeans
|
from sklearn.cluster import KMeans
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold, cross_validate
|
||||||
|
|
||||||
from IPython.core.interactiveshell import InteractiveShell
|
|
||||||
InteractiveShell.ast_node_interactivity = "all"
|
|
||||||
|
|
||||||
nb_dir = os.path.split(os.getcwd())[0]
|
|
||||||
if nb_dir not in sys.path:
|
|
||||||
sys.path.append(nb_dir)
|
|
||||||
|
|
||||||
import machine_learning.labels
|
|
||||||
import machine_learning.model
|
|
||||||
from machine_learning.classification_models import ClassificationModels
|
from machine_learning.classification_models import ClassificationModels
|
||||||
|
|
||||||
# %% [markdown]
|
# %%
|
||||||
# # RAPIDS models
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ## Set script's parameters
|
# ## Set script's parameters
|
||||||
n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
|
N_CLUSTERS = 4 # Number of clusters (could be regarded as a hyperparameter)
|
||||||
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
CV_METHOD = "logo" # logo, halflogo, 5kfold
|
||||||
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
# Cross-validation method (could be regarded as a hyperparameter)
|
||||||
|
N_SL = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||||
|
|
||||||
|
# %%
|
||||||
|
PATH_BASE = Path("E:/STRAWresults/20230415")
|
||||||
|
|
||||||
|
SEGMENT_TYPE = "period"
|
||||||
|
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
||||||
|
SEGMENT_LENGTH = "30_minutes_before"
|
||||||
|
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
||||||
|
TARGET_VARIABLE = "appraisal_stressfulness"
|
||||||
|
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
||||||
|
|
||||||
|
if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
|
||||||
|
TARGET_VARIABLE += "_"
|
||||||
|
TARGET_VARIABLE += SEGMENT_TYPE
|
||||||
|
|
||||||
|
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
||||||
|
|
||||||
|
model_input = pd.read_csv(PATH_FULL)
|
||||||
|
|
||||||
|
if SEGMENT_LENGTH == "daily":
|
||||||
|
DAY_LENGTH = "daily" # or "working"
|
||||||
|
print(DAY_LENGTH)
|
||||||
|
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
|
index_columns = [
|
||||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
"local_segment",
|
||||||
|
"local_segment_label",
|
||||||
|
"local_segment_start_datetime",
|
||||||
|
"local_segment_end_datetime",
|
||||||
|
]
|
||||||
|
|
||||||
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
|
CLUST_COL = "limesurvey_demand_control_ratio_quartile"
|
||||||
|
print("CLUST_COL: " + CLUST_COL)
|
||||||
|
|
||||||
model_input.columns[list(model_input.columns).index('age'):-1]
|
BINS = [-1, 0, 4]
|
||||||
|
print("BINS: " + str(BINS))
|
||||||
|
|
||||||
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
|
model_input[CLUST_COL].describe()
|
||||||
lime_cols
|
|
||||||
lime_col = 'limesurvey_demand_control_ratio_quartile'
|
|
||||||
clust_col = lime_col
|
|
||||||
|
|
||||||
model_input[clust_col].describe()
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
model_input["target"].value_counts()
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
|
# Filter-out outlier rows by clust_col
|
||||||
|
# model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||||
|
|
||||||
# Filter-out outlier rows by clust_col
|
uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
|
||||||
#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
|
||||||
|
|
||||||
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
|
||||||
uniq = uniq.dropna()
|
uniq = uniq.dropna()
|
||||||
plt.bar(uniq['pid'], uniq[clust_col])
|
plt.bar(uniq["pid"], uniq[CLUST_COL])
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# Get clusters by cluster col & and merge the clusters to main df
|
# Get clusters by cluster col & and merge the clusters to main df
|
||||||
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
|
km = KMeans(n_clusters=N_CLUSTERS).fit_predict(uniq.set_index("pid"))
|
||||||
np.unique(km, return_counts=True)
|
np.unique(km, return_counts=True)
|
||||||
uniq['cluster'] = km
|
uniq["cluster"] = km
|
||||||
uniq
|
|
||||||
|
|
||||||
model_input = model_input.merge(uniq[['pid', 'cluster']])
|
model_input = model_input.merge(uniq[["pid", "cluster"]])
|
||||||
|
|
||||||
|
# %%
|
||||||
|
model_input[["cluster", "target"]].value_counts().sort_index()
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
model_input.set_index(index_columns, inplace=True)
|
model_input.set_index(index_columns, inplace=True)
|
||||||
|
@ -98,31 +103,56 @@ cm = ClassificationModels()
|
||||||
cmodels = cm.get_cmodels()
|
cmodels = cm.get_cmodels()
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
for k in range(n_clusters):
|
for k in range(N_CLUSTERS):
|
||||||
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
||||||
bins = [-10, -1, 1, 10] # bins for z-scored targets
|
model_input_subset.loc[:, "target"] = pd.cut(
|
||||||
model_input_subset.loc[:, 'target'] = \
|
model_input_subset.loc[:, "target"],
|
||||||
pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
|
bins=BINS,
|
||||||
model_input_subset['target'].value_counts()
|
labels=["low", "high"],
|
||||||
model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
|
right=True,
|
||||||
model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
) # ['low', 'medium', 'high']
|
||||||
|
model_input_subset["target"].value_counts()
|
||||||
|
# model_input_subset = model_input_subset[model_input_subset["target"] != "medium"]
|
||||||
|
model_input_subset["target"] = (
|
||||||
|
model_input_subset["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||||
|
)
|
||||||
|
|
||||||
model_input_subset['target'].value_counts()
|
print(model_input_subset["target"].value_counts())
|
||||||
|
|
||||||
if cv_method_str == 'half_logo':
|
|
||||||
model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
|
|
||||||
model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
|
|
||||||
|
|
||||||
model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
|
if CV_METHOD == "half_logo":
|
||||||
model_input_subset["pid_half"] = model_input_subset["pid"] + "_" + model_input_subset["pid_index"].astype(int).astype(str)
|
model_input_subset["pid_index"] = model_input_subset.groupby("pid").cumcount()
|
||||||
|
model_input_subset["pid_count"] = model_input_subset.groupby("pid")[
|
||||||
|
"pid"
|
||||||
|
].transform("count")
|
||||||
|
|
||||||
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
|
model_input_subset["pid_index"] = (
|
||||||
|
model_input_subset["pid_index"] / model_input_subset["pid_count"] + 1
|
||||||
|
).round()
|
||||||
|
model_input_subset["pid_half"] = (
|
||||||
|
model_input_subset["pid"]
|
||||||
|
+ "_"
|
||||||
|
+ model_input_subset["pid_index"].astype(int).astype(str)
|
||||||
|
)
|
||||||
|
|
||||||
|
data_x, data_y, data_groups = (
|
||||||
|
model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
|
||||||
|
model_input_subset["target"],
|
||||||
|
model_input_subset["pid_half"],
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]
|
data_x, data_y, data_groups = (
|
||||||
|
model_input_subset.drop(["target", "pid"], axis=1),
|
||||||
|
model_input_subset["target"],
|
||||||
|
model_input_subset["pid"],
|
||||||
|
)
|
||||||
|
|
||||||
# Treat categorical features
|
# Treat categorical features
|
||||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||||
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
additional_categorical_features = [
|
||||||
|
col
|
||||||
|
for col in data_x.columns
|
||||||
|
if "mostcommonactivity" in col or "homelabel" in col
|
||||||
|
]
|
||||||
categorical_feature_colnames += additional_categorical_features
|
categorical_feature_colnames += additional_categorical_features
|
||||||
|
|
||||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||||
|
@ -132,7 +162,9 @@ for k in range(n_clusters):
|
||||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||||
|
|
||||||
# one-hot encoding
|
# one-hot encoding
|
||||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
categorical_features = categorical_features.apply(
|
||||||
|
lambda col: col.astype("category")
|
||||||
|
)
|
||||||
if not categorical_features.empty:
|
if not categorical_features.empty:
|
||||||
categorical_features = pd.get_dummies(categorical_features)
|
categorical_features = pd.get_dummies(categorical_features)
|
||||||
|
|
||||||
|
@ -140,8 +172,10 @@ for k in range(n_clusters):
|
||||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||||
|
|
||||||
# Establish cv method
|
# Establish cv method
|
||||||
cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
|
cv_method = StratifiedKFold(
|
||||||
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
n_splits=5, shuffle=True
|
||||||
|
) # Defaults to 5 k-folds in cross_validate method
|
||||||
|
if CV_METHOD == "logo" or CV_METHOD == "half_logo":
|
||||||
cv_method = LeaveOneGroupOut()
|
cv_method = LeaveOneGroupOut()
|
||||||
cv_method.get_n_splits(
|
cv_method.get_n_splits(
|
||||||
train_x,
|
train_x,
|
||||||
|
@ -149,36 +183,57 @@ for k in range(n_clusters):
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
)
|
)
|
||||||
|
|
||||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
imputer = SimpleImputer(missing_values=np.nan, strategy="median")
|
||||||
|
|
||||||
for model_title, model in cmodels.items():
|
for model_title, model in cmodels.items():
|
||||||
|
|
||||||
classifier = cross_validate(
|
classifier = cross_validate(
|
||||||
model['model'],
|
model["model"],
|
||||||
X=imputer.fit_transform(train_x),
|
X=imputer.fit_transform(train_x),
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cv_method,
|
cv=cv_method,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
error_score='raise',
|
error_score="raise",
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=("accuracy", "precision", "recall", "f1"),
|
||||||
)
|
)
|
||||||
|
|
||||||
print("\n-------------------------------------\n")
|
print("\n-------------------------------------\n")
|
||||||
print("Current cluster:", k, end="\n")
|
print("Current cluster:", k, end="\n")
|
||||||
print("Current model:", model_title, end="\n")
|
print("Current model:", model_title, end="\n")
|
||||||
print("Acc", np.mean(classifier['test_accuracy']))
|
print("Acc", np.mean(classifier["test_accuracy"]))
|
||||||
print("Precision", np.mean(classifier['test_precision']))
|
print("Precision", np.mean(classifier["test_precision"]))
|
||||||
print("Recall", np.mean(classifier['test_recall']))
|
print("Recall", np.mean(classifier["test_recall"]))
|
||||||
print("F1", np.mean(classifier['test_f1']))
|
print("F1", np.mean(classifier["test_f1"]))
|
||||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
print(
|
||||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))
|
f"Largest {N_SL} ACC:",
|
||||||
|
np.sort(-np.partition(-classifier["test_accuracy"], N_SL)[:N_SL])[::-1],
|
||||||
cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
|
)
|
||||||
cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
|
print(
|
||||||
cmodels[model_title]['metrics'][2] += np.mean(classifier['test_recall'])
|
f"Smallest {N_SL} ACC:",
|
||||||
cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])
|
np.sort(np.partition(classifier["test_accuracy"], N_SL)[:N_SL]),
|
||||||
|
)
|
||||||
|
|
||||||
|
cmodels[model_title]["metrics"][0] += np.mean(classifier["test_accuracy"])
|
||||||
|
cmodels[model_title]["metrics"][1] += np.mean(classifier["test_precision"])
|
||||||
|
cmodels[model_title]["metrics"][2] += np.mean(classifier["test_recall"])
|
||||||
|
cmodels[model_title]["metrics"][3] += np.mean(classifier["test_f1"])
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# Get overall results
|
# Get overall results
|
||||||
cm.get_total_models_scores(n_clusters=n_clusters)
|
scores = cm.get_total_models_scores(n_clusters=N_CLUSTERS)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
||||||
|
path_output_full = PATH_OUTPUT / (
|
||||||
|
TARGET_VARIABLE
|
||||||
|
+ "_"
|
||||||
|
+ SEGMENT_LENGTH
|
||||||
|
+ "_classification_"
|
||||||
|
+ CV_METHOD
|
||||||
|
+ str(BINS)
|
||||||
|
+ "_clust_"
|
||||||
|
+ CLUST_COL
|
||||||
|
+ str(N_CLUSTERS)
|
||||||
|
+ ".csv"
|
||||||
|
)
|
||||||
|
scores.to_csv(path_output_full, index=False)
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
# extension: .py
|
# extension: .py
|
||||||
# format_name: percent
|
# format_name: percent
|
||||||
# format_version: '1.3'
|
# format_version: '1.3'
|
||||||
# jupytext_version: 1.13.0
|
# jupytext_version: 1.14.5
|
||||||
# kernelspec:
|
# kernelspec:
|
||||||
# display_name: straw2analysis
|
# display_name: straw2analysis
|
||||||
# language: python
|
# language: python
|
||||||
|
@ -14,92 +14,83 @@
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# %matplotlib inline
|
from pathlib import Path
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from scipy import stats
|
from scipy import stats
|
||||||
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
from sklearn.impute import SimpleImputer
|
|
||||||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
|
||||||
|
|
||||||
from sklearn.cluster import KMeans
|
from sklearn.cluster import KMeans
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
from IPython.core.interactiveshell import InteractiveShell
|
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
||||||
InteractiveShell.ast_node_interactivity = "all"
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
nb_dir = os.path.split(os.getcwd())[0]
|
|
||||||
if nb_dir not in sys.path:
|
|
||||||
sys.path.append(nb_dir)
|
|
||||||
|
|
||||||
from machine_learning.classification_models import ClassificationModels
|
from machine_learning.classification_models import ClassificationModels
|
||||||
|
from machine_learning.helper import impute_encode_categorical_features
|
||||||
# %% [markdown]
|
|
||||||
# # RAPIDS models
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# # Useful method
|
|
||||||
def treat_categorical_features(input_set):
|
|
||||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
|
||||||
additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col]
|
|
||||||
categorical_feature_colnames += additional_categorical_features
|
|
||||||
|
|
||||||
categorical_features = input_set[categorical_feature_colnames].copy()
|
|
||||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
|
||||||
|
|
||||||
# fillna with mode
|
|
||||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
|
||||||
|
|
||||||
# one-hot encoding
|
|
||||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
|
||||||
if not categorical_features.empty:
|
|
||||||
categorical_features = pd.get_dummies(categorical_features)
|
|
||||||
|
|
||||||
numerical_features = input_set.drop(categorical_feature_colnames, axis=1)
|
|
||||||
|
|
||||||
return pd.concat([numerical_features, categorical_features], axis=1)
|
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ## Set script's parameters
|
# ## Set script's parameters
|
||||||
n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
|
#
|
||||||
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
|
||||||
|
# %%
|
||||||
|
n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
|
||||||
|
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||||
|
|
||||||
|
# %%
|
||||||
|
PATH_BASE = Path("E:/STRAWresults/20230415")
|
||||||
|
|
||||||
|
SEGMENT_TYPE = "period"
|
||||||
|
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
||||||
|
SEGMENT_LENGTH = "30_minutes_before"
|
||||||
|
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
||||||
|
TARGET_VARIABLE = "appraisal_stressfulness"
|
||||||
|
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
||||||
|
|
||||||
|
if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
|
||||||
|
TARGET_VARIABLE += "_"
|
||||||
|
TARGET_VARIABLE += SEGMENT_TYPE
|
||||||
|
|
||||||
|
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
||||||
|
|
||||||
|
model_input = pd.read_csv(PATH_FULL)
|
||||||
|
|
||||||
|
if SEGMENT_LENGTH == "daily":
|
||||||
|
DAY_LENGTH = "daily" # or "working"
|
||||||
|
print(DAY_LENGTH)
|
||||||
|
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
|
CLUST_COL = "limesurvey_demand_control_ratio"
|
||||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
print("CLUST_COL: " + CLUST_COL)
|
||||||
|
|
||||||
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
|
BINS = [-1, 0, 4]
|
||||||
|
print("BINS: " + str(BINS))
|
||||||
|
|
||||||
model_input.columns[list(model_input.columns).index('age'):-1]
|
index_columns = [
|
||||||
|
"local_segment",
|
||||||
|
"local_segment_label",
|
||||||
|
"local_segment_start_datetime",
|
||||||
|
"local_segment_end_datetime",
|
||||||
|
]
|
||||||
|
|
||||||
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
|
model_input[CLUST_COL].describe()
|
||||||
lime_cols
|
|
||||||
lime_col = 'limesurvey_demand_control_ratio'
|
|
||||||
clust_col = lime_col
|
|
||||||
|
|
||||||
model_input[clust_col].describe()
|
|
||||||
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
|
# Filter-out outlier rows by clust_col
|
||||||
|
model_input = model_input[(np.abs(stats.zscore(model_input[CLUST_COL])) < 3)]
|
||||||
|
|
||||||
# Filter-out outlier rows by clust_col
|
uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
|
||||||
model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
plt.bar(uniq["pid"], uniq[CLUST_COL])
|
||||||
|
|
||||||
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
|
||||||
plt.bar(uniq['pid'], uniq[clust_col])
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# Get clusters by cluster col & and merge the clusters to main df
|
# Get clusters by cluster col & and merge the clusters to main df
|
||||||
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
|
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index("pid"))
|
||||||
np.unique(km, return_counts=True)
|
np.unique(km, return_counts=True)
|
||||||
uniq['cluster'] = km
|
uniq["cluster"] = km
|
||||||
uniq
|
print(uniq)
|
||||||
|
|
||||||
model_input = model_input.merge(uniq[['pid', 'cluster']])
|
model_input = model_input.merge(uniq[["pid", "cluster"]])
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
model_input.set_index(index_columns, inplace=True)
|
model_input.set_index(index_columns, inplace=True)
|
||||||
|
@ -109,50 +100,64 @@ model_input.set_index(index_columns, inplace=True)
|
||||||
cm = ClassificationModels()
|
cm = ClassificationModels()
|
||||||
cmodels = cm.get_cmodels()
|
cmodels = cm.get_cmodels()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
model_input["target"].value_counts()
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
for k in range(n_clusters):
|
for k in range(n_clusters):
|
||||||
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
||||||
|
|
||||||
# Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
|
|
||||||
model_input_subset['numerical_target'] = model_input_subset['target']
|
|
||||||
bins = [-10, 0, 10] # bins for z-scored targets
|
|
||||||
model_input_subset.loc[:, 'target'] = \
|
|
||||||
pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True)
|
|
||||||
|
|
||||||
p15 = np.percentile(model_input_subset['numerical_target'], 15)
|
|
||||||
p85 = np.percentile(model_input_subset['numerical_target'], 85)
|
|
||||||
|
|
||||||
# Treat categorical features
|
|
||||||
model_input_subset = treat_categorical_features(model_input_subset)
|
|
||||||
|
|
||||||
# Split to train, validate, and test subsets
|
|
||||||
train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
|
|
||||||
test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
|
|
||||||
|
|
||||||
train_set['target'].value_counts()
|
# Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
|
||||||
test_set['target'].value_counts()
|
# model_input_subset['numerical_target'] = model_input_subset['target']
|
||||||
|
|
||||||
|
model_input_subset.loc[:, "target"] = pd.cut(
|
||||||
|
model_input_subset.loc[:, "target"], bins=BINS, labels=[0, 1], right=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# p15 = np.percentile(model_input_subset['numerical_target'], 15)
|
||||||
|
# p85 = np.percentile(model_input_subset['numerical_target'], 85)
|
||||||
|
|
||||||
|
# Treat categorical features
|
||||||
|
model_input_subset = impute_encode_categorical_features(model_input_subset)
|
||||||
|
|
||||||
|
# Split to train, validate, and test subsets
|
||||||
|
# train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
|
||||||
|
# test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
|
||||||
|
train_set, test_set = train_test_split(
|
||||||
|
model_input_subset,
|
||||||
|
test_size=0.3,
|
||||||
|
stratify=model_input_subset["pid"],
|
||||||
|
random_state=42,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(train_set["target"].value_counts())
|
||||||
|
print(test_set["target"].value_counts())
|
||||||
|
|
||||||
train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
|
train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
|
||||||
|
|
||||||
validate_x, test_x, validate_y, test_y = \
|
validate_x, test_x, validate_y, test_y = train_test_split(
|
||||||
train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42)
|
test_set.drop(["target", "pid"], axis=1),
|
||||||
|
test_set["target"],
|
||||||
|
test_size=0.50,
|
||||||
|
random_state=42,
|
||||||
|
)
|
||||||
|
|
||||||
# Impute missing values
|
# Impute missing values
|
||||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
imputer = SimpleImputer(missing_values=np.nan, strategy="median")
|
||||||
|
|
||||||
train_x = imputer.fit_transform(train_x)
|
train_x = imputer.fit_transform(train_x)
|
||||||
validate_x = imputer.fit_transform(validate_x)
|
validate_x = imputer.fit_transform(validate_x)
|
||||||
test_x = imputer.fit_transform(test_x)
|
test_x = imputer.fit_transform(test_x)
|
||||||
|
|
||||||
for model_title, model in cmodels.items():
|
for model_title, model in cmodels.items():
|
||||||
model['model'].fit(train_x, train_y)
|
model["model"].fit(train_x, train_y)
|
||||||
y_pred = model['model'].predict(validate_x)
|
y_pred = model["model"].predict(validate_x)
|
||||||
|
|
||||||
acc = accuracy_score(validate_y, y_pred)
|
acc = accuracy_score(validate_y, y_pred)
|
||||||
prec = precision_score(validate_y, y_pred)
|
prec = precision_score(validate_y, y_pred)
|
||||||
rec = recall_score(validate_y, y_pred)
|
rec = recall_score(validate_y, y_pred)
|
||||||
f1 = f1_score(validate_y, y_pred)
|
f1 = f1_score(validate_y, y_pred)
|
||||||
|
|
||||||
print("\n-------------------------------------\n")
|
print("\n-------------------------------------\n")
|
||||||
print("Current cluster:", k, end="\n")
|
print("Current cluster:", k, end="\n")
|
||||||
print("Current model:", model_title, end="\n")
|
print("Current model:", model_title, end="\n")
|
||||||
|
@ -160,12 +165,30 @@ for k in range(n_clusters):
|
||||||
print("Precision", prec)
|
print("Precision", prec)
|
||||||
print("Recall", rec)
|
print("Recall", rec)
|
||||||
print("F1", f1)
|
print("F1", f1)
|
||||||
|
|
||||||
cmodels[model_title]['metrics'][0] += acc
|
cmodels[model_title]["metrics"][0] += acc
|
||||||
cmodels[model_title]['metrics'][1] += prec
|
cmodels[model_title]["metrics"][1] += prec
|
||||||
cmodels[model_title]['metrics'][2] += rec
|
cmodels[model_title]["metrics"][2] += rec
|
||||||
cmodels[model_title]['metrics'][3] += f1
|
cmodels[model_title]["metrics"][3] += f1
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# Get overall results
|
# Get overall results
|
||||||
cm.get_total_models_scores(n_clusters=n_clusters)
|
scores = cm.get_total_models_scores(n_clusters=n_clusters)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
print(scores)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
||||||
|
path_output_full = PATH_OUTPUT / (
|
||||||
|
TARGET_VARIABLE
|
||||||
|
+ "_"
|
||||||
|
+ SEGMENT_LENGTH
|
||||||
|
+ "_classification"
|
||||||
|
+ str(BINS)
|
||||||
|
+ "_CLUST_"
|
||||||
|
+ CLUST_COL
|
||||||
|
+ +str(n_clusters)
|
||||||
|
+ ".csv"
|
||||||
|
)
|
||||||
|
scores.to_csv(path_output_full, index=False)
|
||||||
|
|
|
@ -6,445 +6,61 @@
|
||||||
# extension: .py
|
# extension: .py
|
||||||
# format_name: percent
|
# format_name: percent
|
||||||
# format_version: '1.3'
|
# format_version: '1.3'
|
||||||
# jupytext_version: 1.13.0
|
# jupytext_version: 1.14.5
|
||||||
# kernelspec:
|
# kernelspec:
|
||||||
# display_name: straw2analysis
|
# display_name: straw2analysis
|
||||||
# language: python
|
# language: python
|
||||||
# name: straw2analysis
|
# name: straw2analysis
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %%
|
||||||
# %matplotlib inline
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import xgboost as xg
|
|
||||||
from machine_learning.helper import prepare_regression_model_input
|
|
||||||
from sklearn import gaussian_process, kernel_ridge, linear_model, svm
|
|
||||||
from sklearn.dummy import DummyRegressor
|
|
||||||
from sklearn.impute import SimpleImputer
|
|
||||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
|
|
||||||
|
|
||||||
# from IPython.core.interactiveshell import InteractiveShell
|
from machine_learning.helper import (
|
||||||
# InteractiveShell.ast_node_interactivity = "all"
|
impute_encode_categorical_features,
|
||||||
|
prepare_cross_validator,
|
||||||
|
prepare_sklearn_data_format,
|
||||||
|
run_all_regression_models,
|
||||||
|
)
|
||||||
|
|
||||||
nb_dir = os.path.split(os.getcwd())[0]
|
nb_dir = os.path.split(os.getcwd())[0]
|
||||||
if nb_dir not in sys.path:
|
if nb_dir not in sys.path:
|
||||||
sys.path.append(nb_dir)
|
sys.path.append(nb_dir)
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %%
|
||||||
model_input = pd.read_csv(
|
model_input = pd.read_csv(
|
||||||
"../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
|
"../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
|
||||||
)
|
)
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %%
|
||||||
cv_method = "half_logo" # logo, half_logo, 5kfold
|
model_input = model_input[model_input["local_segment"].str.contains("daily")]
|
||||||
|
|
||||||
train_x, data_y, data_groups = prepare_regression_model_input(model_input, cv_method)
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
logo = LeaveOneGroupOut()
|
|
||||||
logo.get_n_splits(
|
|
||||||
train_x,
|
|
||||||
data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Defaults to 5 k folds in cross_validate method
|
|
||||||
if cv_method != "logo" and cv_method != "half_logo":
|
|
||||||
logo = None
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
sum(data_y.isna())
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Baseline: Dummy Regression (mean)
|
|
||||||
dummy_regr = DummyRegressor(strategy="mean")
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
dummy_regressor = cross_validate(
|
|
||||||
dummy_regr,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=logo,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=(
|
|
||||||
"r2",
|
|
||||||
"neg_mean_squared_error",
|
|
||||||
"neg_mean_absolute_error",
|
|
||||||
"neg_root_mean_squared_error",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Squared Error",
|
|
||||||
np.median(dummy_regressor["test_neg_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Absolute Error",
|
|
||||||
np.median(dummy_regressor["test_neg_mean_absolute_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Root Mean Squared Error",
|
|
||||||
np.median(dummy_regressor["test_neg_root_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print("R2", np.median(dummy_regressor["test_r2"]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Linear Regression
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
lin_reg_rapids = linear_model.LinearRegression()
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
lin_reg_scores = cross_validate(
|
|
||||||
lin_reg_rapids,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=logo,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=(
|
|
||||||
"r2",
|
|
||||||
"neg_mean_squared_error",
|
|
||||||
"neg_mean_absolute_error",
|
|
||||||
"neg_root_mean_squared_error",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Squared Error",
|
|
||||||
np.median(lin_reg_scores["test_neg_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Absolute Error",
|
|
||||||
np.median(lin_reg_scores["test_neg_mean_absolute_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Root Mean Squared Error",
|
|
||||||
np.median(lin_reg_scores["test_neg_root_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print("R2", np.median(lin_reg_scores["test_r2"]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### XGBRegressor Linear Regression
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
xgb_r = xg.XGBRegressor(objective="reg:squarederror", n_estimators=10)
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
xgb_reg_scores = cross_validate(
|
|
||||||
xgb_r,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=logo,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=(
|
|
||||||
"r2",
|
|
||||||
"neg_mean_squared_error",
|
|
||||||
"neg_mean_absolute_error",
|
|
||||||
"neg_root_mean_squared_error",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Squared Error",
|
|
||||||
np.median(xgb_reg_scores["test_neg_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Absolute Error",
|
|
||||||
np.median(xgb_reg_scores["test_neg_mean_absolute_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Root Mean Squared Error",
|
|
||||||
np.median(xgb_reg_scores["test_neg_root_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print("R2", np.median(xgb_reg_scores["test_r2"]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### XGBRegressor Pseudo Huber Error Regression
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
xgb_psuedo_huber_r = xg.XGBRegressor(objective="reg:pseudohubererror", n_estimators=10)
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
xgb_psuedo_huber_reg_scores = cross_validate(
|
|
||||||
xgb_psuedo_huber_r,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=logo,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=(
|
|
||||||
"r2",
|
|
||||||
"neg_mean_squared_error",
|
|
||||||
"neg_mean_absolute_error",
|
|
||||||
"neg_root_mean_squared_error",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Squared Error",
|
|
||||||
np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Absolute Error",
|
|
||||||
np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_absolute_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Root Mean Squared Error",
|
|
||||||
np.median(xgb_psuedo_huber_reg_scores["test_neg_root_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print("R2", np.median(xgb_psuedo_huber_reg_scores["test_r2"]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Ridge regression
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
ridge_reg = linear_model.Ridge(alpha=0.5)
|
|
||||||
|
|
||||||
# %% tags=[] jupyter={"source_hidden": true}
|
|
||||||
ridge_reg_scores = cross_validate(
|
|
||||||
ridge_reg,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=logo,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=(
|
|
||||||
"r2",
|
|
||||||
"neg_mean_squared_error",
|
|
||||||
"neg_mean_absolute_error",
|
|
||||||
"neg_root_mean_squared_error",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Squared Error",
|
|
||||||
np.median(ridge_reg_scores["test_neg_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Absolute Error",
|
|
||||||
np.median(ridge_reg_scores["test_neg_mean_absolute_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Root Mean Squared Error",
|
|
||||||
np.median(ridge_reg_scores["test_neg_root_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print("R2", np.median(ridge_reg_scores["test_r2"]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Lasso
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
lasso_reg = linear_model.Lasso(alpha=0.1)
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
lasso_reg_score = cross_validate(
|
|
||||||
lasso_reg,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=logo,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=(
|
|
||||||
"r2",
|
|
||||||
"neg_mean_squared_error",
|
|
||||||
"neg_mean_absolute_error",
|
|
||||||
"neg_root_mean_squared_error",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Squared Error",
|
|
||||||
np.median(lasso_reg_score["test_neg_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Absolute Error",
|
|
||||||
np.median(lasso_reg_score["test_neg_mean_absolute_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Root Mean Squared Error",
|
|
||||||
np.median(lasso_reg_score["test_neg_root_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print("R2", np.median(lasso_reg_score["test_r2"]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Bayesian Ridge
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
bayesian_ridge_reg = linear_model.BayesianRidge()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
bayesian_ridge_reg_score = cross_validate(
|
|
||||||
bayesian_ridge_reg,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=logo,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=(
|
|
||||||
"r2",
|
|
||||||
"neg_mean_squared_error",
|
|
||||||
"neg_mean_absolute_error",
|
|
||||||
"neg_root_mean_squared_error",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Squared Error",
|
|
||||||
np.median(bayesian_ridge_reg_score["test_neg_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Absolute Error",
|
|
||||||
np.median(bayesian_ridge_reg_score["test_neg_mean_absolute_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Root Mean Squared Error",
|
|
||||||
np.median(bayesian_ridge_reg_score["test_neg_root_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print("R2", np.median(bayesian_ridge_reg_score["test_r2"]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### RANSAC (outlier robust regression)
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
ransac_reg = linear_model.RANSACRegressor()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
ransac_reg_scores = cross_validate(
|
|
||||||
ransac_reg,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=logo,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=(
|
|
||||||
"r2",
|
|
||||||
"neg_mean_squared_error",
|
|
||||||
"neg_mean_absolute_error",
|
|
||||||
"neg_root_mean_squared_error",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Squared Error",
|
|
||||||
np.median(ransac_reg_scores["test_neg_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Absolute Error",
|
|
||||||
np.median(ransac_reg_scores["test_neg_mean_absolute_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Root Mean Squared Error",
|
|
||||||
np.median(ransac_reg_scores["test_neg_root_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print("R2", np.median(ransac_reg_scores["test_r2"]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Support vector regression
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
svr = svm.SVR()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
svr_scores = cross_validate(
|
|
||||||
svr,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=logo,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=(
|
|
||||||
"r2",
|
|
||||||
"neg_mean_squared_error",
|
|
||||||
"neg_mean_absolute_error",
|
|
||||||
"neg_root_mean_squared_error",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Squared Error", np.median(svr_scores["test_neg_mean_squared_error"])
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Absolute Error",
|
|
||||||
np.median(svr_scores["test_neg_mean_absolute_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Root Mean Squared Error",
|
|
||||||
np.median(svr_scores["test_neg_root_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print("R2", np.median(svr_scores["test_r2"]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Kernel Ridge regression
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
kridge = kernel_ridge.KernelRidge()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
kridge_scores = cross_validate(
|
|
||||||
kridge,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=logo,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=(
|
|
||||||
"r2",
|
|
||||||
"neg_mean_squared_error",
|
|
||||||
"neg_mean_absolute_error",
|
|
||||||
"neg_root_mean_squared_error",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Squared Error",
|
|
||||||
np.median(kridge_scores["test_neg_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Absolute Error",
|
|
||||||
np.median(kridge_scores["test_neg_mean_absolute_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Root Mean Squared Error",
|
|
||||||
np.median(kridge_scores["test_neg_root_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print("R2", np.median(kridge_scores["test_r2"]))
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# ### Gaussian Process Regression
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
gpr = gaussian_process.GaussianProcessRegressor()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
|
||||||
|
|
||||||
gpr_scores = cross_validate(
|
|
||||||
gpr,
|
|
||||||
X=imputer.fit_transform(train_x),
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=logo,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=(
|
|
||||||
"r2",
|
|
||||||
"neg_mean_squared_error",
|
|
||||||
"neg_mean_absolute_error",
|
|
||||||
"neg_root_mean_squared_error",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Squared Error", np.median(gpr_scores["test_neg_mean_squared_error"])
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Mean Absolute Error",
|
|
||||||
np.median(gpr_scores["test_neg_mean_absolute_error"]),
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Negative Root Mean Squared Error",
|
|
||||||
np.median(gpr_scores["test_neg_root_mean_squared_error"]),
|
|
||||||
)
|
|
||||||
print("R2", np.median(gpr_scores["test_r2"]))
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
CV_METHOD = "logo" # logo, half_logo, 5kfold
|
||||||
|
|
||||||
|
model_input_encoded = impute_encode_categorical_features(model_input)
|
||||||
|
# %%
|
||||||
|
data_x, data_y, data_groups = prepare_sklearn_data_format(
|
||||||
|
model_input_encoded, CV_METHOD
|
||||||
|
)
|
||||||
|
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
|
||||||
|
# %%
|
||||||
|
data_y.head()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_y.tail()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
data_y.shape
|
||||||
|
|
||||||
|
# %%
|
||||||
|
scores = run_all_regression_models(data_x, data_y, data_groups, cross_validator)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
scores.to_csv(
|
||||||
|
"../presentation/JCQ_supervisor_support_regression_" + CV_METHOD + ".csv",
|
||||||
|
index=False,
|
||||||
|
)
|
||||||
|
|
|
@ -0,0 +1,217 @@
|
||||||
|
# ---
|
||||||
|
# jupyter:
|
||||||
|
# jupytext:
|
||||||
|
# text_representation:
|
||||||
|
# extension: .py
|
||||||
|
# format_name: percent
|
||||||
|
# format_version: '1.3'
|
||||||
|
# jupytext_version: 1.14.5
|
||||||
|
# kernelspec:
|
||||||
|
# display_name: straw2analysis
|
||||||
|
# language: python
|
||||||
|
# name: straw2analysis
|
||||||
|
# ---
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from features.esm_JCQ import DICT_JCQ_DEMAND_CONTROL_REVERSE
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_questions = pd.read_csv(
|
||||||
|
"E:/STRAWbaseline/survey637813+question_text.csv", header=None
|
||||||
|
).T
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_questions
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_questions[["code", "text"]] = limesurvey_questions[0].str.split(
|
||||||
|
r"\.\s", expand=True, n=1
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_questions
|
||||||
|
|
||||||
|
# %%
|
||||||
|
demand_reverse_lime_rows = (
|
||||||
|
limesurvey_questions["text"].str.startswith(" [Od mene se ne zahteva,")
|
||||||
|
| limesurvey_questions["text"].str.startswith(" [Imam dovolj časa, da končam")
|
||||||
|
| limesurvey_questions["text"].str.startswith(
|
||||||
|
" [Pri svojem delu se ne srečujem s konfliktnimi"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
control_reverse_lime_rows = limesurvey_questions["text"].str.startswith(
|
||||||
|
" [Moje delo vključuje veliko ponavljajočega"
|
||||||
|
) | limesurvey_questions["text"].str.startswith(
|
||||||
|
" [Pri svojem delu imam zelo malo svobode"
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
demand_reverse_lime = limesurvey_questions[demand_reverse_lime_rows]
|
||||||
|
demand_reverse_lime.loc[:, "qid"] = demand_reverse_lime["code"].str.extract(
|
||||||
|
r"\[(\d+)\]"
|
||||||
|
)
|
||||||
|
control_reverse_lime = limesurvey_questions[control_reverse_lime_rows]
|
||||||
|
control_reverse_lime.loc[:, "qid"] = control_reverse_lime["code"].str.extract(
|
||||||
|
r"\[(\d+)\]"
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_questions.loc[89, "text"]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_questions[limesurvey_questions["code"].str.startswith("JobEisen")]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
demand_reverse_lime
|
||||||
|
|
||||||
|
# %%
|
||||||
|
control_reverse_lime
|
||||||
|
|
||||||
|
# %%
|
||||||
|
participant_info = pd.read_csv(
|
||||||
|
"C:/Users/junos/Documents/FWO-ARRS/Analysis/straw2analysis/rapids/data/raw/p031/participant_baseline_raw.csv",
|
||||||
|
parse_dates=["date_of_birth"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
participant_info_t = participant_info.T
|
||||||
|
|
||||||
|
# %%
|
||||||
|
rows_baseline = participant_info_t.index
|
||||||
|
|
||||||
|
# %%
|
||||||
|
rows_demand = rows_baseline.str.startswith("JobEisen") & ~rows_baseline.str.endswith(
|
||||||
|
"Time"
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
rows_baseline[rows_demand]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_control = (
|
||||||
|
participant_info_t[rows_demand]
|
||||||
|
.reset_index()
|
||||||
|
.rename(columns={"index": "question", 0: "score_original"})
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_control
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_control["qid"] = (
|
||||||
|
limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_control["score"] = limesurvey_control["score_original"]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_control["qid"][0]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
rows_demand_reverse = limesurvey_control["qid"].isin(
|
||||||
|
DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
|
||||||
|
)
|
||||||
|
limesurvey_control.loc[rows_demand_reverse, "score"] = (
|
||||||
|
4 + 1 - limesurvey_control.loc[rows_demand_reverse, "score_original"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
JCQ_DEMAND = "JobEisen"
|
||||||
|
JCQ_CONTROL = "JobControle"
|
||||||
|
dict_JCQ_demand_control_reverse = {
|
||||||
|
JCQ_DEMAND: {
|
||||||
|
3: " [Od mene se ne zahteva,",
|
||||||
|
4: " [Imam dovolj časa, da končam",
|
||||||
|
5: " [Pri svojem delu se ne srečujem s konfliktnimi",
|
||||||
|
},
|
||||||
|
JCQ_CONTROL: {
|
||||||
|
2: " |Moje delo vključuje veliko ponavljajočega",
|
||||||
|
6: " [Pri svojem delu imam zelo malo svobode",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_control
|
||||||
|
|
||||||
|
# %%
|
||||||
|
test = pd.DataFrame(
|
||||||
|
data={"question": "one", "score_original": 3, "score": 3, "qid": 10}, index=[0]
|
||||||
|
)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
pd.concat([test, limesurvey_control]).reset_index()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_control["score"].sum()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
rows_demand_reverse
|
||||||
|
|
||||||
|
# %%
|
||||||
|
dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
limesurvey_control
|
||||||
|
|
||||||
|
# %%
|
||||||
|
DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
|
||||||
|
DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
|
||||||
|
|
||||||
|
JCQ_NORMS = {
|
||||||
|
"F": {
|
||||||
|
0: DEMAND_CONTROL_RATIO_MIN,
|
||||||
|
1: 0.45,
|
||||||
|
2: 0.52,
|
||||||
|
3: 0.62,
|
||||||
|
4: DEMAND_CONTROL_RATIO_MAX,
|
||||||
|
},
|
||||||
|
"M": {
|
||||||
|
0: DEMAND_CONTROL_RATIO_MIN,
|
||||||
|
1: 0.41,
|
||||||
|
2: 0.48,
|
||||||
|
3: 0.56,
|
||||||
|
4: DEMAND_CONTROL_RATIO_MAX,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# %%
|
||||||
|
JCQ_NORMS[participant_info.loc[0, "gender"]][0]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
participant_info_t.index.str.startswith("JobControle")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
columns_baseline = participant_info.columns
|
||||||
|
|
||||||
|
# %%
|
||||||
|
columns_demand = columns_baseline.str.startswith(
|
||||||
|
"JobControle"
|
||||||
|
) & ~columns_baseline.str.endswith("Time")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
columns_baseline[columns_demand]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
participant_control = participant_info.loc[:, columns_demand]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
participant_control["id"] = participant_control.index
|
||||||
|
|
||||||
|
# %%
|
||||||
|
participant_control
|
||||||
|
|
||||||
|
# %%
|
||||||
|
pd.wide_to_long(
|
||||||
|
participant_control,
|
||||||
|
stubnames="JobControle",
|
||||||
|
i="id",
|
||||||
|
j="qid",
|
||||||
|
sep="[",
|
||||||
|
suffix="(\\d+)]",
|
||||||
|
)
|
195
features/esm.py
195
features/esm.py
|
@ -20,11 +20,47 @@ ANSWER_DAY_OFF = "DayOff3421"
|
||||||
ANSWER_SET_EVENING = "DayFinishedSetEvening"
|
ANSWER_SET_EVENING = "DayFinishedSetEvening"
|
||||||
|
|
||||||
MAX_MORNING_LENGTH = 3
|
MAX_MORNING_LENGTH = 3
|
||||||
# When the participants was not yet at work at the time of the first (morning) EMA,
|
# When the participant was not yet at work at the time of the first (morning) EMA,
|
||||||
# only three items were answered.
|
# only three items were answered.
|
||||||
# Two sleep related items and one indicating NOT starting work yet.
|
# Two sleep related items and one indicating NOT starting work yet.
|
||||||
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
|
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
|
||||||
|
|
||||||
|
QUESTIONNAIRE_IDS = {
|
||||||
|
"sleep_quality": 1,
|
||||||
|
"PANAS_positive_affect": 8,
|
||||||
|
"PANAS_negative_affect": 9,
|
||||||
|
"JCQ_job_demand": 10,
|
||||||
|
"JCQ_job_control": 11,
|
||||||
|
"JCQ_supervisor_support": 12,
|
||||||
|
"JCQ_coworker_support": 13,
|
||||||
|
"PFITS_supervisor": 14,
|
||||||
|
"PFITS_coworkers": 15,
|
||||||
|
"UWES_vigor": 16,
|
||||||
|
"UWES_dedication": 17,
|
||||||
|
"UWES_absorption": 18,
|
||||||
|
"COPE_active": 19,
|
||||||
|
"COPE_support": 20,
|
||||||
|
"COPE_emotions": 21,
|
||||||
|
"balance_life_work": 22,
|
||||||
|
"balance_work_life": 23,
|
||||||
|
"recovery_experience_detachment": 24,
|
||||||
|
"recovery_experience_relaxation": 25,
|
||||||
|
"symptoms": 26,
|
||||||
|
"appraisal_stressfulness_event": 87,
|
||||||
|
"appraisal_threat": 88,
|
||||||
|
"appraisal_challenge": 89,
|
||||||
|
"appraisal_event_time": 90,
|
||||||
|
"appraisal_event_duration": 91,
|
||||||
|
"appraisal_event_work_related": 92,
|
||||||
|
"appraisal_stressfulness_period": 93,
|
||||||
|
"late_work": 94,
|
||||||
|
"work_hours": 95,
|
||||||
|
"left_work": 96,
|
||||||
|
"activities": 97,
|
||||||
|
"coffee_breaks": 98,
|
||||||
|
"at_work_yet": 99,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_esm_data(usernames: Collection) -> pd.DataFrame:
|
def get_esm_data(usernames: Collection) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
|
@ -52,8 +88,10 @@ def get_esm_data(usernames: Collection) -> pd.DataFrame:
|
||||||
|
|
||||||
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
|
Convert timestamps and expand JSON column.
|
||||||
|
|
||||||
Convert timestamps into human-readable datetimes and dates
|
Convert timestamps into human-readable datetimes and dates
|
||||||
and expand the JSON column into several Pandas DF columns.
|
and expand the JSON column into several Pandas DF columns.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
@ -63,7 +101,8 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
df_esm_preprocessed: pd.DataFrame
|
df_esm_preprocessed: pd.DataFrame
|
||||||
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
|
A dataframe with added columns: datetime in Ljubljana timezone
|
||||||
|
and all fields from ESM_JSON column.
|
||||||
"""
|
"""
|
||||||
df_esm = helper.get_date_from_timestamp(df_esm)
|
df_esm = helper.get_date_from_timestamp(df_esm)
|
||||||
|
|
||||||
|
@ -76,31 +115,39 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||||
def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
For each distinct EMA session, determine how the participant responded to it.
|
For each distinct EMA session, determine how the participant responded to it.
|
||||||
Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
|
|
||||||
|
Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED,
|
||||||
|
and SESSION_STATUS_COMPLETE
|
||||||
|
|
||||||
This is done in three steps.
|
This is done in three steps.
|
||||||
|
|
||||||
First, the esm_status is considered.
|
First, the esm_status is considered.
|
||||||
If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
|
If any of the ESMs in a session has a status *other than* "answered",
|
||||||
|
then this session is taken as unfinished.
|
||||||
|
|
||||||
Second, the sessions which do not represent full questionnaires are identified.
|
Second, the sessions which do not represent full questionnaires are identified.
|
||||||
These are sessions where participants only marked they are finished with the day or have not yet started working.
|
These are sessions where participants only marked they are finished with the day
|
||||||
|
or have not yet started working.
|
||||||
|
|
||||||
Third, the sessions with only one item are marked with their trigger.
|
Third, the sessions with only one item are marked with their trigger.
|
||||||
We never offered questionnaires with single items, so we can be sure these are unfinished.
|
We never offered questionnaires with single items,
|
||||||
|
so we can be sure these are unfinished.
|
||||||
|
|
||||||
Finally, all sessions that remain are marked as completed.
|
Finally, all sessions that remain are marked as completed.
|
||||||
By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option.
|
By going through different possibilities in expl_esm_adherence.ipynb,
|
||||||
|
this turned out to be a reasonable option.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
df_esm_preprocessed: pd.DataFrame
|
df_esm_preprocessed: pd.DataFrame
|
||||||
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
A preprocessed dataframe of esm data,
|
||||||
|
which must include the session ID (esm_session).
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
df_session_counts: pd.Dataframe
|
df_session_counts: pd.Dataframe
|
||||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
|
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
|
||||||
|
with their statuses and the number of items.
|
||||||
"""
|
"""
|
||||||
sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
|
sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
|
||||||
|
|
||||||
|
@ -155,17 +202,22 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat
|
||||||
|
|
||||||
def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)
|
Classify EMA sessions into morning, workday, or evening.
|
||||||
|
|
||||||
|
For each EMA session, determine the time of the first user answer
|
||||||
|
and its time type (morning, workday, or evening).
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
df_esm_preprocessed: pd.DataFrame
|
df_esm_preprocessed: pd.DataFrame
|
||||||
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
A preprocessed dataframe of esm data,
|
||||||
|
which must include the session ID (esm_session).
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
df_session_time: pd.DataFrame
|
df_session_time: pd.DataFrame
|
||||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
|
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
|
||||||
|
with their time type and timestamp of first answer.
|
||||||
"""
|
"""
|
||||||
df_session_time = (
|
df_session_time = (
|
||||||
df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
|
df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
|
||||||
|
@ -179,13 +231,17 @@ def classify_sessions_by_completion_time(
|
||||||
df_esm_preprocessed: pd.DataFrame,
|
df_esm_preprocessed: pd.DataFrame,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
The point of this function is to not only classify sessions by using the previously defined functions.
|
Classify sessions and correct the time type.
|
||||||
|
|
||||||
|
The point of this function is to not only classify sessions
|
||||||
|
by using the previously defined functions.
|
||||||
It also serves to "correct" the time type of some EMA sessions.
|
It also serves to "correct" the time type of some EMA sessions.
|
||||||
|
|
||||||
A morning questionnaire could seamlessly transition into a daytime questionnaire,
|
A morning questionnaire could seamlessly transition into a daytime questionnaire,
|
||||||
if the participant was already at work.
|
if the participant was already at work.
|
||||||
In this case, the "time" label changed mid-session.
|
In this case, the "time" label changed mid-session.
|
||||||
Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
|
Because of the way classify_sessions_by_time works,
|
||||||
|
this questionnaire was classified as "morning".
|
||||||
But for all intents and purposes, it can be treated as a "daytime" EMA.
|
But for all intents and purposes, it can be treated as a "daytime" EMA.
|
||||||
|
|
||||||
The way this scenario is differentiated from a true "morning" questionnaire,
|
The way this scenario is differentiated from a true "morning" questionnaire,
|
||||||
|
@ -194,13 +250,16 @@ def classify_sessions_by_completion_time(
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
df_esm_preprocessed: pd.DataFrame
|
df_esm_preprocessed: pd.DataFrame
|
||||||
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
A preprocessed dataframe of esm data,
|
||||||
|
which must include the session ID (esm_session).
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
df_session_counts_time: pd.DataFrame
|
df_session_counts_time: pd.DataFrame
|
||||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items,
|
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses,
|
||||||
their time type (with some morning EMAs reclassified) and timestamp of first answer.
|
the number of items,
|
||||||
|
their time type (with some morning EMAs reclassified)
|
||||||
|
and timestamp of first answer.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
|
df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
|
||||||
|
@ -219,7 +278,8 @@ def classify_sessions_by_completion_time(
|
||||||
|
|
||||||
def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
This function eliminates invalid ESM responses.
|
Eliminate invalid ESM responses.
|
||||||
|
|
||||||
It removes unanswered ESMs and those that indicate end of work and similar.
|
It removes unanswered ESMs and those that indicate end of work and similar.
|
||||||
It also extracts a numeric answer from strings such as "4 - I strongly agree".
|
It also extracts a numeric answer from strings such as "4 - I strongly agree".
|
||||||
|
|
||||||
|
@ -256,3 +316,100 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return df_esm_clean
|
return df_esm_clean
|
||||||
|
|
||||||
|
|
||||||
|
def increment_answers(df_esm_clean: pd.DataFrame, increment_by=1):
|
||||||
|
"""
|
||||||
|
Increment answers to keep in line with original scoring.
|
||||||
|
|
||||||
|
We always used 0 for the lowest value of user answer.
|
||||||
|
Some scales originally used other scoring, such as starting from 1.
|
||||||
|
This restores original scoring so that the values are comparable to references.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df_esm_clean: pd.DataFrame
|
||||||
|
A cleaned ESM dataframe, which must also include esm_user_answer_numeric.
|
||||||
|
increment_by:
|
||||||
|
A number to add to the user answer.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_esm_clean: pd.DataFrame
|
||||||
|
The same df with addition of a column 'esm_user_answer_numeric'.
|
||||||
|
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
df_esm_clean = df_esm_clean.assign(
|
||||||
|
esm_user_score=lambda x: x.esm_user_answer_numeric + increment_by
|
||||||
|
)
|
||||||
|
except AttributeError as e:
|
||||||
|
print("Please, clean the dataframe first using features.esm.clean_up_esm.")
|
||||||
|
print(e)
|
||||||
|
return df_esm_clean
|
||||||
|
|
||||||
|
|
||||||
|
def reassign_question_ids(
|
||||||
|
df_esm_cleaned: pd.DataFrame, question_ids_content: dict
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Fix question IDs to match their actual content.
|
||||||
|
|
||||||
|
Unfortunately, when altering the protocol to adapt to COVID pandemic,
|
||||||
|
we did not retain original question IDs.
|
||||||
|
This means that for participants before 2021, they are different
|
||||||
|
from for the rest of them.
|
||||||
|
This function searches for question IDs by matching their strings.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df_esm_cleaned: pd.DataFrame
|
||||||
|
A cleaned up dataframe, which must also include esm_user_answer_numeric.
|
||||||
|
question_ids_content: dict
|
||||||
|
A dictionary, linking question IDs with their content ("instructions").
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_esm_fixed: pd.DataFrame
|
||||||
|
The same dataframe but with fixed question IDs.
|
||||||
|
"""
|
||||||
|
df_esm_unique_questions = (
|
||||||
|
df_esm_cleaned.groupby("question_id")
|
||||||
|
.esm_instructions.value_counts()
|
||||||
|
.rename()
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
# Tabulate all possible answers to each question (group by question ID).
|
||||||
|
|
||||||
|
# First, check that we anticipated all esm instructions.
|
||||||
|
for q_id in question_ids_content.keys():
|
||||||
|
# Look for all questions ("instructions") occurring in the dataframe.
|
||||||
|
actual_questions = df_esm_unique_questions.loc[
|
||||||
|
df_esm_unique_questions["question_id"] == q_id,
|
||||||
|
"esm_instructions",
|
||||||
|
]
|
||||||
|
# These are all answers to a given question (by q_id).
|
||||||
|
questions_matches = actual_questions.str.startswith(
|
||||||
|
question_ids_content.get(q_id)
|
||||||
|
)
|
||||||
|
# See if they are expected, i.e. included in the dictionary.
|
||||||
|
if ~actual_questions.all():
|
||||||
|
print("One of the questions that occur in the data was undefined.")
|
||||||
|
print("This were the questions found in the data: ")
|
||||||
|
raise KeyError(actual_questions[~questions_matches])
|
||||||
|
# In case there is an unexpected answer, raise an exception.
|
||||||
|
|
||||||
|
# Next, replace question IDs.
|
||||||
|
df_esm_fixed = df_esm_cleaned.copy()
|
||||||
|
df_esm_fixed["question_id"] = df_esm_cleaned["esm_instructions"].apply(
|
||||||
|
lambda x: next(
|
||||||
|
(
|
||||||
|
key
|
||||||
|
for key, values in question_ids_content.items()
|
||||||
|
if x.startswith(values)
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return df_esm_fixed
|
||||||
|
|
|
@ -0,0 +1,125 @@
|
||||||
|
COPE_ORIGINAL_MAX = 4
|
||||||
|
COPE_ORIGINAL_MIN = 1
|
||||||
|
|
||||||
|
DICT_COPE_QUESTION_IDS = {
|
||||||
|
164: (
|
||||||
|
"I took additional action to try to get rid of the problem",
|
||||||
|
"Ik deed extra mijn best om er iets aan te doen",
|
||||||
|
"Vložila sem dodaten napor, da bi rešila problem",
|
||||||
|
"Vložil sem dodaten napor, da bi rešil problem",
|
||||||
|
),
|
||||||
|
165: (
|
||||||
|
"I concentrated my efforts on doing something about it",
|
||||||
|
"Ik probeerde de situatie te verbeteren",
|
||||||
|
"Svoje sile sem usmerila v reševanje nastale situacije",
|
||||||
|
"Svoje sile sem usmeril v reševanje nastale situacije",
|
||||||
|
),
|
||||||
|
166: (
|
||||||
|
"I did what had to be done, one step at a time",
|
||||||
|
"Ik deed stap voor stap wat nodig was",
|
||||||
|
"Naredila sem, kar je bilo potrebno – korak za korakom",
|
||||||
|
"Naredil sem, kar je bilo potrebno – korak za korakom",
|
||||||
|
),
|
||||||
|
167: (
|
||||||
|
"I took direct action to get around the problem",
|
||||||
|
"Ik handelde vlug om het probleem te verhelpen",
|
||||||
|
"Nekaj sem naredila, da sem zaobšla problem",
|
||||||
|
"Nekaj sem naredil, da sem zaobšel problem",
|
||||||
|
),
|
||||||
|
168: (
|
||||||
|
"I tried to come up with a strategy about what to do",
|
||||||
|
"Ik probeerde te verzinnen wat ik er aan kon doen",
|
||||||
|
"Skušala sem najti ustrezen način za rešitev situacije",
|
||||||
|
"Skušal sem najti ustrezen način za rešitev situacije",
|
||||||
|
),
|
||||||
|
169: (
|
||||||
|
"I made a plan of action",
|
||||||
|
"Ik maakte een plan",
|
||||||
|
"Naredila sem načrt za delovanje",
|
||||||
|
"Naredil sem načrt za delovanje",
|
||||||
|
),
|
||||||
|
170: (
|
||||||
|
"I thought hard about what steps to take",
|
||||||
|
"Ik dacht hard na over wat ik moest doen",
|
||||||
|
"Dobro sem premislila, katere korake moram narediti, da rešim problem",
|
||||||
|
"Dobro sem premislil, katere korake moram narediti, da rešim problem",
|
||||||
|
),
|
||||||
|
171: (
|
||||||
|
"I thought about how I might best handle the problem",
|
||||||
|
"lk dacht na over hoe ik het probleem het best kon aanpakken",
|
||||||
|
"Razmišljala sem, kaj bi bilo najbolje narediti s problemom",
|
||||||
|
"Razmišljal sem, kaj bi bilo najbolje narediti s problemom",
|
||||||
|
),
|
||||||
|
172: (
|
||||||
|
"I asked people who have had similar experiences what they did",
|
||||||
|
"Ik vroeg aan mensen met dergelijke ervaringen hoe zij reageerden",
|
||||||
|
"Vprašala sem posameznike s podobnimi izkušnjami, kaj so storili",
|
||||||
|
"Vprašal sem posameznike s podobnimi izkušnjami, kaj so storili",
|
||||||
|
),
|
||||||
|
173: (
|
||||||
|
"I tried to get advice from someone about what to do",
|
||||||
|
"lk vroeg advies aan iemand",
|
||||||
|
"Pri drugih sem poskušala dobiti nasvet, kaj naj storim",
|
||||||
|
"Pri drugih sem poskušal dobiti nasvet, kaj naj storim",
|
||||||
|
),
|
||||||
|
174: (
|
||||||
|
"I talked to someone to find out more about the situation",
|
||||||
|
"Ik sprak met iemand om meer te weten te komen over de situatie",
|
||||||
|
"Z nekom sem se pogovorila, da bi izvedela še kaj o svojem problemu",
|
||||||
|
"Z nekom sem se pogovoril, da bi izvedel še kaj o svojem problemu",
|
||||||
|
),
|
||||||
|
175: (
|
||||||
|
"I talked to someone who could do something concrete about the problem",
|
||||||
|
"Ik sprak met iemand die iets aan het probleem kon doen",
|
||||||
|
"Pogovorila sem se s kom, ki bi lahko naredil kaj konkretnega",
|
||||||
|
"Pogovoril sem se s kom, ki bi lahko naredil kaj konkretnega",
|
||||||
|
),
|
||||||
|
176: (
|
||||||
|
"I talked to someone about how I felt",
|
||||||
|
"Ik sprak met iemand over hoe ik mij voelde",
|
||||||
|
"Z nekom sem se pogovorila o tem, kako sem se počutila",
|
||||||
|
"Z nekom sem se pogovoril o tem, kako sem se počutil",
|
||||||
|
),
|
||||||
|
177: (
|
||||||
|
"I tried to get emotional support from friends or relatives",
|
||||||
|
"Ik zocht steun bij vrienden of familie",
|
||||||
|
"Skušala sem dobiti čustveno podporo prijateljev ali sorodnikov",
|
||||||
|
"Skušal sem dobiti čustveno podporo prijateljev ali sorodnikov",
|
||||||
|
),
|
||||||
|
178: (
|
||||||
|
"I discussed my feelings with someone",
|
||||||
|
"lk besprak mijn gevoelens met iemand",
|
||||||
|
"O svojih občutkih sem se z nekom pogovorila",
|
||||||
|
"O svojih občutkih sem se z nekom pogovoril",
|
||||||
|
),
|
||||||
|
179: (
|
||||||
|
"I got sympathy and understanding from someone",
|
||||||
|
"Ik vroeg medeleven en begrip van iemand",
|
||||||
|
"Poiskala sem naklonjenost in razumevanje drugih",
|
||||||
|
"Poiskal sem naklonjenost in razumevanje drugih",
|
||||||
|
),
|
||||||
|
180: (
|
||||||
|
"I got upset and let my emotions out",
|
||||||
|
"Ik raakte van streek",
|
||||||
|
"Razburila sem se in to tudi pokazala",
|
||||||
|
"Razburil sem se in to tudi pokazal",
|
||||||
|
),
|
||||||
|
181: (
|
||||||
|
"I let my feelings out",
|
||||||
|
"Ik toonde mijn gevoelens",
|
||||||
|
"Svojim čustvom sem dala prosto pot",
|
||||||
|
"Svojim čustvom sem dal prosto pot",
|
||||||
|
),
|
||||||
|
182: (
|
||||||
|
"I felt a lot of emotional distress and I found myself expressing",
|
||||||
|
"lk liet duidelijk blijken hoe ellendig ik mij voelde",
|
||||||
|
"Doživljala sem veliko stresa in opažala, da sem čustva",
|
||||||
|
"Doživljal sem veliko stresa in opažal, da sem čustva",
|
||||||
|
),
|
||||||
|
183: (
|
||||||
|
"I got upset, and I was really aware of it",
|
||||||
|
"Ik merkte dat ik erg van streek was",
|
||||||
|
"Razburila sem se in razmišljala samo o tem",
|
||||||
|
"Razburil sem se in razmišljal samo o tem",
|
||||||
|
),
|
||||||
|
}
|
|
@ -1,9 +1,11 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from features.esm import increment_answers
|
||||||
|
|
||||||
JCQ_ORIGINAL_MAX = 4
|
JCQ_ORIGINAL_MAX = 4
|
||||||
JCQ_ORIGINAL_MIN = 1
|
JCQ_ORIGINAL_MIN = 1
|
||||||
|
|
||||||
dict_JCQ_demand_control_reverse = {
|
DICT_JCQ_DEMAND_CONTROL_REVERSE = {
|
||||||
75: (
|
75: (
|
||||||
"I was NOT asked",
|
"I was NOT asked",
|
||||||
"Men legde mij geen overdreven",
|
"Men legde mij geen overdreven",
|
||||||
|
@ -40,10 +42,14 @@ def reverse_jcq_demand_control_scoring(
|
||||||
df_esm_jcq_demand_control: pd.DataFrame,
|
df_esm_jcq_demand_control: pd.DataFrame,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
This function recodes answers in Job content questionnaire by first incrementing them by 1,
|
Reverse JCQ demand and control answers.
|
||||||
to be in line with original (1-4) scoring.
|
|
||||||
Then, some answers are reversed (i.e. 1 becomes 4 etc.), because the questions are negatively phrased.
|
This function recodes answers in Job content questionnaire
|
||||||
These answers are listed in dict_JCQ_demand_control_reverse and identified by their question ID.
|
by first incrementing them by 1, to be in line with original (1-4) scoring.
|
||||||
|
Then, some answers are reversed (i.e. 1 becomes 4 etc.),
|
||||||
|
because the questions are negatively phrased.
|
||||||
|
These answers are listed in DICT_JCQ_DEMAND_CONTROL_REVERSE
|
||||||
|
and identified by their question ID.
|
||||||
However, the existing data is checked against literal phrasing of these questions
|
However, the existing data is checked against literal phrasing of these questions
|
||||||
to protect against wrong numbering of questions (differing question IDs).
|
to protect against wrong numbering of questions (differing question IDs).
|
||||||
|
|
||||||
|
@ -55,7 +61,8 @@ def reverse_jcq_demand_control_scoring(
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
df_esm_jcq_demand_control: pd.DataFrame
|
df_esm_jcq_demand_control: pd.DataFrame
|
||||||
The same dataframe with a column esm_user_score containing answers recoded and reversed.
|
The same dataframe with a column esm_user_score
|
||||||
|
containing answers recoded and reversed.
|
||||||
"""
|
"""
|
||||||
df_esm_jcq_demand_control_unique_answers = (
|
df_esm_jcq_demand_control_unique_answers = (
|
||||||
df_esm_jcq_demand_control.groupby("question_id")
|
df_esm_jcq_demand_control.groupby("question_id")
|
||||||
|
@ -64,7 +71,7 @@ def reverse_jcq_demand_control_scoring(
|
||||||
.reset_index()
|
.reset_index()
|
||||||
)
|
)
|
||||||
# Tabulate all possible answers to each question (group by question ID).
|
# Tabulate all possible answers to each question (group by question ID).
|
||||||
for q_id in dict_JCQ_demand_control_reverse.keys():
|
for q_id in DICT_JCQ_DEMAND_CONTROL_REVERSE.keys():
|
||||||
# Look through all answers that need to be reversed.
|
# Look through all answers that need to be reversed.
|
||||||
possible_answers = df_esm_jcq_demand_control_unique_answers.loc[
|
possible_answers = df_esm_jcq_demand_control_unique_answers.loc[
|
||||||
df_esm_jcq_demand_control_unique_answers["question_id"] == q_id,
|
df_esm_jcq_demand_control_unique_answers["question_id"] == q_id,
|
||||||
|
@ -72,7 +79,7 @@ def reverse_jcq_demand_control_scoring(
|
||||||
]
|
]
|
||||||
# These are all answers to a given question (by q_id).
|
# These are all answers to a given question (by q_id).
|
||||||
answers_matches = possible_answers.str.startswith(
|
answers_matches = possible_answers.str.startswith(
|
||||||
dict_JCQ_demand_control_reverse.get(q_id)
|
DICT_JCQ_DEMAND_CONTROL_REVERSE.get(q_id)
|
||||||
)
|
)
|
||||||
# See if they are expected, i.e. included in the dictionary.
|
# See if they are expected, i.e. included in the dictionary.
|
||||||
if ~answers_matches.all():
|
if ~answers_matches.all():
|
||||||
|
@ -82,18 +89,16 @@ def reverse_jcq_demand_control_scoring(
|
||||||
# In case there is an unexpected answer, raise an exception.
|
# In case there is an unexpected answer, raise an exception.
|
||||||
|
|
||||||
try:
|
try:
|
||||||
df_esm_jcq_demand_control = df_esm_jcq_demand_control.assign(
|
df_esm_jcq_demand_control = increment_answers(df_esm_jcq_demand_control)
|
||||||
esm_user_score=lambda x: x.esm_user_answer_numeric + 1
|
# Increment the original answer by 1 to keep in line
|
||||||
)
|
# with traditional scoring (from JCQ_ORIGINAL_MIN to JCQ_ORIGINAL_MAX).
|
||||||
# Increment the original answer by 1
|
|
||||||
# to keep in line with traditional scoring (JCQ_ORIGINAL_MIN - JCQ_ORIGINAL_MAX).
|
|
||||||
df_esm_jcq_demand_control[
|
df_esm_jcq_demand_control[
|
||||||
df_esm_jcq_demand_control["question_id"].isin(
|
df_esm_jcq_demand_control["question_id"].isin(
|
||||||
dict_JCQ_demand_control_reverse.keys()
|
DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
|
||||||
)
|
)
|
||||||
] = df_esm_jcq_demand_control[
|
] = df_esm_jcq_demand_control[
|
||||||
df_esm_jcq_demand_control["question_id"].isin(
|
df_esm_jcq_demand_control["question_id"].isin(
|
||||||
dict_JCQ_demand_control_reverse.keys()
|
DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
|
||||||
)
|
)
|
||||||
].assign(
|
].assign(
|
||||||
esm_user_score=lambda x: JCQ_ORIGINAL_MAX
|
esm_user_score=lambda x: JCQ_ORIGINAL_MAX
|
||||||
|
|
|
@ -3,6 +3,9 @@ import pandas as pd
|
||||||
|
|
||||||
import features.esm
|
import features.esm
|
||||||
|
|
||||||
|
SAM_ORIGINAL_MAX = 5
|
||||||
|
SAM_ORIGINAL_MIN = 1
|
||||||
|
|
||||||
QUESTIONNAIRE_ID_SAM = {
|
QUESTIONNAIRE_ID_SAM = {
|
||||||
"event_stress": 87,
|
"event_stress": 87,
|
||||||
"event_threat": 88,
|
"event_threat": 88,
|
||||||
|
@ -20,10 +23,107 @@ GROUP_QUESTIONNAIRES_BY = [
|
||||||
"device_id",
|
"device_id",
|
||||||
"esm_session",
|
"esm_session",
|
||||||
]
|
]
|
||||||
# Each questionnaire occurs only once within each esm_session on the same device within the same participant.
|
# Each questionnaire occurs only once within each esm_session on the same device
|
||||||
|
# within the same participant.
|
||||||
|
|
||||||
|
|
||||||
|
DICT_SAM_QUESTION_IDS = {
|
||||||
|
87: (
|
||||||
|
"Was there a particular event that created tension in you?",
|
||||||
|
"Was er een bepaalde gebeurtenis die spanning veroorzaakte?",
|
||||||
|
"Je prišlo do kakega dogodka, ki je v vas ustvaril napetost?",
|
||||||
|
),
|
||||||
|
88: (
|
||||||
|
"Did this event make you feel anxious?",
|
||||||
|
"Voelde je je angstig door deze gebeurtenis?",
|
||||||
|
"Ste se zaradi tega dogodka počutili tesnob",
|
||||||
|
),
|
||||||
|
89: (
|
||||||
|
"Will the outcome of this event be negative?",
|
||||||
|
"Zal de uitkomst van deze gebeurtenis negatief zijn?",
|
||||||
|
"Bo izid tega dogodka negativen?",
|
||||||
|
),
|
||||||
|
90: (
|
||||||
|
"How threatening was this event?",
|
||||||
|
"Hoe bedreigend was deze gebeurtenis?",
|
||||||
|
"Kako grozeč je bil ta dogodek?",
|
||||||
|
),
|
||||||
|
91: (
|
||||||
|
"Is this going to have a negative impact on you?",
|
||||||
|
"Zal dit een negatieve impact op je hebben?",
|
||||||
|
"Ali bo to negativno vplivalo na vas?",
|
||||||
|
),
|
||||||
|
92: (
|
||||||
|
"Is this going to have a positive impact on you?",
|
||||||
|
"Zal dit een positief effect op je hebben?",
|
||||||
|
"Ali bo to pozitivno vplivalo na vas?",
|
||||||
|
),
|
||||||
|
93: (
|
||||||
|
"How eager are you to tackle this event?",
|
||||||
|
"Hoe graag wil je deze gebeurtenis aanpakken?",
|
||||||
|
"Kako zagnani ste bili",
|
||||||
|
),
|
||||||
|
94: (
|
||||||
|
"To what extent can you become a stronger person because of this event?",
|
||||||
|
"In welke mate kan je een sterkere persoon worden door deze gebeurtenis?",
|
||||||
|
"V kolikšni meri lahko zaradi tega dogodka postanete močnejša oseba?",
|
||||||
|
),
|
||||||
|
95: (
|
||||||
|
"To what extent are you excited thinking about the outcome of this event?",
|
||||||
|
"In welke mate ben je enthousiast bij de gedachte",
|
||||||
|
"V kolikšni meri vas misel na izid tega dogodka navdušuje?",
|
||||||
|
),
|
||||||
|
96: (
|
||||||
|
"At what time did this event occur?",
|
||||||
|
"Hoe laat vond deze gebeurtenis plaats?",
|
||||||
|
"Kdaj se je ta dogodek zgodil?",
|
||||||
|
),
|
||||||
|
97: (
|
||||||
|
"How long did this event last?",
|
||||||
|
"Hoe lang duurde deze gebeurtenis?",
|
||||||
|
"Kako dolgo je trajal ta dogodek?",
|
||||||
|
),
|
||||||
|
98: (
|
||||||
|
"Was/is this event work-related?",
|
||||||
|
"Was/is deze gebeurtenis werkgerelateerd?",
|
||||||
|
"Je (bil) ta dogodek povezan s službo?",
|
||||||
|
"Je bil ali je ta dogodek povezan s službo?",
|
||||||
|
),
|
||||||
|
99: (
|
||||||
|
"Did this overall period create tension in you?",
|
||||||
|
"Heeft deze globale periode spanning veroorzaakt?",
|
||||||
|
"Je to obdobje kot celota v vas ustvarilo napetost?",
|
||||||
|
"Je to celo obdobje v vas ustvarilo napetost?",
|
||||||
|
),
|
||||||
|
100: (
|
||||||
|
"To what extent do you perceive this overall period as stressful?",
|
||||||
|
"In welke mate ervaar je deze globale periode als stressvol?",
|
||||||
|
"V kolikšni meri ste to obdobje dojemali kot stresno?",
|
||||||
|
"V kolikšni meri ste celo to obdobje dojemali kot stresno?",
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
|
def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Extract information about stressful events.
|
||||||
|
|
||||||
|
Participants were asked: "Was there a particular event that created tension in you?"
|
||||||
|
Then a subset of questions related to this event followed.
|
||||||
|
This function goes through the follow-up questions one by one
|
||||||
|
and preprocesses them, so that it adds new columns to the dataframe.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df_esm: pd.DataFrame
|
||||||
|
A raw dataframe of all ESM data.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_esm_events: pd.DataFrame
|
||||||
|
A cleaned up df of Stress Appraisal Measure items with additional columns.
|
||||||
|
|
||||||
|
"""
|
||||||
# 0. Select only questions from Stress Appraisal Measure.
|
# 0. Select only questions from Stress Appraisal Measure.
|
||||||
df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
|
df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
|
||||||
df_esm_sam = df_esm_preprocessed[
|
df_esm_sam = df_esm_preprocessed[
|
||||||
|
@ -78,7 +178,8 @@ def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
|
||||||
def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
|
This function calculates challenge and threat
|
||||||
|
(two Stress Appraisal Measure subscales) means,
|
||||||
for each ESM session (within participants and devices).
|
for each ESM session (within participants and devices).
|
||||||
It creates a grouped dataframe with means in two columns.
|
It creates a grouped dataframe with means in two columns.
|
||||||
|
|
||||||
|
@ -90,7 +191,8 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
df_esm_event_threat_challenge_mean_wide: pd.DataFrame
|
df_esm_event_threat_challenge_mean_wide: pd.DataFrame
|
||||||
A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
|
A dataframe of unique ESM sessions (by participants and devices)
|
||||||
|
with threat and challenge means.
|
||||||
"""
|
"""
|
||||||
# Select only threat and challenge assessments for events
|
# Select only threat and challenge assessments for events
|
||||||
df_esm_event_threat_challenge = df_esm_sam_clean[
|
df_esm_event_threat_challenge = df_esm_sam_clean[
|
||||||
|
@ -112,8 +214,8 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
|
||||||
aggfunc="mean",
|
aggfunc="mean",
|
||||||
)
|
)
|
||||||
# Drop unnecessary column values.
|
# Drop unnecessary column values.
|
||||||
df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
|
df_esm_event_threat_challenge_mean_wide.columns = (
|
||||||
1
|
df_esm_event_threat_challenge_mean_wide.columns.get_level_values(1)
|
||||||
)
|
)
|
||||||
df_esm_event_threat_challenge_mean_wide.columns.name = None
|
df_esm_event_threat_challenge_mean_wide.columns.name = None
|
||||||
df_esm_event_threat_challenge_mean_wide.rename(
|
df_esm_event_threat_challenge_mean_wide.rename(
|
||||||
|
@ -189,10 +291,12 @@ def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
|
||||||
def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
This function only serves to convert the string datetime answer into a real datetime type.
|
This function only serves to convert the string datetime answer
|
||||||
Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
|
into a real datetime type.
|
||||||
NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
|
Errors during this conversion are coerced, meaning that non-datetime answers
|
||||||
the NaTs can be interpreted to mean this.
|
are assigned Not a Time (NaT).
|
||||||
|
NOTE: Since the only available non-datetime answer to this question was
|
||||||
|
"0 - I do not remember", the NaTs can be interpreted to mean this.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
@ -208,9 +312,13 @@ def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
|
df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
|
||||||
].assign(
|
].assign(
|
||||||
event_time=lambda x: pd.to_datetime(
|
event_time=lambda x: pd.to_datetime(
|
||||||
x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
|
x.esm_user_answer,
|
||||||
|
errors="coerce",
|
||||||
|
format="%Y-%m-%d %H:%M:%S %z",
|
||||||
|
exact=True,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
# Example answer: 2020-09-29 00:05:00 +0200
|
||||||
return df_esm_event_time
|
return df_esm_event_time
|
||||||
|
|
||||||
|
|
||||||
|
@ -241,9 +349,12 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
== QUESTIONNAIRE_ID_SAM.get("event_duration")
|
== QUESTIONNAIRE_ID_SAM.get("event_duration")
|
||||||
].assign(
|
].assign(
|
||||||
event_duration=lambda x: pd.to_datetime(
|
event_duration=lambda x: pd.to_datetime(
|
||||||
x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
|
x.esm_user_answer.str.slice(start=0, stop=-6),
|
||||||
|
errors="coerce",
|
||||||
|
format="%Y-%m-%d %H:%M:%S",
|
||||||
).dt.time
|
).dt.time
|
||||||
)
|
)
|
||||||
|
# Example answer: 2020-09-29 00:05:00 +0200
|
||||||
# TODO Explore the values recorded in event_duration and possibly fix mistakes.
|
# TODO Explore the values recorded in event_duration and possibly fix mistakes.
|
||||||
# For example, participants reported setting 23:50:00 instead of 00:50:00.
|
# For example, participants reported setting 23:50:00 instead of 00:50:00.
|
||||||
|
|
||||||
|
@ -251,7 +362,7 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
# we can determine whether:
|
# we can determine whether:
|
||||||
# - this event is still going on ("1 - It is still going on")
|
# - this event is still going on ("1 - It is still going on")
|
||||||
# - the participant couldn't remember it's duration ("0 - I do not remember")
|
# - the participant couldn't remember it's duration ("0 - I do not remember")
|
||||||
# Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
|
# Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm
|
||||||
# but only the numeric types of questions and answers.
|
# but only the numeric types of questions and answers.
|
||||||
# Since this was of "datetime" type, convert these specific answers here again.
|
# Since this was of "datetime" type, convert these specific answers here again.
|
||||||
df_esm_event_duration["event_duration_info"] = np.nan
|
df_esm_event_duration["event_duration_info"] = np.nan
|
||||||
|
@ -264,4 +375,5 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
return df_esm_event_duration
|
return df_esm_event_duration
|
||||||
|
|
||||||
|
|
||||||
# TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?
|
# TODO: How many questions about the stressfulness of the period were asked
|
||||||
|
# and how does this relate to events?
|
||||||
|
|
|
@ -1,71 +1,123 @@
|
||||||
from sklearn.dummy import DummyClassifier
|
import pandas as pd
|
||||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
import xgboost as xg
|
||||||
from lightgbm import LGBMClassifier
|
from lightgbm import LGBMClassifier
|
||||||
import xgboost as xg
|
from sklearn import ensemble, linear_model, naive_bayes, neighbors, svm, tree
|
||||||
|
from sklearn.dummy import DummyClassifier
|
||||||
|
|
||||||
class ClassificationModels():
|
|
||||||
|
class ClassificationModels:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.cmodels = self.init_classification_models()
|
self.cmodels = self.init_classification_models()
|
||||||
|
|
||||||
def get_cmodels(self):
|
def get_cmodels(self):
|
||||||
return self.cmodels
|
return self.cmodels
|
||||||
|
|
||||||
def init_classification_models(self):
|
def init_classification_models(self):
|
||||||
cmodels = {
|
cmodels = {
|
||||||
'dummy_classifier': {
|
"dummy_classifier": {
|
||||||
'model': DummyClassifier(strategy="most_frequent"),
|
"model": DummyClassifier(strategy="most_frequent"),
|
||||||
'metrics': [0, 0, 0, 0]
|
"metrics": [0, 0, 0, 0],
|
||||||
},
|
},
|
||||||
'logistic_regression': {
|
"logistic_regression": {
|
||||||
'model': linear_model.LogisticRegression(max_iter=1000),
|
"model": linear_model.LogisticRegression(max_iter=1000),
|
||||||
'metrics': [0, 0, 0, 0]
|
"metrics": [0, 0, 0, 0],
|
||||||
},
|
},
|
||||||
'support_vector_machine': {
|
"support_vector_machine": {"model": svm.SVC(), "metrics": [0, 0, 0, 0]},
|
||||||
'model': svm.SVC(),
|
"gaussian_naive_bayes": {
|
||||||
'metrics': [0, 0, 0, 0]
|
"model": naive_bayes.GaussianNB(),
|
||||||
|
"metrics": [0, 0, 0, 0],
|
||||||
},
|
},
|
||||||
'gaussian_naive_bayes': {
|
"stochastic_gradient_descent_classifier": {
|
||||||
'model': naive_bayes.GaussianNB(),
|
"model": linear_model.SGDClassifier(),
|
||||||
'metrics': [0, 0, 0, 0]
|
"metrics": [0, 0, 0, 0],
|
||||||
},
|
},
|
||||||
'stochastic_gradient_descent_classifier': {
|
"knn": {"model": neighbors.KNeighborsClassifier(), "metrics": [0, 0, 0, 0]},
|
||||||
'model': linear_model.SGDClassifier(),
|
"decision_tree": {
|
||||||
'metrics': [0, 0, 0, 0]
|
"model": tree.DecisionTreeClassifier(),
|
||||||
|
"metrics": [0, 0, 0, 0],
|
||||||
},
|
},
|
||||||
'knn': {
|
"random_forest_classifier": {
|
||||||
'model': neighbors.KNeighborsClassifier(),
|
"model": ensemble.RandomForestClassifier(),
|
||||||
'metrics': [0, 0, 0, 0]
|
"metrics": [0, 0, 0, 0],
|
||||||
},
|
},
|
||||||
'decision_tree': {
|
"gradient_boosting_classifier": {
|
||||||
'model': tree.DecisionTreeClassifier(),
|
"model": ensemble.GradientBoostingClassifier(),
|
||||||
'metrics': [0, 0, 0, 0]
|
"metrics": [0, 0, 0, 0],
|
||||||
},
|
},
|
||||||
'random_forest_classifier': {
|
"lgbm_classifier": {"model": LGBMClassifier(), "metrics": [0, 0, 0, 0]},
|
||||||
'model': ensemble.RandomForestClassifier(),
|
"XGBoost_classifier": {
|
||||||
'metrics': [0, 0, 0, 0]
|
"model": xg.sklearn.XGBClassifier(),
|
||||||
|
"metrics": [0, 0, 0, 0],
|
||||||
},
|
},
|
||||||
'gradient_boosting_classifier': {
|
|
||||||
'model': ensemble.GradientBoostingClassifier(),
|
|
||||||
'metrics': [0, 0, 0, 0]
|
|
||||||
},
|
|
||||||
'lgbm_classifier': {
|
|
||||||
'model': LGBMClassifier(),
|
|
||||||
'metrics': [0, 0, 0, 0]
|
|
||||||
},
|
|
||||||
'XGBoost_classifier': {
|
|
||||||
'model': xg.sklearn.XGBClassifier(),
|
|
||||||
'metrics': [0, 0, 0, 0]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return cmodels
|
return cmodels
|
||||||
|
|
||||||
def get_total_models_scores(self, n_clusters=1):
|
def get_total_models_scores(self, n_clusters=1):
|
||||||
|
scores = pd.DataFrame(columns=["method", "metric", "mean"])
|
||||||
for model_title, model in self.cmodels.items():
|
for model_title, model in self.cmodels.items():
|
||||||
|
scores_df = pd.DataFrame(columns=["method", "metric", "mean"])
|
||||||
print("\n************************************\n")
|
print("\n************************************\n")
|
||||||
print("Current model:", model_title, end="\n")
|
print("Current model:", model_title, end="\n")
|
||||||
print("Acc:", model['metrics'][0]/n_clusters)
|
print("Acc:", model["metrics"][0] / n_clusters)
|
||||||
print("Precision:", model['metrics'][1]/n_clusters)
|
scores_df = pd.concat(
|
||||||
print("Recall:", model['metrics'][2]/n_clusters)
|
[
|
||||||
print("F1:", model['metrics'][3]/n_clusters)
|
scores_df,
|
||||||
|
pd.DataFrame(
|
||||||
|
{
|
||||||
|
"method": model_title,
|
||||||
|
"metric": "test_accuracy",
|
||||||
|
"mean": model["metrics"][0] / n_clusters,
|
||||||
|
},
|
||||||
|
index=[0],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
ignore_index=True,
|
||||||
|
)
|
||||||
|
print("Precision:", model["metrics"][1] / n_clusters)
|
||||||
|
scores_df = pd.concat(
|
||||||
|
[
|
||||||
|
scores_df,
|
||||||
|
pd.DataFrame(
|
||||||
|
{
|
||||||
|
"method": model_title,
|
||||||
|
"metric": "test_precision",
|
||||||
|
"mean": model["metrics"][1] / n_clusters,
|
||||||
|
},
|
||||||
|
index=[0],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
ignore_index=True,
|
||||||
|
)
|
||||||
|
print("Recall:", model["metrics"][2] / n_clusters)
|
||||||
|
scores_df = pd.concat(
|
||||||
|
[
|
||||||
|
scores_df,
|
||||||
|
pd.DataFrame(
|
||||||
|
{
|
||||||
|
"method": model_title,
|
||||||
|
"metric": "test_recall",
|
||||||
|
"mean": model["metrics"][2] / n_clusters,
|
||||||
|
},
|
||||||
|
index=[0],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
ignore_index=True,
|
||||||
|
)
|
||||||
|
print("F1:", model["metrics"][3] / n_clusters)
|
||||||
|
scores_df = pd.concat(
|
||||||
|
[
|
||||||
|
scores_df,
|
||||||
|
pd.DataFrame(
|
||||||
|
{
|
||||||
|
"method": model_title,
|
||||||
|
"metric": "test_f1",
|
||||||
|
"mean": model["metrics"][3] / n_clusters,
|
||||||
|
},
|
||||||
|
index=[0],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
ignore_index=True,
|
||||||
|
)
|
||||||
|
scores = pd.concat([scores, scores_df])
|
||||||
|
return scores
|
||||||
|
|
|
@ -49,8 +49,8 @@ class CrossValidation():
|
||||||
|
|
||||||
data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
|
data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
|
||||||
|
|
||||||
elif self.cv_method == "5kfold":
|
elif self.cv_method == "Stratified5kfold":
|
||||||
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
|
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], None
|
||||||
|
|
||||||
self.X, self.y, self.groups = data_X, data_y, data_groups
|
self.X, self.y, self.groups = data_X, data_y, data_groups
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ class CrossValidation():
|
||||||
|
|
||||||
if self.cv_method in ["logo", "half_logo"]:
|
if self.cv_method in ["logo", "half_logo"]:
|
||||||
self.cv = LeaveOneGroupOut()
|
self.cv = LeaveOneGroupOut()
|
||||||
elif self.cv_method == "5kfold":
|
elif self.cv_method == "Stratified5kfold":
|
||||||
self.cv = StratifiedKFold(n_splits=5, shuffle=True)
|
self.cv = StratifiedKFold(n_splits=5, shuffle=True)
|
||||||
|
|
||||||
|
|
||||||
|
@ -118,4 +118,11 @@ class CrossValidation():
|
||||||
"""
|
"""
|
||||||
return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
|
return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
|
||||||
|
|
||||||
|
def get_groups_sets(self, split):
|
||||||
|
|
||||||
|
if self.groups is None:
|
||||||
|
return None, None
|
||||||
|
else:
|
||||||
|
return self.groups.iloc[split[0]], self.groups.iloc[split[1]]
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import warnings
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from sklearn.feature_selection import SequentialFeatureSelector
|
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
|
||||||
|
from sklearn.model_selection import cross_validate, StratifiedKFold, GroupKFold
|
||||||
from sklearn.naive_bayes import GaussianNB
|
from sklearn.naive_bayes import GaussianNB
|
||||||
from sklearn.linear_model import Lasso
|
from sklearn.linear_model import Lasso
|
||||||
|
|
||||||
|
@ -21,13 +23,15 @@ from sklearn.linear_model import Lasso
|
||||||
|
|
||||||
class FeatureSelection:
|
class FeatureSelection:
|
||||||
|
|
||||||
def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
|
def __init__(self, X, y, groups):
|
||||||
pass # TODO....
|
self.X = X
|
||||||
|
self.y = y
|
||||||
|
self.groups = groups
|
||||||
|
|
||||||
|
|
||||||
def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
|
def select_best_feature(self, features, method="remove", ml_category="classification", ml_subcategory="bin", metric="recall", stored_features=[]):
|
||||||
"""The method selects the best feature by testing the prediction on the feature set with or without the current feature.
|
"""The method selects the best feature by testing the prediction on the feature set with or without the current feature.
|
||||||
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat
|
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particular
|
||||||
feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
|
feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
|
||||||
specified as a parameter.
|
specified as a parameter.
|
||||||
|
|
||||||
|
@ -35,7 +39,11 @@ class FeatureSelection:
|
||||||
df (DataFrame): Input data on which the predictions will be made.
|
df (DataFrame): Input data on which the predictions will be made.
|
||||||
features (list): List of features to select the best/worst from
|
features (list): List of features to select the best/worst from
|
||||||
method (str, optional): remove or add features. Defaults to "remove".
|
method (str, optional): remove or add features. Defaults to "remove".
|
||||||
ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and metric. Defaults to "classification".
|
ml_category (str, optional): Either classification or regression ml problem controls the ML algorithm and metric.
|
||||||
|
Defaults to "classification".
|
||||||
|
ml_subcategory (str, optional): In case of classification '_bin' for binary classification
|
||||||
|
and 'multi' for multiclass classification. For regression an empty string '' is sufficient.
|
||||||
|
Defaults to "bin".
|
||||||
metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
|
metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
|
||||||
stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
|
stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
|
||||||
|
|
||||||
|
@ -49,173 +57,189 @@ class FeatureSelection:
|
||||||
|
|
||||||
best_feature = None
|
best_feature = None
|
||||||
|
|
||||||
if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
|
# Validacije tipov ML in specificiranimi metrikami
|
||||||
raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
|
if ml_category == "classification":
|
||||||
elif ml_type == "regression" and metric not in ['r2']:
|
if ml_subcategory == "bin" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
|
||||||
|
raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
|
||||||
|
elif ml_subcategory == "multi":
|
||||||
|
ml_subcategory_error = False
|
||||||
|
if metric != "accuracy" and "_" in metric:
|
||||||
|
metric_s, metric_t = metric.split("_")
|
||||||
|
if metric_s not in ['accuracy', 'precision', 'recall', 'f1'] or metric_t not in ['micro', 'macro', 'weighted']:
|
||||||
|
ml_subcategory_error = True
|
||||||
|
else:
|
||||||
|
ml_subcategory_error = True
|
||||||
|
|
||||||
|
if ml_subcategory_error:
|
||||||
|
raise ValueError(""""Classification metric for multi-class classification must be specified precisely.
|
||||||
|
Available metric are: 'accuracy', 'precision', 'recall' and 'f1'.
|
||||||
|
Only accuracy must be specified as 'accuracy'.
|
||||||
|
For others please add appropriate suffixes: '_macro', '_micro', or '_weighted', e.g., 'f1_macro'""")
|
||||||
|
elif ml_category == "regression" and metric not in ['r2']:
|
||||||
raise ValueError("Regression metric not recognized. Please choose 'r2'")
|
raise ValueError("Regression metric not recognized. Please choose 'r2'")
|
||||||
|
|
||||||
for feat in features:
|
for feat in features:
|
||||||
if method == "remove":
|
if method == "remove":
|
||||||
pred_features = [col for col in df.columns if feat != col] # All but feat
|
pred_features = [col for col in self.X.columns if feat != col] # All but feat
|
||||||
elif method == "add":
|
elif method == "add":
|
||||||
pred_features = [feat] + stored_features # Feat with stored features
|
pred_features = [feat] + stored_features # Feat with stored features
|
||||||
|
|
||||||
X, y = df.drop(columns=['target', 'pid'])[pred_features], df['target']
|
X = self.X[pred_features].copy()
|
||||||
|
|
||||||
if ml_type == "classification":
|
if self.groups is not None:
|
||||||
|
cv = GroupKFold(n_splits=5)
|
||||||
|
else:
|
||||||
|
cv = StratifiedKFold(n_splits=5, shuffle=True)
|
||||||
|
|
||||||
|
# See link about scoring for multiclassfication
|
||||||
|
# http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
|
||||||
|
if ml_category == "classification":
|
||||||
nb = GaussianNB()
|
nb = GaussianNB()
|
||||||
model_cv = cross_validate(
|
model_cv = cross_validate(
|
||||||
nb,
|
nb,
|
||||||
X=X,
|
X=X,
|
||||||
y=y,
|
y=self.y,
|
||||||
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
cv=cv,
|
||||||
|
groups=self.groups,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
scoring=(metric)
|
||||||
)
|
)
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
|
||||||
|
|
||||||
if metric == "accuracy":
|
|
||||||
acc = np.mean(model_cv['test_accuracy'])
|
|
||||||
acc_std = np.std(model_cv['test_accuracy'])
|
|
||||||
|
|
||||||
if not best_feature or (acc > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = acc
|
|
||||||
best_metric_score_std = acc_std
|
|
||||||
|
|
||||||
elif metric == "precision":
|
|
||||||
prec = np.mean(model_cv['test_precision'])
|
|
||||||
prec_std = np.std(model_cv['test_precision'])
|
|
||||||
|
|
||||||
if not best_feature or (prec > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = prec
|
|
||||||
best_metric_score_std = prec_std
|
|
||||||
|
|
||||||
elif metric == "recall":
|
|
||||||
rec = np.mean(model_cv['test_recall'])
|
|
||||||
rec_std = np.std(model_cv['test_recall'])
|
|
||||||
|
|
||||||
if not best_feature or (rec > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = rec
|
|
||||||
best_metric_score_std = rec_std
|
|
||||||
|
|
||||||
else:
|
|
||||||
f1 = np.mean(model_cv['test_f1'])
|
|
||||||
f1_std = np.std(model_cv['test_f1'])
|
|
||||||
|
|
||||||
if not best_feature or (f1 > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = f1
|
|
||||||
best_metric_score_std = f1_std
|
|
||||||
|
|
||||||
elif ml_type == "regression":
|
elif ml_category == "regression":
|
||||||
lass = Lasso()
|
lass = Lasso()
|
||||||
model_cv = cross_validate(
|
model_cv = cross_validate(
|
||||||
lass,
|
lass,
|
||||||
X=X,
|
X=X,
|
||||||
y=y,
|
y=y,
|
||||||
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
cv=cv,
|
||||||
|
groups=self.groups,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=('r2')
|
scoring=('r2')
|
||||||
)
|
)
|
||||||
|
|
||||||
if metric == "r2":
|
|
||||||
r2 = np.mean(model_cv['test_r2'])
|
|
||||||
r2_std = np.std(model_cv['test_r2'])
|
|
||||||
|
|
||||||
if not best_feature or (r2 > best_metric_score):
|
|
||||||
best_feature = feat
|
|
||||||
best_metric_score = r2
|
|
||||||
best_metric_score_std = r2_std
|
|
||||||
else:
|
else:
|
||||||
raise ValueError("ML type not yet implemented!")
|
raise ValueError("ML type not yet implemented!")
|
||||||
|
|
||||||
|
# Section of metrics' scores comparison.
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
||||||
|
|
||||||
|
metric_score = np.nanmean(model_cv["test_score"])
|
||||||
|
metric_score_std = np.nanstd(model_cv["test_score"])
|
||||||
|
|
||||||
|
if not best_feature or (metric_score > best_metric_score):
|
||||||
|
best_feature = feat
|
||||||
|
best_metric_score = metric_score
|
||||||
|
best_metric_score_std = metric_score_std
|
||||||
|
|
||||||
return best_feature, best_metric_score, best_metric_score_std
|
return best_feature, best_metric_score, best_metric_score_std
|
||||||
|
|
||||||
|
|
||||||
def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
|
def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10):
|
||||||
|
"""This method selects a set of features and returns them as a list. It returns number of features
|
||||||
|
determined in the interval of [n_min, n_max].
|
||||||
|
|
||||||
n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
|
The method consists of two steps:
|
||||||
if n_max > n_features:
|
(1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter.
|
||||||
n_max = n_features
|
(2) The sequential features removal procedure is executed. Using the remaing features from (1).
|
||||||
|
The best score is detected using a removal procedure. The procedure sequentially removes the features
|
||||||
|
that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is
|
||||||
|
improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance)
|
||||||
|
with which the next n removed features are inspected whether currently best score is improved.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
n_min (int, optional): Minimal amount of features returned.
|
||||||
|
n_max (int, optional): Maximal amount of features returned.
|
||||||
|
k (int, optional): Determines the k in the k-best features method.
|
||||||
|
If None, SelectKBest feature selection does not execute.
|
||||||
|
ml_type(str, optional): Type of ML problem. Currently implemented options:
|
||||||
|
'classification_bin', 'classification_multi', and 'regression_'
|
||||||
|
method (str, optional): "remove" or "add" features. Defaults to "remove".
|
||||||
|
n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
|
||||||
|
the method returns index of feature with current best score as a tipping point feature.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: list of selected features
|
||||||
|
"""
|
||||||
|
|
||||||
|
if k is not None and k <= n_max:
|
||||||
|
raise ValueError("The k parameter needs to be greater than the n_max parameter.")
|
||||||
|
|
||||||
|
# Select k-best feature dependent on the type of ML task
|
||||||
|
ml_category, ml_subcategory = ml_type.split("_")
|
||||||
|
|
||||||
|
if k is not None:
|
||||||
|
if ml_category == "classification":
|
||||||
|
if ml_subcategory== "bin":
|
||||||
|
selector = SelectKBest(mutual_info_classif, k=k)
|
||||||
|
elif ml_subcategory== "multi":
|
||||||
|
selector = SelectKBest(f_classif, k=k)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
|
||||||
|
elif ml_category == "regression":
|
||||||
|
selector = SelectKBest(f_regression, k=k)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
|
||||||
|
|
||||||
|
selector.fit(self.X, self.y)
|
||||||
|
cols_idxs = selector.get_support(indices=True)
|
||||||
|
self.X = self.X.iloc[:,cols_idxs]
|
||||||
|
|
||||||
|
print("All columns (after SelectKBest method):")
|
||||||
|
print(self.X.columns)
|
||||||
|
|
||||||
|
# Sequential feature addition / removal
|
||||||
|
n_features = self.X.shape[1]
|
||||||
|
if n_max >= n_features:
|
||||||
|
n_max = n_features-1 # The algorithm removes at least one feature
|
||||||
|
|
||||||
if n_min > n_features:
|
if n_min > n_features:
|
||||||
raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
|
raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
|
||||||
|
|
||||||
if n_max < n_min:
|
if n_max < n_min:
|
||||||
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
|
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
|
||||||
|
|
||||||
features = df.columns.tolist()
|
features = self.X.columns.tolist()
|
||||||
features.remove("pid")
|
|
||||||
features.remove("target")
|
|
||||||
feature_importance = []
|
feature_importance = []
|
||||||
if method == "remove":
|
if method == "remove":
|
||||||
|
best_score = 0
|
||||||
|
best_feature_indx = None
|
||||||
|
i_worse = 0
|
||||||
for i in reversed(range(n_features)):
|
for i in reversed(range(n_features)):
|
||||||
|
|
||||||
|
if i+1 == n_min:
|
||||||
|
break
|
||||||
|
|
||||||
best_feature, best_metric_score, best_metric_score_std = \
|
best_feature, best_metric_score, best_metric_score_std = \
|
||||||
self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
|
self.select_best_feature(features, method=method, ml_category=ml_category, ml_subcategory=ml_subcategory, metric=metric)
|
||||||
feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
|
|
||||||
|
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
|
||||||
|
|
||||||
features.remove(best_feature)
|
features.remove(best_feature)
|
||||||
|
print("Features left:", i)
|
||||||
|
|
||||||
|
if i <= n_max:
|
||||||
|
if best_metric_score >= best_score:
|
||||||
|
best_score = best_metric_score
|
||||||
|
best_feature_indx = i+1
|
||||||
|
i_worse = 0
|
||||||
|
else:
|
||||||
|
i_worse += 1
|
||||||
|
|
||||||
|
if i_worse == n_tolerance:
|
||||||
|
break
|
||||||
|
|
||||||
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
|
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
|
||||||
|
|
||||||
# Selekcijski kriterij značilk v rangu max-min
|
|
||||||
# Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
|
|
||||||
|
|
||||||
# Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
|
|
||||||
|
|
||||||
# "Tipping point" značilka mora biti v rangu max-min
|
|
||||||
|
|
||||||
selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
|
|
||||||
selection_area.set_index(["i", "name"], inplace=True)
|
|
||||||
diffrences = selection_area.diff()
|
|
||||||
diffrences.dropna(how='any', inplace=True)
|
|
||||||
|
|
||||||
# Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo
|
|
||||||
cumulative_sumation = diffrences.cumsum()
|
|
||||||
tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
|
|
||||||
|
|
||||||
# Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative
|
print(feature_importance_df)
|
||||||
tipping_feature_indx_2 = None
|
print("best_feature_indx", best_feature_indx)
|
||||||
for indx, row in diffrences.iterrows():
|
print("best_score", best_score)
|
||||||
if row["metric"] > 0:
|
|
||||||
tipping_feature_indx_2 = indx
|
features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist()
|
||||||
else:
|
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
|
||||||
break
|
|
||||||
|
|
||||||
# Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score
|
|
||||||
tipping_feature_indx_3 = None
|
|
||||||
cum_sum_score = 0
|
|
||||||
i_worse = 0
|
|
||||||
# TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
|
|
||||||
for indx, row in selection_area.iterrows():
|
|
||||||
if row["metric"] > 0:
|
|
||||||
tipping_feature_indx_3 = indx
|
|
||||||
cum_sum_score += row["metric"]
|
|
||||||
i_worse = 0
|
|
||||||
else:
|
|
||||||
i_worse += 1
|
|
||||||
|
|
||||||
if i_worse == n_not_improve:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return selected_features
|
||||||
|
|
||||||
|
else:
|
||||||
def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
|
raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
|
||||||
pass
|
|
||||||
|
|
||||||
def vizualize_feature_selection_process():
|
|
||||||
pass
|
|
||||||
|
|
||||||
def execute_feature_selection_step():
|
|
||||||
pass
|
|
|
@ -11,7 +11,13 @@ from sklearn import (
|
||||||
svm,
|
svm,
|
||||||
)
|
)
|
||||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
|
from sklearn.metrics import confusion_matrix
|
||||||
|
from sklearn.model_selection import (
|
||||||
|
BaseCrossValidator,
|
||||||
|
LeaveOneGroupOut,
|
||||||
|
StratifiedKFold,
|
||||||
|
cross_validate,
|
||||||
|
)
|
||||||
from xgboost import XGBClassifier, XGBRegressor
|
from xgboost import XGBClassifier, XGBRegressor
|
||||||
|
|
||||||
|
|
||||||
|
@ -73,7 +79,40 @@ def insert_row(df, row):
|
||||||
return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
|
return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
def prepare_regression_model_input(model_input, cv_method="logo"):
|
def impute_encode_categorical_features(model_input: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
categorical_feature_col_names = [
|
||||||
|
"gender",
|
||||||
|
"startlanguage",
|
||||||
|
"limesurvey_demand_control_ratio_quartile",
|
||||||
|
]
|
||||||
|
additional_categorical_features = [
|
||||||
|
col
|
||||||
|
for col in model_input.columns
|
||||||
|
if "mostcommonactivity" in col or "homelabel" in col
|
||||||
|
]
|
||||||
|
categorical_feature_col_names += additional_categorical_features
|
||||||
|
|
||||||
|
categorical_features = model_input[categorical_feature_col_names].copy()
|
||||||
|
|
||||||
|
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||||
|
# fillna with mode
|
||||||
|
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||||
|
# one-hot encoding
|
||||||
|
categorical_features = categorical_features.apply(
|
||||||
|
lambda col: col.astype("category")
|
||||||
|
)
|
||||||
|
if not categorical_features.empty:
|
||||||
|
categorical_features = pd.get_dummies(categorical_features)
|
||||||
|
|
||||||
|
numerical_features = model_input.drop(categorical_feature_col_names, axis=1)
|
||||||
|
|
||||||
|
model_input = pd.concat([numerical_features, categorical_features], axis=1)
|
||||||
|
return model_input
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_sklearn_data_format(
|
||||||
|
model_input: pd.DataFrame, cv_method: str = "logo"
|
||||||
|
) -> tuple:
|
||||||
index_columns = [
|
index_columns = [
|
||||||
"local_segment",
|
"local_segment",
|
||||||
"local_segment_label",
|
"local_segment_label",
|
||||||
|
@ -82,13 +121,7 @@ def prepare_regression_model_input(model_input, cv_method="logo"):
|
||||||
]
|
]
|
||||||
model_input.set_index(index_columns, inplace=True)
|
model_input.set_index(index_columns, inplace=True)
|
||||||
|
|
||||||
if cv_method == "logo":
|
if cv_method == "half_logo":
|
||||||
data_x, data_y, data_groups = (
|
|
||||||
model_input.drop(["target", "pid"], axis=1),
|
|
||||||
model_input["target"],
|
|
||||||
model_input["pid"],
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
model_input["pid_index"] = model_input.groupby("pid").cumcount()
|
model_input["pid_index"] = model_input.groupby("pid").cumcount()
|
||||||
model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")
|
model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")
|
||||||
|
|
||||||
|
@ -104,52 +137,53 @@ def prepare_regression_model_input(model_input, cv_method="logo"):
|
||||||
model_input["target"],
|
model_input["target"],
|
||||||
model_input["pid_half"],
|
model_input["pid_half"],
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
data_x, data_y, data_groups = (
|
||||||
|
model_input.drop(["target", "pid"], axis=1),
|
||||||
|
model_input["target"],
|
||||||
|
model_input["pid"],
|
||||||
|
)
|
||||||
|
return data_x, data_y, data_groups
|
||||||
|
|
||||||
categorical_feature_colnames = [
|
|
||||||
"gender",
|
|
||||||
"startlanguage",
|
|
||||||
"limesurvey_demand_control_ratio_quartile",
|
|
||||||
]
|
|
||||||
additional_categorical_features = [
|
|
||||||
col
|
|
||||||
for col in data_x.columns
|
|
||||||
if "mostcommonactivity" in col or "homelabel" in col
|
|
||||||
]
|
|
||||||
categorical_feature_colnames += additional_categorical_features
|
|
||||||
|
|
||||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
def prepare_cross_validator(
|
||||||
|
data_x: pd.DataFrame,
|
||||||
|
data_y: pd.DataFrame,
|
||||||
|
data_groups: pd.DataFrame,
|
||||||
|
cv_method: str = "logo",
|
||||||
|
) -> BaseCrossValidator:
|
||||||
|
if cv_method == "logo" or cv_method == "half_logo":
|
||||||
|
cv = LeaveOneGroupOut()
|
||||||
|
cv.get_n_splits(
|
||||||
|
data_x,
|
||||||
|
data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cv = StratifiedKFold(n_splits=5, shuffle=True)
|
||||||
|
return cv
|
||||||
|
|
||||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
|
||||||
# fillna with mode
|
def aggregate_and_transpose(df: pd.DataFrame, statistics=None) -> pd.DataFrame:
|
||||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
if statistics is None:
|
||||||
# one-hot encoding
|
statistics = ["max", "mean"]
|
||||||
categorical_features = categorical_features.apply(
|
return (
|
||||||
lambda col: col.astype("category")
|
df.agg(statistics)
|
||||||
|
.transpose()
|
||||||
|
.reset_index()
|
||||||
|
.rename(columns={"index": "test_metric"})
|
||||||
)
|
)
|
||||||
if not categorical_features.empty:
|
|
||||||
categorical_features = pd.get_dummies(categorical_features)
|
|
||||||
|
|
||||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
|
||||||
|
|
||||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
|
||||||
|
|
||||||
return train_x, data_y, data_groups
|
|
||||||
|
|
||||||
|
|
||||||
def run_all_regression_models(input_csv):
|
def run_all_regression_models(
|
||||||
# Prepare data
|
data_x: pd.DataFrame,
|
||||||
data_x, data_y, data_groups = prepare_regression_model_input(input_csv)
|
data_y: pd.DataFrame,
|
||||||
|
data_groups: pd.DataFrame,
|
||||||
# Prepare cross validation
|
cross_validator: BaseCrossValidator,
|
||||||
logo = LeaveOneGroupOut()
|
) -> pd.DataFrame:
|
||||||
logo.get_n_splits(
|
|
||||||
data_x,
|
|
||||||
data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
)
|
|
||||||
metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
|
metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
|
||||||
test_metrics = ["test_" + metric for metric in metrics]
|
test_metrics = ["test_" + metric for metric in metrics]
|
||||||
scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
|
scores = pd.DataFrame(columns=["method", "test_metric", "max", "nanmedian"])
|
||||||
|
|
||||||
# Validate models
|
# Validate models
|
||||||
dummy_regr = DummyRegressor(strategy="mean")
|
dummy_regr = DummyRegressor(strategy="mean")
|
||||||
|
@ -158,7 +192,7 @@ def run_all_regression_models(input_csv):
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=logo,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
|
@ -166,17 +200,19 @@ def run_all_regression_models(input_csv):
|
||||||
print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))
|
print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))
|
||||||
|
|
||||||
scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
|
scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||||
scores_df["method"] = "dummy"
|
scores_df["method"] = "dummy"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del dummy_regr
|
||||||
|
del dummy_regr_scores
|
||||||
|
|
||||||
lin_reg_rapids = linear_model.LinearRegression()
|
lin_reg = linear_model.LinearRegression()
|
||||||
lin_reg_scores = cross_validate(
|
lin_reg_scores = cross_validate(
|
||||||
lin_reg_rapids,
|
lin_reg,
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=logo,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
|
@ -184,9 +220,11 @@ def run_all_regression_models(input_csv):
|
||||||
print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))
|
print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))
|
||||||
|
|
||||||
scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
|
scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||||
scores_df["method"] = "linear_reg"
|
scores_df["method"] = "linear_reg"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del lin_reg
|
||||||
|
del lin_reg_scores
|
||||||
|
|
||||||
ridge_reg = linear_model.Ridge(alpha=0.5)
|
ridge_reg = linear_model.Ridge(alpha=0.5)
|
||||||
ridge_reg_scores = cross_validate(
|
ridge_reg_scores = cross_validate(
|
||||||
|
@ -194,16 +232,18 @@ def run_all_regression_models(input_csv):
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=logo,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
print("Ridge regression")
|
print("Ridge regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
|
scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||||
scores_df["method"] = "ridge_reg"
|
scores_df["method"] = "ridge_reg"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del ridge_reg
|
||||||
|
del ridge_reg_scores
|
||||||
|
|
||||||
lasso_reg = linear_model.Lasso(alpha=0.1)
|
lasso_reg = linear_model.Lasso(alpha=0.1)
|
||||||
lasso_reg_score = cross_validate(
|
lasso_reg_score = cross_validate(
|
||||||
|
@ -211,16 +251,18 @@ def run_all_regression_models(input_csv):
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=logo,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
print("Lasso regression")
|
print("Lasso regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
|
scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||||
scores_df["method"] = "lasso_reg"
|
scores_df["method"] = "lasso_reg"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del lasso_reg
|
||||||
|
del lasso_reg_score
|
||||||
|
|
||||||
bayesian_ridge_reg = linear_model.BayesianRidge()
|
bayesian_ridge_reg = linear_model.BayesianRidge()
|
||||||
bayesian_ridge_reg_score = cross_validate(
|
bayesian_ridge_reg_score = cross_validate(
|
||||||
|
@ -228,16 +270,18 @@ def run_all_regression_models(input_csv):
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=logo,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
print("Bayesian Ridge")
|
print("Bayesian Ridge")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
|
scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||||
scores_df["method"] = "bayesian_ridge"
|
scores_df["method"] = "bayesian_ridge"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del bayesian_ridge_reg
|
||||||
|
del bayesian_ridge_reg_score
|
||||||
|
|
||||||
ransac_reg = linear_model.RANSACRegressor()
|
ransac_reg = linear_model.RANSACRegressor()
|
||||||
ransac_reg_score = cross_validate(
|
ransac_reg_score = cross_validate(
|
||||||
|
@ -245,27 +289,37 @@ def run_all_regression_models(input_csv):
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=logo,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
print("RANSAC (outlier robust regression)")
|
print("RANSAC (outlier robust regression)")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
|
scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||||
scores_df["method"] = "RANSAC"
|
scores_df["method"] = "RANSAC"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del ransac_reg
|
||||||
|
del ransac_reg_score
|
||||||
|
|
||||||
svr = svm.SVR()
|
svr = svm.SVR()
|
||||||
svr_score = cross_validate(
|
svr_score = cross_validate(
|
||||||
svr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
|
svr,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cross_validator,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
print("Support vector regression")
|
print("Support vector regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(svr_score)[test_metrics]
|
scores_df = pd.DataFrame(svr_score)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||||
scores_df["method"] = "SVR"
|
scores_df["method"] = "SVR"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del svr
|
||||||
|
del svr_score
|
||||||
|
|
||||||
kridge = kernel_ridge.KernelRidge()
|
kridge = kernel_ridge.KernelRidge()
|
||||||
kridge_score = cross_validate(
|
kridge_score = cross_validate(
|
||||||
|
@ -273,69 +327,130 @@ def run_all_regression_models(input_csv):
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=logo,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
print("Kernel Ridge regression")
|
print("Kernel Ridge regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(kridge_score)[test_metrics]
|
scores_df = pd.DataFrame(kridge_score)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||||
scores_df["method"] = "kernel_ridge"
|
scores_df["method"] = "kernel_ridge"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del kridge
|
||||||
|
del kridge_score
|
||||||
|
|
||||||
gpr = gaussian_process.GaussianProcessRegressor()
|
gpr = gaussian_process.GaussianProcessRegressor()
|
||||||
gpr_score = cross_validate(
|
gpr_score = cross_validate(
|
||||||
gpr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
|
gpr,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cross_validator,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
print("Gaussian Process Regression")
|
print("Gaussian Process Regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(gpr_score)[test_metrics]
|
scores_df = pd.DataFrame(gpr_score)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||||
scores_df["method"] = "gaussian_proc"
|
scores_df["method"] = "gaussian_proc"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del gpr
|
||||||
|
del gpr_score
|
||||||
|
|
||||||
rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
|
rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
|
||||||
rfr_score = cross_validate(
|
rfr_score = cross_validate(
|
||||||
rfr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
|
rfr,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cross_validator,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
print("Random Forest Regression")
|
print("Random Forest Regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(rfr_score)[test_metrics]
|
scores_df = pd.DataFrame(rfr_score)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||||
scores_df["method"] = "random_forest"
|
scores_df["method"] = "random_forest"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del rfr
|
||||||
|
del rfr_score
|
||||||
|
|
||||||
xgb = XGBRegressor()
|
xgb = XGBRegressor()
|
||||||
xgb_score = cross_validate(
|
xgb_score = cross_validate(
|
||||||
xgb, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
|
xgb,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cross_validator,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
print("XGBoost Regressor")
|
print("XGBoost Regressor")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(xgb_score)[test_metrics]
|
scores_df = pd.DataFrame(xgb_score)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||||
scores_df["method"] = "XGBoost"
|
scores_df["method"] = "XGBoost"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del xgb
|
||||||
|
del xgb_score
|
||||||
|
|
||||||
ada = ensemble.AdaBoostRegressor()
|
ada = ensemble.AdaBoostRegressor()
|
||||||
ada_score = cross_validate(
|
ada_score = cross_validate(
|
||||||
ada, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
|
ada,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cross_validator,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
print("ADA Boost Regressor")
|
print("ADA Boost Regressor")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(ada_score)[test_metrics]
|
scores_df = pd.DataFrame(ada_score)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||||
scores_df["method"] = "ADA_boost"
|
scores_df["method"] = "ADA_boost"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del ada
|
||||||
|
del ada_score
|
||||||
|
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
|
|
||||||
def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
def confusion_matrix_scorer(clf, X, y):
|
||||||
metrics = ["accuracy", "average_precision", "recall", "f1"]
|
y_pred = clf.predict(X)
|
||||||
|
cm = confusion_matrix(y, y_pred)
|
||||||
|
return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]}
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate_confusion_matrix(scores_dict: dict) -> pd.DataFrame:
|
||||||
|
scores_aggregated = aggregate_and_transpose(
|
||||||
|
pd.DataFrame(scores_dict), statistics=["sum"]
|
||||||
|
)
|
||||||
|
return scores_aggregated[
|
||||||
|
~scores_aggregated.test_metric.isin(["fit_time", "score_time"])
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def run_all_classification_models(
|
||||||
|
data_x: pd.DataFrame,
|
||||||
|
data_y: pd.DataFrame,
|
||||||
|
data_groups: pd.DataFrame,
|
||||||
|
cross_validator: BaseCrossValidator,
|
||||||
|
):
|
||||||
|
data_y_value_counts = data_y.value_counts()
|
||||||
|
if len(data_y_value_counts) == 1:
|
||||||
|
raise (ValueError("There is only one unique value in data_y."))
|
||||||
|
if len(data_y_value_counts) == 2:
|
||||||
|
metrics = ["accuracy", "average_precision", "recall", "f1"]
|
||||||
|
else:
|
||||||
|
metrics = ["accuracy", "precision_micro", "recall_micro", "f1_micro"]
|
||||||
|
|
||||||
test_metrics = ["test_" + metric for metric in metrics]
|
test_metrics = ["test_" + metric for metric in metrics]
|
||||||
|
|
||||||
scores = pd.DataFrame(columns=["method", "max", "mean"])
|
scores = pd.DataFrame(columns=["method", "test_metric", "max", "mean"])
|
||||||
|
|
||||||
dummy_class = DummyClassifier(strategy="most_frequent")
|
dummy_class = DummyClassifier(strategy="most_frequent")
|
||||||
|
|
||||||
|
@ -344,17 +459,39 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cv_method,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
error_score="raise",
|
error_score="raise",
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
|
dummy_confusion_matrix = cross_validate(
|
||||||
|
dummy_class,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cross_validator,
|
||||||
|
n_jobs=-1,
|
||||||
|
error_score="raise",
|
||||||
|
scoring=confusion_matrix_scorer,
|
||||||
|
)
|
||||||
print("Dummy")
|
print("Dummy")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(dummy_score)[test_metrics]
|
scores_df = pd.DataFrame(dummy_score)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||||
scores_df["method"] = "Dummy"
|
scores_df = pd.concat(
|
||||||
|
[
|
||||||
|
scores_df,
|
||||||
|
aggregate_confusion_matrix(dummy_confusion_matrix).rename(
|
||||||
|
columns={"sum": "mean"}
|
||||||
|
# Note: the column is misleadingly renamed to get concise output.
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
scores_df["method"] = "dummy_classifier"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del dummy_class
|
||||||
|
del dummy_score
|
||||||
|
del dummy_confusion_matrix
|
||||||
|
|
||||||
logistic_regression = linear_model.LogisticRegression()
|
logistic_regression = linear_model.LogisticRegression()
|
||||||
|
|
||||||
|
@ -363,16 +500,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cv_method,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
|
log_reg_confusion_matrix = cross_validate(
|
||||||
|
logistic_regression,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cross_validator,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=confusion_matrix_scorer,
|
||||||
|
)
|
||||||
print("Logistic regression")
|
print("Logistic regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
|
scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||||
scores_df["method"] = "logistic_reg"
|
scores_df = pd.concat(
|
||||||
|
[
|
||||||
|
scores_df,
|
||||||
|
aggregate_confusion_matrix(log_reg_confusion_matrix).rename(
|
||||||
|
columns={"sum": "mean"}
|
||||||
|
# Note: the column is misleadingly renamed to get concise output.
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
scores_df["method"] = "logistic_regression"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del logistic_regression
|
||||||
|
del log_reg_scores
|
||||||
|
del log_reg_confusion_matrix
|
||||||
|
|
||||||
svc = svm.SVC()
|
svc = svm.SVC()
|
||||||
|
|
||||||
|
@ -381,16 +539,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cv_method,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
|
svc_confusion_matrix = cross_validate(
|
||||||
|
svc,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cross_validator,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=confusion_matrix_scorer,
|
||||||
|
)
|
||||||
print("Support Vector Machine")
|
print("Support Vector Machine")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(svc_scores)[test_metrics]
|
scores_df = pd.DataFrame(svc_scores)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||||
scores_df["method"] = "svc"
|
scores_df = pd.concat(
|
||||||
|
[
|
||||||
|
scores_df,
|
||||||
|
aggregate_confusion_matrix(svc_confusion_matrix).rename(
|
||||||
|
columns={"sum": "mean"}
|
||||||
|
# Note: the column is misleadingly renamed to get concise output.
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
scores_df["method"] = "SVC"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del svc
|
||||||
|
del svc_scores
|
||||||
|
del svc_confusion_matrix
|
||||||
|
|
||||||
gaussian_nb = naive_bayes.GaussianNB()
|
gaussian_nb = naive_bayes.GaussianNB()
|
||||||
|
|
||||||
|
@ -399,16 +578,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cv_method,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
|
gaussian_nb_confusion_matrix = cross_validate(
|
||||||
|
gaussian_nb,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cross_validator,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=confusion_matrix_scorer,
|
||||||
|
)
|
||||||
print("Gaussian Naive Bayes")
|
print("Gaussian Naive Bayes")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
|
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||||
|
scores_df = pd.concat(
|
||||||
|
[
|
||||||
|
scores_df,
|
||||||
|
aggregate_confusion_matrix(gaussian_nb_confusion_matrix).rename(
|
||||||
|
columns={"sum": "mean"}
|
||||||
|
# Note: the column is misleadingly renamed to get concise output.
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
scores_df["method"] = "gaussian_naive_bayes"
|
scores_df["method"] = "gaussian_naive_bayes"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del gaussian_nb
|
||||||
|
del gaussian_nb_scores
|
||||||
|
del gaussian_nb_confusion_matrix
|
||||||
|
|
||||||
sgdc = linear_model.SGDClassifier()
|
sgdc = linear_model.SGDClassifier()
|
||||||
|
|
||||||
|
@ -417,16 +617,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cv_method,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
|
sgdc_confusion_matrix = cross_validate(
|
||||||
|
sgdc,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cross_validator,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=confusion_matrix_scorer,
|
||||||
|
)
|
||||||
print("Stochastic Gradient Descent")
|
print("Stochastic Gradient Descent")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
|
scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||||
scores_df["method"] = "stochastic_gradient_descent"
|
scores_df = pd.concat(
|
||||||
|
[
|
||||||
|
scores_df,
|
||||||
|
aggregate_confusion_matrix(sgdc_confusion_matrix).rename(
|
||||||
|
columns={"sum": "mean"}
|
||||||
|
# Note: the column is misleadingly renamed to get concise output.
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
scores_df["method"] = "stochastic_gradient_descent_classifier"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del sgdc
|
||||||
|
del sgdc_scores
|
||||||
|
del sgdc_confusion_matrix
|
||||||
|
|
||||||
rfc = ensemble.RandomForestClassifier()
|
rfc = ensemble.RandomForestClassifier()
|
||||||
|
|
||||||
|
@ -435,16 +656,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cv_method,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
|
rfc_confusion_matrix = cross_validate(
|
||||||
|
rfc,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cross_validator,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=confusion_matrix_scorer,
|
||||||
|
)
|
||||||
print("Random Forest")
|
print("Random Forest")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(rfc_scores)[test_metrics]
|
scores_df = pd.DataFrame(rfc_scores)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||||
scores_df["method"] = "random_forest"
|
scores_df = pd.concat(
|
||||||
|
[
|
||||||
|
scores_df,
|
||||||
|
aggregate_confusion_matrix(rfc_confusion_matrix).rename(
|
||||||
|
columns={"sum": "mean"}
|
||||||
|
# Note: the column is misleadingly renamed to get concise output.
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
scores_df["method"] = "random_forest_classifier"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del rfc
|
||||||
|
del rfc_scores
|
||||||
|
del rfc_confusion_matrix
|
||||||
|
|
||||||
xgb_classifier = XGBClassifier()
|
xgb_classifier = XGBClassifier()
|
||||||
|
|
||||||
|
@ -453,15 +695,36 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cv_method,
|
cv=cross_validator,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics,
|
||||||
)
|
)
|
||||||
|
xgb_confusion_matrix = cross_validate(
|
||||||
|
xgb_classifier,
|
||||||
|
X=data_x,
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cross_validator,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=confusion_matrix_scorer,
|
||||||
|
)
|
||||||
print("XGBoost")
|
print("XGBoost")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(xgb_scores)[test_metrics]
|
scores_df = pd.DataFrame(xgb_scores)[test_metrics]
|
||||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||||
scores_df["method"] = "xgboost"
|
scores_df = pd.concat(
|
||||||
|
[
|
||||||
|
scores_df,
|
||||||
|
aggregate_confusion_matrix(xgb_confusion_matrix).rename(
|
||||||
|
columns={"sum": "mean"}
|
||||||
|
# Note: the column is misleadingly renamed to get concise output.
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
scores_df["method"] = "XGBoost_classifier"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
|
del xgb_classifier
|
||||||
|
del xgb_scores
|
||||||
|
del xgb_confusion_matrix
|
||||||
|
|
||||||
return scores
|
return scores
|
||||||
|
|
|
@ -33,7 +33,7 @@ class Preprocessing:
|
||||||
Args:
|
Args:
|
||||||
categorical_features (DataFrame): DataFrame including only categorical columns.
|
categorical_features (DataFrame): DataFrame including only categorical columns.
|
||||||
numerical_features (_type_): DataFrame including only numerical columns.
|
numerical_features (_type_): DataFrame including only numerical columns.
|
||||||
mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
|
mode (int): Mode of the column with which DataFrame is filled.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
DataFrame: Hot-One Encoded DataFrame.
|
DataFrame: Hot-One Encoded DataFrame.
|
||||||
|
@ -46,7 +46,7 @@ class Preprocessing:
|
||||||
if not categorical_features.empty:
|
if not categorical_features.empty:
|
||||||
categorical_features = pd.get_dummies(categorical_features)
|
categorical_features = pd.get_dummies(categorical_features)
|
||||||
|
|
||||||
return pd.concat([numerical_features, categorical_features], axis=1)
|
return pd.concat([numerical_features, categorical_features], axis=1), categorical_features.columns.tolist()
|
||||||
|
|
||||||
|
|
||||||
def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
|
def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
|
||||||
|
@ -68,20 +68,27 @@ class Preprocessing:
|
||||||
categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
|
categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
|
||||||
|
|
||||||
# For train set
|
# For train set
|
||||||
|
|
||||||
train_X_categorical_features = self.train_X[categorical_columns].copy()
|
train_X_categorical_features = self.train_X[categorical_columns].copy()
|
||||||
train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
|
train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
|
||||||
mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
|
mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
|
||||||
|
|
||||||
self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
|
self.train_X, train_cat_col_names = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
|
||||||
|
encoded_categorical_features = [col for col in self.train_X.columns if col.startswith(tuple(categorical_columns))]
|
||||||
|
|
||||||
# For test set
|
# For test set
|
||||||
|
|
||||||
test_X_categorical_features = self.test_X[categorical_columns].copy()
|
test_X_categorical_features = self.test_X[categorical_columns].copy()
|
||||||
test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
|
test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
|
||||||
|
|
||||||
self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
|
self.test_X, test_cat_col_names = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
|
||||||
|
|
||||||
|
# Create categorical columns that were not found in test set and fill them with 0
|
||||||
|
missing_cols = [col for col in train_cat_col_names if col not in test_cat_col_names]
|
||||||
|
self.test_X[missing_cols] = 0
|
||||||
|
|
||||||
|
# Sort column names alphabetically
|
||||||
|
self.train_X = self.train_X.reindex(sorted(self.train_X.columns), axis=1)
|
||||||
|
self.test_X = self.test_X.reindex(sorted(self.test_X.columns), axis=1)
|
||||||
|
|
||||||
|
|
||||||
def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
|
def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
|
||||||
|
|
||||||
|
|
|
@ -1,29 +0,0 @@
|
||||||
method,metric,max,mean
|
|
||||||
Dummy,test_accuracy,0.8557046979865772,0.8548446932649828
|
|
||||||
Dummy,test_average_precision,0.1457286432160804,0.14515530673501736
|
|
||||||
Dummy,test_recall,0.0,0.0
|
|
||||||
Dummy,test_f1,0.0,0.0
|
|
||||||
logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606
|
|
||||||
logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834
|
|
||||||
logistic_reg,test_recall,0.3023255813953488,0.24266238973536486
|
|
||||||
logistic_reg,test_f1,0.3909774436090226,0.318943511424051
|
|
||||||
svc,test_accuracy,0.8557046979865772,0.8548446932649828
|
|
||||||
svc,test_average_precision,0.44514416839823046,0.4068200938341621
|
|
||||||
svc,test_recall,0.0,0.0
|
|
||||||
svc,test_f1,0.0,0.0
|
|
||||||
gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234
|
|
||||||
gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853
|
|
||||||
gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935
|
|
||||||
gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699
|
|
||||||
stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601
|
|
||||||
stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592
|
|
||||||
stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675
|
|
||||||
stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583
|
|
||||||
random_forest,test_accuracy,0.9110738255033557,0.9011129472867694
|
|
||||||
random_forest,test_average_precision,0.6998372262021191,0.6619275281099584
|
|
||||||
random_forest,test_recall,0.4069767441860465,0.35356856455493185
|
|
||||||
random_forest,test_f1,0.5691056910569107,0.5078402513053142
|
|
||||||
xgboost,test_accuracy,0.9128978224455612,0.9007711937764886
|
|
||||||
xgboost,test_average_precision,0.7366643049075349,0.698622165966308
|
|
||||||
xgboost,test_recall,0.5287356321839081,0.44346431435445066
|
|
||||||
xgboost,test_f1,0.638888888888889,0.5633957169928393
|
|
|
|
@ -1,29 +0,0 @@
|
||||||
method,metric,max,mean
|
|
||||||
Dummy,test_accuracy,1.0,0.8524114578096439
|
|
||||||
Dummy,test_average_precision,0.7,0.14758854219035614
|
|
||||||
Dummy,test_recall,0.0,0.0
|
|
||||||
Dummy,test_f1,0.0,0.0
|
|
||||||
logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311
|
|
||||||
logistic_reg,test_average_precision,1.0,0.44605167668563583
|
|
||||||
logistic_reg,test_recall,1.0,0.25353566685532386
|
|
||||||
logistic_reg,test_f1,0.823529411764706,0.27951926390778625
|
|
||||||
svc,test_accuracy,1.0,0.8524114578096439
|
|
||||||
svc,test_average_precision,0.9612401707068228,0.44179454944271934
|
|
||||||
svc,test_recall,0.0,0.0
|
|
||||||
svc,test_f1,0.0,0.0
|
|
||||||
gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129
|
|
||||||
gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991
|
|
||||||
gaussian_naive_bayes,test_recall,1.0,0.3743761174081108
|
|
||||||
gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668
|
|
||||||
stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739
|
|
||||||
stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582
|
|
||||||
stochastic_gradient_descent,test_recall,1.0,0.30156420704502945
|
|
||||||
stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388
|
|
||||||
random_forest,test_accuracy,1.0,0.8722158105763481
|
|
||||||
random_forest,test_average_precision,1.0,0.49817066323226833
|
|
||||||
random_forest,test_recall,1.0,0.18161263127840668
|
|
||||||
random_forest,test_f1,1.0,0.2508096532365307
|
|
||||||
xgboost,test_accuracy,1.0,0.8812627400277729
|
|
||||||
xgboost,test_average_precision,1.0,0.5505695112208401
|
|
||||||
xgboost,test_recall,1.0,0.2896161238315027
|
|
||||||
xgboost,test_f1,0.9411764705882353,0.36887408735855665
|
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
[tool.isort]
|
||||||
|
profile = "black"
|
||||||
|
py_version = 311
|
||||||
|
skip_gitignore = "true"
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
target-version = ["py311"]
|
2
rapids
2
rapids
|
@ -1 +1 @@
|
||||||
Subproject commit 63f5a526fce4d288499168e1701adadb8b885d82
|
Subproject commit 059774bda10545a83ab282f59eb7a329fef9ee4c
|
5
setup.py
5
setup.py
|
@ -1,8 +1,7 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import sqlalchemy.engine.url
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import URL, create_engine
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
@ -11,7 +10,7 @@ testing: bool = False
|
||||||
|
|
||||||
db_password = os.getenv("DB_PASSWORD")
|
db_password = os.getenv("DB_PASSWORD")
|
||||||
|
|
||||||
db_uri = sqlalchemy.engine.url.URL(
|
db_uri = URL.create(
|
||||||
drivername="postgresql+psycopg2",
|
drivername="postgresql+psycopg2",
|
||||||
username="staw_db",
|
username="staw_db",
|
||||||
password=db_password,
|
password=db_password,
|
||||||
|
|
|
@ -0,0 +1,60 @@
|
||||||
|
---
|
||||||
|
title: "Reliability of SAM threat and challenge and COPE"
|
||||||
|
output: html_notebook
|
||||||
|
---
|
||||||
|
|
||||||
|
|
||||||
|
```{r libraries, message=FALSE, warning=FALSE, include=FALSE, cache=FALSE}
|
||||||
|
library(conflicted)
|
||||||
|
library(here)
|
||||||
|
library(tidyverse)
|
||||||
|
library(magrittr)
|
||||||
|
library(lavaan)
|
||||||
|
library(kableExtra)
|
||||||
|
|
||||||
|
conflicts_prefer(
|
||||||
|
readr::col_factor,
|
||||||
|
purrr::discard,
|
||||||
|
dplyr::filter,
|
||||||
|
dplyr::lag,
|
||||||
|
purrr::set_names,
|
||||||
|
tidyr::extract,
|
||||||
|
kableExtra::group_rows
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r style, include=FALSE, cache=FALSE}
|
||||||
|
styler::style_file(
|
||||||
|
here("statistical_analysis", "scale_reliability.Rmd"),
|
||||||
|
scope = "tokens",
|
||||||
|
indent_by = 4L
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
The data were preprocessed and cleaned using [expl_esm_labels.py](../exploration/expl_esm_labels.py) script and read as csv here.
|
||||||
|
|
||||||
|
```{r read_data}
|
||||||
|
COL_TYPES <- cols(
|
||||||
|
.default = col_double(),
|
||||||
|
participant_id = col_factor(),
|
||||||
|
username = col_factor(),
|
||||||
|
device_id = col_factor(),
|
||||||
|
esm_trigger = col_factor(),
|
||||||
|
esm_instructions = col_factor(),
|
||||||
|
double_esm_user_answer_timestamp = col_double(),
|
||||||
|
datetime_lj = col_datetime(format = ""),
|
||||||
|
date_lj = col_date(format = ""),
|
||||||
|
time = col_factor(),
|
||||||
|
esm_user_answer = col_factor()
|
||||||
|
)
|
||||||
|
df_SAM <- read_csv(here("data", "raw", "df_esm_SAM_threat_challenge.csv"), col_types = COL_TYPES)
|
||||||
|
df_COPE <- read_csv(here("data", "raw", "df_esm_COPE.csv"), col_types = COL_TYPES)
|
||||||
|
```
|
||||||
|
|
||||||
|
Demonstrate factor analysis for a single participant.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
df_COPE %>%
|
||||||
|
group_by(question_id, questionnaire_id) %>%
|
||||||
|
count()
|
||||||
|
```
|
|
@ -0,0 +1,20 @@
|
||||||
|
Version: 1.0
|
||||||
|
|
||||||
|
RestoreWorkspace: Default
|
||||||
|
SaveWorkspace: Default
|
||||||
|
AlwaysSaveHistory: Default
|
||||||
|
|
||||||
|
EnableCodeIndexing: Yes
|
||||||
|
UseSpacesForTab: No
|
||||||
|
NumSpacesForTab: 4
|
||||||
|
Encoding: UTF-8
|
||||||
|
|
||||||
|
RnwWeave: Sweave
|
||||||
|
LaTeX: XeLaTeX
|
||||||
|
|
||||||
|
AutoAppendNewline: Yes
|
||||||
|
StripTrailingWhitespace: Yes
|
||||||
|
|
||||||
|
PythonType: conda
|
||||||
|
PythonVersion: 3.11.3
|
||||||
|
PythonPath: E:/ProgramData/mambaforge/envs/straw2analysis/python.exe
|
Loading…
Reference in New Issue