Compare commits
80 Commits
c66e046014
...
777d30365e
Author | SHA1 | Date |
---|---|---|
junos | 777d30365e | |
junos | ca85131ed2 | |
junos | 2aa5c5cb07 | |
junos | 82b53bc0d3 | |
junos | c688580fe8 | |
junos | 8c0b66eddc | |
junos | e3ff4846e1 | |
junos | 825380a47e | |
junos | ef26772038 | |
junos | 64c05ec5ec | |
junos | c0236b251c | |
junos | 2f22f2052a | |
junos | ec51d7d406 | |
junos | 2aca64aa09 | |
junos | 9a87a0a34a | |
junos | 91a9f20839 | |
junos | e3be17e56e | |
junos | 5c0e2e2621 | |
junos | 51201c2bc9 | |
junos | fed4b33611 | |
junos | 471ce7c2cb | |
junos | dbb2033f78 | |
junos | 1b77fb119c | |
junos | ae2ca63bc4 | |
junos | 577f1330da | |
junos | 4af360f411 | |
junos | 96bbe32f56 | |
junos | 40170339c2 | |
junos | 7b0c0037f7 | |
junos | db06584ddd | |
junos | 112d968715 | |
junos | 9cc6bf7c21 | |
junos | 78807b941c | |
junos | a9af113c9c | |
junos | 97113fe9ab | |
junos | bc78a1d498 | |
junos | aca84b214d | |
junos | aa13123136 | |
junos | c51e0da0f7 | |
junos | a2401b5e36 | |
junos | 70232949c3 | |
junos | 8a9595c615 | |
junos | 045b9fa0dc | |
junos | bb4445f1a8 | |
junos | 1318ae3609 | |
junos | 45441c288d | |
junos | fa45b30955 | |
junos | 2336edffb6 | |
junos | b756ed5feb | |
junos | cad28c3fe8 | |
junos | 38a405d378 | |
junos | 2c5a0b4157 | |
junos | 0409c9e982 | |
junos | a7446cc34a | |
junos | 118e686491 | |
junos | 9417a1b9f1 | |
junos | 7b5db88f1d | |
junos | 0f8f0b0fb6 | |
junos | 26c7d22b83 | |
junos | 87781840d4 | |
junos | 3091328fc5 | |
junos | 055e87dbac | |
junos | f58d20ffc2 | |
junos | 075fdab9ea | |
junos | 91e7352480 | |
junos | 35c09374dd | |
junos | b505fb2b6a | |
junos | 47b1ecdbb9 | |
junos | 24744c288d | |
junos | caeaf03239 | |
junos | cd5d8b6a10 | |
junos | 3e38b64b45 | |
junos | 76071fd550 | |
Primoz | 26804cf8ea | |
Primoz | 865225994b | |
Primoz | 259be708aa | |
Primoz | 0594993133 | |
Primoz | 1cbc743cf7 | |
Primoz | 2a8f1ee613 | |
Primoz | ce13a9e13b |
|
@ -0,0 +1,9 @@
|
|||
[flake8]
|
||||
max-line-length = 88
|
||||
extend-ignore =
|
||||
E203,
|
||||
# E501 line too long for docstrings
|
||||
D501
|
||||
per-file-ignores =
|
||||
exploration/*.py:E501
|
||||
docstring-convention = numpy
|
|
@ -12,12 +12,15 @@ __pycache__/
|
|||
/data/*input*.csv
|
||||
/data/daily*
|
||||
/data/intradaily*
|
||||
/data/raw
|
||||
/data/stressfulness_event*
|
||||
/data/30min*
|
||||
/presentation/*scores.csv
|
||||
/presentation/Results.ods
|
||||
/presentation/results/
|
||||
.Rproj.user
|
||||
.Rhistory
|
||||
/presentation/*.nb.html
|
||||
presentation/event_stressful_detection_half_loso.csv
|
||||
presentation/event_stressful_detection_loso.csv
|
||||
/statistical_analysis/scale_reliability.nb.html
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
<component name="ProjectCodeStyleConfiguration">
|
||||
<code_scheme name="Project" version="173">
|
||||
<option name="RIGHT_MARGIN" value="150" />
|
||||
<option name="SOFT_MARGINS" value="88" />
|
||||
</code_scheme>
|
||||
</component>
|
|
@ -0,0 +1,5 @@
|
|||
<component name="ProjectCodeStyleConfiguration">
|
||||
<state>
|
||||
<option name="USE_PER_PROJECT_SETTINGS" value="true" />
|
||||
</state>
|
||||
</component>
|
|
@ -0,0 +1,3 @@
|
|||
<component name="ProjectDictionaryState">
|
||||
<dictionary name="junos" />
|
||||
</component>
|
|
@ -17,6 +17,14 @@
|
|||
</RMarkdownRenderProfile>
|
||||
</value>
|
||||
</entry>
|
||||
<entry key="file://$PROJECT_DIR$/statistical_analysis/scale_reliability.rmd">
|
||||
<value>
|
||||
<RMarkdownRenderProfile>
|
||||
<option name="lastOutput" value="$PROJECT_DIR$/statistical_analysis/scale_reliability.nb.html" />
|
||||
<option name="outputDirectoryUrl" value="file://$PROJECT_DIR$/statistical_analysis" />
|
||||
</RMarkdownRenderProfile>
|
||||
</value>
|
||||
</entry>
|
||||
</map>
|
||||
</option>
|
||||
</component>
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="RGraphicsSettings">
|
||||
<option name="height" value="600" />
|
||||
<option name="resolution" value="75" />
|
||||
<option name="version" value="2" />
|
||||
<option name="width" value="960" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="RMarkdownGraphicsSettings">
|
||||
<option name="globalResolution" value="75" />
|
||||
<option name="version" value="2" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="RSettings">
|
||||
<option name="interpreterPath" value="C:\Program Files\R\R-4.3.1\bin\R.exe" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,30 @@
|
|||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.4.0
|
||||
hooks:
|
||||
- id: check-yaml
|
||||
- id: end-of-file-fixer
|
||||
- id: trailing-whitespace
|
||||
- repo: https://github.com/pycqa/isort
|
||||
rev: 5.12.0
|
||||
hooks:
|
||||
- id: isort
|
||||
name: isort (python)
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 23.3.0
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3
|
||||
- repo: https://github.com/pycqa/flake8
|
||||
rev: 6.0.0
|
||||
hooks:
|
||||
- id: flake8
|
||||
# - repo: https://github.com/mwouts/jupytext
|
||||
# rev: v1.14.7
|
||||
# hooks:
|
||||
# - id: jupytext
|
||||
# args: [ --from, "py:percent", --to, "ipynb" ]
|
||||
# additional_dependencies:
|
||||
# - isort==5.12.0 # Matches hook
|
||||
# - black==23.3.0
|
||||
# - flake8==6.0.0
|
|
@ -6,6 +6,7 @@ dependencies:
|
|||
- black
|
||||
- isort
|
||||
- flake8
|
||||
- flake8-docstrings
|
||||
- imbalanced-learn=0.10.0
|
||||
- jupyterlab
|
||||
- jupytext
|
||||
|
@ -14,6 +15,7 @@ dependencies:
|
|||
- nodejs
|
||||
- pandas
|
||||
- psycopg2 >= 2.9.1
|
||||
- pre-commit
|
||||
- python-dotenv
|
||||
- pytz
|
||||
- pyprojroot
|
||||
|
|
|
@ -14,15 +14,9 @@
|
|||
# ---
|
||||
|
||||
# %%
|
||||
import os, sys
|
||||
import importlib
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# import plotly.graph_objects as go
|
||||
from importlib import util
|
||||
from pathlib import Path
|
||||
import yaml
|
||||
from rapids.src.features.utils.utils import chunk_episodes
|
||||
|
||||
# %%
|
||||
phone_data_yield = pd.read_csv(
|
||||
|
@ -36,23 +30,29 @@ time_segments_labels = pd.read_csv(
|
|||
# %%
|
||||
phone_data_yield["assigned_segments"] = phone_data_yield[
|
||||
"assigned_segments"
|
||||
].str.replace(r"_RR\d+SS#", "#")
|
||||
].str.replace(r"_RR\d+SS#", "#", regex=True)
|
||||
time_segments_labels["label"] = time_segments_labels["label"].str.replace(
|
||||
r"_RR\d+SS$", ""
|
||||
r"_RR\d+SS$", "", regex=True
|
||||
)
|
||||
|
||||
|
||||
# %% tags=[]
|
||||
def filter_data_by_segment(data, time_segment):
|
||||
def filter_data_by_segment(data, time_segment_current):
|
||||
data.dropna(subset=["assigned_segments"], inplace=True)
|
||||
if data.shape[0] == 0: # data is empty
|
||||
data["local_segment"] = data["timestamps_segment"] = None
|
||||
return data
|
||||
|
||||
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||
timestamps_regex = "[0-9]{13}"
|
||||
segment_regex = "\[({}#{},{};{},{})\]".format(
|
||||
time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex
|
||||
datetime_regex = (
|
||||
r"[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||
)
|
||||
timestamps_regex = r"[0-9]{13}"
|
||||
segment_regex = r"\[({}#{},{};{},{})\]".format(
|
||||
time_segment_current,
|
||||
datetime_regex,
|
||||
datetime_regex,
|
||||
timestamps_regex,
|
||||
timestamps_regex,
|
||||
)
|
||||
data["local_segment"] = data["assigned_segments"].str.extract(
|
||||
segment_regex, expand=True
|
||||
|
@ -147,14 +147,17 @@ def getDataForPlot(phone_data_yield_per_segment):
|
|||
.fillna(0)
|
||||
)
|
||||
|
||||
# transpose the dataframe per local start datetime of the segment and discard the useless index layer
|
||||
# transpose the dataframe per local start datetime of the segment
|
||||
# and discard the useless index layer
|
||||
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
||||
"local_segment_start_datetimes"
|
||||
)[["minutes_after_segment_start", "sensor"]].apply(
|
||||
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
||||
)
|
||||
phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
|
||||
"local_segment_start_datetimes"
|
||||
phone_data_yield_per_segment.index = (
|
||||
phone_data_yield_per_segment.index.get_level_values(
|
||||
"local_segment_start_datetimes"
|
||||
)
|
||||
)
|
||||
return phone_data_yield_per_segment
|
||||
|
||||
|
@ -227,9 +230,13 @@ phone_data_yield_per_segment.tail()
|
|||
# # A workaround
|
||||
|
||||
# %%
|
||||
phone_data_yield_per_segment["local_segment_start_datetimes", "minutes_after_segment_start"] = phone_data_yield_per_segment[
|
||||
phone_data_yield_per_segment[
|
||||
"local_segment_start_datetimes", "minutes_after_segment_start"
|
||||
] = phone_data_yield_per_segment[
|
||||
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||
].drop_duplicates(keep="first")
|
||||
].drop_duplicates(
|
||||
keep="first"
|
||||
)
|
||||
|
||||
# %%
|
||||
phone_data_yield_per_segment.set_index(
|
||||
|
@ -244,8 +251,9 @@ phone_data_yield_per_segment.head()
|
|||
# %% [markdown]
|
||||
# # Retry
|
||||
|
||||
|
||||
# %%
|
||||
def getDataForPlot(phone_data_yield_per_segment):
|
||||
def get_data_for_plot(phone_data_yield_per_segment):
|
||||
# calculate the length (in minute) of per segment instance
|
||||
phone_data_yield_per_segment["length"] = (
|
||||
phone_data_yield_per_segment["timestamps_segment"]
|
||||
|
@ -292,7 +300,10 @@ def getDataForPlot(phone_data_yield_per_segment):
|
|||
full_index,
|
||||
names=("local_segment_start_datetimes", "minutes_after_segment_start"),
|
||||
)
|
||||
phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(subset=["local_segment_start_datetimes", "minutes_after_segment_start"],keep="first")
|
||||
phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(
|
||||
subset=["local_segment_start_datetimes", "minutes_after_segment_start"],
|
||||
keep="first",
|
||||
)
|
||||
phone_data_yield_per_segment = (
|
||||
phone_data_yield_per_segment.set_index(
|
||||
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||
|
@ -302,14 +313,17 @@ def getDataForPlot(phone_data_yield_per_segment):
|
|||
.fillna(0)
|
||||
)
|
||||
|
||||
# transpose the dataframe per local start datetime of the segment and discard the useless index layer
|
||||
# transpose the dataframe per local start datetime of the segment
|
||||
# and discard the useless index layer
|
||||
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
||||
"local_segment_start_datetimes"
|
||||
)[["minutes_after_segment_start", "sensor"]].apply(
|
||||
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
||||
)
|
||||
phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
|
||||
"local_segment_start_datetimes"
|
||||
phone_data_yield_per_segment.index = (
|
||||
phone_data_yield_per_segment.index.get_level_values(
|
||||
"local_segment_start_datetimes"
|
||||
)
|
||||
)
|
||||
return phone_data_yield_per_segment
|
||||
|
||||
|
@ -318,6 +332,6 @@ def getDataForPlot(phone_data_yield_per_segment):
|
|||
phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
|
||||
|
||||
# %%
|
||||
data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
|
||||
data_for_plot_per_segment = get_data_for_plot(phone_data_yield_per_segment)
|
||||
|
||||
# %%
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# jupytext_version: 1.14.5
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
|
@ -15,19 +15,33 @@
|
|||
# ---
|
||||
|
||||
# %%
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
|
||||
import seaborn as sns
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
import participants.query_db
|
||||
from features.esm import *
|
||||
from features.esm_JCQ import *
|
||||
from features.esm_SAM import *
|
||||
from features.esm import (
|
||||
QUESTIONNAIRE_IDS,
|
||||
clean_up_esm,
|
||||
get_esm_data,
|
||||
increment_answers,
|
||||
preprocess_esm,
|
||||
reassign_question_ids,
|
||||
)
|
||||
from features.esm_COPE import DICT_COPE_QUESTION_IDS
|
||||
from features.esm_JCQ import reverse_jcq_demand_control_scoring
|
||||
from features.esm_SAM import DICT_SAM_QUESTION_IDS, extract_stressful_events
|
||||
|
||||
# import os
|
||||
# import sys
|
||||
# nb_dir = os.path.split(os.getcwd())[0]
|
||||
# if nb_dir not in sys.path:
|
||||
# sys.path.append(nb_dir)
|
||||
|
||||
|
||||
# %%
|
||||
save_figs = False
|
||||
export_data = True
|
||||
|
||||
# %%
|
||||
participants_inactive_usernames = participants.query_db.get_usernames(
|
||||
|
@ -43,8 +57,14 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)
|
|||
|
||||
# %%
|
||||
df_esm_PANAS = df_esm_preprocessed[
|
||||
(df_esm_preprocessed["questionnaire_id"] == 8)
|
||||
| (df_esm_preprocessed["questionnaire_id"] == 9)
|
||||
(
|
||||
df_esm_preprocessed["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["PANAS_positive_affect"]
|
||||
)
|
||||
| (
|
||||
df_esm_preprocessed["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["PANAS_negative_affect"]
|
||||
)
|
||||
]
|
||||
df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)
|
||||
|
||||
|
@ -65,35 +85,47 @@ df_esm_PANAS_daily_means = (
|
|||
# %%
|
||||
df_esm_PANAS_summary_participant = (
|
||||
df_esm_PANAS_daily_means.groupby(["participant_id", "questionnaire_id"])
|
||||
.agg(["mean", "median", "std"])
|
||||
.esm_numeric_mean.agg(["mean", "median", "std"])
|
||||
.reset_index(col_level=1)
|
||||
)
|
||||
df_esm_PANAS_summary_participant.columns = df_esm_PANAS_summary_participant.columns.get_level_values(
|
||||
1
|
||||
)
|
||||
df_esm_PANAS_summary_participant[
|
||||
"PANAS_subscale"
|
||||
"PANAS subscale"
|
||||
] = df_esm_PANAS_daily_means.questionnaire_id.astype("category").cat.rename_categories(
|
||||
{8.0: "PA", 9.0: "NA"}
|
||||
{8.0: "positive affect", 9.0: "negative affect"}
|
||||
)
|
||||
|
||||
# %%
|
||||
sns.displot(
|
||||
data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS_subscale", binwidth=0.2
|
||||
df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["mean"]
|
||||
|
||||
# %%
|
||||
df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["std"]
|
||||
|
||||
# %%
|
||||
df_esm_PANAS_summary_participant.query("std == 0")
|
||||
|
||||
# %%
|
||||
fig1 = sns.displot(
|
||||
data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS subscale", binwidth=0.2
|
||||
)
|
||||
fig1.set_axis_labels(x_var="participant mean", y_var="frequency")
|
||||
if save_figs:
|
||||
fig1.figure.savefig("PANAS_mean_participant.pdf", dpi=300)
|
||||
|
||||
# %%
|
||||
sns.displot(
|
||||
data=df_esm_PANAS_summary_participant,
|
||||
x="median",
|
||||
hue="PANAS_subscale",
|
||||
hue="PANAS subscale",
|
||||
binwidth=0.2,
|
||||
)
|
||||
|
||||
# %%
|
||||
sns.displot(
|
||||
data=df_esm_PANAS_summary_participant, x="std", hue="PANAS_subscale", binwidth=0.05
|
||||
fig2 = sns.displot(
|
||||
data=df_esm_PANAS_summary_participant, x="std", hue="PANAS subscale", binwidth=0.05
|
||||
)
|
||||
fig2.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
|
||||
if save_figs:
|
||||
fig2.figure.savefig("PANAS_std_participant.pdf", dpi=300)
|
||||
|
||||
# %%
|
||||
df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
|
||||
|
@ -109,8 +141,14 @@ df_SAM_all.head()
|
|||
|
||||
# %%
|
||||
df_esm_SAM = df_esm_preprocessed[
|
||||
(df_esm_preprocessed["questionnaire_id"] >= 87)
|
||||
& (df_esm_preprocessed["questionnaire_id"] <= 93)
|
||||
(
|
||||
df_esm_preprocessed["questionnaire_id"]
|
||||
>= QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
|
||||
)
|
||||
& (
|
||||
df_esm_preprocessed["questionnaire_id"]
|
||||
<= QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
|
||||
)
|
||||
]
|
||||
df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
|
||||
|
||||
|
@ -118,9 +156,10 @@ df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
|
|||
# ## Stressful events
|
||||
|
||||
# %%
|
||||
df_esm_SAM_event = df_esm_SAM_clean[df_esm_SAM_clean["questionnaire_id"] == 87].assign(
|
||||
stressful_event=lambda x: (x.esm_user_answer_numeric > 0)
|
||||
)
|
||||
df_esm_SAM_event = df_esm_SAM_clean[
|
||||
df_esm_SAM_clean["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
|
||||
].assign(stressful_event=lambda x: (x.esm_user_answer_numeric > 0))
|
||||
|
||||
# %%
|
||||
df_esm_SAM_daily_events = (
|
||||
|
@ -131,20 +170,22 @@ df_esm_SAM_daily_events = (
|
|||
)
|
||||
|
||||
# %% [markdown]
|
||||
# Calculate the daily mean of YES (1) or NO (0) answers to the question about a stressful events. This is then the daily ratio of EMA sessions that included a stressful event.
|
||||
# Calculate the daily mean of YES (1) or NO (0) answers to the question about stressful events. This is then the daily ratio of EMA sessions that included a stressful event.
|
||||
|
||||
# %%
|
||||
df_esm_SAM_event_summary_participant = (
|
||||
df_esm_SAM_daily_events.groupby(["participant_id"])
|
||||
.agg(["mean", "median", "std"])
|
||||
.SAM_event_ratio.agg(["mean", "median", "std"])
|
||||
.reset_index(col_level=1)
|
||||
)
|
||||
df_esm_SAM_event_summary_participant.columns = df_esm_SAM_event_summary_participant.columns.get_level_values(
|
||||
1
|
||||
)
|
||||
|
||||
# %%
|
||||
sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
|
||||
fig6 = sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
|
||||
fig6.set_axis_labels(
|
||||
x_var="participant proportion of stressful events", y_var="frequency"
|
||||
)
|
||||
if save_figs:
|
||||
fig6.figure.savefig("SAM_events_mean_participant.pdf", dpi=300)
|
||||
|
||||
# %%
|
||||
sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
|
||||
|
@ -155,7 +196,12 @@ sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
|
|||
# %% [markdown]
|
||||
# * Example of threat: "Did this event make you feel anxious?"
|
||||
# * Example of challenge: "How eager are you to tackle this event?"
|
||||
# * Possible answers: 0 - Not at all, 1 - Slightly, 2 - Moderately, 3 - Considerably, 4 - Extremely
|
||||
# * Possible answers:
|
||||
# 0 - Not at all,
|
||||
# 1 - Slightly,
|
||||
# 2 - Moderately,
|
||||
# 3 - Considerably,
|
||||
# 4 - Extremely
|
||||
|
||||
# %%
|
||||
df_esm_SAM_daily = (
|
||||
|
@ -167,27 +213,45 @@ df_esm_SAM_daily = (
|
|||
|
||||
# %%
|
||||
df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[
|
||||
(df_esm_SAM_daily["questionnaire_id"] == 88)
|
||||
| (df_esm_SAM_daily["questionnaire_id"] == 89)
|
||||
(df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
|
||||
| (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
|
||||
]
|
||||
|
||||
# %%
|
||||
df_esm_SAM_summary_participant = (
|
||||
df_esm_SAM_daily.groupby(["participant_id", "questionnaire_id"])
|
||||
.agg(["mean", "median", "std"])
|
||||
.esm_numeric_mean.agg(["mean", "median", "std"])
|
||||
.reset_index(col_level=1)
|
||||
)
|
||||
df_esm_SAM_summary_participant.columns = df_esm_SAM_summary_participant.columns.get_level_values(
|
||||
1
|
||||
|
||||
# %%
|
||||
df_esm_SAM_event_stressfulness_summary_participant = df_esm_SAM_summary_participant[
|
||||
df_esm_SAM_summary_participant["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
|
||||
]
|
||||
df_esm_SAM_event_stressfulness_summary_participant.describe()["mean"]
|
||||
|
||||
# %%
|
||||
df_esm_SAM_event_stressfulness_summary_participant.describe()["std"]
|
||||
|
||||
# %%
|
||||
sns.displot(
|
||||
data=df_esm_SAM_event_stressfulness_summary_participant, x="mean", binwidth=0.2
|
||||
)
|
||||
|
||||
# %%
|
||||
df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[
|
||||
(df_esm_SAM_summary_participant["questionnaire_id"] == 88)
|
||||
| (df_esm_SAM_summary_participant["questionnaire_id"] == 89)
|
||||
(
|
||||
df_esm_SAM_summary_participant["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["appraisal_threat"]
|
||||
)
|
||||
| (
|
||||
df_esm_SAM_summary_participant["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["appraisal_challenge"]
|
||||
)
|
||||
]
|
||||
df_esm_SAM_threat_challenge_summary_participant[
|
||||
"event_subscale"
|
||||
"event subscale"
|
||||
] = df_esm_SAM_threat_challenge_summary_participant.questionnaire_id.astype(
|
||||
"category"
|
||||
).cat.rename_categories(
|
||||
|
@ -198,26 +262,84 @@ df_esm_SAM_threat_challenge_summary_participant[
|
|||
sns.displot(
|
||||
data=df_esm_SAM_threat_challenge_summary_participant,
|
||||
x="mean",
|
||||
hue="event_subscale",
|
||||
hue="event subscale",
|
||||
binwidth=0.2,
|
||||
)
|
||||
|
||||
# %%
|
||||
sns.displot(
|
||||
fig3 = sns.displot(
|
||||
data=df_esm_SAM_threat_challenge_summary_participant,
|
||||
x="std",
|
||||
hue="event_subscale",
|
||||
hue="event subscale",
|
||||
binwidth=0.1,
|
||||
)
|
||||
fig3.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
|
||||
if save_figs:
|
||||
fig3.figure.savefig("SAM_std_participant.pdf", dpi=300)
|
||||
|
||||
# %%
|
||||
df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
|
||||
"mean"
|
||||
]
|
||||
|
||||
# %%
|
||||
df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
|
||||
"std"
|
||||
]
|
||||
|
||||
# %%
|
||||
df_esm_SAM_clean.columns
|
||||
|
||||
# %%
|
||||
df_esm_SAM_clean.esm_status.value_counts()
|
||||
|
||||
# %%
|
||||
if export_data:
|
||||
df_esm_SAM_fixed = reassign_question_ids(df_esm_SAM_clean, DICT_SAM_QUESTION_IDS)
|
||||
df_esm_SAM_fixed = increment_answers(df_esm_SAM_fixed)
|
||||
df_esm_SAM_for_export = df_esm_SAM_fixed[
|
||||
[
|
||||
"participant_id",
|
||||
"username",
|
||||
"device_id",
|
||||
"_id",
|
||||
"esm_trigger",
|
||||
"esm_session",
|
||||
"esm_notification_id",
|
||||
"question_id",
|
||||
"questionnaire_id",
|
||||
"esm_instructions",
|
||||
"double_esm_user_answer_timestamp",
|
||||
"datetime_lj",
|
||||
"date_lj",
|
||||
"time",
|
||||
"esm_user_answer",
|
||||
"esm_user_answer_numeric",
|
||||
]
|
||||
]
|
||||
df_esm_SAM_for_export.sort_values(
|
||||
by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
|
||||
)
|
||||
print(df_esm_SAM_for_export.head())
|
||||
df_esm_SAM_for_export.to_csv(
|
||||
"../data/raw/df_esm_SAM_threat_challenge.csv", index=False
|
||||
)
|
||||
|
||||
# %% [markdown]
|
||||
# ## Stressfulness of period
|
||||
|
||||
# %%
|
||||
df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[
|
||||
df_esm_SAM_summary_participant["questionnaire_id"] == 93
|
||||
df_esm_SAM_summary_participant["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
|
||||
]
|
||||
|
||||
# %%
|
||||
df_esm_SAM_period_summary_participant.describe()["mean"]
|
||||
|
||||
# %%
|
||||
df_esm_SAM_period_summary_participant.describe()["std"]
|
||||
|
||||
# %%
|
||||
sns.displot(data=df_esm_SAM_period_summary_participant, x="mean", binwidth=0.2)
|
||||
|
||||
|
@ -229,8 +351,8 @@ sns.displot(data=df_esm_SAM_period_summary_participant, x="std", binwidth=0.1)
|
|||
|
||||
# %%
|
||||
df_esm_JCQ_demand_control = df_esm_preprocessed[
|
||||
(df_esm_preprocessed["questionnaire_id"] >= 10)
|
||||
& (df_esm_preprocessed["questionnaire_id"] <= 11)
|
||||
(df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["JCQ_job_demand"])
|
||||
& (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["JCQ_job_control"])
|
||||
]
|
||||
df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control)
|
||||
|
||||
|
@ -250,14 +372,11 @@ df_esm_JCQ_daily = (
|
|||
)
|
||||
df_esm_JCQ_summary_participant = (
|
||||
df_esm_JCQ_daily.groupby(["participant_id", "questionnaire_id"])
|
||||
.agg(["mean", "median", "std"])
|
||||
.esm_score_mean.agg(["mean", "median", "std"])
|
||||
.reset_index(col_level=1)
|
||||
)
|
||||
df_esm_JCQ_summary_participant.columns = df_esm_JCQ_summary_participant.columns.get_level_values(
|
||||
1
|
||||
)
|
||||
df_esm_JCQ_summary_participant[
|
||||
"JCQ_subscale"
|
||||
"JCQ subscale"
|
||||
] = df_esm_JCQ_summary_participant.questionnaire_id.astype(
|
||||
"category"
|
||||
).cat.rename_categories(
|
||||
|
@ -265,11 +384,71 @@ df_esm_JCQ_summary_participant[
|
|||
)
|
||||
|
||||
# %%
|
||||
sns.displot(
|
||||
data=df_esm_JCQ_summary_participant, x="mean", hue="JCQ_subscale", binwidth=0.1,
|
||||
)
|
||||
df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["mean"]
|
||||
|
||||
# %%
|
||||
sns.displot(
|
||||
data=df_esm_JCQ_summary_participant, x="std", hue="JCQ_subscale", binwidth=0.05,
|
||||
df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["std"]
|
||||
|
||||
# %%
|
||||
fig4 = sns.displot(
|
||||
data=df_esm_JCQ_summary_participant,
|
||||
x="mean",
|
||||
hue="JCQ subscale",
|
||||
binwidth=0.1,
|
||||
)
|
||||
fig4.set_axis_labels(x_var="participant mean", y_var="frequency")
|
||||
if save_figs:
|
||||
fig4.figure.savefig("JCQ_mean_participant.pdf", dpi=300)
|
||||
|
||||
# %%
|
||||
fig5 = sns.displot(
|
||||
data=df_esm_JCQ_summary_participant,
|
||||
x="std",
|
||||
hue="JCQ subscale",
|
||||
binwidth=0.05,
|
||||
)
|
||||
fig6.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
|
||||
if save_figs:
|
||||
fig5.figure.savefig("JCQ_std_participant.pdf", dpi=300)
|
||||
|
||||
# %% [markdown]
|
||||
# # COPE Inventory
|
||||
|
||||
# %%
|
||||
df_esm_COPE = df_esm_preprocessed[
|
||||
(df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["COPE_active"])
|
||||
& (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["COPE_emotions"])
|
||||
]
|
||||
|
||||
# %%
|
||||
df_esm_COPE_clean = clean_up_esm(df_esm_COPE)
|
||||
df_esm_COPE_clean = increment_answers(df_esm_COPE_clean)
|
||||
df_esm_COPE_fixed = reassign_question_ids(df_esm_COPE_clean, DICT_COPE_QUESTION_IDS)
|
||||
|
||||
# %%
|
||||
if export_data:
|
||||
df_esm_COPE_for_export = df_esm_COPE_fixed[
|
||||
[
|
||||
"participant_id",
|
||||
"username",
|
||||
"device_id",
|
||||
"_id",
|
||||
"esm_trigger",
|
||||
"esm_session",
|
||||
"esm_notification_id",
|
||||
"question_id",
|
||||
"questionnaire_id",
|
||||
"esm_instructions",
|
||||
"double_esm_user_answer_timestamp",
|
||||
"datetime_lj",
|
||||
"date_lj",
|
||||
"time",
|
||||
"esm_user_answer",
|
||||
"esm_user_answer_numeric",
|
||||
]
|
||||
]
|
||||
df_esm_COPE_for_export.sort_values(
|
||||
by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
|
||||
)
|
||||
print(df_esm_COPE_for_export.head())
|
||||
df_esm_COPE_for_export.to_csv("../data/raw/df_esm_COPE.csv", index=False)
|
||||
|
|
|
@ -20,30 +20,74 @@ import numpy as np
|
|||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.metrics import recall_score, f1_score
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
from machine_learning.cross_validation import CrossValidation
|
||||
from machine_learning.preprocessing import Preprocessing
|
||||
from machine_learning.feature_selection import FeatureSelection
|
||||
|
||||
# %%
|
||||
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
df.set_index(index_columns, inplace=True)
|
||||
|
||||
# Create binary target
|
||||
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||
df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
|
||||
|
||||
|
||||
nan_cols = df.columns[df.isna().any()].tolist()
|
||||
df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))
|
||||
|
||||
cv = CrossValidation(data=df, cv_method="logo")
|
||||
|
||||
categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
|
||||
interval_feature_list, other_feature_list = [], []
|
||||
|
||||
print(df.columns.tolist())
|
||||
|
||||
# %%
|
||||
for split in cv.get_splits():
|
||||
train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
|
||||
pre = Preprocessing(train_X, train_y, test_X, test_y)
|
||||
pre.one_hot_encode_train_and_test_sets(categorical_columns)
|
||||
train_X, train_y, test_X, test_y = pre.get_train_test_sets()
|
||||
|
||||
|
||||
print(train_X.shape, test_X.shape)
|
||||
# Predict before feature selection
|
||||
rfc = RandomForestClassifier(n_estimators=10)
|
||||
rfc.fit(train_X, train_y)
|
||||
predictions = rfc.predict(test_X)
|
||||
|
||||
print("Recall:", recall_score(test_y, predictions))
|
||||
print("F1:", f1_score(test_y, predictions))
|
||||
|
||||
# Feature selection on train set
|
||||
train_groups, test_groups = cv.get_groups_sets(split)
|
||||
|
||||
fs = FeatureSelection(train_X, train_y, train_groups)
|
||||
selected_features = fs.select_features(n_min=20, n_max=29, k=40,
|
||||
ml_type="classification_bin",
|
||||
metric="recall", n_tolerance=20)
|
||||
|
||||
train_X = train_X[selected_features]
|
||||
test_X = test_X[selected_features]
|
||||
|
||||
print(selected_features)
|
||||
print(len(selected_features))
|
||||
|
||||
# Predict after feature selection
|
||||
rfc = RandomForestClassifier(n_estimators=500)
|
||||
rfc.fit(train_X, train_y)
|
||||
predictions = rfc.predict(test_X)
|
||||
|
||||
print("Recall:", recall_score(test_y, predictions))
|
||||
print("F1:", f1_score(test_y, predictions))
|
||||
|
||||
break
|
||||
|
||||
# %%
|
||||
|
|
|
@ -6,457 +6,129 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# jupytext_version: 1.14.5
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
# %matplotlib inline
|
||||
import os
|
||||
import sys
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
# from IPython.core.interactiveshell import InteractiveShell
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
# matplotlib inline
|
||||
# import os
|
||||
# import sys
|
||||
import pandas as pd
|
||||
|
||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
|
||||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn.impute import SimpleImputer
|
||||
from machine_learning.helper import (
|
||||
impute_encode_categorical_features,
|
||||
prepare_cross_validator,
|
||||
prepare_sklearn_data_format,
|
||||
run_all_classification_models,
|
||||
)
|
||||
|
||||
from lightgbm import LGBMClassifier
|
||||
import xgboost as xg
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
import machine_learning.helper
|
||||
|
||||
# %% [markdown]
|
||||
# # RAPIDS models
|
||||
|
||||
# %% [markdown]
|
||||
# ## Set script's parameters
|
||||
# InteractiveShell.ast_node_interactivity = "all"
|
||||
#
|
||||
# nb_dir = os.path.split(os.getcwd())[0]
|
||||
# if nb_dir not in sys.path:
|
||||
# sys.path.append(nb_dir)
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method)
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
|
||||
# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
|
||||
# %%
|
||||
CV_METHOD = "logo" # logo, half_logo, 5kfold
|
||||
# Cross-validation method (could be regarded as a hyperparameter)
|
||||
print("CV_METHOD: " + CV_METHOD)
|
||||
N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
UNDERSAMPLING = False
|
||||
# (bool) If True this will train and test data on balanced dataset
|
||||
# (using undersampling method)
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
model_input['target'].value_counts()
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
PATH_BASE = Path("E:/STRAWresults/20230415")
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
SEGMENT_TYPE = "period"
|
||||
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
||||
SEGMENT_LENGTH = "30_minutes_before"
|
||||
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
||||
TARGET_VARIABLE = "JCQ_job_control"
|
||||
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
||||
|
||||
if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
|
||||
TARGET_VARIABLE += "_"
|
||||
TARGET_VARIABLE += SEGMENT_TYPE
|
||||
|
||||
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
||||
|
||||
model_input = pd.read_csv(PATH_FULL)
|
||||
|
||||
if SEGMENT_LENGTH == "daily":
|
||||
DAY_LENGTH = "daily" # or "working"
|
||||
print(DAY_LENGTH)
|
||||
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
model_input["target"].value_counts()
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
# bins = [-10, 0, 10] # bins for z-scored targets
|
||||
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
|
||||
model_input['target'].value_counts(), edges
|
||||
# model_input = model_input[model_input['target'] != "medium"]
|
||||
model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||
BINS = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||
print("BINS: ", BINS)
|
||||
model_input["target"], edges = pd.cut(
|
||||
model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
|
||||
) # ['low', 'medium', 'high']
|
||||
print(model_input["target"].value_counts())
|
||||
REMOVE_MEDIUM = True
|
||||
if ("medium" in model_input["target"]) and REMOVE_MEDIUM:
|
||||
model_input = model_input[model_input["target"] != "medium"]
|
||||
model_input["target"] = (
|
||||
model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||
)
|
||||
else:
|
||||
model_input["target"] = model_input["target"].map(
|
||||
{"low": 0, "medium": 1, "high": 2}
|
||||
)
|
||||
print(model_input["target"].value_counts())
|
||||
|
||||
model_input['target'].value_counts()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
# UnderSampling
|
||||
if undersampling:
|
||||
no_stress = model_input[model_input['target'] == 0]
|
||||
stress = model_input[model_input['target'] == 1]
|
||||
if UNDERSAMPLING:
|
||||
no_stress = model_input[model_input["target"] == 0]
|
||||
stress = model_input[model_input["target"] == 1]
|
||||
|
||||
no_stress = no_stress.sample(n=len(stress))
|
||||
model_input = pd.concat([stress,no_stress], axis=0)
|
||||
|
||||
# model_input_new = pd.DataFrame(columns=model_input.columns)
|
||||
# for pid in model_input["pid"].unique():
|
||||
# stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
|
||||
# no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
|
||||
# if (len(stress) == 0):
|
||||
# continue
|
||||
# if (len(no_stress) == 0):
|
||||
# continue
|
||||
# model_input_new = pd.concat([model_input_new, stress], axis=0)
|
||||
|
||||
# no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
|
||||
# # In case there are more stress samples than no_stress, take all instances of no_stress.
|
||||
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
|
||||
# model_input = model_input_new
|
||||
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
|
||||
model_input = pd.concat([stress, no_stress], axis=0)
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
if cv_method_str == 'half_logo':
|
||||
model_input['pid_index'] = model_input.groupby('pid').cumcount()
|
||||
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
|
||||
|
||||
model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
|
||||
model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
|
||||
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
|
||||
else:
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
train_x.dtypes
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
|
||||
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
||||
cv_method = LeaveOneGroupOut()
|
||||
cv_method.get_n_splits(
|
||||
train_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
|
||||
# %% [markdown]
|
||||
# ### Baseline: Dummy Classifier (most frequent)
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
dummy_class = DummyClassifier(strategy="most_frequent")
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
dummy_classifier = cross_validate(
|
||||
dummy_class,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
|
||||
print("Precision", np.mean(dummy_classifier['test_precision']))
|
||||
print("Recall", np.mean(dummy_classifier['test_recall']))
|
||||
print("F1", np.mean(dummy_classifier['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown] nteract={"transient": {"deleting": false}}
|
||||
# ### All models
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
final_scores = machine_learning.helper.run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
model_input_encoded = impute_encode_categorical_features(model_input)
|
||||
# %%
|
||||
final_scores.index.name = "metric"
|
||||
final_scores = final_scores.set_index(["method", final_scores.index])
|
||||
final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Logistic Regression
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
logistic_regression = linear_model.LogisticRegression()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
log_reg_scores = cross_validate(
|
||||
logistic_regression,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
data_x, data_y, data_groups = prepare_sklearn_data_format(
|
||||
model_input_encoded, CV_METHOD
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
|
||||
print("Precision", np.mean(log_reg_scores['test_precision']))
|
||||
print("Recall", np.mean(log_reg_scores['test_recall']))
|
||||
print("F1", np.mean(log_reg_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Support Vector Machine
|
||||
# %%
|
||||
data_y.head()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
svc = svm.SVC()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
svc_scores = cross_validate(
|
||||
svc,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
# %%
|
||||
data_y.tail()
|
||||
# %%
|
||||
data_y.shape
|
||||
# %%
|
||||
scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
|
||||
# %%
|
||||
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
||||
path_output_full = PATH_OUTPUT / (
|
||||
TARGET_VARIABLE
|
||||
+ "_"
|
||||
+ SEGMENT_LENGTH
|
||||
+ "_classification"
|
||||
+ str(BINS)
|
||||
+ "_"
|
||||
+ CV_METHOD
|
||||
+ ".csv"
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
|
||||
print("Precision", np.mean(svc_scores['test_precision']))
|
||||
print("Recall", np.mean(svc_scores['test_recall']))
|
||||
print("F1", np.mean(svc_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Gaussian Naive Bayes
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
gaussian_nb = naive_bayes.GaussianNB()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
gaussian_nb_scores = cross_validate(
|
||||
gaussian_nb,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
|
||||
print("Precision", np.mean(gaussian_nb_scores['test_precision']))
|
||||
print("Recall", np.mean(gaussian_nb_scores['test_recall']))
|
||||
print("F1", np.mean(gaussian_nb_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Stochastic Gradient Descent Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
sgdc = linear_model.SGDClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
sgdc_scores = cross_validate(
|
||||
sgdc,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
|
||||
print("Precision", np.mean(sgdc_scores['test_precision']))
|
||||
print("Recall", np.mean(sgdc_scores['test_recall']))
|
||||
print("F1", np.mean(sgdc_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### K-nearest neighbors
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
knn = neighbors.KNeighborsClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
knn_scores = cross_validate(
|
||||
knn,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
|
||||
print("Precision", np.mean(knn_scores['test_precision']))
|
||||
print("Recall", np.mean(knn_scores['test_recall']))
|
||||
print("F1", np.mean(knn_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Decision Tree
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
dtree = tree.DecisionTreeClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
dtree_scores = cross_validate(
|
||||
dtree,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
|
||||
print("Precision", np.mean(dtree_scores['test_precision']))
|
||||
print("Recall", np.mean(dtree_scores['test_recall']))
|
||||
print("F1", np.mean(dtree_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Random Forest Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
rfc = ensemble.RandomForestClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
rfc_scores = cross_validate(
|
||||
rfc,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1'),
|
||||
return_estimator=True
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
|
||||
print("Precision", np.mean(rfc_scores['test_precision']))
|
||||
print("Recall", np.mean(rfc_scores['test_recall']))
|
||||
print("F1", np.mean(rfc_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Feature importance (RFC)
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
|
||||
for idx, estimator in enumerate(rfc_scores['estimator']):
|
||||
feature_importances = pd.DataFrame(estimator.feature_importances_,
|
||||
index = list(train_x.columns),
|
||||
columns=['importance'])
|
||||
# print("\nFeatures sorted by their score for estimator {}:".format(idx))
|
||||
# print(feature_importances.sort_values('importance', ascending=False).head(10))
|
||||
rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
|
||||
|
||||
pd.set_option('display.max_rows', 100)
|
||||
print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
|
||||
|
||||
rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
|
||||
|
||||
rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
|
||||
|
||||
train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Gradient Boosting Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
gbc = ensemble.GradientBoostingClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
gbc_scores = cross_validate(
|
||||
gbc,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
|
||||
print("Precision", np.mean(gbc_scores['test_precision']))
|
||||
print("Recall", np.mean(gbc_scores['test_recall']))
|
||||
print("F1", np.mean(gbc_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### LGBM Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
lgbm = LGBMClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
lgbm_scores = cross_validate(
|
||||
lgbm,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
|
||||
print("Precision", np.mean(lgbm_scores['test_precision']))
|
||||
print("Recall", np.mean(lgbm_scores['test_recall']))
|
||||
print("F1", np.mean(lgbm_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### XGBoost Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
xgb_classifier = xg.sklearn.XGBClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
xgb_classifier_scores = cross_validate(
|
||||
xgb_classifier,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
|
||||
print("Precision", np.mean(xgb_classifier_scores['test_precision']))
|
||||
print("Recall", np.mean(xgb_classifier_scores['test_recall']))
|
||||
print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
scores.to_csv(path_output_full, index=False)
|
||||
|
|
|
@ -0,0 +1,177 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.14.5
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
from machine_learning.helper import (
|
||||
impute_encode_categorical_features,
|
||||
prepare_cross_validator,
|
||||
prepare_sklearn_data_format,
|
||||
run_all_classification_models,
|
||||
)
|
||||
|
||||
# %%
|
||||
CV_METHOD = "logo" # logo, half_logo, 5kfold
|
||||
# Cross-validation method (could be regarded as a hyperparameter)
|
||||
print("CV_METHOD: " + CV_METHOD)
|
||||
N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
UNDERSAMPLING = False
|
||||
# (bool) If True this will train and test data on balanced dataset
|
||||
# (using undersampling method)
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
PATH_BASE = Path("E:/STRAWresults/20230415")
|
||||
|
||||
SEGMENT_TYPE = "period"
|
||||
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
||||
SEGMENT_LENGTH = "30_minutes_before"
|
||||
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
||||
|
||||
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv"
|
||||
|
||||
all_features_with_baseline = pd.read_csv(PATH_FULL)
|
||||
|
||||
# %%
|
||||
TARGETS = [
|
||||
"PANAS_negative_affect_mean",
|
||||
"PANAS_positive_affect_mean",
|
||||
"JCQ_job_demand_mean",
|
||||
"JCQ_job_control_mean",
|
||||
"appraisal_stressfulness_period_mean",
|
||||
]
|
||||
|
||||
# %%
|
||||
all_features_cleaned = pd.DataFrame()
|
||||
for target in TARGETS:
|
||||
PATH_FULL = (
|
||||
PATH_BASE
|
||||
/ SEGMENT_LENGTH
|
||||
/ "features"
|
||||
/ ("all_sensor_features_cleaned_straw_py_(" + target + ").csv")
|
||||
)
|
||||
current_features = pd.read_csv(PATH_FULL, index_col="local_segment")
|
||||
if all_features_cleaned.empty:
|
||||
all_features_cleaned = current_features
|
||||
else:
|
||||
all_features_cleaned = all_features_cleaned.join(
|
||||
current_features[("phone_esm_straw_" + target)],
|
||||
how="inner",
|
||||
rsuffix="_" + target,
|
||||
)
|
||||
print(all_features_cleaned.shape)
|
||||
|
||||
# %%
|
||||
pca = PCA(n_components=1)
|
||||
TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS]
|
||||
pca.fit(all_features_cleaned[TARGETS_PREFIXED])
|
||||
print(pca.explained_variance_ratio_)
|
||||
|
||||
# %%
|
||||
model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED)
|
||||
model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED])
|
||||
|
||||
# %%
|
||||
sns.histplot(data=model_input, x="target")
|
||||
|
||||
# %%
|
||||
model_input.target.quantile(0.6)
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
# bins = [-10, 0, 10] # bins for z-scored targets
|
||||
BINS = [-10, 0, 10] # bins for stressfulness (0-4) target
|
||||
print("BINS: ", BINS)
|
||||
model_input["target"], edges = pd.cut(
|
||||
model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
|
||||
) # ['low', 'medium', 'high']
|
||||
print(model_input["target"].value_counts())
|
||||
REMOVE_MEDIUM = True
|
||||
if REMOVE_MEDIUM:
|
||||
if "medium" in model_input["target"]:
|
||||
model_input = model_input[model_input["target"] != "medium"]
|
||||
model_input["target"] = (
|
||||
model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||
)
|
||||
else:
|
||||
model_input["target"] = model_input["target"].map(
|
||||
{"low": 0, "medium": 1, "high": 2}
|
||||
)
|
||||
print(model_input["target"].value_counts())
|
||||
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
# UnderSampling
|
||||
if UNDERSAMPLING:
|
||||
no_stress = model_input[model_input["target"] == 0]
|
||||
stress = model_input[model_input["target"] == 1]
|
||||
|
||||
no_stress = no_stress.sample(n=len(stress))
|
||||
model_input = pd.concat([stress, no_stress], axis=0)
|
||||
|
||||
|
||||
# %%
|
||||
TARGET_VARIABLE = "PANAS_negative_affect"
|
||||
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
||||
|
||||
PATH_FULL_HELP = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
||||
|
||||
model_input_with_baseline = pd.read_csv(PATH_FULL_HELP, index_col="local_segment")
|
||||
|
||||
# %%
|
||||
baseline_col_names = [
|
||||
col for col in model_input_with_baseline.columns if col not in model_input.columns
|
||||
]
|
||||
print(baseline_col_names)
|
||||
|
||||
# %%
|
||||
model_input = model_input.join(
|
||||
model_input_with_baseline[baseline_col_names], how="left"
|
||||
)
|
||||
model_input.reset_index(inplace=True)
|
||||
|
||||
# %%
|
||||
model_input_encoded = impute_encode_categorical_features(model_input)
|
||||
|
||||
# %%
|
||||
data_x, data_y, data_groups = prepare_sklearn_data_format(
|
||||
model_input_encoded, CV_METHOD
|
||||
)
|
||||
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
|
||||
|
||||
# %%
|
||||
data_y.head()
|
||||
|
||||
# %%
|
||||
data_y.tail()
|
||||
# %%
|
||||
data_y.shape
|
||||
# %%
|
||||
scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
|
||||
# %%
|
||||
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
||||
path_output_full = PATH_OUTPUT / (
|
||||
"composite_"
|
||||
+ SEGMENT_LENGTH
|
||||
+ "_classification"
|
||||
+ str(BINS)
|
||||
+ "_"
|
||||
+ CV_METHOD
|
||||
+ ".csv"
|
||||
)
|
||||
scores.to_csv(path_output_full, index=False)
|
|
@ -6,7 +6,7 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# jupytext_version: 1.14.5
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
|
@ -14,80 +14,85 @@
|
|||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# %matplotlib inline
|
||||
import datetime
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
|
||||
from sklearn.impute import SimpleImputer
|
||||
|
||||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||
import xgboost as xg
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold, cross_validate
|
||||
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
import machine_learning.labels
|
||||
import machine_learning.model
|
||||
from machine_learning.classification_models import ClassificationModels
|
||||
|
||||
# %% [markdown]
|
||||
# # RAPIDS models
|
||||
|
||||
# %% [markdown]
|
||||
# %%
|
||||
# ## Set script's parameters
|
||||
n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
|
||||
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
N_CLUSTERS = 4 # Number of clusters (could be regarded as a hyperparameter)
|
||||
CV_METHOD = "logo" # logo, halflogo, 5kfold
|
||||
# Cross-validation method (could be regarded as a hyperparameter)
|
||||
N_SL = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
|
||||
# %%
|
||||
PATH_BASE = Path("E:/STRAWresults/20230415")
|
||||
|
||||
SEGMENT_TYPE = "period"
|
||||
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
||||
SEGMENT_LENGTH = "30_minutes_before"
|
||||
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
||||
TARGET_VARIABLE = "appraisal_stressfulness"
|
||||
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
||||
|
||||
if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
|
||||
TARGET_VARIABLE += "_"
|
||||
TARGET_VARIABLE += SEGMENT_TYPE
|
||||
|
||||
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
||||
|
||||
model_input = pd.read_csv(PATH_FULL)
|
||||
|
||||
if SEGMENT_LENGTH == "daily":
|
||||
DAY_LENGTH = "daily" # or "working"
|
||||
print(DAY_LENGTH)
|
||||
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
index_columns = [
|
||||
"local_segment",
|
||||
"local_segment_label",
|
||||
"local_segment_start_datetime",
|
||||
"local_segment_end_datetime",
|
||||
]
|
||||
|
||||
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
|
||||
CLUST_COL = "limesurvey_demand_control_ratio_quartile"
|
||||
print("CLUST_COL: " + CLUST_COL)
|
||||
|
||||
model_input.columns[list(model_input.columns).index('age'):-1]
|
||||
BINS = [-1, 0, 4]
|
||||
print("BINS: " + str(BINS))
|
||||
|
||||
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
|
||||
lime_cols
|
||||
lime_col = 'limesurvey_demand_control_ratio_quartile'
|
||||
clust_col = lime_col
|
||||
model_input[CLUST_COL].describe()
|
||||
|
||||
model_input[clust_col].describe()
|
||||
|
||||
# %%
|
||||
model_input["target"].value_counts()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
|
||||
# Filter-out outlier rows by clust_col
|
||||
#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||
# model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||
|
||||
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
||||
uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
|
||||
uniq = uniq.dropna()
|
||||
plt.bar(uniq['pid'], uniq[clust_col])
|
||||
plt.bar(uniq["pid"], uniq[CLUST_COL])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Get clusters by cluster col & and merge the clusters to main df
|
||||
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
|
||||
km = KMeans(n_clusters=N_CLUSTERS).fit_predict(uniq.set_index("pid"))
|
||||
np.unique(km, return_counts=True)
|
||||
uniq['cluster'] = km
|
||||
uniq
|
||||
uniq["cluster"] = km
|
||||
|
||||
model_input = model_input.merge(uniq[['pid', 'cluster']])
|
||||
model_input = model_input.merge(uniq[["pid", "cluster"]])
|
||||
|
||||
# %%
|
||||
model_input[["cluster", "target"]].value_counts().sort_index()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
|
@ -98,31 +103,56 @@ cm = ClassificationModels()
|
|||
cmodels = cm.get_cmodels()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
for k in range(n_clusters):
|
||||
for k in range(N_CLUSTERS):
|
||||
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
||||
bins = [-10, -1, 1, 10] # bins for z-scored targets
|
||||
model_input_subset.loc[:, 'target'] = \
|
||||
pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
|
||||
model_input_subset['target'].value_counts()
|
||||
model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
|
||||
model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||
model_input_subset.loc[:, "target"] = pd.cut(
|
||||
model_input_subset.loc[:, "target"],
|
||||
bins=BINS,
|
||||
labels=["low", "high"],
|
||||
right=True,
|
||||
) # ['low', 'medium', 'high']
|
||||
model_input_subset["target"].value_counts()
|
||||
# model_input_subset = model_input_subset[model_input_subset["target"] != "medium"]
|
||||
model_input_subset["target"] = (
|
||||
model_input_subset["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||
)
|
||||
|
||||
model_input_subset['target'].value_counts()
|
||||
print(model_input_subset["target"].value_counts())
|
||||
|
||||
if cv_method_str == 'half_logo':
|
||||
model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
|
||||
model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
|
||||
if CV_METHOD == "half_logo":
|
||||
model_input_subset["pid_index"] = model_input_subset.groupby("pid").cumcount()
|
||||
model_input_subset["pid_count"] = model_input_subset.groupby("pid")[
|
||||
"pid"
|
||||
].transform("count")
|
||||
|
||||
model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
|
||||
model_input_subset["pid_half"] = model_input_subset["pid"] + "_" + model_input_subset["pid_index"].astype(int).astype(str)
|
||||
model_input_subset["pid_index"] = (
|
||||
model_input_subset["pid_index"] / model_input_subset["pid_count"] + 1
|
||||
).round()
|
||||
model_input_subset["pid_half"] = (
|
||||
model_input_subset["pid"]
|
||||
+ "_"
|
||||
+ model_input_subset["pid_index"].astype(int).astype(str)
|
||||
)
|
||||
|
||||
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
|
||||
data_x, data_y, data_groups = (
|
||||
model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
|
||||
model_input_subset["target"],
|
||||
model_input_subset["pid_half"],
|
||||
)
|
||||
else:
|
||||
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]
|
||||
data_x, data_y, data_groups = (
|
||||
model_input_subset.drop(["target", "pid"], axis=1),
|
||||
model_input_subset["target"],
|
||||
model_input_subset["pid"],
|
||||
)
|
||||
|
||||
# Treat categorical features
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
additional_categorical_features = [
|
||||
col
|
||||
for col in data_x.columns
|
||||
if "mostcommonactivity" in col or "homelabel" in col
|
||||
]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||
|
@ -132,7 +162,9 @@ for k in range(n_clusters):
|
|||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
categorical_features = categorical_features.apply(
|
||||
lambda col: col.astype("category")
|
||||
)
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
|
@ -140,8 +172,10 @@ for k in range(n_clusters):
|
|||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# Establish cv method
|
||||
cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
|
||||
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
||||
cv_method = StratifiedKFold(
|
||||
n_splits=5, shuffle=True
|
||||
) # Defaults to 5 k-folds in cross_validate method
|
||||
if CV_METHOD == "logo" or CV_METHOD == "half_logo":
|
||||
cv_method = LeaveOneGroupOut()
|
||||
cv_method.get_n_splits(
|
||||
train_x,
|
||||
|
@ -149,36 +183,57 @@ for k in range(n_clusters):
|
|||
groups=data_groups,
|
||||
)
|
||||
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy="median")
|
||||
|
||||
for model_title, model in cmodels.items():
|
||||
|
||||
classifier = cross_validate(
|
||||
model['model'],
|
||||
model["model"],
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
error_score="raise",
|
||||
scoring=("accuracy", "precision", "recall", "f1"),
|
||||
)
|
||||
|
||||
print("\n-------------------------------------\n")
|
||||
print("Current cluster:", k, end="\n")
|
||||
print("Current model:", model_title, end="\n")
|
||||
print("Acc", np.mean(classifier['test_accuracy']))
|
||||
print("Precision", np.mean(classifier['test_precision']))
|
||||
print("Recall", np.mean(classifier['test_recall']))
|
||||
print("F1", np.mean(classifier['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))
|
||||
print("Acc", np.mean(classifier["test_accuracy"]))
|
||||
print("Precision", np.mean(classifier["test_precision"]))
|
||||
print("Recall", np.mean(classifier["test_recall"]))
|
||||
print("F1", np.mean(classifier["test_f1"]))
|
||||
print(
|
||||
f"Largest {N_SL} ACC:",
|
||||
np.sort(-np.partition(-classifier["test_accuracy"], N_SL)[:N_SL])[::-1],
|
||||
)
|
||||
print(
|
||||
f"Smallest {N_SL} ACC:",
|
||||
np.sort(np.partition(classifier["test_accuracy"], N_SL)[:N_SL]),
|
||||
)
|
||||
|
||||
cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
|
||||
cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
|
||||
cmodels[model_title]['metrics'][2] += np.mean(classifier['test_recall'])
|
||||
cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])
|
||||
cmodels[model_title]["metrics"][0] += np.mean(classifier["test_accuracy"])
|
||||
cmodels[model_title]["metrics"][1] += np.mean(classifier["test_precision"])
|
||||
cmodels[model_title]["metrics"][2] += np.mean(classifier["test_recall"])
|
||||
cmodels[model_title]["metrics"][3] += np.mean(classifier["test_f1"])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Get overall results
|
||||
cm.get_total_models_scores(n_clusters=n_clusters)
|
||||
scores = cm.get_total_models_scores(n_clusters=N_CLUSTERS)
|
||||
|
||||
# %%
|
||||
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
||||
path_output_full = PATH_OUTPUT / (
|
||||
TARGET_VARIABLE
|
||||
+ "_"
|
||||
+ SEGMENT_LENGTH
|
||||
+ "_classification_"
|
||||
+ CV_METHOD
|
||||
+ str(BINS)
|
||||
+ "_clust_"
|
||||
+ CLUST_COL
|
||||
+ str(N_CLUSTERS)
|
||||
+ ".csv"
|
||||
)
|
||||
scores.to_csv(path_output_full, index=False)
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# jupytext_version: 1.14.5
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
|
@ -14,92 +14,83 @@
|
|||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# %matplotlib inline
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy import stats
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from machine_learning.classification_models import ClassificationModels
|
||||
|
||||
# %% [markdown]
|
||||
# # RAPIDS models
|
||||
|
||||
# %% [markdown]
|
||||
# # Useful method
|
||||
def treat_categorical_features(input_set):
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = input_set[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = input_set.drop(categorical_feature_colnames, axis=1)
|
||||
|
||||
return pd.concat([numerical_features, categorical_features], axis=1)
|
||||
from machine_learning.helper import impute_encode_categorical_features
|
||||
|
||||
# %% [markdown]
|
||||
# ## Set script's parameters
|
||||
n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
|
||||
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
#
|
||||
|
||||
# %%
|
||||
n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
|
||||
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
|
||||
# %%
|
||||
PATH_BASE = Path("E:/STRAWresults/20230415")
|
||||
|
||||
SEGMENT_TYPE = "period"
|
||||
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
||||
SEGMENT_LENGTH = "30_minutes_before"
|
||||
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
||||
TARGET_VARIABLE = "appraisal_stressfulness"
|
||||
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
||||
|
||||
if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
|
||||
TARGET_VARIABLE += "_"
|
||||
TARGET_VARIABLE += SEGMENT_TYPE
|
||||
|
||||
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
||||
|
||||
model_input = pd.read_csv(PATH_FULL)
|
||||
|
||||
if SEGMENT_LENGTH == "daily":
|
||||
DAY_LENGTH = "daily" # or "working"
|
||||
print(DAY_LENGTH)
|
||||
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
CLUST_COL = "limesurvey_demand_control_ratio"
|
||||
print("CLUST_COL: " + CLUST_COL)
|
||||
|
||||
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
|
||||
BINS = [-1, 0, 4]
|
||||
print("BINS: " + str(BINS))
|
||||
|
||||
model_input.columns[list(model_input.columns).index('age'):-1]
|
||||
index_columns = [
|
||||
"local_segment",
|
||||
"local_segment_label",
|
||||
"local_segment_start_datetime",
|
||||
"local_segment_end_datetime",
|
||||
]
|
||||
|
||||
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
|
||||
lime_cols
|
||||
lime_col = 'limesurvey_demand_control_ratio'
|
||||
clust_col = lime_col
|
||||
|
||||
model_input[clust_col].describe()
|
||||
model_input[CLUST_COL].describe()
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
|
||||
# Filter-out outlier rows by clust_col
|
||||
model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||
model_input = model_input[(np.abs(stats.zscore(model_input[CLUST_COL])) < 3)]
|
||||
|
||||
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
||||
plt.bar(uniq['pid'], uniq[clust_col])
|
||||
uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
|
||||
plt.bar(uniq["pid"], uniq[CLUST_COL])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Get clusters by cluster col & and merge the clusters to main df
|
||||
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
|
||||
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index("pid"))
|
||||
np.unique(km, return_counts=True)
|
||||
uniq['cluster'] = km
|
||||
uniq
|
||||
uniq["cluster"] = km
|
||||
print(uniq)
|
||||
|
||||
model_input = model_input.merge(uniq[['pid', 'cluster']])
|
||||
model_input = model_input.merge(uniq[["pid", "cluster"]])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
|
@ -109,44 +100,58 @@ model_input.set_index(index_columns, inplace=True)
|
|||
cm = ClassificationModels()
|
||||
cmodels = cm.get_cmodels()
|
||||
|
||||
# %%
|
||||
model_input["target"].value_counts()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
for k in range(n_clusters):
|
||||
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
||||
|
||||
# Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
|
||||
model_input_subset['numerical_target'] = model_input_subset['target']
|
||||
bins = [-10, 0, 10] # bins for z-scored targets
|
||||
model_input_subset.loc[:, 'target'] = \
|
||||
pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True)
|
||||
# model_input_subset['numerical_target'] = model_input_subset['target']
|
||||
|
||||
p15 = np.percentile(model_input_subset['numerical_target'], 15)
|
||||
p85 = np.percentile(model_input_subset['numerical_target'], 85)
|
||||
model_input_subset.loc[:, "target"] = pd.cut(
|
||||
model_input_subset.loc[:, "target"], bins=BINS, labels=[0, 1], right=True
|
||||
)
|
||||
|
||||
# p15 = np.percentile(model_input_subset['numerical_target'], 15)
|
||||
# p85 = np.percentile(model_input_subset['numerical_target'], 85)
|
||||
|
||||
# Treat categorical features
|
||||
model_input_subset = treat_categorical_features(model_input_subset)
|
||||
model_input_subset = impute_encode_categorical_features(model_input_subset)
|
||||
|
||||
# Split to train, validate, and test subsets
|
||||
train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
|
||||
test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
|
||||
# train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
|
||||
# test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
|
||||
train_set, test_set = train_test_split(
|
||||
model_input_subset,
|
||||
test_size=0.3,
|
||||
stratify=model_input_subset["pid"],
|
||||
random_state=42,
|
||||
)
|
||||
|
||||
train_set['target'].value_counts()
|
||||
test_set['target'].value_counts()
|
||||
print(train_set["target"].value_counts())
|
||||
print(test_set["target"].value_counts())
|
||||
|
||||
train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
|
||||
|
||||
validate_x, test_x, validate_y, test_y = \
|
||||
train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42)
|
||||
validate_x, test_x, validate_y, test_y = train_test_split(
|
||||
test_set.drop(["target", "pid"], axis=1),
|
||||
test_set["target"],
|
||||
test_size=0.50,
|
||||
random_state=42,
|
||||
)
|
||||
|
||||
# Impute missing values
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy="median")
|
||||
|
||||
train_x = imputer.fit_transform(train_x)
|
||||
validate_x = imputer.fit_transform(validate_x)
|
||||
test_x = imputer.fit_transform(test_x)
|
||||
|
||||
for model_title, model in cmodels.items():
|
||||
model['model'].fit(train_x, train_y)
|
||||
y_pred = model['model'].predict(validate_x)
|
||||
model["model"].fit(train_x, train_y)
|
||||
y_pred = model["model"].predict(validate_x)
|
||||
|
||||
acc = accuracy_score(validate_y, y_pred)
|
||||
prec = precision_score(validate_y, y_pred)
|
||||
|
@ -161,11 +166,29 @@ for k in range(n_clusters):
|
|||
print("Recall", rec)
|
||||
print("F1", f1)
|
||||
|
||||
cmodels[model_title]['metrics'][0] += acc
|
||||
cmodels[model_title]['metrics'][1] += prec
|
||||
cmodels[model_title]['metrics'][2] += rec
|
||||
cmodels[model_title]['metrics'][3] += f1
|
||||
cmodels[model_title]["metrics"][0] += acc
|
||||
cmodels[model_title]["metrics"][1] += prec
|
||||
cmodels[model_title]["metrics"][2] += rec
|
||||
cmodels[model_title]["metrics"][3] += f1
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Get overall results
|
||||
cm.get_total_models_scores(n_clusters=n_clusters)
|
||||
scores = cm.get_total_models_scores(n_clusters=n_clusters)
|
||||
|
||||
# %%
|
||||
print(scores)
|
||||
|
||||
# %%
|
||||
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
||||
path_output_full = PATH_OUTPUT / (
|
||||
TARGET_VARIABLE
|
||||
+ "_"
|
||||
+ SEGMENT_LENGTH
|
||||
+ "_classification"
|
||||
+ str(BINS)
|
||||
+ "_CLUST_"
|
||||
+ CLUST_COL
|
||||
+ +str(n_clusters)
|
||||
+ ".csv"
|
||||
)
|
||||
scores.to_csv(path_output_full, index=False)
|
||||
|
|
|
@ -6,445 +6,61 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.13.0
|
||||
# jupytext_version: 1.14.5
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# %matplotlib inline
|
||||
# %%
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import xgboost as xg
|
||||
from machine_learning.helper import prepare_regression_model_input
|
||||
from sklearn import gaussian_process, kernel_ridge, linear_model, svm
|
||||
from sklearn.dummy import DummyRegressor
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
|
||||
|
||||
# from IPython.core.interactiveshell import InteractiveShell
|
||||
# InteractiveShell.ast_node_interactivity = "all"
|
||||
from machine_learning.helper import (
|
||||
impute_encode_categorical_features,
|
||||
prepare_cross_validator,
|
||||
prepare_sklearn_data_format,
|
||||
run_all_regression_models,
|
||||
)
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# %%
|
||||
model_input = pd.read_csv(
|
||||
"../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
|
||||
)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
cv_method = "half_logo" # logo, half_logo, 5kfold
|
||||
|
||||
train_x, data_y, data_groups = prepare_regression_model_input(model_input, cv_method)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
logo = LeaveOneGroupOut()
|
||||
logo.get_n_splits(
|
||||
train_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
|
||||
# Defaults to 5 k folds in cross_validate method
|
||||
if cv_method != "logo" and cv_method != "half_logo":
|
||||
logo = None
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
sum(data_y.isna())
|
||||
|
||||
# %% [markdown]
|
||||
# ### Baseline: Dummy Regression (mean)
|
||||
dummy_regr = DummyRegressor(strategy="mean")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
dummy_regressor = cross_validate(
|
||||
dummy_regr,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=(
|
||||
"r2",
|
||||
"neg_mean_squared_error",
|
||||
"neg_mean_absolute_error",
|
||||
"neg_root_mean_squared_error",
|
||||
),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Squared Error",
|
||||
np.median(dummy_regressor["test_neg_mean_squared_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Absolute Error",
|
||||
np.median(dummy_regressor["test_neg_mean_absolute_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Root Mean Squared Error",
|
||||
np.median(dummy_regressor["test_neg_root_mean_squared_error"]),
|
||||
)
|
||||
print("R2", np.median(dummy_regressor["test_r2"]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Linear Regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lin_reg_rapids = linear_model.LinearRegression()
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lin_reg_scores = cross_validate(
|
||||
lin_reg_rapids,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=(
|
||||
"r2",
|
||||
"neg_mean_squared_error",
|
||||
"neg_mean_absolute_error",
|
||||
"neg_root_mean_squared_error",
|
||||
),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Squared Error",
|
||||
np.median(lin_reg_scores["test_neg_mean_squared_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Absolute Error",
|
||||
np.median(lin_reg_scores["test_neg_mean_absolute_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Root Mean Squared Error",
|
||||
np.median(lin_reg_scores["test_neg_root_mean_squared_error"]),
|
||||
)
|
||||
print("R2", np.median(lin_reg_scores["test_r2"]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### XGBRegressor Linear Regression
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_r = xg.XGBRegressor(objective="reg:squarederror", n_estimators=10)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_reg_scores = cross_validate(
|
||||
xgb_r,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=(
|
||||
"r2",
|
||||
"neg_mean_squared_error",
|
||||
"neg_mean_absolute_error",
|
||||
"neg_root_mean_squared_error",
|
||||
),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Squared Error",
|
||||
np.median(xgb_reg_scores["test_neg_mean_squared_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Absolute Error",
|
||||
np.median(xgb_reg_scores["test_neg_mean_absolute_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Root Mean Squared Error",
|
||||
np.median(xgb_reg_scores["test_neg_root_mean_squared_error"]),
|
||||
)
|
||||
print("R2", np.median(xgb_reg_scores["test_r2"]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### XGBRegressor Pseudo Huber Error Regression
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_psuedo_huber_r = xg.XGBRegressor(objective="reg:pseudohubererror", n_estimators=10)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_psuedo_huber_reg_scores = cross_validate(
|
||||
xgb_psuedo_huber_r,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=(
|
||||
"r2",
|
||||
"neg_mean_squared_error",
|
||||
"neg_mean_absolute_error",
|
||||
"neg_root_mean_squared_error",
|
||||
),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Squared Error",
|
||||
np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_squared_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Absolute Error",
|
||||
np.median(xgb_psuedo_huber_reg_scores["test_neg_mean_absolute_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Root Mean Squared Error",
|
||||
np.median(xgb_psuedo_huber_reg_scores["test_neg_root_mean_squared_error"]),
|
||||
)
|
||||
print("R2", np.median(xgb_psuedo_huber_reg_scores["test_r2"]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Ridge regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ridge_reg = linear_model.Ridge(alpha=0.5)
|
||||
|
||||
# %% tags=[] jupyter={"source_hidden": true}
|
||||
ridge_reg_scores = cross_validate(
|
||||
ridge_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=(
|
||||
"r2",
|
||||
"neg_mean_squared_error",
|
||||
"neg_mean_absolute_error",
|
||||
"neg_root_mean_squared_error",
|
||||
),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Squared Error",
|
||||
np.median(ridge_reg_scores["test_neg_mean_squared_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Absolute Error",
|
||||
np.median(ridge_reg_scores["test_neg_mean_absolute_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Root Mean Squared Error",
|
||||
np.median(ridge_reg_scores["test_neg_root_mean_squared_error"]),
|
||||
)
|
||||
print("R2", np.median(ridge_reg_scores["test_r2"]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Lasso
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lasso_reg = linear_model.Lasso(alpha=0.1)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lasso_reg_score = cross_validate(
|
||||
lasso_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=(
|
||||
"r2",
|
||||
"neg_mean_squared_error",
|
||||
"neg_mean_absolute_error",
|
||||
"neg_root_mean_squared_error",
|
||||
),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Squared Error",
|
||||
np.median(lasso_reg_score["test_neg_mean_squared_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Absolute Error",
|
||||
np.median(lasso_reg_score["test_neg_mean_absolute_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Root Mean Squared Error",
|
||||
np.median(lasso_reg_score["test_neg_root_mean_squared_error"]),
|
||||
)
|
||||
print("R2", np.median(lasso_reg_score["test_r2"]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Bayesian Ridge
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
bayesian_ridge_reg = linear_model.BayesianRidge()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
bayesian_ridge_reg_score = cross_validate(
|
||||
bayesian_ridge_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=(
|
||||
"r2",
|
||||
"neg_mean_squared_error",
|
||||
"neg_mean_absolute_error",
|
||||
"neg_root_mean_squared_error",
|
||||
),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Squared Error",
|
||||
np.median(bayesian_ridge_reg_score["test_neg_mean_squared_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Absolute Error",
|
||||
np.median(bayesian_ridge_reg_score["test_neg_mean_absolute_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Root Mean Squared Error",
|
||||
np.median(bayesian_ridge_reg_score["test_neg_root_mean_squared_error"]),
|
||||
)
|
||||
print("R2", np.median(bayesian_ridge_reg_score["test_r2"]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### RANSAC (outlier robust regression)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ransac_reg = linear_model.RANSACRegressor()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ransac_reg_scores = cross_validate(
|
||||
ransac_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=(
|
||||
"r2",
|
||||
"neg_mean_squared_error",
|
||||
"neg_mean_absolute_error",
|
||||
"neg_root_mean_squared_error",
|
||||
),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Squared Error",
|
||||
np.median(ransac_reg_scores["test_neg_mean_squared_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Absolute Error",
|
||||
np.median(ransac_reg_scores["test_neg_mean_absolute_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Root Mean Squared Error",
|
||||
np.median(ransac_reg_scores["test_neg_root_mean_squared_error"]),
|
||||
)
|
||||
print("R2", np.median(ransac_reg_scores["test_r2"]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Support vector regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
svr = svm.SVR()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
svr_scores = cross_validate(
|
||||
svr,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=(
|
||||
"r2",
|
||||
"neg_mean_squared_error",
|
||||
"neg_mean_absolute_error",
|
||||
"neg_root_mean_squared_error",
|
||||
),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Squared Error", np.median(svr_scores["test_neg_mean_squared_error"])
|
||||
)
|
||||
print(
|
||||
"Negative Mean Absolute Error",
|
||||
np.median(svr_scores["test_neg_mean_absolute_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Root Mean Squared Error",
|
||||
np.median(svr_scores["test_neg_root_mean_squared_error"]),
|
||||
)
|
||||
print("R2", np.median(svr_scores["test_r2"]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Kernel Ridge regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
kridge = kernel_ridge.KernelRidge()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
kridge_scores = cross_validate(
|
||||
kridge,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=(
|
||||
"r2",
|
||||
"neg_mean_squared_error",
|
||||
"neg_mean_absolute_error",
|
||||
"neg_root_mean_squared_error",
|
||||
),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Squared Error",
|
||||
np.median(kridge_scores["test_neg_mean_squared_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Absolute Error",
|
||||
np.median(kridge_scores["test_neg_mean_absolute_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Root Mean Squared Error",
|
||||
np.median(kridge_scores["test_neg_root_mean_squared_error"]),
|
||||
)
|
||||
print("R2", np.median(kridge_scores["test_r2"]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Gaussian Process Regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
gpr = gaussian_process.GaussianProcessRegressor()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
|
||||
gpr_scores = cross_validate(
|
||||
gpr,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=(
|
||||
"r2",
|
||||
"neg_mean_squared_error",
|
||||
"neg_mean_absolute_error",
|
||||
"neg_root_mean_squared_error",
|
||||
),
|
||||
)
|
||||
print(
|
||||
"Negative Mean Squared Error", np.median(gpr_scores["test_neg_mean_squared_error"])
|
||||
)
|
||||
print(
|
||||
"Negative Mean Absolute Error",
|
||||
np.median(gpr_scores["test_neg_mean_absolute_error"]),
|
||||
)
|
||||
print(
|
||||
"Negative Root Mean Squared Error",
|
||||
np.median(gpr_scores["test_neg_root_mean_squared_error"]),
|
||||
)
|
||||
print("R2", np.median(gpr_scores["test_r2"]))
|
||||
# %%
|
||||
model_input = model_input[model_input["local_segment"].str.contains("daily")]
|
||||
|
||||
# %%
|
||||
CV_METHOD = "logo" # logo, half_logo, 5kfold
|
||||
|
||||
model_input_encoded = impute_encode_categorical_features(model_input)
|
||||
# %%
|
||||
data_x, data_y, data_groups = prepare_sklearn_data_format(
|
||||
model_input_encoded, CV_METHOD
|
||||
)
|
||||
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
|
||||
# %%
|
||||
data_y.head()
|
||||
|
||||
# %%
|
||||
data_y.tail()
|
||||
|
||||
# %%
|
||||
data_y.shape
|
||||
|
||||
# %%
|
||||
scores = run_all_regression_models(data_x, data_y, data_groups, cross_validator)
|
||||
|
||||
# %%
|
||||
scores.to_csv(
|
||||
"../presentation/JCQ_supervisor_support_regression_" + CV_METHOD + ".csv",
|
||||
index=False,
|
||||
)
|
||||
|
|
|
@ -0,0 +1,217 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.14.5
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %%
|
||||
import pandas as pd
|
||||
|
||||
from features.esm_JCQ import DICT_JCQ_DEMAND_CONTROL_REVERSE
|
||||
|
||||
# %%
|
||||
limesurvey_questions = pd.read_csv(
|
||||
"E:/STRAWbaseline/survey637813+question_text.csv", header=None
|
||||
).T
|
||||
|
||||
# %%
|
||||
limesurvey_questions
|
||||
|
||||
# %%
|
||||
limesurvey_questions[["code", "text"]] = limesurvey_questions[0].str.split(
|
||||
r"\.\s", expand=True, n=1
|
||||
)
|
||||
|
||||
# %%
|
||||
limesurvey_questions
|
||||
|
||||
# %%
|
||||
demand_reverse_lime_rows = (
|
||||
limesurvey_questions["text"].str.startswith(" [Od mene se ne zahteva,")
|
||||
| limesurvey_questions["text"].str.startswith(" [Imam dovolj časa, da končam")
|
||||
| limesurvey_questions["text"].str.startswith(
|
||||
" [Pri svojem delu se ne srečujem s konfliktnimi"
|
||||
)
|
||||
)
|
||||
control_reverse_lime_rows = limesurvey_questions["text"].str.startswith(
|
||||
" [Moje delo vključuje veliko ponavljajočega"
|
||||
) | limesurvey_questions["text"].str.startswith(
|
||||
" [Pri svojem delu imam zelo malo svobode"
|
||||
)
|
||||
|
||||
# %%
|
||||
demand_reverse_lime = limesurvey_questions[demand_reverse_lime_rows]
|
||||
demand_reverse_lime.loc[:, "qid"] = demand_reverse_lime["code"].str.extract(
|
||||
r"\[(\d+)\]"
|
||||
)
|
||||
control_reverse_lime = limesurvey_questions[control_reverse_lime_rows]
|
||||
control_reverse_lime.loc[:, "qid"] = control_reverse_lime["code"].str.extract(
|
||||
r"\[(\d+)\]"
|
||||
)
|
||||
|
||||
# %%
|
||||
limesurvey_questions.loc[89, "text"]
|
||||
|
||||
# %%
|
||||
limesurvey_questions[limesurvey_questions["code"].str.startswith("JobEisen")]
|
||||
|
||||
# %%
|
||||
demand_reverse_lime
|
||||
|
||||
# %%
|
||||
control_reverse_lime
|
||||
|
||||
# %%
|
||||
participant_info = pd.read_csv(
|
||||
"C:/Users/junos/Documents/FWO-ARRS/Analysis/straw2analysis/rapids/data/raw/p031/participant_baseline_raw.csv",
|
||||
parse_dates=["date_of_birth"],
|
||||
)
|
||||
|
||||
# %%
|
||||
participant_info_t = participant_info.T
|
||||
|
||||
# %%
|
||||
rows_baseline = participant_info_t.index
|
||||
|
||||
# %%
|
||||
rows_demand = rows_baseline.str.startswith("JobEisen") & ~rows_baseline.str.endswith(
|
||||
"Time"
|
||||
)
|
||||
|
||||
# %%
|
||||
rows_baseline[rows_demand]
|
||||
|
||||
# %%
|
||||
limesurvey_control = (
|
||||
participant_info_t[rows_demand]
|
||||
.reset_index()
|
||||
.rename(columns={"index": "question", 0: "score_original"})
|
||||
)
|
||||
|
||||
# %%
|
||||
limesurvey_control
|
||||
|
||||
# %%
|
||||
limesurvey_control["qid"] = (
|
||||
limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
|
||||
)
|
||||
|
||||
# %%
|
||||
limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
|
||||
|
||||
# %%
|
||||
limesurvey_control["score"] = limesurvey_control["score_original"]
|
||||
|
||||
# %%
|
||||
limesurvey_control["qid"][0]
|
||||
|
||||
# %%
|
||||
rows_demand_reverse = limesurvey_control["qid"].isin(
|
||||
DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
|
||||
)
|
||||
limesurvey_control.loc[rows_demand_reverse, "score"] = (
|
||||
4 + 1 - limesurvey_control.loc[rows_demand_reverse, "score_original"]
|
||||
)
|
||||
|
||||
# %%
|
||||
JCQ_DEMAND = "JobEisen"
|
||||
JCQ_CONTROL = "JobControle"
|
||||
dict_JCQ_demand_control_reverse = {
|
||||
JCQ_DEMAND: {
|
||||
3: " [Od mene se ne zahteva,",
|
||||
4: " [Imam dovolj časa, da končam",
|
||||
5: " [Pri svojem delu se ne srečujem s konfliktnimi",
|
||||
},
|
||||
JCQ_CONTROL: {
|
||||
2: " |Moje delo vključuje veliko ponavljajočega",
|
||||
6: " [Pri svojem delu imam zelo malo svobode",
|
||||
},
|
||||
}
|
||||
|
||||
# %%
|
||||
limesurvey_control
|
||||
|
||||
# %%
|
||||
test = pd.DataFrame(
|
||||
data={"question": "one", "score_original": 3, "score": 3, "qid": 10}, index=[0]
|
||||
)
|
||||
|
||||
# %%
|
||||
pd.concat([test, limesurvey_control]).reset_index()
|
||||
|
||||
# %%
|
||||
limesurvey_control["score"].sum()
|
||||
|
||||
# %%
|
||||
rows_demand_reverse
|
||||
|
||||
# %%
|
||||
dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
|
||||
|
||||
# %%
|
||||
limesurvey_control
|
||||
|
||||
# %%
|
||||
DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
|
||||
DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
|
||||
|
||||
JCQ_NORMS = {
|
||||
"F": {
|
||||
0: DEMAND_CONTROL_RATIO_MIN,
|
||||
1: 0.45,
|
||||
2: 0.52,
|
||||
3: 0.62,
|
||||
4: DEMAND_CONTROL_RATIO_MAX,
|
||||
},
|
||||
"M": {
|
||||
0: DEMAND_CONTROL_RATIO_MIN,
|
||||
1: 0.41,
|
||||
2: 0.48,
|
||||
3: 0.56,
|
||||
4: DEMAND_CONTROL_RATIO_MAX,
|
||||
},
|
||||
}
|
||||
|
||||
# %%
|
||||
JCQ_NORMS[participant_info.loc[0, "gender"]][0]
|
||||
|
||||
# %%
|
||||
participant_info_t.index.str.startswith("JobControle")
|
||||
|
||||
# %%
|
||||
columns_baseline = participant_info.columns
|
||||
|
||||
# %%
|
||||
columns_demand = columns_baseline.str.startswith(
|
||||
"JobControle"
|
||||
) & ~columns_baseline.str.endswith("Time")
|
||||
|
||||
# %%
|
||||
columns_baseline[columns_demand]
|
||||
|
||||
# %%
|
||||
participant_control = participant_info.loc[:, columns_demand]
|
||||
|
||||
# %%
|
||||
participant_control["id"] = participant_control.index
|
||||
|
||||
# %%
|
||||
participant_control
|
||||
|
||||
# %%
|
||||
pd.wide_to_long(
|
||||
participant_control,
|
||||
stubnames="JobControle",
|
||||
i="id",
|
||||
j="qid",
|
||||
sep="[",
|
||||
suffix="(\\d+)]",
|
||||
)
|
195
features/esm.py
195
features/esm.py
|
@ -20,11 +20,47 @@ ANSWER_DAY_OFF = "DayOff3421"
|
|||
ANSWER_SET_EVENING = "DayFinishedSetEvening"
|
||||
|
||||
MAX_MORNING_LENGTH = 3
|
||||
# When the participants was not yet at work at the time of the first (morning) EMA,
|
||||
# When the participant was not yet at work at the time of the first (morning) EMA,
|
||||
# only three items were answered.
|
||||
# Two sleep related items and one indicating NOT starting work yet.
|
||||
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
|
||||
|
||||
QUESTIONNAIRE_IDS = {
|
||||
"sleep_quality": 1,
|
||||
"PANAS_positive_affect": 8,
|
||||
"PANAS_negative_affect": 9,
|
||||
"JCQ_job_demand": 10,
|
||||
"JCQ_job_control": 11,
|
||||
"JCQ_supervisor_support": 12,
|
||||
"JCQ_coworker_support": 13,
|
||||
"PFITS_supervisor": 14,
|
||||
"PFITS_coworkers": 15,
|
||||
"UWES_vigor": 16,
|
||||
"UWES_dedication": 17,
|
||||
"UWES_absorption": 18,
|
||||
"COPE_active": 19,
|
||||
"COPE_support": 20,
|
||||
"COPE_emotions": 21,
|
||||
"balance_life_work": 22,
|
||||
"balance_work_life": 23,
|
||||
"recovery_experience_detachment": 24,
|
||||
"recovery_experience_relaxation": 25,
|
||||
"symptoms": 26,
|
||||
"appraisal_stressfulness_event": 87,
|
||||
"appraisal_threat": 88,
|
||||
"appraisal_challenge": 89,
|
||||
"appraisal_event_time": 90,
|
||||
"appraisal_event_duration": 91,
|
||||
"appraisal_event_work_related": 92,
|
||||
"appraisal_stressfulness_period": 93,
|
||||
"late_work": 94,
|
||||
"work_hours": 95,
|
||||
"left_work": 96,
|
||||
"activities": 97,
|
||||
"coffee_breaks": 98,
|
||||
"at_work_yet": 99,
|
||||
}
|
||||
|
||||
|
||||
def get_esm_data(usernames: Collection) -> pd.DataFrame:
|
||||
"""
|
||||
|
@ -52,8 +88,10 @@ def get_esm_data(usernames: Collection) -> pd.DataFrame:
|
|||
|
||||
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Convert timestamps and expand JSON column.
|
||||
|
||||
Convert timestamps into human-readable datetimes and dates
|
||||
and expand the JSON column into several Pandas DF columns.
|
||||
and expand the JSON column into several Pandas DF columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
@ -63,7 +101,8 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
|||
Returns
|
||||
-------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
|
||||
A dataframe with added columns: datetime in Ljubljana timezone
|
||||
and all fields from ESM_JSON column.
|
||||
"""
|
||||
df_esm = helper.get_date_from_timestamp(df_esm)
|
||||
|
||||
|
@ -76,31 +115,39 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
|||
def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
For each distinct EMA session, determine how the participant responded to it.
|
||||
Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
|
||||
|
||||
Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED,
|
||||
and SESSION_STATUS_COMPLETE
|
||||
|
||||
This is done in three steps.
|
||||
|
||||
First, the esm_status is considered.
|
||||
If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
|
||||
If any of the ESMs in a session has a status *other than* "answered",
|
||||
then this session is taken as unfinished.
|
||||
|
||||
Second, the sessions which do not represent full questionnaires are identified.
|
||||
These are sessions where participants only marked they are finished with the day or have not yet started working.
|
||||
These are sessions where participants only marked they are finished with the day
|
||||
or have not yet started working.
|
||||
|
||||
Third, the sessions with only one item are marked with their trigger.
|
||||
We never offered questionnaires with single items, so we can be sure these are unfinished.
|
||||
We never offered questionnaires with single items,
|
||||
so we can be sure these are unfinished.
|
||||
|
||||
Finally, all sessions that remain are marked as completed.
|
||||
By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option.
|
||||
By going through different possibilities in expl_esm_adherence.ipynb,
|
||||
this turned out to be a reasonable option.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||
A preprocessed dataframe of esm data,
|
||||
which must include the session ID (esm_session).
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_session_counts: pd.Dataframe
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
|
||||
with their statuses and the number of items.
|
||||
"""
|
||||
sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
|
||||
|
||||
|
@ -155,17 +202,22 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat
|
|||
|
||||
def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)
|
||||
Classify EMA sessions into morning, workday, or evening.
|
||||
|
||||
For each EMA session, determine the time of the first user answer
|
||||
and its time type (morning, workday, or evening).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||
A preprocessed dataframe of esm data,
|
||||
which must include the session ID (esm_session).
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_session_time: pd.DataFrame
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
|
||||
with their time type and timestamp of first answer.
|
||||
"""
|
||||
df_session_time = (
|
||||
df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
|
||||
|
@ -179,13 +231,17 @@ def classify_sessions_by_completion_time(
|
|||
df_esm_preprocessed: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
The point of this function is to not only classify sessions by using the previously defined functions.
|
||||
Classify sessions and correct the time type.
|
||||
|
||||
The point of this function is to not only classify sessions
|
||||
by using the previously defined functions.
|
||||
It also serves to "correct" the time type of some EMA sessions.
|
||||
|
||||
A morning questionnaire could seamlessly transition into a daytime questionnaire,
|
||||
if the participant was already at work.
|
||||
In this case, the "time" label changed mid-session.
|
||||
Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
|
||||
Because of the way classify_sessions_by_time works,
|
||||
this questionnaire was classified as "morning".
|
||||
But for all intents and purposes, it can be treated as a "daytime" EMA.
|
||||
|
||||
The way this scenario is differentiated from a true "morning" questionnaire,
|
||||
|
@ -194,13 +250,16 @@ def classify_sessions_by_completion_time(
|
|||
Parameters
|
||||
----------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||
A preprocessed dataframe of esm data,
|
||||
which must include the session ID (esm_session).
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_session_counts_time: pd.DataFrame
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items,
|
||||
their time type (with some morning EMAs reclassified) and timestamp of first answer.
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses,
|
||||
the number of items,
|
||||
their time type (with some morning EMAs reclassified)
|
||||
and timestamp of first answer.
|
||||
|
||||
"""
|
||||
df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
|
||||
|
@ -219,7 +278,8 @@ def classify_sessions_by_completion_time(
|
|||
|
||||
def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
This function eliminates invalid ESM responses.
|
||||
Eliminate invalid ESM responses.
|
||||
|
||||
It removes unanswered ESMs and those that indicate end of work and similar.
|
||||
It also extracts a numeric answer from strings such as "4 - I strongly agree".
|
||||
|
||||
|
@ -256,3 +316,100 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
|||
)
|
||||
)
|
||||
return df_esm_clean
|
||||
|
||||
|
||||
def increment_answers(df_esm_clean: pd.DataFrame, increment_by=1):
|
||||
"""
|
||||
Increment answers to keep in line with original scoring.
|
||||
|
||||
We always used 0 for the lowest value of user answer.
|
||||
Some scales originally used other scoring, such as starting from 1.
|
||||
This restores original scoring so that the values are comparable to references.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_clean: pd.DataFrame
|
||||
A cleaned ESM dataframe, which must also include esm_user_answer_numeric.
|
||||
increment_by:
|
||||
A number to add to the user answer.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_clean: pd.DataFrame
|
||||
The same df with addition of a column 'esm_user_answer_numeric'.
|
||||
|
||||
"""
|
||||
try:
|
||||
df_esm_clean = df_esm_clean.assign(
|
||||
esm_user_score=lambda x: x.esm_user_answer_numeric + increment_by
|
||||
)
|
||||
except AttributeError as e:
|
||||
print("Please, clean the dataframe first using features.esm.clean_up_esm.")
|
||||
print(e)
|
||||
return df_esm_clean
|
||||
|
||||
|
||||
def reassign_question_ids(
|
||||
df_esm_cleaned: pd.DataFrame, question_ids_content: dict
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Fix question IDs to match their actual content.
|
||||
|
||||
Unfortunately, when altering the protocol to adapt to COVID pandemic,
|
||||
we did not retain original question IDs.
|
||||
This means that for participants before 2021, they are different
|
||||
from for the rest of them.
|
||||
This function searches for question IDs by matching their strings.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_cleaned: pd.DataFrame
|
||||
A cleaned up dataframe, which must also include esm_user_answer_numeric.
|
||||
question_ids_content: dict
|
||||
A dictionary, linking question IDs with their content ("instructions").
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_fixed: pd.DataFrame
|
||||
The same dataframe but with fixed question IDs.
|
||||
"""
|
||||
df_esm_unique_questions = (
|
||||
df_esm_cleaned.groupby("question_id")
|
||||
.esm_instructions.value_counts()
|
||||
.rename()
|
||||
.reset_index()
|
||||
)
|
||||
# Tabulate all possible answers to each question (group by question ID).
|
||||
|
||||
# First, check that we anticipated all esm instructions.
|
||||
for q_id in question_ids_content.keys():
|
||||
# Look for all questions ("instructions") occurring in the dataframe.
|
||||
actual_questions = df_esm_unique_questions.loc[
|
||||
df_esm_unique_questions["question_id"] == q_id,
|
||||
"esm_instructions",
|
||||
]
|
||||
# These are all answers to a given question (by q_id).
|
||||
questions_matches = actual_questions.str.startswith(
|
||||
question_ids_content.get(q_id)
|
||||
)
|
||||
# See if they are expected, i.e. included in the dictionary.
|
||||
if ~actual_questions.all():
|
||||
print("One of the questions that occur in the data was undefined.")
|
||||
print("This were the questions found in the data: ")
|
||||
raise KeyError(actual_questions[~questions_matches])
|
||||
# In case there is an unexpected answer, raise an exception.
|
||||
|
||||
# Next, replace question IDs.
|
||||
df_esm_fixed = df_esm_cleaned.copy()
|
||||
df_esm_fixed["question_id"] = df_esm_cleaned["esm_instructions"].apply(
|
||||
lambda x: next(
|
||||
(
|
||||
key
|
||||
for key, values in question_ids_content.items()
|
||||
if x.startswith(values)
|
||||
),
|
||||
None,
|
||||
)
|
||||
)
|
||||
|
||||
return df_esm_fixed
|
||||
|
|
|
@ -0,0 +1,125 @@
|
|||
COPE_ORIGINAL_MAX = 4
|
||||
COPE_ORIGINAL_MIN = 1
|
||||
|
||||
DICT_COPE_QUESTION_IDS = {
|
||||
164: (
|
||||
"I took additional action to try to get rid of the problem",
|
||||
"Ik deed extra mijn best om er iets aan te doen",
|
||||
"Vložila sem dodaten napor, da bi rešila problem",
|
||||
"Vložil sem dodaten napor, da bi rešil problem",
|
||||
),
|
||||
165: (
|
||||
"I concentrated my efforts on doing something about it",
|
||||
"Ik probeerde de situatie te verbeteren",
|
||||
"Svoje sile sem usmerila v reševanje nastale situacije",
|
||||
"Svoje sile sem usmeril v reševanje nastale situacije",
|
||||
),
|
||||
166: (
|
||||
"I did what had to be done, one step at a time",
|
||||
"Ik deed stap voor stap wat nodig was",
|
||||
"Naredila sem, kar je bilo potrebno – korak za korakom",
|
||||
"Naredil sem, kar je bilo potrebno – korak za korakom",
|
||||
),
|
||||
167: (
|
||||
"I took direct action to get around the problem",
|
||||
"Ik handelde vlug om het probleem te verhelpen",
|
||||
"Nekaj sem naredila, da sem zaobšla problem",
|
||||
"Nekaj sem naredil, da sem zaobšel problem",
|
||||
),
|
||||
168: (
|
||||
"I tried to come up with a strategy about what to do",
|
||||
"Ik probeerde te verzinnen wat ik er aan kon doen",
|
||||
"Skušala sem najti ustrezen način za rešitev situacije",
|
||||
"Skušal sem najti ustrezen način za rešitev situacije",
|
||||
),
|
||||
169: (
|
||||
"I made a plan of action",
|
||||
"Ik maakte een plan",
|
||||
"Naredila sem načrt za delovanje",
|
||||
"Naredil sem načrt za delovanje",
|
||||
),
|
||||
170: (
|
||||
"I thought hard about what steps to take",
|
||||
"Ik dacht hard na over wat ik moest doen",
|
||||
"Dobro sem premislila, katere korake moram narediti, da rešim problem",
|
||||
"Dobro sem premislil, katere korake moram narediti, da rešim problem",
|
||||
),
|
||||
171: (
|
||||
"I thought about how I might best handle the problem",
|
||||
"lk dacht na over hoe ik het probleem het best kon aanpakken",
|
||||
"Razmišljala sem, kaj bi bilo najbolje narediti s problemom",
|
||||
"Razmišljal sem, kaj bi bilo najbolje narediti s problemom",
|
||||
),
|
||||
172: (
|
||||
"I asked people who have had similar experiences what they did",
|
||||
"Ik vroeg aan mensen met dergelijke ervaringen hoe zij reageerden",
|
||||
"Vprašala sem posameznike s podobnimi izkušnjami, kaj so storili",
|
||||
"Vprašal sem posameznike s podobnimi izkušnjami, kaj so storili",
|
||||
),
|
||||
173: (
|
||||
"I tried to get advice from someone about what to do",
|
||||
"lk vroeg advies aan iemand",
|
||||
"Pri drugih sem poskušala dobiti nasvet, kaj naj storim",
|
||||
"Pri drugih sem poskušal dobiti nasvet, kaj naj storim",
|
||||
),
|
||||
174: (
|
||||
"I talked to someone to find out more about the situation",
|
||||
"Ik sprak met iemand om meer te weten te komen over de situatie",
|
||||
"Z nekom sem se pogovorila, da bi izvedela še kaj o svojem problemu",
|
||||
"Z nekom sem se pogovoril, da bi izvedel še kaj o svojem problemu",
|
||||
),
|
||||
175: (
|
||||
"I talked to someone who could do something concrete about the problem",
|
||||
"Ik sprak met iemand die iets aan het probleem kon doen",
|
||||
"Pogovorila sem se s kom, ki bi lahko naredil kaj konkretnega",
|
||||
"Pogovoril sem se s kom, ki bi lahko naredil kaj konkretnega",
|
||||
),
|
||||
176: (
|
||||
"I talked to someone about how I felt",
|
||||
"Ik sprak met iemand over hoe ik mij voelde",
|
||||
"Z nekom sem se pogovorila o tem, kako sem se počutila",
|
||||
"Z nekom sem se pogovoril o tem, kako sem se počutil",
|
||||
),
|
||||
177: (
|
||||
"I tried to get emotional support from friends or relatives",
|
||||
"Ik zocht steun bij vrienden of familie",
|
||||
"Skušala sem dobiti čustveno podporo prijateljev ali sorodnikov",
|
||||
"Skušal sem dobiti čustveno podporo prijateljev ali sorodnikov",
|
||||
),
|
||||
178: (
|
||||
"I discussed my feelings with someone",
|
||||
"lk besprak mijn gevoelens met iemand",
|
||||
"O svojih občutkih sem se z nekom pogovorila",
|
||||
"O svojih občutkih sem se z nekom pogovoril",
|
||||
),
|
||||
179: (
|
||||
"I got sympathy and understanding from someone",
|
||||
"Ik vroeg medeleven en begrip van iemand",
|
||||
"Poiskala sem naklonjenost in razumevanje drugih",
|
||||
"Poiskal sem naklonjenost in razumevanje drugih",
|
||||
),
|
||||
180: (
|
||||
"I got upset and let my emotions out",
|
||||
"Ik raakte van streek",
|
||||
"Razburila sem se in to tudi pokazala",
|
||||
"Razburil sem se in to tudi pokazal",
|
||||
),
|
||||
181: (
|
||||
"I let my feelings out",
|
||||
"Ik toonde mijn gevoelens",
|
||||
"Svojim čustvom sem dala prosto pot",
|
||||
"Svojim čustvom sem dal prosto pot",
|
||||
),
|
||||
182: (
|
||||
"I felt a lot of emotional distress and I found myself expressing",
|
||||
"lk liet duidelijk blijken hoe ellendig ik mij voelde",
|
||||
"Doživljala sem veliko stresa in opažala, da sem čustva",
|
||||
"Doživljal sem veliko stresa in opažal, da sem čustva",
|
||||
),
|
||||
183: (
|
||||
"I got upset, and I was really aware of it",
|
||||
"Ik merkte dat ik erg van streek was",
|
||||
"Razburila sem se in razmišljala samo o tem",
|
||||
"Razburil sem se in razmišljal samo o tem",
|
||||
),
|
||||
}
|
|
@ -1,9 +1,11 @@
|
|||
import pandas as pd
|
||||
|
||||
from features.esm import increment_answers
|
||||
|
||||
JCQ_ORIGINAL_MAX = 4
|
||||
JCQ_ORIGINAL_MIN = 1
|
||||
|
||||
dict_JCQ_demand_control_reverse = {
|
||||
DICT_JCQ_DEMAND_CONTROL_REVERSE = {
|
||||
75: (
|
||||
"I was NOT asked",
|
||||
"Men legde mij geen overdreven",
|
||||
|
@ -40,10 +42,14 @@ def reverse_jcq_demand_control_scoring(
|
|||
df_esm_jcq_demand_control: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
This function recodes answers in Job content questionnaire by first incrementing them by 1,
|
||||
to be in line with original (1-4) scoring.
|
||||
Then, some answers are reversed (i.e. 1 becomes 4 etc.), because the questions are negatively phrased.
|
||||
These answers are listed in dict_JCQ_demand_control_reverse and identified by their question ID.
|
||||
Reverse JCQ demand and control answers.
|
||||
|
||||
This function recodes answers in Job content questionnaire
|
||||
by first incrementing them by 1, to be in line with original (1-4) scoring.
|
||||
Then, some answers are reversed (i.e. 1 becomes 4 etc.),
|
||||
because the questions are negatively phrased.
|
||||
These answers are listed in DICT_JCQ_DEMAND_CONTROL_REVERSE
|
||||
and identified by their question ID.
|
||||
However, the existing data is checked against literal phrasing of these questions
|
||||
to protect against wrong numbering of questions (differing question IDs).
|
||||
|
||||
|
@ -55,7 +61,8 @@ def reverse_jcq_demand_control_scoring(
|
|||
Returns
|
||||
-------
|
||||
df_esm_jcq_demand_control: pd.DataFrame
|
||||
The same dataframe with a column esm_user_score containing answers recoded and reversed.
|
||||
The same dataframe with a column esm_user_score
|
||||
containing answers recoded and reversed.
|
||||
"""
|
||||
df_esm_jcq_demand_control_unique_answers = (
|
||||
df_esm_jcq_demand_control.groupby("question_id")
|
||||
|
@ -64,7 +71,7 @@ def reverse_jcq_demand_control_scoring(
|
|||
.reset_index()
|
||||
)
|
||||
# Tabulate all possible answers to each question (group by question ID).
|
||||
for q_id in dict_JCQ_demand_control_reverse.keys():
|
||||
for q_id in DICT_JCQ_DEMAND_CONTROL_REVERSE.keys():
|
||||
# Look through all answers that need to be reversed.
|
||||
possible_answers = df_esm_jcq_demand_control_unique_answers.loc[
|
||||
df_esm_jcq_demand_control_unique_answers["question_id"] == q_id,
|
||||
|
@ -72,7 +79,7 @@ def reverse_jcq_demand_control_scoring(
|
|||
]
|
||||
# These are all answers to a given question (by q_id).
|
||||
answers_matches = possible_answers.str.startswith(
|
||||
dict_JCQ_demand_control_reverse.get(q_id)
|
||||
DICT_JCQ_DEMAND_CONTROL_REVERSE.get(q_id)
|
||||
)
|
||||
# See if they are expected, i.e. included in the dictionary.
|
||||
if ~answers_matches.all():
|
||||
|
@ -82,18 +89,16 @@ def reverse_jcq_demand_control_scoring(
|
|||
# In case there is an unexpected answer, raise an exception.
|
||||
|
||||
try:
|
||||
df_esm_jcq_demand_control = df_esm_jcq_demand_control.assign(
|
||||
esm_user_score=lambda x: x.esm_user_answer_numeric + 1
|
||||
)
|
||||
# Increment the original answer by 1
|
||||
# to keep in line with traditional scoring (JCQ_ORIGINAL_MIN - JCQ_ORIGINAL_MAX).
|
||||
df_esm_jcq_demand_control = increment_answers(df_esm_jcq_demand_control)
|
||||
# Increment the original answer by 1 to keep in line
|
||||
# with traditional scoring (from JCQ_ORIGINAL_MIN to JCQ_ORIGINAL_MAX).
|
||||
df_esm_jcq_demand_control[
|
||||
df_esm_jcq_demand_control["question_id"].isin(
|
||||
dict_JCQ_demand_control_reverse.keys()
|
||||
DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
|
||||
)
|
||||
] = df_esm_jcq_demand_control[
|
||||
df_esm_jcq_demand_control["question_id"].isin(
|
||||
dict_JCQ_demand_control_reverse.keys()
|
||||
DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
|
||||
)
|
||||
].assign(
|
||||
esm_user_score=lambda x: JCQ_ORIGINAL_MAX
|
||||
|
|
|
@ -3,6 +3,9 @@ import pandas as pd
|
|||
|
||||
import features.esm
|
||||
|
||||
SAM_ORIGINAL_MAX = 5
|
||||
SAM_ORIGINAL_MIN = 1
|
||||
|
||||
QUESTIONNAIRE_ID_SAM = {
|
||||
"event_stress": 87,
|
||||
"event_threat": 88,
|
||||
|
@ -20,10 +23,107 @@ GROUP_QUESTIONNAIRES_BY = [
|
|||
"device_id",
|
||||
"esm_session",
|
||||
]
|
||||
# Each questionnaire occurs only once within each esm_session on the same device within the same participant.
|
||||
# Each questionnaire occurs only once within each esm_session on the same device
|
||||
# within the same participant.
|
||||
|
||||
|
||||
DICT_SAM_QUESTION_IDS = {
|
||||
87: (
|
||||
"Was there a particular event that created tension in you?",
|
||||
"Was er een bepaalde gebeurtenis die spanning veroorzaakte?",
|
||||
"Je prišlo do kakega dogodka, ki je v vas ustvaril napetost?",
|
||||
),
|
||||
88: (
|
||||
"Did this event make you feel anxious?",
|
||||
"Voelde je je angstig door deze gebeurtenis?",
|
||||
"Ste se zaradi tega dogodka počutili tesnob",
|
||||
),
|
||||
89: (
|
||||
"Will the outcome of this event be negative?",
|
||||
"Zal de uitkomst van deze gebeurtenis negatief zijn?",
|
||||
"Bo izid tega dogodka negativen?",
|
||||
),
|
||||
90: (
|
||||
"How threatening was this event?",
|
||||
"Hoe bedreigend was deze gebeurtenis?",
|
||||
"Kako grozeč je bil ta dogodek?",
|
||||
),
|
||||
91: (
|
||||
"Is this going to have a negative impact on you?",
|
||||
"Zal dit een negatieve impact op je hebben?",
|
||||
"Ali bo to negativno vplivalo na vas?",
|
||||
),
|
||||
92: (
|
||||
"Is this going to have a positive impact on you?",
|
||||
"Zal dit een positief effect op je hebben?",
|
||||
"Ali bo to pozitivno vplivalo na vas?",
|
||||
),
|
||||
93: (
|
||||
"How eager are you to tackle this event?",
|
||||
"Hoe graag wil je deze gebeurtenis aanpakken?",
|
||||
"Kako zagnani ste bili",
|
||||
),
|
||||
94: (
|
||||
"To what extent can you become a stronger person because of this event?",
|
||||
"In welke mate kan je een sterkere persoon worden door deze gebeurtenis?",
|
||||
"V kolikšni meri lahko zaradi tega dogodka postanete močnejša oseba?",
|
||||
),
|
||||
95: (
|
||||
"To what extent are you excited thinking about the outcome of this event?",
|
||||
"In welke mate ben je enthousiast bij de gedachte",
|
||||
"V kolikšni meri vas misel na izid tega dogodka navdušuje?",
|
||||
),
|
||||
96: (
|
||||
"At what time did this event occur?",
|
||||
"Hoe laat vond deze gebeurtenis plaats?",
|
||||
"Kdaj se je ta dogodek zgodil?",
|
||||
),
|
||||
97: (
|
||||
"How long did this event last?",
|
||||
"Hoe lang duurde deze gebeurtenis?",
|
||||
"Kako dolgo je trajal ta dogodek?",
|
||||
),
|
||||
98: (
|
||||
"Was/is this event work-related?",
|
||||
"Was/is deze gebeurtenis werkgerelateerd?",
|
||||
"Je (bil) ta dogodek povezan s službo?",
|
||||
"Je bil ali je ta dogodek povezan s službo?",
|
||||
),
|
||||
99: (
|
||||
"Did this overall period create tension in you?",
|
||||
"Heeft deze globale periode spanning veroorzaakt?",
|
||||
"Je to obdobje kot celota v vas ustvarilo napetost?",
|
||||
"Je to celo obdobje v vas ustvarilo napetost?",
|
||||
),
|
||||
100: (
|
||||
"To what extent do you perceive this overall period as stressful?",
|
||||
"In welke mate ervaar je deze globale periode als stressvol?",
|
||||
"V kolikšni meri ste to obdobje dojemali kot stresno?",
|
||||
"V kolikšni meri ste celo to obdobje dojemali kot stresno?",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Extract information about stressful events.
|
||||
|
||||
Participants were asked: "Was there a particular event that created tension in you?"
|
||||
Then a subset of questions related to this event followed.
|
||||
This function goes through the follow-up questions one by one
|
||||
and preprocesses them, so that it adds new columns to the dataframe.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm: pd.DataFrame
|
||||
A raw dataframe of all ESM data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_events: pd.DataFrame
|
||||
A cleaned up df of Stress Appraisal Measure items with additional columns.
|
||||
|
||||
"""
|
||||
# 0. Select only questions from Stress Appraisal Measure.
|
||||
df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
|
||||
df_esm_sam = df_esm_preprocessed[
|
||||
|
@ -78,7 +178,8 @@ def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
|
|||
|
||||
def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
|
||||
This function calculates challenge and threat
|
||||
(two Stress Appraisal Measure subscales) means,
|
||||
for each ESM session (within participants and devices).
|
||||
It creates a grouped dataframe with means in two columns.
|
||||
|
||||
|
@ -90,7 +191,8 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
|
|||
Returns
|
||||
-------
|
||||
df_esm_event_threat_challenge_mean_wide: pd.DataFrame
|
||||
A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
|
||||
A dataframe of unique ESM sessions (by participants and devices)
|
||||
with threat and challenge means.
|
||||
"""
|
||||
# Select only threat and challenge assessments for events
|
||||
df_esm_event_threat_challenge = df_esm_sam_clean[
|
||||
|
@ -112,8 +214,8 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
|
|||
aggfunc="mean",
|
||||
)
|
||||
# Drop unnecessary column values.
|
||||
df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
|
||||
1
|
||||
df_esm_event_threat_challenge_mean_wide.columns = (
|
||||
df_esm_event_threat_challenge_mean_wide.columns.get_level_values(1)
|
||||
)
|
||||
df_esm_event_threat_challenge_mean_wide.columns.name = None
|
||||
df_esm_event_threat_challenge_mean_wide.rename(
|
||||
|
@ -189,10 +291,12 @@ def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
|||
|
||||
def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
This function only serves to convert the string datetime answer into a real datetime type.
|
||||
Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
|
||||
NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
|
||||
the NaTs can be interpreted to mean this.
|
||||
This function only serves to convert the string datetime answer
|
||||
into a real datetime type.
|
||||
Errors during this conversion are coerced, meaning that non-datetime answers
|
||||
are assigned Not a Time (NaT).
|
||||
NOTE: Since the only available non-datetime answer to this question was
|
||||
"0 - I do not remember", the NaTs can be interpreted to mean this.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
@ -208,9 +312,13 @@ def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
|||
df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
|
||||
].assign(
|
||||
event_time=lambda x: pd.to_datetime(
|
||||
x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
|
||||
x.esm_user_answer,
|
||||
errors="coerce",
|
||||
format="%Y-%m-%d %H:%M:%S %z",
|
||||
exact=True,
|
||||
)
|
||||
)
|
||||
# Example answer: 2020-09-29 00:05:00 +0200
|
||||
return df_esm_event_time
|
||||
|
||||
|
||||
|
@ -241,9 +349,12 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
|||
== QUESTIONNAIRE_ID_SAM.get("event_duration")
|
||||
].assign(
|
||||
event_duration=lambda x: pd.to_datetime(
|
||||
x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
|
||||
x.esm_user_answer.str.slice(start=0, stop=-6),
|
||||
errors="coerce",
|
||||
format="%Y-%m-%d %H:%M:%S",
|
||||
).dt.time
|
||||
)
|
||||
# Example answer: 2020-09-29 00:05:00 +0200
|
||||
# TODO Explore the values recorded in event_duration and possibly fix mistakes.
|
||||
# For example, participants reported setting 23:50:00 instead of 00:50:00.
|
||||
|
||||
|
@ -251,7 +362,7 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
|||
# we can determine whether:
|
||||
# - this event is still going on ("1 - It is still going on")
|
||||
# - the participant couldn't remember it's duration ("0 - I do not remember")
|
||||
# Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
|
||||
# Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm
|
||||
# but only the numeric types of questions and answers.
|
||||
# Since this was of "datetime" type, convert these specific answers here again.
|
||||
df_esm_event_duration["event_duration_info"] = np.nan
|
||||
|
@ -264,4 +375,5 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
|||
return df_esm_event_duration
|
||||
|
||||
|
||||
# TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?
|
||||
# TODO: How many questions about the stressfulness of the period were asked
|
||||
# and how does this relate to events?
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||
from lightgbm import LGBMClassifier
|
||||
import pandas as pd
|
||||
import xgboost as xg
|
||||
from lightgbm import LGBMClassifier
|
||||
from sklearn import ensemble, linear_model, naive_bayes, neighbors, svm, tree
|
||||
from sklearn.dummy import DummyClassifier
|
||||
|
||||
class ClassificationModels():
|
||||
|
||||
class ClassificationModels:
|
||||
def __init__(self):
|
||||
self.cmodels = self.init_classification_models()
|
||||
|
||||
|
@ -13,59 +14,110 @@ class ClassificationModels():
|
|||
|
||||
def init_classification_models(self):
|
||||
cmodels = {
|
||||
'dummy_classifier': {
|
||||
'model': DummyClassifier(strategy="most_frequent"),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
"dummy_classifier": {
|
||||
"model": DummyClassifier(strategy="most_frequent"),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
},
|
||||
'logistic_regression': {
|
||||
'model': linear_model.LogisticRegression(max_iter=1000),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
"logistic_regression": {
|
||||
"model": linear_model.LogisticRegression(max_iter=1000),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
},
|
||||
'support_vector_machine': {
|
||||
'model': svm.SVC(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
"support_vector_machine": {"model": svm.SVC(), "metrics": [0, 0, 0, 0]},
|
||||
"gaussian_naive_bayes": {
|
||||
"model": naive_bayes.GaussianNB(),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
},
|
||||
'gaussian_naive_bayes': {
|
||||
'model': naive_bayes.GaussianNB(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
"stochastic_gradient_descent_classifier": {
|
||||
"model": linear_model.SGDClassifier(),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
},
|
||||
'stochastic_gradient_descent_classifier': {
|
||||
'model': linear_model.SGDClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
"knn": {"model": neighbors.KNeighborsClassifier(), "metrics": [0, 0, 0, 0]},
|
||||
"decision_tree": {
|
||||
"model": tree.DecisionTreeClassifier(),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
},
|
||||
'knn': {
|
||||
'model': neighbors.KNeighborsClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
"random_forest_classifier": {
|
||||
"model": ensemble.RandomForestClassifier(),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
},
|
||||
'decision_tree': {
|
||||
'model': tree.DecisionTreeClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
"gradient_boosting_classifier": {
|
||||
"model": ensemble.GradientBoostingClassifier(),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
},
|
||||
'random_forest_classifier': {
|
||||
'model': ensemble.RandomForestClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
"lgbm_classifier": {"model": LGBMClassifier(), "metrics": [0, 0, 0, 0]},
|
||||
"XGBoost_classifier": {
|
||||
"model": xg.sklearn.XGBClassifier(),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
},
|
||||
'gradient_boosting_classifier': {
|
||||
'model': ensemble.GradientBoostingClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'lgbm_classifier': {
|
||||
'model': LGBMClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'XGBoost_classifier': {
|
||||
'model': xg.sklearn.XGBClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
}
|
||||
}
|
||||
|
||||
return cmodels
|
||||
|
||||
def get_total_models_scores(self, n_clusters=1):
|
||||
scores = pd.DataFrame(columns=["method", "metric", "mean"])
|
||||
for model_title, model in self.cmodels.items():
|
||||
scores_df = pd.DataFrame(columns=["method", "metric", "mean"])
|
||||
print("\n************************************\n")
|
||||
print("Current model:", model_title, end="\n")
|
||||
print("Acc:", model['metrics'][0]/n_clusters)
|
||||
print("Precision:", model['metrics'][1]/n_clusters)
|
||||
print("Recall:", model['metrics'][2]/n_clusters)
|
||||
print("F1:", model['metrics'][3]/n_clusters)
|
||||
print("Acc:", model["metrics"][0] / n_clusters)
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
pd.DataFrame(
|
||||
{
|
||||
"method": model_title,
|
||||
"metric": "test_accuracy",
|
||||
"mean": model["metrics"][0] / n_clusters,
|
||||
},
|
||||
index=[0],
|
||||
),
|
||||
],
|
||||
ignore_index=True,
|
||||
)
|
||||
print("Precision:", model["metrics"][1] / n_clusters)
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
pd.DataFrame(
|
||||
{
|
||||
"method": model_title,
|
||||
"metric": "test_precision",
|
||||
"mean": model["metrics"][1] / n_clusters,
|
||||
},
|
||||
index=[0],
|
||||
),
|
||||
],
|
||||
ignore_index=True,
|
||||
)
|
||||
print("Recall:", model["metrics"][2] / n_clusters)
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
pd.DataFrame(
|
||||
{
|
||||
"method": model_title,
|
||||
"metric": "test_recall",
|
||||
"mean": model["metrics"][2] / n_clusters,
|
||||
},
|
||||
index=[0],
|
||||
),
|
||||
],
|
||||
ignore_index=True,
|
||||
)
|
||||
print("F1:", model["metrics"][3] / n_clusters)
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
pd.DataFrame(
|
||||
{
|
||||
"method": model_title,
|
||||
"metric": "test_f1",
|
||||
"mean": model["metrics"][3] / n_clusters,
|
||||
},
|
||||
index=[0],
|
||||
),
|
||||
],
|
||||
ignore_index=True,
|
||||
)
|
||||
scores = pd.concat([scores, scores_df])
|
||||
return scores
|
||||
|
|
|
@ -49,8 +49,8 @@ class CrossValidation():
|
|||
|
||||
data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
|
||||
|
||||
elif self.cv_method == "5kfold":
|
||||
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
|
||||
elif self.cv_method == "Stratified5kfold":
|
||||
data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], None
|
||||
|
||||
self.X, self.y, self.groups = data_X, data_y, data_groups
|
||||
|
||||
|
@ -71,7 +71,7 @@ class CrossValidation():
|
|||
|
||||
if self.cv_method in ["logo", "half_logo"]:
|
||||
self.cv = LeaveOneGroupOut()
|
||||
elif self.cv_method == "5kfold":
|
||||
elif self.cv_method == "Stratified5kfold":
|
||||
self.cv = StratifiedKFold(n_splits=5, shuffle=True)
|
||||
|
||||
|
||||
|
@ -118,4 +118,11 @@ class CrossValidation():
|
|||
"""
|
||||
return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
|
||||
|
||||
def get_groups_sets(self, split):
|
||||
|
||||
if self.groups is None:
|
||||
return None, None
|
||||
else:
|
||||
return self.groups.iloc[split[0]], self.groups.iloc[split[1]]
|
||||
|
||||
|
|
@ -1,11 +1,13 @@
|
|||
import os
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.feature_selection import SequentialFeatureSelector
|
||||
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression
|
||||
from sklearn.model_selection import cross_validate, StratifiedKFold, GroupKFold
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.linear_model import Lasso
|
||||
|
||||
|
@ -21,13 +23,15 @@ from sklearn.linear_model import Lasso
|
|||
|
||||
class FeatureSelection:
|
||||
|
||||
def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
|
||||
pass # TODO....
|
||||
def __init__(self, X, y, groups):
|
||||
self.X = X
|
||||
self.y = y
|
||||
self.groups = groups
|
||||
|
||||
|
||||
def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
|
||||
def select_best_feature(self, features, method="remove", ml_category="classification", ml_subcategory="bin", metric="recall", stored_features=[]):
|
||||
"""The method selects the best feature by testing the prediction on the feature set with or without the current feature.
|
||||
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat
|
||||
The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particular
|
||||
feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
|
||||
specified as a parameter.
|
||||
|
||||
|
@ -35,7 +39,11 @@ class FeatureSelection:
|
|||
df (DataFrame): Input data on which the predictions will be made.
|
||||
features (list): List of features to select the best/worst from
|
||||
method (str, optional): remove or add features. Defaults to "remove".
|
||||
ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and metric. Defaults to "classification".
|
||||
ml_category (str, optional): Either classification or regression ml problem controls the ML algorithm and metric.
|
||||
Defaults to "classification".
|
||||
ml_subcategory (str, optional): In case of classification '_bin' for binary classification
|
||||
and 'multi' for multiclass classification. For regression an empty string '' is sufficient.
|
||||
Defaults to "bin".
|
||||
metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
|
||||
stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
|
||||
|
||||
|
@ -49,173 +57,189 @@ class FeatureSelection:
|
|||
|
||||
best_feature = None
|
||||
|
||||
if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
|
||||
raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
|
||||
elif ml_type == "regression" and metric not in ['r2']:
|
||||
# Validacije tipov ML in specificiranimi metrikami
|
||||
if ml_category == "classification":
|
||||
if ml_subcategory == "bin" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
|
||||
raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
|
||||
elif ml_subcategory == "multi":
|
||||
ml_subcategory_error = False
|
||||
if metric != "accuracy" and "_" in metric:
|
||||
metric_s, metric_t = metric.split("_")
|
||||
if metric_s not in ['accuracy', 'precision', 'recall', 'f1'] or metric_t not in ['micro', 'macro', 'weighted']:
|
||||
ml_subcategory_error = True
|
||||
else:
|
||||
ml_subcategory_error = True
|
||||
|
||||
if ml_subcategory_error:
|
||||
raise ValueError(""""Classification metric for multi-class classification must be specified precisely.
|
||||
Available metric are: 'accuracy', 'precision', 'recall' and 'f1'.
|
||||
Only accuracy must be specified as 'accuracy'.
|
||||
For others please add appropriate suffixes: '_macro', '_micro', or '_weighted', e.g., 'f1_macro'""")
|
||||
elif ml_category == "regression" and metric not in ['r2']:
|
||||
raise ValueError("Regression metric not recognized. Please choose 'r2'")
|
||||
|
||||
for feat in features:
|
||||
if method == "remove":
|
||||
pred_features = [col for col in df.columns if feat != col] # All but feat
|
||||
pred_features = [col for col in self.X.columns if feat != col] # All but feat
|
||||
elif method == "add":
|
||||
pred_features = [feat] + stored_features # Feat with stored features
|
||||
|
||||
X, y = df.drop(columns=['target', 'pid'])[pred_features], df['target']
|
||||
X = self.X[pred_features].copy()
|
||||
|
||||
if ml_type == "classification":
|
||||
if self.groups is not None:
|
||||
cv = GroupKFold(n_splits=5)
|
||||
else:
|
||||
cv = StratifiedKFold(n_splits=5, shuffle=True)
|
||||
|
||||
# See link about scoring for multiclassfication
|
||||
# http://iamirmasoud.com/2022/06/19/understanding-micro-macro-and-weighted-averages-for-scikit-learn-metrics-in-multi-class-classification-with-example/
|
||||
if ml_category == "classification":
|
||||
nb = GaussianNB()
|
||||
model_cv = cross_validate(
|
||||
nb,
|
||||
X=X,
|
||||
y=y,
|
||||
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
||||
y=self.y,
|
||||
cv=cv,
|
||||
groups=self.groups,
|
||||
n_jobs=-1,
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
scoring=(metric)
|
||||
)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
||||
|
||||
if metric == "accuracy":
|
||||
acc = np.mean(model_cv['test_accuracy'])
|
||||
acc_std = np.std(model_cv['test_accuracy'])
|
||||
|
||||
if not best_feature or (acc > best_metric_score):
|
||||
best_feature = feat
|
||||
best_metric_score = acc
|
||||
best_metric_score_std = acc_std
|
||||
|
||||
elif metric == "precision":
|
||||
prec = np.mean(model_cv['test_precision'])
|
||||
prec_std = np.std(model_cv['test_precision'])
|
||||
|
||||
if not best_feature or (prec > best_metric_score):
|
||||
best_feature = feat
|
||||
best_metric_score = prec
|
||||
best_metric_score_std = prec_std
|
||||
|
||||
elif metric == "recall":
|
||||
rec = np.mean(model_cv['test_recall'])
|
||||
rec_std = np.std(model_cv['test_recall'])
|
||||
|
||||
if not best_feature or (rec > best_metric_score):
|
||||
best_feature = feat
|
||||
best_metric_score = rec
|
||||
best_metric_score_std = rec_std
|
||||
|
||||
else:
|
||||
f1 = np.mean(model_cv['test_f1'])
|
||||
f1_std = np.std(model_cv['test_f1'])
|
||||
|
||||
if not best_feature or (f1 > best_metric_score):
|
||||
best_feature = feat
|
||||
best_metric_score = f1
|
||||
best_metric_score_std = f1_std
|
||||
|
||||
elif ml_type == "regression":
|
||||
elif ml_category == "regression":
|
||||
lass = Lasso()
|
||||
model_cv = cross_validate(
|
||||
lass,
|
||||
X=X,
|
||||
y=y,
|
||||
cv=StratifiedKFold(n_splits=5, shuffle=True),
|
||||
cv=cv,
|
||||
groups=self.groups,
|
||||
n_jobs=-1,
|
||||
scoring=('r2')
|
||||
)
|
||||
|
||||
if metric == "r2":
|
||||
r2 = np.mean(model_cv['test_r2'])
|
||||
r2_std = np.std(model_cv['test_r2'])
|
||||
|
||||
if not best_feature or (r2 > best_metric_score):
|
||||
best_feature = feat
|
||||
best_metric_score = r2
|
||||
best_metric_score_std = r2_std
|
||||
else:
|
||||
raise ValueError("ML type not yet implemented!")
|
||||
|
||||
# Section of metrics' scores comparison.
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
|
||||
|
||||
metric_score = np.nanmean(model_cv["test_score"])
|
||||
metric_score_std = np.nanstd(model_cv["test_score"])
|
||||
|
||||
if not best_feature or (metric_score > best_metric_score):
|
||||
best_feature = feat
|
||||
best_metric_score = metric_score
|
||||
best_metric_score_std = metric_score_std
|
||||
|
||||
return best_feature, best_metric_score, best_metric_score_std
|
||||
|
||||
|
||||
def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
|
||||
def select_features(self, n_min=20, n_max=50, k=100, method="remove", ml_type="classification_bin", metric="recall", n_tolerance=10):
|
||||
"""This method selects a set of features and returns them as a list. It returns number of features
|
||||
determined in the interval of [n_min, n_max].
|
||||
|
||||
n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
|
||||
if n_max > n_features:
|
||||
n_max = n_features
|
||||
The method consists of two steps:
|
||||
(1) The method uses sklearn kBest method which selects k best features dependent on the ml_type parameter.
|
||||
(2) The sequential features removal procedure is executed. Using the remaing features from (1).
|
||||
The best score is detected using a removal procedure. The procedure sequentially removes the features
|
||||
that attribute the least to the choosen evaluation metric. If in this sequence the score ML score is
|
||||
improved the next feature is remove otherwise there is a tolerance criteria (n_tolerance)
|
||||
with which the next n removed features are inspected whether currently best score is improved.
|
||||
|
||||
Args:
|
||||
n_min (int, optional): Minimal amount of features returned.
|
||||
n_max (int, optional): Maximal amount of features returned.
|
||||
k (int, optional): Determines the k in the k-best features method.
|
||||
If None, SelectKBest feature selection does not execute.
|
||||
ml_type(str, optional): Type of ML problem. Currently implemented options:
|
||||
'classification_bin', 'classification_multi', and 'regression_'
|
||||
method (str, optional): "remove" or "add" features. Defaults to "remove".
|
||||
n_tolerance (int, optional): If the best score is not improved in n that is specified by this parameter
|
||||
the method returns index of feature with current best score as a tipping point feature.
|
||||
|
||||
Returns:
|
||||
list: list of selected features
|
||||
"""
|
||||
|
||||
if k is not None and k <= n_max:
|
||||
raise ValueError("The k parameter needs to be greater than the n_max parameter.")
|
||||
|
||||
# Select k-best feature dependent on the type of ML task
|
||||
ml_category, ml_subcategory = ml_type.split("_")
|
||||
|
||||
if k is not None:
|
||||
if ml_category == "classification":
|
||||
if ml_subcategory== "bin":
|
||||
selector = SelectKBest(mutual_info_classif, k=k)
|
||||
elif ml_subcategory== "multi":
|
||||
selector = SelectKBest(f_classif, k=k)
|
||||
else:
|
||||
raise ValueError("Unknown ML type: cannot recognize ML classification subtype.")
|
||||
elif ml_category == "regression":
|
||||
selector = SelectKBest(f_regression, k=k)
|
||||
else:
|
||||
raise ValueError("Unknown ML type: cannot recognize ML type. Must be either classification or regression.")
|
||||
|
||||
selector.fit(self.X, self.y)
|
||||
cols_idxs = selector.get_support(indices=True)
|
||||
self.X = self.X.iloc[:,cols_idxs]
|
||||
|
||||
print("All columns (after SelectKBest method):")
|
||||
print(self.X.columns)
|
||||
|
||||
# Sequential feature addition / removal
|
||||
n_features = self.X.shape[1]
|
||||
if n_max >= n_features:
|
||||
n_max = n_features-1 # The algorithm removes at least one feature
|
||||
|
||||
if n_min > n_features:
|
||||
raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
|
||||
raise ValueError("The number of remaining features in the dataframe must be at least as n_min+1 parameter.")
|
||||
|
||||
if n_max < n_min:
|
||||
raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
|
||||
|
||||
features = df.columns.tolist()
|
||||
features.remove("pid")
|
||||
features.remove("target")
|
||||
features = self.X.columns.tolist()
|
||||
feature_importance = []
|
||||
if method == "remove":
|
||||
best_score = 0
|
||||
best_feature_indx = None
|
||||
i_worse = 0
|
||||
for i in reversed(range(n_features)):
|
||||
|
||||
if i+1 == n_min:
|
||||
break
|
||||
|
||||
best_feature, best_metric_score, best_metric_score_std = \
|
||||
self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
|
||||
feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
|
||||
self.select_best_feature(features, method=method, ml_category=ml_category, ml_subcategory=ml_subcategory, metric=metric)
|
||||
|
||||
feature_importance.append((i+1, best_feature, best_metric_score, best_metric_score_std))
|
||||
|
||||
features.remove(best_feature)
|
||||
print("Features left:", i)
|
||||
|
||||
if i <= n_max:
|
||||
if best_metric_score >= best_score:
|
||||
best_score = best_metric_score
|
||||
best_feature_indx = i+1
|
||||
i_worse = 0
|
||||
else:
|
||||
i_worse += 1
|
||||
|
||||
if i_worse == n_tolerance:
|
||||
break
|
||||
|
||||
feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
|
||||
|
||||
# Selekcijski kriterij značilk v rangu max-min
|
||||
# Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
|
||||
print(feature_importance_df)
|
||||
print("best_feature_indx", best_feature_indx)
|
||||
print("best_score", best_score)
|
||||
|
||||
# Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
|
||||
features_to_remove = feature_importance_df[feature_importance_df["i"] >= best_feature_indx]["name"].values.tolist()
|
||||
selected_features = [feat for feat in self.X.columns.tolist() if feat not in features_to_remove]
|
||||
|
||||
# "Tipping point" značilka mora biti v rangu max-min
|
||||
return selected_features
|
||||
|
||||
selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
|
||||
selection_area.set_index(["i", "name"], inplace=True)
|
||||
diffrences = selection_area.diff()
|
||||
diffrences.dropna(how='any', inplace=True)
|
||||
|
||||
# Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo
|
||||
cumulative_sumation = diffrences.cumsum()
|
||||
tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
|
||||
|
||||
# Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative
|
||||
tipping_feature_indx_2 = None
|
||||
for indx, row in diffrences.iterrows():
|
||||
if row["metric"] > 0:
|
||||
tipping_feature_indx_2 = indx
|
||||
else:
|
||||
break
|
||||
|
||||
# Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score
|
||||
tipping_feature_indx_3 = None
|
||||
cum_sum_score = 0
|
||||
i_worse = 0
|
||||
# TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
|
||||
for indx, row in selection_area.iterrows():
|
||||
if row["metric"] > 0:
|
||||
tipping_feature_indx_3 = indx
|
||||
cum_sum_score += row["metric"]
|
||||
i_worse = 0
|
||||
else:
|
||||
i_worse += 1
|
||||
|
||||
if i_worse == n_not_improve:
|
||||
break
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
|
||||
pass
|
||||
|
||||
def vizualize_feature_selection_process():
|
||||
pass
|
||||
|
||||
def execute_feature_selection_step():
|
||||
pass
|
||||
else:
|
||||
raise ValueError("Method type not recognized: only the 'remove' method is currently implemented.")
|
|
@ -11,7 +11,13 @@ from sklearn import (
|
|||
svm,
|
||||
)
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.model_selection import (
|
||||
BaseCrossValidator,
|
||||
LeaveOneGroupOut,
|
||||
StratifiedKFold,
|
||||
cross_validate,
|
||||
)
|
||||
from xgboost import XGBClassifier, XGBRegressor
|
||||
|
||||
|
||||
|
@ -73,7 +79,40 @@ def insert_row(df, row):
|
|||
return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
|
||||
|
||||
|
||||
def prepare_regression_model_input(model_input, cv_method="logo"):
|
||||
def impute_encode_categorical_features(model_input: pd.DataFrame) -> pd.DataFrame:
|
||||
categorical_feature_col_names = [
|
||||
"gender",
|
||||
"startlanguage",
|
||||
"limesurvey_demand_control_ratio_quartile",
|
||||
]
|
||||
additional_categorical_features = [
|
||||
col
|
||||
for col in model_input.columns
|
||||
if "mostcommonactivity" in col or "homelabel" in col
|
||||
]
|
||||
categorical_feature_col_names += additional_categorical_features
|
||||
|
||||
categorical_features = model_input[categorical_feature_col_names].copy()
|
||||
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(
|
||||
lambda col: col.astype("category")
|
||||
)
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = model_input.drop(categorical_feature_col_names, axis=1)
|
||||
|
||||
model_input = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
return model_input
|
||||
|
||||
|
||||
def prepare_sklearn_data_format(
|
||||
model_input: pd.DataFrame, cv_method: str = "logo"
|
||||
) -> tuple:
|
||||
index_columns = [
|
||||
"local_segment",
|
||||
"local_segment_label",
|
||||
|
@ -82,13 +121,7 @@ def prepare_regression_model_input(model_input, cv_method="logo"):
|
|||
]
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
|
||||
if cv_method == "logo":
|
||||
data_x, data_y, data_groups = (
|
||||
model_input.drop(["target", "pid"], axis=1),
|
||||
model_input["target"],
|
||||
model_input["pid"],
|
||||
)
|
||||
else:
|
||||
if cv_method == "half_logo":
|
||||
model_input["pid_index"] = model_input.groupby("pid").cumcount()
|
||||
model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")
|
||||
|
||||
|
@ -104,52 +137,53 @@ def prepare_regression_model_input(model_input, cv_method="logo"):
|
|||
model_input["target"],
|
||||
model_input["pid_half"],
|
||||
)
|
||||
else:
|
||||
data_x, data_y, data_groups = (
|
||||
model_input.drop(["target", "pid"], axis=1),
|
||||
model_input["target"],
|
||||
model_input["pid"],
|
||||
)
|
||||
return data_x, data_y, data_groups
|
||||
|
||||
categorical_feature_colnames = [
|
||||
"gender",
|
||||
"startlanguage",
|
||||
"limesurvey_demand_control_ratio_quartile",
|
||||
]
|
||||
additional_categorical_features = [
|
||||
col
|
||||
for col in data_x.columns
|
||||
if "mostcommonactivity" in col or "homelabel" in col
|
||||
]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||
def prepare_cross_validator(
|
||||
data_x: pd.DataFrame,
|
||||
data_y: pd.DataFrame,
|
||||
data_groups: pd.DataFrame,
|
||||
cv_method: str = "logo",
|
||||
) -> BaseCrossValidator:
|
||||
if cv_method == "logo" or cv_method == "half_logo":
|
||||
cv = LeaveOneGroupOut()
|
||||
cv.get_n_splits(
|
||||
data_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
else:
|
||||
cv = StratifiedKFold(n_splits=5, shuffle=True)
|
||||
return cv
|
||||
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(
|
||||
lambda col: col.astype("category")
|
||||
|
||||
def aggregate_and_transpose(df: pd.DataFrame, statistics=None) -> pd.DataFrame:
|
||||
if statistics is None:
|
||||
statistics = ["max", "mean"]
|
||||
return (
|
||||
df.agg(statistics)
|
||||
.transpose()
|
||||
.reset_index()
|
||||
.rename(columns={"index": "test_metric"})
|
||||
)
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||
|
||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
return train_x, data_y, data_groups
|
||||
|
||||
|
||||
def run_all_regression_models(input_csv):
|
||||
# Prepare data
|
||||
data_x, data_y, data_groups = prepare_regression_model_input(input_csv)
|
||||
|
||||
# Prepare cross validation
|
||||
logo = LeaveOneGroupOut()
|
||||
logo.get_n_splits(
|
||||
data_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
def run_all_regression_models(
|
||||
data_x: pd.DataFrame,
|
||||
data_y: pd.DataFrame,
|
||||
data_groups: pd.DataFrame,
|
||||
cross_validator: BaseCrossValidator,
|
||||
) -> pd.DataFrame:
|
||||
metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
|
||||
test_metrics = ["test_" + metric for metric in metrics]
|
||||
scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
|
||||
scores = pd.DataFrame(columns=["method", "test_metric", "max", "nanmedian"])
|
||||
|
||||
# Validate models
|
||||
dummy_regr = DummyRegressor(strategy="mean")
|
||||
|
@ -158,7 +192,7 @@ def run_all_regression_models(input_csv):
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
|
@ -166,17 +200,19 @@ def run_all_regression_models(input_csv):
|
|||
print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))
|
||||
|
||||
scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df["method"] = "dummy"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del dummy_regr
|
||||
del dummy_regr_scores
|
||||
|
||||
lin_reg_rapids = linear_model.LinearRegression()
|
||||
lin_reg = linear_model.LinearRegression()
|
||||
lin_reg_scores = cross_validate(
|
||||
lin_reg_rapids,
|
||||
lin_reg,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
|
@ -184,9 +220,11 @@ def run_all_regression_models(input_csv):
|
|||
print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))
|
||||
|
||||
scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df["method"] = "linear_reg"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del lin_reg
|
||||
del lin_reg_scores
|
||||
|
||||
ridge_reg = linear_model.Ridge(alpha=0.5)
|
||||
ridge_reg_scores = cross_validate(
|
||||
|
@ -194,16 +232,18 @@ def run_all_regression_models(input_csv):
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
print("Ridge regression")
|
||||
|
||||
scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df["method"] = "ridge_reg"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del ridge_reg
|
||||
del ridge_reg_scores
|
||||
|
||||
lasso_reg = linear_model.Lasso(alpha=0.1)
|
||||
lasso_reg_score = cross_validate(
|
||||
|
@ -211,16 +251,18 @@ def run_all_regression_models(input_csv):
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
print("Lasso regression")
|
||||
|
||||
scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df["method"] = "lasso_reg"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del lasso_reg
|
||||
del lasso_reg_score
|
||||
|
||||
bayesian_ridge_reg = linear_model.BayesianRidge()
|
||||
bayesian_ridge_reg_score = cross_validate(
|
||||
|
@ -228,16 +270,18 @@ def run_all_regression_models(input_csv):
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
print("Bayesian Ridge")
|
||||
|
||||
scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df["method"] = "bayesian_ridge"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del bayesian_ridge_reg
|
||||
del bayesian_ridge_reg_score
|
||||
|
||||
ransac_reg = linear_model.RANSACRegressor()
|
||||
ransac_reg_score = cross_validate(
|
||||
|
@ -245,27 +289,37 @@ def run_all_regression_models(input_csv):
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
print("RANSAC (outlier robust regression)")
|
||||
|
||||
scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df["method"] = "RANSAC"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del ransac_reg
|
||||
del ransac_reg_score
|
||||
|
||||
svr = svm.SVR()
|
||||
svr_score = cross_validate(
|
||||
svr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
|
||||
svr,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
print("Support vector regression")
|
||||
|
||||
scores_df = pd.DataFrame(svr_score)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df["method"] = "SVR"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del svr
|
||||
del svr_score
|
||||
|
||||
kridge = kernel_ridge.KernelRidge()
|
||||
kridge_score = cross_validate(
|
||||
|
@ -273,69 +327,130 @@ def run_all_regression_models(input_csv):
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
print("Kernel Ridge regression")
|
||||
|
||||
scores_df = pd.DataFrame(kridge_score)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df["method"] = "kernel_ridge"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del kridge
|
||||
del kridge_score
|
||||
|
||||
gpr = gaussian_process.GaussianProcessRegressor()
|
||||
gpr_score = cross_validate(
|
||||
gpr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
|
||||
gpr,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
print("Gaussian Process Regression")
|
||||
|
||||
scores_df = pd.DataFrame(gpr_score)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df["method"] = "gaussian_proc"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del gpr
|
||||
del gpr_score
|
||||
|
||||
rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
|
||||
rfr_score = cross_validate(
|
||||
rfr, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
|
||||
rfr,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
print("Random Forest Regression")
|
||||
|
||||
scores_df = pd.DataFrame(rfr_score)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df["method"] = "random_forest"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del rfr
|
||||
del rfr_score
|
||||
|
||||
xgb = XGBRegressor()
|
||||
xgb_score = cross_validate(
|
||||
xgb, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
|
||||
xgb,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
print("XGBoost Regressor")
|
||||
|
||||
scores_df = pd.DataFrame(xgb_score)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df["method"] = "XGBoost"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del xgb
|
||||
del xgb_score
|
||||
|
||||
ada = ensemble.AdaBoostRegressor()
|
||||
ada_score = cross_validate(
|
||||
ada, X=data_x, y=data_y, groups=data_groups, cv=logo, n_jobs=-1, scoring=metrics
|
||||
ada,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
print("ADA Boost Regressor")
|
||||
|
||||
scores_df = pd.DataFrame(ada_score)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", np.nanmedian]).transpose()
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df["method"] = "ADA_boost"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del ada
|
||||
del ada_score
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
||||
metrics = ["accuracy", "average_precision", "recall", "f1"]
|
||||
def confusion_matrix_scorer(clf, X, y):
|
||||
y_pred = clf.predict(X)
|
||||
cm = confusion_matrix(y, y_pred)
|
||||
return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]}
|
||||
|
||||
|
||||
def aggregate_confusion_matrix(scores_dict: dict) -> pd.DataFrame:
|
||||
scores_aggregated = aggregate_and_transpose(
|
||||
pd.DataFrame(scores_dict), statistics=["sum"]
|
||||
)
|
||||
return scores_aggregated[
|
||||
~scores_aggregated.test_metric.isin(["fit_time", "score_time"])
|
||||
]
|
||||
|
||||
|
||||
def run_all_classification_models(
|
||||
data_x: pd.DataFrame,
|
||||
data_y: pd.DataFrame,
|
||||
data_groups: pd.DataFrame,
|
||||
cross_validator: BaseCrossValidator,
|
||||
):
|
||||
data_y_value_counts = data_y.value_counts()
|
||||
if len(data_y_value_counts) == 1:
|
||||
raise (ValueError("There is only one unique value in data_y."))
|
||||
if len(data_y_value_counts) == 2:
|
||||
metrics = ["accuracy", "average_precision", "recall", "f1"]
|
||||
else:
|
||||
metrics = ["accuracy", "precision_micro", "recall_micro", "f1_micro"]
|
||||
|
||||
test_metrics = ["test_" + metric for metric in metrics]
|
||||
|
||||
scores = pd.DataFrame(columns=["method", "max", "mean"])
|
||||
scores = pd.DataFrame(columns=["method", "test_metric", "max", "mean"])
|
||||
|
||||
dummy_class = DummyClassifier(strategy="most_frequent")
|
||||
|
||||
|
@ -344,17 +459,39 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
error_score="raise",
|
||||
scoring=metrics,
|
||||
)
|
||||
dummy_confusion_matrix = cross_validate(
|
||||
dummy_class,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
error_score="raise",
|
||||
scoring=confusion_matrix_scorer,
|
||||
)
|
||||
print("Dummy")
|
||||
|
||||
scores_df = pd.DataFrame(dummy_score)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
||||
scores_df["method"] = "Dummy"
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(dummy_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df["method"] = "dummy_classifier"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del dummy_class
|
||||
del dummy_score
|
||||
del dummy_confusion_matrix
|
||||
|
||||
logistic_regression = linear_model.LogisticRegression()
|
||||
|
||||
|
@ -363,16 +500,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
log_reg_confusion_matrix = cross_validate(
|
||||
logistic_regression,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=confusion_matrix_scorer,
|
||||
)
|
||||
print("Logistic regression")
|
||||
|
||||
scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
||||
scores_df["method"] = "logistic_reg"
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(log_reg_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df["method"] = "logistic_regression"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del logistic_regression
|
||||
del log_reg_scores
|
||||
del log_reg_confusion_matrix
|
||||
|
||||
svc = svm.SVC()
|
||||
|
||||
|
@ -381,16 +539,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
svc_confusion_matrix = cross_validate(
|
||||
svc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=confusion_matrix_scorer,
|
||||
)
|
||||
print("Support Vector Machine")
|
||||
|
||||
scores_df = pd.DataFrame(svc_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
||||
scores_df["method"] = "svc"
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(svc_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df["method"] = "SVC"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del svc
|
||||
del svc_scores
|
||||
del svc_confusion_matrix
|
||||
|
||||
gaussian_nb = naive_bayes.GaussianNB()
|
||||
|
||||
|
@ -399,16 +578,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
gaussian_nb_confusion_matrix = cross_validate(
|
||||
gaussian_nb,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=confusion_matrix_scorer,
|
||||
)
|
||||
print("Gaussian Naive Bayes")
|
||||
|
||||
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(gaussian_nb_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df["method"] = "gaussian_naive_bayes"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del gaussian_nb
|
||||
del gaussian_nb_scores
|
||||
del gaussian_nb_confusion_matrix
|
||||
|
||||
sgdc = linear_model.SGDClassifier()
|
||||
|
||||
|
@ -417,16 +617,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
sgdc_confusion_matrix = cross_validate(
|
||||
sgdc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=confusion_matrix_scorer,
|
||||
)
|
||||
print("Stochastic Gradient Descent")
|
||||
|
||||
scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
||||
scores_df["method"] = "stochastic_gradient_descent"
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(sgdc_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df["method"] = "stochastic_gradient_descent_classifier"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del sgdc
|
||||
del sgdc_scores
|
||||
del sgdc_confusion_matrix
|
||||
|
||||
rfc = ensemble.RandomForestClassifier()
|
||||
|
||||
|
@ -435,16 +656,37 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
rfc_confusion_matrix = cross_validate(
|
||||
rfc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=confusion_matrix_scorer,
|
||||
)
|
||||
print("Random Forest")
|
||||
|
||||
scores_df = pd.DataFrame(rfc_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
||||
scores_df["method"] = "random_forest"
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(rfc_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df["method"] = "random_forest_classifier"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del rfc
|
||||
del rfc_scores
|
||||
del rfc_confusion_matrix
|
||||
|
||||
xgb_classifier = XGBClassifier()
|
||||
|
||||
|
@ -453,15 +695,36 @@ def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
xgb_confusion_matrix = cross_validate(
|
||||
xgb_classifier,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=confusion_matrix_scorer,
|
||||
)
|
||||
print("XGBoost")
|
||||
|
||||
scores_df = pd.DataFrame(xgb_scores)[test_metrics]
|
||||
scores_df = scores_df.agg(["max", "mean"]).transpose()
|
||||
scores_df["method"] = "xgboost"
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(xgb_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df["method"] = "XGBoost_classifier"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del xgb_classifier
|
||||
del xgb_scores
|
||||
del xgb_confusion_matrix
|
||||
|
||||
return scores
|
||||
|
|
|
@ -33,7 +33,7 @@ class Preprocessing:
|
|||
Args:
|
||||
categorical_features (DataFrame): DataFrame including only categorical columns.
|
||||
numerical_features (_type_): DataFrame including only numerical columns.
|
||||
mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
|
||||
mode (int): Mode of the column with which DataFrame is filled.
|
||||
|
||||
Returns:
|
||||
DataFrame: Hot-One Encoded DataFrame.
|
||||
|
@ -46,7 +46,7 @@ class Preprocessing:
|
|||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
return pd.concat([numerical_features, categorical_features], axis=1)
|
||||
return pd.concat([numerical_features, categorical_features], axis=1), categorical_features.columns.tolist()
|
||||
|
||||
|
||||
def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
|
||||
|
@ -68,19 +68,26 @@ class Preprocessing:
|
|||
categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
|
||||
|
||||
# For train set
|
||||
|
||||
train_X_categorical_features = self.train_X[categorical_columns].copy()
|
||||
train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
|
||||
mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
|
||||
|
||||
self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
|
||||
self.train_X, train_cat_col_names = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
|
||||
encoded_categorical_features = [col for col in self.train_X.columns if col.startswith(tuple(categorical_columns))]
|
||||
|
||||
# For test set
|
||||
|
||||
test_X_categorical_features = self.test_X[categorical_columns].copy()
|
||||
test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
|
||||
|
||||
self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
|
||||
self.test_X, test_cat_col_names = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
|
||||
|
||||
# Create categorical columns that were not found in test set and fill them with 0
|
||||
missing_cols = [col for col in train_cat_col_names if col not in test_cat_col_names]
|
||||
self.test_X[missing_cols] = 0
|
||||
|
||||
# Sort column names alphabetically
|
||||
self.train_X = self.train_X.reindex(sorted(self.train_X.columns), axis=1)
|
||||
self.test_X = self.test_X.reindex(sorted(self.test_X.columns), axis=1)
|
||||
|
||||
|
||||
def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
method,metric,max,mean
|
||||
Dummy,test_accuracy,0.8557046979865772,0.8548446932649828
|
||||
Dummy,test_average_precision,0.1457286432160804,0.14515530673501736
|
||||
Dummy,test_recall,0.0,0.0
|
||||
Dummy,test_f1,0.0,0.0
|
||||
logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606
|
||||
logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834
|
||||
logistic_reg,test_recall,0.3023255813953488,0.24266238973536486
|
||||
logistic_reg,test_f1,0.3909774436090226,0.318943511424051
|
||||
svc,test_accuracy,0.8557046979865772,0.8548446932649828
|
||||
svc,test_average_precision,0.44514416839823046,0.4068200938341621
|
||||
svc,test_recall,0.0,0.0
|
||||
svc,test_f1,0.0,0.0
|
||||
gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234
|
||||
gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853
|
||||
gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935
|
||||
gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699
|
||||
stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601
|
||||
stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592
|
||||
stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675
|
||||
stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583
|
||||
random_forest,test_accuracy,0.9110738255033557,0.9011129472867694
|
||||
random_forest,test_average_precision,0.6998372262021191,0.6619275281099584
|
||||
random_forest,test_recall,0.4069767441860465,0.35356856455493185
|
||||
random_forest,test_f1,0.5691056910569107,0.5078402513053142
|
||||
xgboost,test_accuracy,0.9128978224455612,0.9007711937764886
|
||||
xgboost,test_average_precision,0.7366643049075349,0.698622165966308
|
||||
xgboost,test_recall,0.5287356321839081,0.44346431435445066
|
||||
xgboost,test_f1,0.638888888888889,0.5633957169928393
|
|
|
@ -1,29 +0,0 @@
|
|||
method,metric,max,mean
|
||||
Dummy,test_accuracy,1.0,0.8524114578096439
|
||||
Dummy,test_average_precision,0.7,0.14758854219035614
|
||||
Dummy,test_recall,0.0,0.0
|
||||
Dummy,test_f1,0.0,0.0
|
||||
logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311
|
||||
logistic_reg,test_average_precision,1.0,0.44605167668563583
|
||||
logistic_reg,test_recall,1.0,0.25353566685532386
|
||||
logistic_reg,test_f1,0.823529411764706,0.27951926390778625
|
||||
svc,test_accuracy,1.0,0.8524114578096439
|
||||
svc,test_average_precision,0.9612401707068228,0.44179454944271934
|
||||
svc,test_recall,0.0,0.0
|
||||
svc,test_f1,0.0,0.0
|
||||
gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129
|
||||
gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991
|
||||
gaussian_naive_bayes,test_recall,1.0,0.3743761174081108
|
||||
gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668
|
||||
stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739
|
||||
stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582
|
||||
stochastic_gradient_descent,test_recall,1.0,0.30156420704502945
|
||||
stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388
|
||||
random_forest,test_accuracy,1.0,0.8722158105763481
|
||||
random_forest,test_average_precision,1.0,0.49817066323226833
|
||||
random_forest,test_recall,1.0,0.18161263127840668
|
||||
random_forest,test_f1,1.0,0.2508096532365307
|
||||
xgboost,test_accuracy,1.0,0.8812627400277729
|
||||
xgboost,test_average_precision,1.0,0.5505695112208401
|
||||
xgboost,test_recall,1.0,0.2896161238315027
|
||||
xgboost,test_f1,0.9411764705882353,0.36887408735855665
|
|
|
@ -0,0 +1,7 @@
|
|||
[tool.isort]
|
||||
profile = "black"
|
||||
py_version = 311
|
||||
skip_gitignore = "true"
|
||||
|
||||
[tool.black]
|
||||
target-version = ["py311"]
|
2
rapids
2
rapids
|
@ -1 +1 @@
|
|||
Subproject commit 63f5a526fce4d288499168e1701adadb8b885d82
|
||||
Subproject commit 059774bda10545a83ab282f59eb7a329fef9ee4c
|
5
setup.py
5
setup.py
|
@ -1,8 +1,7 @@
|
|||
import os
|
||||
|
||||
import sqlalchemy.engine.url
|
||||
from dotenv import load_dotenv
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy import URL, create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
load_dotenv()
|
||||
|
@ -11,7 +10,7 @@ testing: bool = False
|
|||
|
||||
db_password = os.getenv("DB_PASSWORD")
|
||||
|
||||
db_uri = sqlalchemy.engine.url.URL(
|
||||
db_uri = URL.create(
|
||||
drivername="postgresql+psycopg2",
|
||||
username="staw_db",
|
||||
password=db_password,
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
---
|
||||
title: "Reliability of SAM threat and challenge and COPE"
|
||||
output: html_notebook
|
||||
---
|
||||
|
||||
|
||||
```{r libraries, message=FALSE, warning=FALSE, include=FALSE, cache=FALSE}
|
||||
library(conflicted)
|
||||
library(here)
|
||||
library(tidyverse)
|
||||
library(magrittr)
|
||||
library(lavaan)
|
||||
library(kableExtra)
|
||||
|
||||
conflicts_prefer(
|
||||
readr::col_factor,
|
||||
purrr::discard,
|
||||
dplyr::filter,
|
||||
dplyr::lag,
|
||||
purrr::set_names,
|
||||
tidyr::extract,
|
||||
kableExtra::group_rows
|
||||
)
|
||||
```
|
||||
|
||||
```{r style, include=FALSE, cache=FALSE}
|
||||
styler::style_file(
|
||||
here("statistical_analysis", "scale_reliability.Rmd"),
|
||||
scope = "tokens",
|
||||
indent_by = 4L
|
||||
)
|
||||
```
|
||||
|
||||
The data were preprocessed and cleaned using [expl_esm_labels.py](../exploration/expl_esm_labels.py) script and read as csv here.
|
||||
|
||||
```{r read_data}
|
||||
COL_TYPES <- cols(
|
||||
.default = col_double(),
|
||||
participant_id = col_factor(),
|
||||
username = col_factor(),
|
||||
device_id = col_factor(),
|
||||
esm_trigger = col_factor(),
|
||||
esm_instructions = col_factor(),
|
||||
double_esm_user_answer_timestamp = col_double(),
|
||||
datetime_lj = col_datetime(format = ""),
|
||||
date_lj = col_date(format = ""),
|
||||
time = col_factor(),
|
||||
esm_user_answer = col_factor()
|
||||
)
|
||||
df_SAM <- read_csv(here("data", "raw", "df_esm_SAM_threat_challenge.csv"), col_types = COL_TYPES)
|
||||
df_COPE <- read_csv(here("data", "raw", "df_esm_COPE.csv"), col_types = COL_TYPES)
|
||||
```
|
||||
|
||||
Demonstrate factor analysis for a single participant.
|
||||
|
||||
```{r}
|
||||
df_COPE %>%
|
||||
group_by(question_id, questionnaire_id) %>%
|
||||
count()
|
||||
```
|
|
@ -0,0 +1,20 @@
|
|||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: Default
|
||||
SaveWorkspace: Default
|
||||
AlwaysSaveHistory: Default
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: No
|
||||
NumSpacesForTab: 4
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: Sweave
|
||||
LaTeX: XeLaTeX
|
||||
|
||||
AutoAppendNewline: Yes
|
||||
StripTrailingWhitespace: Yes
|
||||
|
||||
PythonType: conda
|
||||
PythonVersion: 3.11.3
|
||||
PythonPath: E:/ProgramData/mambaforge/envs/straw2analysis/python.exe
|
Loading…
Reference in New Issue