Compare commits
No commits in common. "master" and "ml_pipeline" have entirely different histories.
master
...
ml_pipelin
9
.flake8
9
.flake8
|
@ -1,9 +0,0 @@
|
||||||
[flake8]
|
|
||||||
max-line-length = 88
|
|
||||||
extend-ignore =
|
|
||||||
E203,
|
|
||||||
# E501 line too long for docstrings
|
|
||||||
D501
|
|
||||||
per-file-ignores =
|
|
||||||
exploration/*.py:E501
|
|
||||||
docstring-convention = numpy
|
|
|
@ -12,15 +12,12 @@ __pycache__/
|
||||||
/data/*input*.csv
|
/data/*input*.csv
|
||||||
/data/daily*
|
/data/daily*
|
||||||
/data/intradaily*
|
/data/intradaily*
|
||||||
/data/raw
|
|
||||||
/data/stressfulness_event*
|
/data/stressfulness_event*
|
||||||
/data/30min*
|
/data/30min*
|
||||||
/presentation/*scores.csv
|
/presentation/*scores.csv
|
||||||
/presentation/Results.ods
|
/presentation/Results.ods
|
||||||
/presentation/results/
|
|
||||||
.Rproj.user
|
.Rproj.user
|
||||||
.Rhistory
|
.Rhistory
|
||||||
/presentation/*.nb.html
|
/presentation/*.nb.html
|
||||||
presentation/event_stressful_detection_half_loso.csv
|
presentation/event_stressful_detection_half_loso.csv
|
||||||
presentation/event_stressful_detection_loso.csv
|
presentation/event_stressful_detection_loso.csv
|
||||||
/statistical_analysis/scale_reliability.nb.html
|
|
||||||
|
|
|
@ -1,6 +0,0 @@
|
||||||
<component name="ProjectCodeStyleConfiguration">
|
|
||||||
<code_scheme name="Project" version="173">
|
|
||||||
<option name="RIGHT_MARGIN" value="150" />
|
|
||||||
<option name="SOFT_MARGINS" value="88" />
|
|
||||||
</code_scheme>
|
|
||||||
</component>
|
|
|
@ -1,5 +0,0 @@
|
||||||
<component name="ProjectCodeStyleConfiguration">
|
|
||||||
<state>
|
|
||||||
<option name="USE_PER_PROJECT_SETTINGS" value="true" />
|
|
||||||
</state>
|
|
||||||
</component>
|
|
|
@ -1,3 +0,0 @@
|
||||||
<component name="ProjectDictionaryState">
|
|
||||||
<dictionary name="junos" />
|
|
||||||
</component>
|
|
|
@ -1,9 +1,6 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="straw2analysis" project-jdk-type="Python SDK" />
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (straw2analysis)" project-jdk-type="Python SDK" />
|
||||||
<component name="PyCharmDSProjectLayout">
|
|
||||||
<option name="id" value="JupyterRightHiddenStructureLayout" />
|
|
||||||
</component>
|
|
||||||
<component name="PyCharmProfessionalAdvertiser">
|
<component name="PyCharmProfessionalAdvertiser">
|
||||||
<option name="shown" value="true" />
|
<option name="shown" value="true" />
|
||||||
</component>
|
</component>
|
||||||
|
@ -17,14 +14,6 @@
|
||||||
</RMarkdownRenderProfile>
|
</RMarkdownRenderProfile>
|
||||||
</value>
|
</value>
|
||||||
</entry>
|
</entry>
|
||||||
<entry key="file://$PROJECT_DIR$/statistical_analysis/scale_reliability.rmd">
|
|
||||||
<value>
|
|
||||||
<RMarkdownRenderProfile>
|
|
||||||
<option name="lastOutput" value="$PROJECT_DIR$/statistical_analysis/scale_reliability.nb.html" />
|
|
||||||
<option name="outputDirectoryUrl" value="file://$PROJECT_DIR$/statistical_analysis" />
|
|
||||||
</RMarkdownRenderProfile>
|
|
||||||
</value>
|
|
||||||
</entry>
|
|
||||||
</map>
|
</map>
|
||||||
</option>
|
</option>
|
||||||
</component>
|
</component>
|
||||||
|
|
|
@ -1,9 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="RGraphicsSettings">
|
|
||||||
<option name="height" value="600" />
|
|
||||||
<option name="resolution" value="75" />
|
|
||||||
<option name="version" value="2" />
|
|
||||||
<option name="width" value="960" />
|
|
||||||
</component>
|
|
||||||
</project>
|
|
|
@ -1,7 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="RMarkdownGraphicsSettings">
|
|
||||||
<option name="globalResolution" value="75" />
|
|
||||||
<option name="version" value="2" />
|
|
||||||
</component>
|
|
||||||
</project>
|
|
|
@ -1,6 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="RSettings">
|
|
||||||
<option name="interpreterPath" value="C:\Program Files\R\R-4.3.1\bin\R.exe" />
|
|
||||||
</component>
|
|
||||||
</project>
|
|
|
@ -5,7 +5,7 @@
|
||||||
<excludeFolder url="file://$MODULE_DIR$/config/.ipynb_checkpoints" />
|
<excludeFolder url="file://$MODULE_DIR$/config/.ipynb_checkpoints" />
|
||||||
<excludeFolder url="file://$MODULE_DIR$/exploration/.ipynb_checkpoints" />
|
<excludeFolder url="file://$MODULE_DIR$/exploration/.ipynb_checkpoints" />
|
||||||
</content>
|
</content>
|
||||||
<orderEntry type="jdk" jdkName="straw2analysis" jdkType="Python SDK" />
|
<orderEntry type="jdk" jdkName="Python 3.9 (straw2analysis)" jdkType="Python SDK" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
</component>
|
</component>
|
||||||
<component name="PyDocumentationSettings">
|
<component name="PyDocumentationSettings">
|
||||||
|
|
|
@ -3,5 +3,6 @@
|
||||||
<component name="VcsDirectoryMappings">
|
<component name="VcsDirectoryMappings">
|
||||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
<mapping directory="$PROJECT_DIR$/rapids" vcs="Git" />
|
<mapping directory="$PROJECT_DIR$/rapids" vcs="Git" />
|
||||||
|
<mapping directory="$PROJECT_DIR$/rapids/calculatingfeatures" vcs="Git" />
|
||||||
</component>
|
</component>
|
||||||
</project>
|
</project>
|
|
@ -1,30 +0,0 @@
|
||||||
repos:
|
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
||||||
rev: v4.4.0
|
|
||||||
hooks:
|
|
||||||
- id: check-yaml
|
|
||||||
- id: end-of-file-fixer
|
|
||||||
- id: trailing-whitespace
|
|
||||||
- repo: https://github.com/pycqa/isort
|
|
||||||
rev: 5.12.0
|
|
||||||
hooks:
|
|
||||||
- id: isort
|
|
||||||
name: isort (python)
|
|
||||||
- repo: https://github.com/psf/black
|
|
||||||
rev: 23.3.0
|
|
||||||
hooks:
|
|
||||||
- id: black
|
|
||||||
language_version: python3
|
|
||||||
- repo: https://github.com/pycqa/flake8
|
|
||||||
rev: 6.0.0
|
|
||||||
hooks:
|
|
||||||
- id: flake8
|
|
||||||
# - repo: https://github.com/mwouts/jupytext
|
|
||||||
# rev: v1.14.7
|
|
||||||
# hooks:
|
|
||||||
# - id: jupytext
|
|
||||||
# args: [ --from, "py:percent", --to, "ipynb" ]
|
|
||||||
# additional_dependencies:
|
|
||||||
# - isort==5.12.0 # Matches hook
|
|
||||||
# - black==23.3.0
|
|
||||||
# - flake8==6.0.0
|
|
|
@ -1,12 +1,12 @@
|
||||||
name: straw2analysis
|
name: straw2analysis
|
||||||
channels:
|
channels:
|
||||||
|
- defaults
|
||||||
- conda-forge
|
- conda-forge
|
||||||
dependencies:
|
dependencies:
|
||||||
- python=3.11
|
- python=3.9
|
||||||
- black
|
- black
|
||||||
- isort
|
- isort
|
||||||
- flake8
|
- flake8
|
||||||
- flake8-docstrings
|
|
||||||
- imbalanced-learn=0.10.0
|
- imbalanced-learn=0.10.0
|
||||||
- jupyterlab
|
- jupyterlab
|
||||||
- jupytext
|
- jupytext
|
||||||
|
@ -15,7 +15,6 @@ dependencies:
|
||||||
- nodejs
|
- nodejs
|
||||||
- pandas
|
- pandas
|
||||||
- psycopg2 >= 2.9.1
|
- psycopg2 >= 2.9.1
|
||||||
- pre-commit
|
|
||||||
- python-dotenv
|
- python-dotenv
|
||||||
- pytz
|
- pytz
|
||||||
- pyprojroot
|
- pyprojroot
|
||||||
|
@ -25,4 +24,3 @@ dependencies:
|
||||||
- sqlalchemy
|
- sqlalchemy
|
||||||
- statsmodels
|
- statsmodels
|
||||||
- tabulate
|
- tabulate
|
||||||
- xgboost
|
|
||||||
|
|
|
@ -14,9 +14,15 @@
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
import os, sys
|
||||||
|
import importlib
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from rapids.src.features.utils.utils import chunk_episodes
|
# import plotly.graph_objects as go
|
||||||
|
from importlib import util
|
||||||
|
from pathlib import Path
|
||||||
|
import yaml
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
phone_data_yield = pd.read_csv(
|
phone_data_yield = pd.read_csv(
|
||||||
|
@ -30,29 +36,23 @@ time_segments_labels = pd.read_csv(
|
||||||
# %%
|
# %%
|
||||||
phone_data_yield["assigned_segments"] = phone_data_yield[
|
phone_data_yield["assigned_segments"] = phone_data_yield[
|
||||||
"assigned_segments"
|
"assigned_segments"
|
||||||
].str.replace(r"_RR\d+SS#", "#", regex=True)
|
].str.replace(r"_RR\d+SS#", "#")
|
||||||
time_segments_labels["label"] = time_segments_labels["label"].str.replace(
|
time_segments_labels["label"] = time_segments_labels["label"].str.replace(
|
||||||
r"_RR\d+SS$", "", regex=True
|
r"_RR\d+SS$", ""
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# %% tags=[]
|
# %% tags=[]
|
||||||
def filter_data_by_segment(data, time_segment_current):
|
def filter_data_by_segment(data, time_segment):
|
||||||
data.dropna(subset=["assigned_segments"], inplace=True)
|
data.dropna(subset=["assigned_segments"], inplace=True)
|
||||||
if data.shape[0] == 0: # data is empty
|
if data.shape[0] == 0: # data is empty
|
||||||
data["local_segment"] = data["timestamps_segment"] = None
|
data["local_segment"] = data["timestamps_segment"] = None
|
||||||
return data
|
return data
|
||||||
|
|
||||||
datetime_regex = (
|
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||||
r"[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
timestamps_regex = "[0-9]{13}"
|
||||||
)
|
segment_regex = "\[({}#{},{};{},{})\]".format(
|
||||||
timestamps_regex = r"[0-9]{13}"
|
time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex
|
||||||
segment_regex = r"\[({}#{},{};{},{})\]".format(
|
|
||||||
time_segment_current,
|
|
||||||
datetime_regex,
|
|
||||||
datetime_regex,
|
|
||||||
timestamps_regex,
|
|
||||||
timestamps_regex,
|
|
||||||
)
|
)
|
||||||
data["local_segment"] = data["assigned_segments"].str.extract(
|
data["local_segment"] = data["assigned_segments"].str.extract(
|
||||||
segment_regex, expand=True
|
segment_regex, expand=True
|
||||||
|
@ -147,18 +147,15 @@ def getDataForPlot(phone_data_yield_per_segment):
|
||||||
.fillna(0)
|
.fillna(0)
|
||||||
)
|
)
|
||||||
|
|
||||||
# transpose the dataframe per local start datetime of the segment
|
# transpose the dataframe per local start datetime of the segment and discard the useless index layer
|
||||||
# and discard the useless index layer
|
|
||||||
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
||||||
"local_segment_start_datetimes"
|
"local_segment_start_datetimes"
|
||||||
)[["minutes_after_segment_start", "sensor"]].apply(
|
)[["minutes_after_segment_start", "sensor"]].apply(
|
||||||
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
||||||
)
|
)
|
||||||
phone_data_yield_per_segment.index = (
|
phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
|
||||||
phone_data_yield_per_segment.index.get_level_values(
|
|
||||||
"local_segment_start_datetimes"
|
"local_segment_start_datetimes"
|
||||||
)
|
)
|
||||||
)
|
|
||||||
return phone_data_yield_per_segment
|
return phone_data_yield_per_segment
|
||||||
|
|
||||||
|
|
||||||
|
@ -230,13 +227,9 @@ phone_data_yield_per_segment.tail()
|
||||||
# # A workaround
|
# # A workaround
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
phone_data_yield_per_segment[
|
phone_data_yield_per_segment["local_segment_start_datetimes", "minutes_after_segment_start"] = phone_data_yield_per_segment[
|
||||||
"local_segment_start_datetimes", "minutes_after_segment_start"
|
|
||||||
] = phone_data_yield_per_segment[
|
|
||||||
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||||
].drop_duplicates(
|
].drop_duplicates(keep="first")
|
||||||
keep="first"
|
|
||||||
)
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
phone_data_yield_per_segment.set_index(
|
phone_data_yield_per_segment.set_index(
|
||||||
|
@ -251,9 +244,8 @@ phone_data_yield_per_segment.head()
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# # Retry
|
# # Retry
|
||||||
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
def get_data_for_plot(phone_data_yield_per_segment):
|
def getDataForPlot(phone_data_yield_per_segment):
|
||||||
# calculate the length (in minute) of per segment instance
|
# calculate the length (in minute) of per segment instance
|
||||||
phone_data_yield_per_segment["length"] = (
|
phone_data_yield_per_segment["length"] = (
|
||||||
phone_data_yield_per_segment["timestamps_segment"]
|
phone_data_yield_per_segment["timestamps_segment"]
|
||||||
|
@ -300,10 +292,7 @@ def get_data_for_plot(phone_data_yield_per_segment):
|
||||||
full_index,
|
full_index,
|
||||||
names=("local_segment_start_datetimes", "minutes_after_segment_start"),
|
names=("local_segment_start_datetimes", "minutes_after_segment_start"),
|
||||||
)
|
)
|
||||||
phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(
|
phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(subset=["local_segment_start_datetimes", "minutes_after_segment_start"],keep="first")
|
||||||
subset=["local_segment_start_datetimes", "minutes_after_segment_start"],
|
|
||||||
keep="first",
|
|
||||||
)
|
|
||||||
phone_data_yield_per_segment = (
|
phone_data_yield_per_segment = (
|
||||||
phone_data_yield_per_segment.set_index(
|
phone_data_yield_per_segment.set_index(
|
||||||
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||||
|
@ -313,18 +302,15 @@ def get_data_for_plot(phone_data_yield_per_segment):
|
||||||
.fillna(0)
|
.fillna(0)
|
||||||
)
|
)
|
||||||
|
|
||||||
# transpose the dataframe per local start datetime of the segment
|
# transpose the dataframe per local start datetime of the segment and discard the useless index layer
|
||||||
# and discard the useless index layer
|
|
||||||
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
||||||
"local_segment_start_datetimes"
|
"local_segment_start_datetimes"
|
||||||
)[["minutes_after_segment_start", "sensor"]].apply(
|
)[["minutes_after_segment_start", "sensor"]].apply(
|
||||||
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
||||||
)
|
)
|
||||||
phone_data_yield_per_segment.index = (
|
phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
|
||||||
phone_data_yield_per_segment.index.get_level_values(
|
|
||||||
"local_segment_start_datetimes"
|
"local_segment_start_datetimes"
|
||||||
)
|
)
|
||||||
)
|
|
||||||
return phone_data_yield_per_segment
|
return phone_data_yield_per_segment
|
||||||
|
|
||||||
|
|
||||||
|
@ -332,6 +318,6 @@ def get_data_for_plot(phone_data_yield_per_segment):
|
||||||
phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
|
phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
data_for_plot_per_segment = get_data_for_plot(phone_data_yield_per_segment)
|
data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
# extension: .py
|
# extension: .py
|
||||||
# format_name: percent
|
# format_name: percent
|
||||||
# format_version: '1.3'
|
# format_version: '1.3'
|
||||||
# jupytext_version: 1.14.5
|
# jupytext_version: 1.13.0
|
||||||
# kernelspec:
|
# kernelspec:
|
||||||
# display_name: straw2analysis
|
# display_name: straw2analysis
|
||||||
# language: python
|
# language: python
|
||||||
|
@ -15,33 +15,19 @@
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
|
|
||||||
|
nb_dir = os.path.split(os.getcwd())[0]
|
||||||
|
if nb_dir not in sys.path:
|
||||||
|
sys.path.append(nb_dir)
|
||||||
import participants.query_db
|
import participants.query_db
|
||||||
from features.esm import (
|
from features.esm import *
|
||||||
QUESTIONNAIRE_IDS,
|
from features.esm_JCQ import *
|
||||||
clean_up_esm,
|
from features.esm_SAM import *
|
||||||
get_esm_data,
|
|
||||||
increment_answers,
|
|
||||||
preprocess_esm,
|
|
||||||
reassign_question_ids,
|
|
||||||
)
|
|
||||||
from features.esm_COPE import DICT_COPE_QUESTION_IDS
|
|
||||||
from features.esm_JCQ import reverse_jcq_demand_control_scoring
|
|
||||||
from features.esm_SAM import DICT_SAM_QUESTION_IDS, extract_stressful_events
|
|
||||||
|
|
||||||
# import os
|
|
||||||
# import sys
|
|
||||||
# nb_dir = os.path.split(os.getcwd())[0]
|
|
||||||
# if nb_dir not in sys.path:
|
|
||||||
# sys.path.append(nb_dir)
|
|
||||||
|
|
||||||
|
|
||||||
# %%
|
|
||||||
save_figs = False
|
|
||||||
export_data = True
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
participants_inactive_usernames = participants.query_db.get_usernames(
|
participants_inactive_usernames = participants.query_db.get_usernames(
|
||||||
|
@ -57,14 +43,8 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_PANAS = df_esm_preprocessed[
|
df_esm_PANAS = df_esm_preprocessed[
|
||||||
(
|
(df_esm_preprocessed["questionnaire_id"] == 8)
|
||||||
df_esm_preprocessed["questionnaire_id"]
|
| (df_esm_preprocessed["questionnaire_id"] == 9)
|
||||||
== QUESTIONNAIRE_IDS["PANAS_positive_affect"]
|
|
||||||
)
|
|
||||||
| (
|
|
||||||
df_esm_preprocessed["questionnaire_id"]
|
|
||||||
== QUESTIONNAIRE_IDS["PANAS_negative_affect"]
|
|
||||||
)
|
|
||||||
]
|
]
|
||||||
df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)
|
df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)
|
||||||
|
|
||||||
|
@ -85,47 +65,35 @@ df_esm_PANAS_daily_means = (
|
||||||
# %%
|
# %%
|
||||||
df_esm_PANAS_summary_participant = (
|
df_esm_PANAS_summary_participant = (
|
||||||
df_esm_PANAS_daily_means.groupby(["participant_id", "questionnaire_id"])
|
df_esm_PANAS_daily_means.groupby(["participant_id", "questionnaire_id"])
|
||||||
.esm_numeric_mean.agg(["mean", "median", "std"])
|
.agg(["mean", "median", "std"])
|
||||||
.reset_index(col_level=1)
|
.reset_index(col_level=1)
|
||||||
)
|
)
|
||||||
|
df_esm_PANAS_summary_participant.columns = df_esm_PANAS_summary_participant.columns.get_level_values(
|
||||||
|
1
|
||||||
|
)
|
||||||
df_esm_PANAS_summary_participant[
|
df_esm_PANAS_summary_participant[
|
||||||
"PANAS subscale"
|
"PANAS_subscale"
|
||||||
] = df_esm_PANAS_daily_means.questionnaire_id.astype("category").cat.rename_categories(
|
] = df_esm_PANAS_daily_means.questionnaire_id.astype("category").cat.rename_categories(
|
||||||
{8.0: "positive affect", 9.0: "negative affect"}
|
{8.0: "PA", 9.0: "NA"}
|
||||||
)
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["mean"]
|
sns.displot(
|
||||||
|
data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS_subscale", binwidth=0.2
|
||||||
# %%
|
|
||||||
df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["std"]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
df_esm_PANAS_summary_participant.query("std == 0")
|
|
||||||
|
|
||||||
# %%
|
|
||||||
fig1 = sns.displot(
|
|
||||||
data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS subscale", binwidth=0.2
|
|
||||||
)
|
)
|
||||||
fig1.set_axis_labels(x_var="participant mean", y_var="frequency")
|
|
||||||
if save_figs:
|
|
||||||
fig1.figure.savefig("PANAS_mean_participant.pdf", dpi=300)
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
sns.displot(
|
sns.displot(
|
||||||
data=df_esm_PANAS_summary_participant,
|
data=df_esm_PANAS_summary_participant,
|
||||||
x="median",
|
x="median",
|
||||||
hue="PANAS subscale",
|
hue="PANAS_subscale",
|
||||||
binwidth=0.2,
|
binwidth=0.2,
|
||||||
)
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
fig2 = sns.displot(
|
sns.displot(
|
||||||
data=df_esm_PANAS_summary_participant, x="std", hue="PANAS subscale", binwidth=0.05
|
data=df_esm_PANAS_summary_participant, x="std", hue="PANAS_subscale", binwidth=0.05
|
||||||
)
|
)
|
||||||
fig2.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
|
|
||||||
if save_figs:
|
|
||||||
fig2.figure.savefig("PANAS_std_participant.pdf", dpi=300)
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
|
df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
|
||||||
|
@ -141,14 +109,8 @@ df_SAM_all.head()
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM = df_esm_preprocessed[
|
df_esm_SAM = df_esm_preprocessed[
|
||||||
(
|
(df_esm_preprocessed["questionnaire_id"] >= 87)
|
||||||
df_esm_preprocessed["questionnaire_id"]
|
& (df_esm_preprocessed["questionnaire_id"] <= 93)
|
||||||
>= QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
|
|
||||||
)
|
|
||||||
& (
|
|
||||||
df_esm_preprocessed["questionnaire_id"]
|
|
||||||
<= QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
|
|
||||||
)
|
|
||||||
]
|
]
|
||||||
df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
|
df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
|
||||||
|
|
||||||
|
@ -156,10 +118,9 @@ df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
|
||||||
# ## Stressful events
|
# ## Stressful events
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_event = df_esm_SAM_clean[
|
df_esm_SAM_event = df_esm_SAM_clean[df_esm_SAM_clean["questionnaire_id"] == 87].assign(
|
||||||
df_esm_SAM_clean["questionnaire_id"]
|
stressful_event=lambda x: (x.esm_user_answer_numeric > 0)
|
||||||
== QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
|
)
|
||||||
].assign(stressful_event=lambda x: (x.esm_user_answer_numeric > 0))
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_daily_events = (
|
df_esm_SAM_daily_events = (
|
||||||
|
@ -170,22 +131,20 @@ df_esm_SAM_daily_events = (
|
||||||
)
|
)
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# Calculate the daily mean of YES (1) or NO (0) answers to the question about stressful events. This is then the daily ratio of EMA sessions that included a stressful event.
|
# Calculate the daily mean of YES (1) or NO (0) answers to the question about a stressful events. This is then the daily ratio of EMA sessions that included a stressful event.
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_event_summary_participant = (
|
df_esm_SAM_event_summary_participant = (
|
||||||
df_esm_SAM_daily_events.groupby(["participant_id"])
|
df_esm_SAM_daily_events.groupby(["participant_id"])
|
||||||
.SAM_event_ratio.agg(["mean", "median", "std"])
|
.agg(["mean", "median", "std"])
|
||||||
.reset_index(col_level=1)
|
.reset_index(col_level=1)
|
||||||
)
|
)
|
||||||
|
df_esm_SAM_event_summary_participant.columns = df_esm_SAM_event_summary_participant.columns.get_level_values(
|
||||||
|
1
|
||||||
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
fig6 = sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
|
sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
|
||||||
fig6.set_axis_labels(
|
|
||||||
x_var="participant proportion of stressful events", y_var="frequency"
|
|
||||||
)
|
|
||||||
if save_figs:
|
|
||||||
fig6.figure.savefig("SAM_events_mean_participant.pdf", dpi=300)
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
|
sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
|
||||||
|
@ -196,12 +155,7 @@ sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# * Example of threat: "Did this event make you feel anxious?"
|
# * Example of threat: "Did this event make you feel anxious?"
|
||||||
# * Example of challenge: "How eager are you to tackle this event?"
|
# * Example of challenge: "How eager are you to tackle this event?"
|
||||||
# * Possible answers:
|
# * Possible answers: 0 - Not at all, 1 - Slightly, 2 - Moderately, 3 - Considerably, 4 - Extremely
|
||||||
# 0 - Not at all,
|
|
||||||
# 1 - Slightly,
|
|
||||||
# 2 - Moderately,
|
|
||||||
# 3 - Considerably,
|
|
||||||
# 4 - Extremely
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_daily = (
|
df_esm_SAM_daily = (
|
||||||
|
@ -213,45 +167,27 @@ df_esm_SAM_daily = (
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[
|
df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[
|
||||||
(df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
|
(df_esm_SAM_daily["questionnaire_id"] == 88)
|
||||||
| (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
|
| (df_esm_SAM_daily["questionnaire_id"] == 89)
|
||||||
]
|
]
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_summary_participant = (
|
df_esm_SAM_summary_participant = (
|
||||||
df_esm_SAM_daily.groupby(["participant_id", "questionnaire_id"])
|
df_esm_SAM_daily.groupby(["participant_id", "questionnaire_id"])
|
||||||
.esm_numeric_mean.agg(["mean", "median", "std"])
|
.agg(["mean", "median", "std"])
|
||||||
.reset_index(col_level=1)
|
.reset_index(col_level=1)
|
||||||
)
|
)
|
||||||
|
df_esm_SAM_summary_participant.columns = df_esm_SAM_summary_participant.columns.get_level_values(
|
||||||
# %%
|
1
|
||||||
df_esm_SAM_event_stressfulness_summary_participant = df_esm_SAM_summary_participant[
|
|
||||||
df_esm_SAM_summary_participant["questionnaire_id"]
|
|
||||||
== QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
|
|
||||||
]
|
|
||||||
df_esm_SAM_event_stressfulness_summary_participant.describe()["mean"]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
df_esm_SAM_event_stressfulness_summary_participant.describe()["std"]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
sns.displot(
|
|
||||||
data=df_esm_SAM_event_stressfulness_summary_participant, x="mean", binwidth=0.2
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[
|
df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[
|
||||||
(
|
(df_esm_SAM_summary_participant["questionnaire_id"] == 88)
|
||||||
df_esm_SAM_summary_participant["questionnaire_id"]
|
| (df_esm_SAM_summary_participant["questionnaire_id"] == 89)
|
||||||
== QUESTIONNAIRE_IDS["appraisal_threat"]
|
|
||||||
)
|
|
||||||
| (
|
|
||||||
df_esm_SAM_summary_participant["questionnaire_id"]
|
|
||||||
== QUESTIONNAIRE_IDS["appraisal_challenge"]
|
|
||||||
)
|
|
||||||
]
|
]
|
||||||
df_esm_SAM_threat_challenge_summary_participant[
|
df_esm_SAM_threat_challenge_summary_participant[
|
||||||
"event subscale"
|
"event_subscale"
|
||||||
] = df_esm_SAM_threat_challenge_summary_participant.questionnaire_id.astype(
|
] = df_esm_SAM_threat_challenge_summary_participant.questionnaire_id.astype(
|
||||||
"category"
|
"category"
|
||||||
).cat.rename_categories(
|
).cat.rename_categories(
|
||||||
|
@ -262,84 +198,26 @@ df_esm_SAM_threat_challenge_summary_participant[
|
||||||
sns.displot(
|
sns.displot(
|
||||||
data=df_esm_SAM_threat_challenge_summary_participant,
|
data=df_esm_SAM_threat_challenge_summary_participant,
|
||||||
x="mean",
|
x="mean",
|
||||||
hue="event subscale",
|
hue="event_subscale",
|
||||||
binwidth=0.2,
|
binwidth=0.2,
|
||||||
)
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
fig3 = sns.displot(
|
sns.displot(
|
||||||
data=df_esm_SAM_threat_challenge_summary_participant,
|
data=df_esm_SAM_threat_challenge_summary_participant,
|
||||||
x="std",
|
x="std",
|
||||||
hue="event subscale",
|
hue="event_subscale",
|
||||||
binwidth=0.1,
|
binwidth=0.1,
|
||||||
)
|
)
|
||||||
fig3.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
|
|
||||||
if save_figs:
|
|
||||||
fig3.figure.savefig("SAM_std_participant.pdf", dpi=300)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
|
|
||||||
"mean"
|
|
||||||
]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
|
|
||||||
"std"
|
|
||||||
]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
df_esm_SAM_clean.columns
|
|
||||||
|
|
||||||
# %%
|
|
||||||
df_esm_SAM_clean.esm_status.value_counts()
|
|
||||||
|
|
||||||
# %%
|
|
||||||
if export_data:
|
|
||||||
df_esm_SAM_fixed = reassign_question_ids(df_esm_SAM_clean, DICT_SAM_QUESTION_IDS)
|
|
||||||
df_esm_SAM_fixed = increment_answers(df_esm_SAM_fixed)
|
|
||||||
df_esm_SAM_for_export = df_esm_SAM_fixed[
|
|
||||||
[
|
|
||||||
"participant_id",
|
|
||||||
"username",
|
|
||||||
"device_id",
|
|
||||||
"_id",
|
|
||||||
"esm_trigger",
|
|
||||||
"esm_session",
|
|
||||||
"esm_notification_id",
|
|
||||||
"question_id",
|
|
||||||
"questionnaire_id",
|
|
||||||
"esm_instructions",
|
|
||||||
"double_esm_user_answer_timestamp",
|
|
||||||
"datetime_lj",
|
|
||||||
"date_lj",
|
|
||||||
"time",
|
|
||||||
"esm_user_answer",
|
|
||||||
"esm_user_answer_numeric",
|
|
||||||
]
|
|
||||||
]
|
|
||||||
df_esm_SAM_for_export.sort_values(
|
|
||||||
by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
|
|
||||||
)
|
|
||||||
print(df_esm_SAM_for_export.head())
|
|
||||||
df_esm_SAM_for_export.to_csv(
|
|
||||||
"../data/raw/df_esm_SAM_threat_challenge.csv", index=False
|
|
||||||
)
|
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ## Stressfulness of period
|
# ## Stressfulness of period
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[
|
df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[
|
||||||
df_esm_SAM_summary_participant["questionnaire_id"]
|
df_esm_SAM_summary_participant["questionnaire_id"] == 93
|
||||||
== QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# %%
|
|
||||||
df_esm_SAM_period_summary_participant.describe()["mean"]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
df_esm_SAM_period_summary_participant.describe()["std"]
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
sns.displot(data=df_esm_SAM_period_summary_participant, x="mean", binwidth=0.2)
|
sns.displot(data=df_esm_SAM_period_summary_participant, x="mean", binwidth=0.2)
|
||||||
|
|
||||||
|
@ -351,8 +229,8 @@ sns.displot(data=df_esm_SAM_period_summary_participant, x="std", binwidth=0.1)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_JCQ_demand_control = df_esm_preprocessed[
|
df_esm_JCQ_demand_control = df_esm_preprocessed[
|
||||||
(df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["JCQ_job_demand"])
|
(df_esm_preprocessed["questionnaire_id"] >= 10)
|
||||||
& (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["JCQ_job_control"])
|
& (df_esm_preprocessed["questionnaire_id"] <= 11)
|
||||||
]
|
]
|
||||||
df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control)
|
df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control)
|
||||||
|
|
||||||
|
@ -372,11 +250,14 @@ df_esm_JCQ_daily = (
|
||||||
)
|
)
|
||||||
df_esm_JCQ_summary_participant = (
|
df_esm_JCQ_summary_participant = (
|
||||||
df_esm_JCQ_daily.groupby(["participant_id", "questionnaire_id"])
|
df_esm_JCQ_daily.groupby(["participant_id", "questionnaire_id"])
|
||||||
.esm_score_mean.agg(["mean", "median", "std"])
|
.agg(["mean", "median", "std"])
|
||||||
.reset_index(col_level=1)
|
.reset_index(col_level=1)
|
||||||
)
|
)
|
||||||
|
df_esm_JCQ_summary_participant.columns = df_esm_JCQ_summary_participant.columns.get_level_values(
|
||||||
|
1
|
||||||
|
)
|
||||||
df_esm_JCQ_summary_participant[
|
df_esm_JCQ_summary_participant[
|
||||||
"JCQ subscale"
|
"JCQ_subscale"
|
||||||
] = df_esm_JCQ_summary_participant.questionnaire_id.astype(
|
] = df_esm_JCQ_summary_participant.questionnaire_id.astype(
|
||||||
"category"
|
"category"
|
||||||
).cat.rename_categories(
|
).cat.rename_categories(
|
||||||
|
@ -384,71 +265,11 @@ df_esm_JCQ_summary_participant[
|
||||||
)
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["mean"]
|
sns.displot(
|
||||||
|
data=df_esm_JCQ_summary_participant, x="mean", hue="JCQ_subscale", binwidth=0.1,
|
||||||
# %%
|
|
||||||
df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["std"]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
fig4 = sns.displot(
|
|
||||||
data=df_esm_JCQ_summary_participant,
|
|
||||||
x="mean",
|
|
||||||
hue="JCQ subscale",
|
|
||||||
binwidth=0.1,
|
|
||||||
)
|
)
|
||||||
fig4.set_axis_labels(x_var="participant mean", y_var="frequency")
|
|
||||||
if save_figs:
|
|
||||||
fig4.figure.savefig("JCQ_mean_participant.pdf", dpi=300)
|
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
fig5 = sns.displot(
|
sns.displot(
|
||||||
data=df_esm_JCQ_summary_participant,
|
data=df_esm_JCQ_summary_participant, x="std", hue="JCQ_subscale", binwidth=0.05,
|
||||||
x="std",
|
|
||||||
hue="JCQ subscale",
|
|
||||||
binwidth=0.05,
|
|
||||||
)
|
)
|
||||||
fig6.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
|
|
||||||
if save_figs:
|
|
||||||
fig5.figure.savefig("JCQ_std_participant.pdf", dpi=300)
|
|
||||||
|
|
||||||
# %% [markdown]
|
|
||||||
# # COPE Inventory
|
|
||||||
|
|
||||||
# %%
|
|
||||||
df_esm_COPE = df_esm_preprocessed[
|
|
||||||
(df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["COPE_active"])
|
|
||||||
& (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["COPE_emotions"])
|
|
||||||
]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
df_esm_COPE_clean = clean_up_esm(df_esm_COPE)
|
|
||||||
df_esm_COPE_clean = increment_answers(df_esm_COPE_clean)
|
|
||||||
df_esm_COPE_fixed = reassign_question_ids(df_esm_COPE_clean, DICT_COPE_QUESTION_IDS)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
if export_data:
|
|
||||||
df_esm_COPE_for_export = df_esm_COPE_fixed[
|
|
||||||
[
|
|
||||||
"participant_id",
|
|
||||||
"username",
|
|
||||||
"device_id",
|
|
||||||
"_id",
|
|
||||||
"esm_trigger",
|
|
||||||
"esm_session",
|
|
||||||
"esm_notification_id",
|
|
||||||
"question_id",
|
|
||||||
"questionnaire_id",
|
|
||||||
"esm_instructions",
|
|
||||||
"double_esm_user_answer_timestamp",
|
|
||||||
"datetime_lj",
|
|
||||||
"date_lj",
|
|
||||||
"time",
|
|
||||||
"esm_user_answer",
|
|
||||||
"esm_user_answer_numeric",
|
|
||||||
]
|
|
||||||
]
|
|
||||||
df_esm_COPE_for_export.sort_values(
|
|
||||||
by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
|
|
||||||
)
|
|
||||||
print(df_esm_COPE_for_export.head())
|
|
||||||
df_esm_COPE_for_export.to_csv("../data/raw/df_esm_COPE.csv", index=False)
|
|
||||||
|
|
|
@ -6,129 +6,457 @@
|
||||||
# extension: .py
|
# extension: .py
|
||||||
# format_name: percent
|
# format_name: percent
|
||||||
# format_version: '1.3'
|
# format_version: '1.3'
|
||||||
# jupytext_version: 1.14.5
|
# jupytext_version: 1.13.0
|
||||||
# kernelspec:
|
# kernelspec:
|
||||||
# display_name: straw2analysis
|
# display_name: straw2analysis
|
||||||
# language: python
|
# language: python
|
||||||
# name: straw2analysis
|
# name: straw2analysis
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
# from IPython.core.interactiveshell import InteractiveShell
|
# %matplotlib inline
|
||||||
from pathlib import Path
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
# matplotlib inline
|
import numpy as np
|
||||||
# import os
|
import matplotlib.pyplot as plt
|
||||||
# import sys
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from machine_learning.helper import (
|
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||||
impute_encode_categorical_features,
|
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
|
||||||
prepare_cross_validator,
|
from sklearn.dummy import DummyClassifier
|
||||||
prepare_sklearn_data_format,
|
from sklearn.impute import SimpleImputer
|
||||||
run_all_classification_models,
|
|
||||||
)
|
|
||||||
|
|
||||||
# InteractiveShell.ast_node_interactivity = "all"
|
from lightgbm import LGBMClassifier
|
||||||
|
import xgboost as xg
|
||||||
|
from IPython.core.interactiveshell import InteractiveShell
|
||||||
|
InteractiveShell.ast_node_interactivity = "all"
|
||||||
|
|
||||||
|
nb_dir = os.path.split(os.getcwd())[0]
|
||||||
|
if nb_dir not in sys.path:
|
||||||
|
sys.path.append(nb_dir)
|
||||||
|
|
||||||
|
import machine_learning.helper
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# # RAPIDS models
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ## Set script's parameters
|
||||||
#
|
#
|
||||||
# nb_dir = os.path.split(os.getcwd())[0]
|
|
||||||
# if nb_dir not in sys.path:
|
|
||||||
# sys.path.append(nb_dir)
|
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||||
|
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||||
|
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||||
|
undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method)
|
||||||
|
|
||||||
# %%
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
CV_METHOD = "logo" # logo, half_logo, 5kfold
|
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
|
||||||
# Cross-validation method (could be regarded as a hyperparameter)
|
# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
|
||||||
print("CV_METHOD: " + CV_METHOD)
|
|
||||||
N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
|
||||||
UNDERSAMPLING = False
|
|
||||||
# (bool) If True this will train and test data on balanced dataset
|
|
||||||
# (using undersampling method)
|
|
||||||
|
|
||||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
PATH_BASE = Path("E:/STRAWresults/20230415")
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
|
model_input.set_index(index_columns, inplace=True)
|
||||||
|
model_input['target'].value_counts()
|
||||||
|
|
||||||
SEGMENT_TYPE = "period"
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
|
||||||
SEGMENT_LENGTH = "30_minutes_before"
|
|
||||||
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
|
||||||
TARGET_VARIABLE = "JCQ_job_control"
|
|
||||||
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
|
||||||
|
|
||||||
if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
|
|
||||||
TARGET_VARIABLE += "_"
|
|
||||||
TARGET_VARIABLE += SEGMENT_TYPE
|
|
||||||
|
|
||||||
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
|
||||||
|
|
||||||
model_input = pd.read_csv(PATH_FULL)
|
|
||||||
|
|
||||||
if SEGMENT_LENGTH == "daily":
|
|
||||||
DAY_LENGTH = "daily" # or "working"
|
|
||||||
print(DAY_LENGTH)
|
|
||||||
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
|
|
||||||
|
|
||||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
|
||||||
model_input["target"].value_counts()
|
|
||||||
|
|
||||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
|
||||||
# bins = [-10, 0, 10] # bins for z-scored targets
|
# bins = [-10, 0, 10] # bins for z-scored targets
|
||||||
BINS = [-1, 0, 4] # bins for stressfulness (0-4) target
|
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||||
print("BINS: ", BINS)
|
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
|
||||||
model_input["target"], edges = pd.cut(
|
model_input['target'].value_counts(), edges
|
||||||
model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
|
# model_input = model_input[model_input['target'] != "medium"]
|
||||||
) # ['low', 'medium', 'high']
|
model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||||
print(model_input["target"].value_counts())
|
|
||||||
REMOVE_MEDIUM = True
|
|
||||||
if ("medium" in model_input["target"]) and REMOVE_MEDIUM:
|
|
||||||
model_input = model_input[model_input["target"] != "medium"]
|
|
||||||
model_input["target"] = (
|
|
||||||
model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
model_input["target"] = model_input["target"].map(
|
|
||||||
{"low": 0, "medium": 1, "high": 2}
|
|
||||||
)
|
|
||||||
print(model_input["target"].value_counts())
|
|
||||||
|
|
||||||
|
model_input['target'].value_counts()
|
||||||
|
|
||||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
# UnderSampling
|
# UnderSampling
|
||||||
if UNDERSAMPLING:
|
if undersampling:
|
||||||
no_stress = model_input[model_input["target"] == 0]
|
no_stress = model_input[model_input['target'] == 0]
|
||||||
stress = model_input[model_input["target"] == 1]
|
stress = model_input[model_input['target'] == 1]
|
||||||
|
|
||||||
no_stress = no_stress.sample(n=len(stress))
|
no_stress = no_stress.sample(n=len(stress))
|
||||||
model_input = pd.concat([stress,no_stress], axis=0)
|
model_input = pd.concat([stress,no_stress], axis=0)
|
||||||
|
|
||||||
|
# model_input_new = pd.DataFrame(columns=model_input.columns)
|
||||||
|
# for pid in model_input["pid"].unique():
|
||||||
|
# stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
|
||||||
|
# no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
|
||||||
|
# if (len(stress) == 0):
|
||||||
|
# continue
|
||||||
|
# if (len(no_stress) == 0):
|
||||||
|
# continue
|
||||||
|
# model_input_new = pd.concat([model_input_new, stress], axis=0)
|
||||||
|
|
||||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
# no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
|
||||||
model_input_encoded = impute_encode_categorical_features(model_input)
|
# # In case there are more stress samples than no_stress, take all instances of no_stress.
|
||||||
# %%
|
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
|
||||||
data_x, data_y, data_groups = prepare_sklearn_data_format(
|
# model_input = model_input_new
|
||||||
model_input_encoded, CV_METHOD
|
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
|
||||||
|
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
if cv_method_str == 'half_logo':
|
||||||
|
model_input['pid_index'] = model_input.groupby('pid').cumcount()
|
||||||
|
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
|
||||||
|
|
||||||
|
model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
|
||||||
|
model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
|
||||||
|
|
||||||
|
data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
|
||||||
|
else:
|
||||||
|
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
|
||||||
|
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||||
|
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||||
|
categorical_feature_colnames += additional_categorical_features
|
||||||
|
|
||||||
|
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||||
|
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||||
|
|
||||||
|
# fillna with mode
|
||||||
|
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||||
|
|
||||||
|
# one-hot encoding
|
||||||
|
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||||
|
if not categorical_features.empty:
|
||||||
|
categorical_features = pd.get_dummies(categorical_features)
|
||||||
|
|
||||||
|
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||||
|
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||||
|
train_x.dtypes
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
|
||||||
|
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
||||||
|
cv_method = LeaveOneGroupOut()
|
||||||
|
cv_method.get_n_splits(
|
||||||
|
train_x,
|
||||||
|
data_y,
|
||||||
|
groups=data_groups,
|
||||||
)
|
)
|
||||||
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
|
|
||||||
|
|
||||||
# %%
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
data_y.head()
|
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||||
|
|
||||||
# %%
|
# %% [markdown]
|
||||||
data_y.tail()
|
# ### Baseline: Dummy Classifier (most frequent)
|
||||||
# %%
|
|
||||||
data_y.shape
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||||
# %%
|
dummy_class = DummyClassifier(strategy="most_frequent")
|
||||||
scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
|
|
||||||
# %%
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
dummy_classifier = cross_validate(
|
||||||
path_output_full = PATH_OUTPUT / (
|
dummy_class,
|
||||||
TARGET_VARIABLE
|
X=imputer.fit_transform(train_x),
|
||||||
+ "_"
|
y=data_y,
|
||||||
+ SEGMENT_LENGTH
|
groups=data_groups,
|
||||||
+ "_classification"
|
cv=cv_method,
|
||||||
+ str(BINS)
|
n_jobs=-1,
|
||||||
+ "_"
|
error_score='raise',
|
||||||
+ CV_METHOD
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
+ ".csv"
|
|
||||||
)
|
)
|
||||||
scores.to_csv(path_output_full, index=False)
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
|
||||||
|
print("Precision", np.mean(dummy_classifier['test_precision']))
|
||||||
|
print("Recall", np.mean(dummy_classifier['test_recall']))
|
||||||
|
print("F1", np.mean(dummy_classifier['test_f1']))
|
||||||
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %% [markdown] nteract={"transient": {"deleting": false}}
|
||||||
|
# ### All models
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||||
|
final_scores = machine_learning.helper.run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||||
|
# %%
|
||||||
|
final_scores.index.name = "metric"
|
||||||
|
final_scores = final_scores.set_index(["method", final_scores.index])
|
||||||
|
final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv")
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Logistic Regression
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
logistic_regression = linear_model.LogisticRegression()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
log_reg_scores = cross_validate(
|
||||||
|
logistic_regression,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
|
)
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
|
||||||
|
print("Precision", np.mean(log_reg_scores['test_precision']))
|
||||||
|
print("Recall", np.mean(log_reg_scores['test_recall']))
|
||||||
|
print("F1", np.mean(log_reg_scores['test_f1']))
|
||||||
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Support Vector Machine
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
svc = svm.SVC()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
svc_scores = cross_validate(
|
||||||
|
svc,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
|
)
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
|
||||||
|
print("Precision", np.mean(svc_scores['test_precision']))
|
||||||
|
print("Recall", np.mean(svc_scores['test_recall']))
|
||||||
|
print("F1", np.mean(svc_scores['test_f1']))
|
||||||
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Gaussian Naive Bayes
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
gaussian_nb = naive_bayes.GaussianNB()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
gaussian_nb_scores = cross_validate(
|
||||||
|
gaussian_nb,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
error_score='raise',
|
||||||
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
|
)
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
|
||||||
|
print("Precision", np.mean(gaussian_nb_scores['test_precision']))
|
||||||
|
print("Recall", np.mean(gaussian_nb_scores['test_recall']))
|
||||||
|
print("F1", np.mean(gaussian_nb_scores['test_f1']))
|
||||||
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Stochastic Gradient Descent Classifier
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
sgdc = linear_model.SGDClassifier()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
sgdc_scores = cross_validate(
|
||||||
|
sgdc,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
error_score='raise',
|
||||||
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
|
)
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
|
||||||
|
print("Precision", np.mean(sgdc_scores['test_precision']))
|
||||||
|
print("Recall", np.mean(sgdc_scores['test_recall']))
|
||||||
|
print("F1", np.mean(sgdc_scores['test_f1']))
|
||||||
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### K-nearest neighbors
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
knn = neighbors.KNeighborsClassifier()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
knn_scores = cross_validate(
|
||||||
|
knn,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
error_score='raise',
|
||||||
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
|
)
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
|
||||||
|
print("Precision", np.mean(knn_scores['test_precision']))
|
||||||
|
print("Recall", np.mean(knn_scores['test_recall']))
|
||||||
|
print("F1", np.mean(knn_scores['test_f1']))
|
||||||
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Decision Tree
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
dtree = tree.DecisionTreeClassifier()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
dtree_scores = cross_validate(
|
||||||
|
dtree,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
error_score='raise',
|
||||||
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
|
)
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
|
||||||
|
print("Precision", np.mean(dtree_scores['test_precision']))
|
||||||
|
print("Recall", np.mean(dtree_scores['test_recall']))
|
||||||
|
print("F1", np.mean(dtree_scores['test_f1']))
|
||||||
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Random Forest Classifier
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
rfc = ensemble.RandomForestClassifier()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
rfc_scores = cross_validate(
|
||||||
|
rfc,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
error_score='raise',
|
||||||
|
scoring=('accuracy', 'precision', 'recall', 'f1'),
|
||||||
|
return_estimator=True
|
||||||
|
)
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
|
||||||
|
print("Precision", np.mean(rfc_scores['test_precision']))
|
||||||
|
print("Recall", np.mean(rfc_scores['test_recall']))
|
||||||
|
print("F1", np.mean(rfc_scores['test_f1']))
|
||||||
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Feature importance (RFC)
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
|
||||||
|
for idx, estimator in enumerate(rfc_scores['estimator']):
|
||||||
|
feature_importances = pd.DataFrame(estimator.feature_importances_,
|
||||||
|
index = list(train_x.columns),
|
||||||
|
columns=['importance'])
|
||||||
|
# print("\nFeatures sorted by their score for estimator {}:".format(idx))
|
||||||
|
# print(feature_importances.sort_values('importance', ascending=False).head(10))
|
||||||
|
rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
|
||||||
|
|
||||||
|
pd.set_option('display.max_rows', 100)
|
||||||
|
print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
|
||||||
|
|
||||||
|
rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
|
||||||
|
|
||||||
|
rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
|
||||||
|
|
||||||
|
train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Gradient Boosting Classifier
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
gbc = ensemble.GradientBoostingClassifier()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
gbc_scores = cross_validate(
|
||||||
|
gbc,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
error_score='raise',
|
||||||
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
|
)
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
|
||||||
|
print("Precision", np.mean(gbc_scores['test_precision']))
|
||||||
|
print("Recall", np.mean(gbc_scores['test_recall']))
|
||||||
|
print("F1", np.mean(gbc_scores['test_f1']))
|
||||||
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### LGBM Classifier
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
lgbm = LGBMClassifier()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
lgbm_scores = cross_validate(
|
||||||
|
lgbm,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
error_score='raise',
|
||||||
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
|
)
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
|
||||||
|
print("Precision", np.mean(lgbm_scores['test_precision']))
|
||||||
|
print("Recall", np.mean(lgbm_scores['test_recall']))
|
||||||
|
print("F1", np.mean(lgbm_scores['test_f1']))
|
||||||
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### XGBoost Classifier
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
xgb_classifier = xg.sklearn.XGBClassifier()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
xgb_classifier_scores = cross_validate(
|
||||||
|
xgb_classifier,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=cv_method,
|
||||||
|
n_jobs=-1,
|
||||||
|
error_score='raise',
|
||||||
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
|
)
|
||||||
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||||
|
print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
|
||||||
|
print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
|
||||||
|
print("Precision", np.mean(xgb_classifier_scores['test_precision']))
|
||||||
|
print("Recall", np.mean(xgb_classifier_scores['test_recall']))
|
||||||
|
print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
||||||
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||||
|
|
||||||
|
|
|
@ -1,177 +0,0 @@
|
||||||
# ---
|
|
||||||
# jupyter:
|
|
||||||
# jupytext:
|
|
||||||
# formats: ipynb,py:percent
|
|
||||||
# text_representation:
|
|
||||||
# extension: .py
|
|
||||||
# format_name: percent
|
|
||||||
# format_version: '1.3'
|
|
||||||
# jupytext_version: 1.14.5
|
|
||||||
# kernelspec:
|
|
||||||
# display_name: straw2analysis
|
|
||||||
# language: python
|
|
||||||
# name: straw2analysis
|
|
||||||
# ---
|
|
||||||
|
|
||||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import seaborn as sns
|
|
||||||
from sklearn.decomposition import PCA
|
|
||||||
|
|
||||||
from machine_learning.helper import (
|
|
||||||
impute_encode_categorical_features,
|
|
||||||
prepare_cross_validator,
|
|
||||||
prepare_sklearn_data_format,
|
|
||||||
run_all_classification_models,
|
|
||||||
)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
CV_METHOD = "logo" # logo, half_logo, 5kfold
|
|
||||||
# Cross-validation method (could be regarded as a hyperparameter)
|
|
||||||
print("CV_METHOD: " + CV_METHOD)
|
|
||||||
N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
|
||||||
UNDERSAMPLING = False
|
|
||||||
# (bool) If True this will train and test data on balanced dataset
|
|
||||||
# (using undersampling method)
|
|
||||||
|
|
||||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
|
||||||
PATH_BASE = Path("E:/STRAWresults/20230415")
|
|
||||||
|
|
||||||
SEGMENT_TYPE = "period"
|
|
||||||
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
|
||||||
SEGMENT_LENGTH = "30_minutes_before"
|
|
||||||
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
|
||||||
|
|
||||||
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv"
|
|
||||||
|
|
||||||
all_features_with_baseline = pd.read_csv(PATH_FULL)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
TARGETS = [
|
|
||||||
"PANAS_negative_affect_mean",
|
|
||||||
"PANAS_positive_affect_mean",
|
|
||||||
"JCQ_job_demand_mean",
|
|
||||||
"JCQ_job_control_mean",
|
|
||||||
"appraisal_stressfulness_period_mean",
|
|
||||||
]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
all_features_cleaned = pd.DataFrame()
|
|
||||||
for target in TARGETS:
|
|
||||||
PATH_FULL = (
|
|
||||||
PATH_BASE
|
|
||||||
/ SEGMENT_LENGTH
|
|
||||||
/ "features"
|
|
||||||
/ ("all_sensor_features_cleaned_straw_py_(" + target + ").csv")
|
|
||||||
)
|
|
||||||
current_features = pd.read_csv(PATH_FULL, index_col="local_segment")
|
|
||||||
if all_features_cleaned.empty:
|
|
||||||
all_features_cleaned = current_features
|
|
||||||
else:
|
|
||||||
all_features_cleaned = all_features_cleaned.join(
|
|
||||||
current_features[("phone_esm_straw_" + target)],
|
|
||||||
how="inner",
|
|
||||||
rsuffix="_" + target,
|
|
||||||
)
|
|
||||||
print(all_features_cleaned.shape)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
pca = PCA(n_components=1)
|
|
||||||
TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS]
|
|
||||||
pca.fit(all_features_cleaned[TARGETS_PREFIXED])
|
|
||||||
print(pca.explained_variance_ratio_)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED)
|
|
||||||
model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED])
|
|
||||||
|
|
||||||
# %%
|
|
||||||
sns.histplot(data=model_input, x="target")
|
|
||||||
|
|
||||||
# %%
|
|
||||||
model_input.target.quantile(0.6)
|
|
||||||
|
|
||||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
|
||||||
# bins = [-10, 0, 10] # bins for z-scored targets
|
|
||||||
BINS = [-10, 0, 10] # bins for stressfulness (0-4) target
|
|
||||||
print("BINS: ", BINS)
|
|
||||||
model_input["target"], edges = pd.cut(
|
|
||||||
model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
|
|
||||||
) # ['low', 'medium', 'high']
|
|
||||||
print(model_input["target"].value_counts())
|
|
||||||
REMOVE_MEDIUM = True
|
|
||||||
if REMOVE_MEDIUM:
|
|
||||||
if "medium" in model_input["target"]:
|
|
||||||
model_input = model_input[model_input["target"] != "medium"]
|
|
||||||
model_input["target"] = (
|
|
||||||
model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
model_input["target"] = model_input["target"].map(
|
|
||||||
{"low": 0, "medium": 1, "high": 2}
|
|
||||||
)
|
|
||||||
print(model_input["target"].value_counts())
|
|
||||||
|
|
||||||
|
|
||||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
|
||||||
# UnderSampling
|
|
||||||
if UNDERSAMPLING:
|
|
||||||
no_stress = model_input[model_input["target"] == 0]
|
|
||||||
stress = model_input[model_input["target"] == 1]
|
|
||||||
|
|
||||||
no_stress = no_stress.sample(n=len(stress))
|
|
||||||
model_input = pd.concat([stress, no_stress], axis=0)
|
|
||||||
|
|
||||||
|
|
||||||
# %%
|
|
||||||
TARGET_VARIABLE = "PANAS_negative_affect"
|
|
||||||
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
|
||||||
|
|
||||||
PATH_FULL_HELP = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
|
||||||
|
|
||||||
model_input_with_baseline = pd.read_csv(PATH_FULL_HELP, index_col="local_segment")
|
|
||||||
|
|
||||||
# %%
|
|
||||||
baseline_col_names = [
|
|
||||||
col for col in model_input_with_baseline.columns if col not in model_input.columns
|
|
||||||
]
|
|
||||||
print(baseline_col_names)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
model_input = model_input.join(
|
|
||||||
model_input_with_baseline[baseline_col_names], how="left"
|
|
||||||
)
|
|
||||||
model_input.reset_index(inplace=True)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
model_input_encoded = impute_encode_categorical_features(model_input)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
data_x, data_y, data_groups = prepare_sklearn_data_format(
|
|
||||||
model_input_encoded, CV_METHOD
|
|
||||||
)
|
|
||||||
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
data_y.head()
|
|
||||||
|
|
||||||
# %%
|
|
||||||
data_y.tail()
|
|
||||||
# %%
|
|
||||||
data_y.shape
|
|
||||||
# %%
|
|
||||||
scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
|
|
||||||
# %%
|
|
||||||
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
|
||||||
path_output_full = PATH_OUTPUT / (
|
|
||||||
"composite_"
|
|
||||||
+ SEGMENT_LENGTH
|
|
||||||
+ "_classification"
|
|
||||||
+ str(BINS)
|
|
||||||
+ "_"
|
|
||||||
+ CV_METHOD
|
|
||||||
+ ".csv"
|
|
||||||
)
|
|
||||||
scores.to_csv(path_output_full, index=False)
|
|
|
@ -6,7 +6,7 @@
|
||||||
# extension: .py
|
# extension: .py
|
||||||
# format_name: percent
|
# format_name: percent
|
||||||
# format_version: '1.3'
|
# format_version: '1.3'
|
||||||
# jupytext_version: 1.14.5
|
# jupytext_version: 1.13.0
|
||||||
# kernelspec:
|
# kernelspec:
|
||||||
# display_name: straw2analysis
|
# display_name: straw2analysis
|
||||||
# language: python
|
# language: python
|
||||||
|
@ -14,85 +14,80 @@
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
from pathlib import Path
|
# %matplotlib inline
|
||||||
|
import datetime
|
||||||
|
import importlib
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.cluster import KMeans
|
import seaborn as sns
|
||||||
from sklearn.impute import SimpleImputer
|
from scipy import stats
|
||||||
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold, cross_validate
|
|
||||||
|
|
||||||
|
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
|
||||||
|
from sklearn.dummy import DummyClassifier
|
||||||
|
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||||
|
import xgboost as xg
|
||||||
|
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
|
||||||
|
from IPython.core.interactiveshell import InteractiveShell
|
||||||
|
InteractiveShell.ast_node_interactivity = "all"
|
||||||
|
|
||||||
|
nb_dir = os.path.split(os.getcwd())[0]
|
||||||
|
if nb_dir not in sys.path:
|
||||||
|
sys.path.append(nb_dir)
|
||||||
|
|
||||||
|
import machine_learning.labels
|
||||||
|
import machine_learning.model
|
||||||
from machine_learning.classification_models import ClassificationModels
|
from machine_learning.classification_models import ClassificationModels
|
||||||
|
|
||||||
# %%
|
# %% [markdown]
|
||||||
|
# # RAPIDS models
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
# ## Set script's parameters
|
# ## Set script's parameters
|
||||||
N_CLUSTERS = 4 # Number of clusters (could be regarded as a hyperparameter)
|
n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
|
||||||
CV_METHOD = "logo" # logo, halflogo, 5kfold
|
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||||
# Cross-validation method (could be regarded as a hyperparameter)
|
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||||
N_SL = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
|
||||||
|
|
||||||
# %%
|
|
||||||
PATH_BASE = Path("E:/STRAWresults/20230415")
|
|
||||||
|
|
||||||
SEGMENT_TYPE = "period"
|
|
||||||
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
|
||||||
SEGMENT_LENGTH = "30_minutes_before"
|
|
||||||
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
|
||||||
TARGET_VARIABLE = "appraisal_stressfulness"
|
|
||||||
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
|
||||||
|
|
||||||
if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
|
|
||||||
TARGET_VARIABLE += "_"
|
|
||||||
TARGET_VARIABLE += SEGMENT_TYPE
|
|
||||||
|
|
||||||
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
|
||||||
|
|
||||||
model_input = pd.read_csv(PATH_FULL)
|
|
||||||
|
|
||||||
if SEGMENT_LENGTH == "daily":
|
|
||||||
DAY_LENGTH = "daily" # or "working"
|
|
||||||
print(DAY_LENGTH)
|
|
||||||
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
index_columns = [
|
model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
|
||||||
"local_segment",
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
"local_segment_label",
|
|
||||||
"local_segment_start_datetime",
|
|
||||||
"local_segment_end_datetime",
|
|
||||||
]
|
|
||||||
|
|
||||||
CLUST_COL = "limesurvey_demand_control_ratio_quartile"
|
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
|
||||||
print("CLUST_COL: " + CLUST_COL)
|
|
||||||
|
|
||||||
BINS = [-1, 0, 4]
|
model_input.columns[list(model_input.columns).index('age'):-1]
|
||||||
print("BINS: " + str(BINS))
|
|
||||||
|
|
||||||
model_input[CLUST_COL].describe()
|
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
|
||||||
|
lime_cols
|
||||||
|
lime_col = 'limesurvey_demand_control_ratio_quartile'
|
||||||
|
clust_col = lime_col
|
||||||
|
|
||||||
|
model_input[clust_col].describe()
|
||||||
|
|
||||||
# %%
|
|
||||||
model_input["target"].value_counts()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
|
|
||||||
# Filter-out outlier rows by clust_col
|
# Filter-out outlier rows by clust_col
|
||||||
#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||||
|
|
||||||
uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
|
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
||||||
uniq = uniq.dropna()
|
uniq = uniq.dropna()
|
||||||
plt.bar(uniq["pid"], uniq[CLUST_COL])
|
plt.bar(uniq['pid'], uniq[clust_col])
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# Get clusters by cluster col & and merge the clusters to main df
|
# Get clusters by cluster col & and merge the clusters to main df
|
||||||
km = KMeans(n_clusters=N_CLUSTERS).fit_predict(uniq.set_index("pid"))
|
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
|
||||||
np.unique(km, return_counts=True)
|
np.unique(km, return_counts=True)
|
||||||
uniq["cluster"] = km
|
uniq['cluster'] = km
|
||||||
|
uniq
|
||||||
|
|
||||||
model_input = model_input.merge(uniq[["pid", "cluster"]])
|
model_input = model_input.merge(uniq[['pid', 'cluster']])
|
||||||
|
|
||||||
# %%
|
|
||||||
model_input[["cluster", "target"]].value_counts().sort_index()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
model_input.set_index(index_columns, inplace=True)
|
model_input.set_index(index_columns, inplace=True)
|
||||||
|
@ -103,56 +98,31 @@ cm = ClassificationModels()
|
||||||
cmodels = cm.get_cmodels()
|
cmodels = cm.get_cmodels()
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
for k in range(N_CLUSTERS):
|
for k in range(n_clusters):
|
||||||
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
||||||
model_input_subset.loc[:, "target"] = pd.cut(
|
bins = [-10, -1, 1, 10] # bins for z-scored targets
|
||||||
model_input_subset.loc[:, "target"],
|
model_input_subset.loc[:, 'target'] = \
|
||||||
bins=BINS,
|
pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
|
||||||
labels=["low", "high"],
|
model_input_subset['target'].value_counts()
|
||||||
right=True,
|
model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
|
||||||
) # ['low', 'medium', 'high']
|
model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||||
model_input_subset["target"].value_counts()
|
|
||||||
# model_input_subset = model_input_subset[model_input_subset["target"] != "medium"]
|
|
||||||
model_input_subset["target"] = (
|
|
||||||
model_input_subset["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
print(model_input_subset["target"].value_counts())
|
model_input_subset['target'].value_counts()
|
||||||
|
|
||||||
if CV_METHOD == "half_logo":
|
if cv_method_str == 'half_logo':
|
||||||
model_input_subset["pid_index"] = model_input_subset.groupby("pid").cumcount()
|
model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
|
||||||
model_input_subset["pid_count"] = model_input_subset.groupby("pid")[
|
model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
|
||||||
"pid"
|
|
||||||
].transform("count")
|
|
||||||
|
|
||||||
model_input_subset["pid_index"] = (
|
model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
|
||||||
model_input_subset["pid_index"] / model_input_subset["pid_count"] + 1
|
model_input_subset["pid_half"] = model_input_subset["pid"] + "_" + model_input_subset["pid_index"].astype(int).astype(str)
|
||||||
).round()
|
|
||||||
model_input_subset["pid_half"] = (
|
|
||||||
model_input_subset["pid"]
|
|
||||||
+ "_"
|
|
||||||
+ model_input_subset["pid_index"].astype(int).astype(str)
|
|
||||||
)
|
|
||||||
|
|
||||||
data_x, data_y, data_groups = (
|
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
|
||||||
model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
|
|
||||||
model_input_subset["target"],
|
|
||||||
model_input_subset["pid_half"],
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
data_x, data_y, data_groups = (
|
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]
|
||||||
model_input_subset.drop(["target", "pid"], axis=1),
|
|
||||||
model_input_subset["target"],
|
|
||||||
model_input_subset["pid"],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Treat categorical features
|
# Treat categorical features
|
||||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||||
additional_categorical_features = [
|
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||||
col
|
|
||||||
for col in data_x.columns
|
|
||||||
if "mostcommonactivity" in col or "homelabel" in col
|
|
||||||
]
|
|
||||||
categorical_feature_colnames += additional_categorical_features
|
categorical_feature_colnames += additional_categorical_features
|
||||||
|
|
||||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||||
|
@ -162,9 +132,7 @@ for k in range(N_CLUSTERS):
|
||||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||||
|
|
||||||
# one-hot encoding
|
# one-hot encoding
|
||||||
categorical_features = categorical_features.apply(
|
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||||
lambda col: col.astype("category")
|
|
||||||
)
|
|
||||||
if not categorical_features.empty:
|
if not categorical_features.empty:
|
||||||
categorical_features = pd.get_dummies(categorical_features)
|
categorical_features = pd.get_dummies(categorical_features)
|
||||||
|
|
||||||
|
@ -172,10 +140,8 @@ for k in range(N_CLUSTERS):
|
||||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||||
|
|
||||||
# Establish cv method
|
# Establish cv method
|
||||||
cv_method = StratifiedKFold(
|
cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
|
||||||
n_splits=5, shuffle=True
|
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
||||||
) # Defaults to 5 k-folds in cross_validate method
|
|
||||||
if CV_METHOD == "logo" or CV_METHOD == "half_logo":
|
|
||||||
cv_method = LeaveOneGroupOut()
|
cv_method = LeaveOneGroupOut()
|
||||||
cv_method.get_n_splits(
|
cv_method.get_n_splits(
|
||||||
train_x,
|
train_x,
|
||||||
|
@ -183,57 +149,36 @@ for k in range(N_CLUSTERS):
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
)
|
)
|
||||||
|
|
||||||
imputer = SimpleImputer(missing_values=np.nan, strategy="median")
|
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||||
|
|
||||||
for model_title, model in cmodels.items():
|
for model_title, model in cmodels.items():
|
||||||
|
|
||||||
classifier = cross_validate(
|
classifier = cross_validate(
|
||||||
model["model"],
|
model['model'],
|
||||||
X=imputer.fit_transform(train_x),
|
X=imputer.fit_transform(train_x),
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cv_method,
|
cv=cv_method,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
error_score="raise",
|
error_score='raise',
|
||||||
scoring=("accuracy", "precision", "recall", "f1"),
|
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||||
)
|
)
|
||||||
|
|
||||||
print("\n-------------------------------------\n")
|
print("\n-------------------------------------\n")
|
||||||
print("Current cluster:", k, end="\n")
|
print("Current cluster:", k, end="\n")
|
||||||
print("Current model:", model_title, end="\n")
|
print("Current model:", model_title, end="\n")
|
||||||
print("Acc", np.mean(classifier["test_accuracy"]))
|
print("Acc", np.mean(classifier['test_accuracy']))
|
||||||
print("Precision", np.mean(classifier["test_precision"]))
|
print("Precision", np.mean(classifier['test_precision']))
|
||||||
print("Recall", np.mean(classifier["test_recall"]))
|
print("Recall", np.mean(classifier['test_recall']))
|
||||||
print("F1", np.mean(classifier["test_f1"]))
|
print("F1", np.mean(classifier['test_f1']))
|
||||||
print(
|
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||||
f"Largest {N_SL} ACC:",
|
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))
|
||||||
np.sort(-np.partition(-classifier["test_accuracy"], N_SL)[:N_SL])[::-1],
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
f"Smallest {N_SL} ACC:",
|
|
||||||
np.sort(np.partition(classifier["test_accuracy"], N_SL)[:N_SL]),
|
|
||||||
)
|
|
||||||
|
|
||||||
cmodels[model_title]["metrics"][0] += np.mean(classifier["test_accuracy"])
|
cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
|
||||||
cmodels[model_title]["metrics"][1] += np.mean(classifier["test_precision"])
|
cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
|
||||||
cmodels[model_title]["metrics"][2] += np.mean(classifier["test_recall"])
|
cmodels[model_title]['metrics'][2] += np.mean(classifier['test_recall'])
|
||||||
cmodels[model_title]["metrics"][3] += np.mean(classifier["test_f1"])
|
cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# Get overall results
|
# Get overall results
|
||||||
scores = cm.get_total_models_scores(n_clusters=N_CLUSTERS)
|
cm.get_total_models_scores(n_clusters=n_clusters)
|
||||||
|
|
||||||
# %%
|
|
||||||
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
|
||||||
path_output_full = PATH_OUTPUT / (
|
|
||||||
TARGET_VARIABLE
|
|
||||||
+ "_"
|
|
||||||
+ SEGMENT_LENGTH
|
|
||||||
+ "_classification_"
|
|
||||||
+ CV_METHOD
|
|
||||||
+ str(BINS)
|
|
||||||
+ "_clust_"
|
|
||||||
+ CLUST_COL
|
|
||||||
+ str(N_CLUSTERS)
|
|
||||||
+ ".csv"
|
|
||||||
)
|
|
||||||
scores.to_csv(path_output_full, index=False)
|
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
# extension: .py
|
# extension: .py
|
||||||
# format_name: percent
|
# format_name: percent
|
||||||
# format_version: '1.3'
|
# format_version: '1.3'
|
||||||
# jupytext_version: 1.14.5
|
# jupytext_version: 1.13.0
|
||||||
# kernelspec:
|
# kernelspec:
|
||||||
# display_name: straw2analysis
|
# display_name: straw2analysis
|
||||||
# language: python
|
# language: python
|
||||||
|
@ -14,83 +14,92 @@
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
from pathlib import Path
|
# %matplotlib inline
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from scipy import stats
|
from scipy import stats
|
||||||
from sklearn.cluster import KMeans
|
|
||||||
from sklearn.impute import SimpleImputer
|
|
||||||
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
||||||
|
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
|
||||||
|
from IPython.core.interactiveshell import InteractiveShell
|
||||||
|
InteractiveShell.ast_node_interactivity = "all"
|
||||||
|
|
||||||
|
nb_dir = os.path.split(os.getcwd())[0]
|
||||||
|
if nb_dir not in sys.path:
|
||||||
|
sys.path.append(nb_dir)
|
||||||
|
|
||||||
from machine_learning.classification_models import ClassificationModels
|
from machine_learning.classification_models import ClassificationModels
|
||||||
from machine_learning.helper import impute_encode_categorical_features
|
|
||||||
|
# %% [markdown]
|
||||||
|
# # RAPIDS models
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# # Useful method
|
||||||
|
def treat_categorical_features(input_set):
|
||||||
|
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||||
|
additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||||
|
categorical_feature_colnames += additional_categorical_features
|
||||||
|
|
||||||
|
categorical_features = input_set[categorical_feature_colnames].copy()
|
||||||
|
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||||
|
|
||||||
|
# fillna with mode
|
||||||
|
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||||
|
|
||||||
|
# one-hot encoding
|
||||||
|
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||||
|
if not categorical_features.empty:
|
||||||
|
categorical_features = pd.get_dummies(categorical_features)
|
||||||
|
|
||||||
|
numerical_features = input_set.drop(categorical_feature_colnames, axis=1)
|
||||||
|
|
||||||
|
return pd.concat([numerical_features, categorical_features], axis=1)
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ## Set script's parameters
|
# ## Set script's parameters
|
||||||
#
|
|
||||||
|
|
||||||
# %%
|
|
||||||
n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
|
n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
|
||||||
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||||
|
|
||||||
# %%
|
|
||||||
PATH_BASE = Path("E:/STRAWresults/20230415")
|
|
||||||
|
|
||||||
SEGMENT_TYPE = "period"
|
|
||||||
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
|
||||||
SEGMENT_LENGTH = "30_minutes_before"
|
|
||||||
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
|
||||||
TARGET_VARIABLE = "appraisal_stressfulness"
|
|
||||||
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
|
||||||
|
|
||||||
if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
|
|
||||||
TARGET_VARIABLE += "_"
|
|
||||||
TARGET_VARIABLE += SEGMENT_TYPE
|
|
||||||
|
|
||||||
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
|
||||||
|
|
||||||
model_input = pd.read_csv(PATH_FULL)
|
|
||||||
|
|
||||||
if SEGMENT_LENGTH == "daily":
|
|
||||||
DAY_LENGTH = "daily" # or "working"
|
|
||||||
print(DAY_LENGTH)
|
|
||||||
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
CLUST_COL = "limesurvey_demand_control_ratio"
|
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
|
||||||
print("CLUST_COL: " + CLUST_COL)
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
|
|
||||||
BINS = [-1, 0, 4]
|
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
|
||||||
print("BINS: " + str(BINS))
|
|
||||||
|
|
||||||
index_columns = [
|
model_input.columns[list(model_input.columns).index('age'):-1]
|
||||||
"local_segment",
|
|
||||||
"local_segment_label",
|
|
||||||
"local_segment_start_datetime",
|
|
||||||
"local_segment_end_datetime",
|
|
||||||
]
|
|
||||||
|
|
||||||
model_input[CLUST_COL].describe()
|
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
|
||||||
|
lime_cols
|
||||||
|
lime_col = 'limesurvey_demand_control_ratio'
|
||||||
|
clust_col = lime_col
|
||||||
|
|
||||||
|
model_input[clust_col].describe()
|
||||||
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
|
|
||||||
# Filter-out outlier rows by clust_col
|
# Filter-out outlier rows by clust_col
|
||||||
model_input = model_input[(np.abs(stats.zscore(model_input[CLUST_COL])) < 3)]
|
model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||||
|
|
||||||
uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
|
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
||||||
plt.bar(uniq["pid"], uniq[CLUST_COL])
|
plt.bar(uniq['pid'], uniq[clust_col])
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# Get clusters by cluster col & and merge the clusters to main df
|
# Get clusters by cluster col & and merge the clusters to main df
|
||||||
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index("pid"))
|
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
|
||||||
np.unique(km, return_counts=True)
|
np.unique(km, return_counts=True)
|
||||||
uniq["cluster"] = km
|
uniq['cluster'] = km
|
||||||
print(uniq)
|
uniq
|
||||||
|
|
||||||
model_input = model_input.merge(uniq[["pid", "cluster"]])
|
model_input = model_input.merge(uniq[['pid', 'cluster']])
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
model_input.set_index(index_columns, inplace=True)
|
model_input.set_index(index_columns, inplace=True)
|
||||||
|
@ -100,58 +109,44 @@ model_input.set_index(index_columns, inplace=True)
|
||||||
cm = ClassificationModels()
|
cm = ClassificationModels()
|
||||||
cmodels = cm.get_cmodels()
|
cmodels = cm.get_cmodels()
|
||||||
|
|
||||||
# %%
|
|
||||||
model_input["target"].value_counts()
|
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
for k in range(n_clusters):
|
for k in range(n_clusters):
|
||||||
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
||||||
|
|
||||||
# Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
|
# Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
|
||||||
# model_input_subset['numerical_target'] = model_input_subset['target']
|
model_input_subset['numerical_target'] = model_input_subset['target']
|
||||||
|
bins = [-10, 0, 10] # bins for z-scored targets
|
||||||
|
model_input_subset.loc[:, 'target'] = \
|
||||||
|
pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True)
|
||||||
|
|
||||||
model_input_subset.loc[:, "target"] = pd.cut(
|
p15 = np.percentile(model_input_subset['numerical_target'], 15)
|
||||||
model_input_subset.loc[:, "target"], bins=BINS, labels=[0, 1], right=True
|
p85 = np.percentile(model_input_subset['numerical_target'], 85)
|
||||||
)
|
|
||||||
|
|
||||||
# p15 = np.percentile(model_input_subset['numerical_target'], 15)
|
|
||||||
# p85 = np.percentile(model_input_subset['numerical_target'], 85)
|
|
||||||
|
|
||||||
# Treat categorical features
|
# Treat categorical features
|
||||||
model_input_subset = impute_encode_categorical_features(model_input_subset)
|
model_input_subset = treat_categorical_features(model_input_subset)
|
||||||
|
|
||||||
# Split to train, validate, and test subsets
|
# Split to train, validate, and test subsets
|
||||||
# train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
|
train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
|
||||||
# test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
|
test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
|
||||||
train_set, test_set = train_test_split(
|
|
||||||
model_input_subset,
|
|
||||||
test_size=0.3,
|
|
||||||
stratify=model_input_subset["pid"],
|
|
||||||
random_state=42,
|
|
||||||
)
|
|
||||||
|
|
||||||
print(train_set["target"].value_counts())
|
train_set['target'].value_counts()
|
||||||
print(test_set["target"].value_counts())
|
test_set['target'].value_counts()
|
||||||
|
|
||||||
train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
|
train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
|
||||||
|
|
||||||
validate_x, test_x, validate_y, test_y = train_test_split(
|
validate_x, test_x, validate_y, test_y = \
|
||||||
test_set.drop(["target", "pid"], axis=1),
|
train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42)
|
||||||
test_set["target"],
|
|
||||||
test_size=0.50,
|
|
||||||
random_state=42,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Impute missing values
|
# Impute missing values
|
||||||
imputer = SimpleImputer(missing_values=np.nan, strategy="median")
|
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||||
|
|
||||||
train_x = imputer.fit_transform(train_x)
|
train_x = imputer.fit_transform(train_x)
|
||||||
validate_x = imputer.fit_transform(validate_x)
|
validate_x = imputer.fit_transform(validate_x)
|
||||||
test_x = imputer.fit_transform(test_x)
|
test_x = imputer.fit_transform(test_x)
|
||||||
|
|
||||||
for model_title, model in cmodels.items():
|
for model_title, model in cmodels.items():
|
||||||
model["model"].fit(train_x, train_y)
|
model['model'].fit(train_x, train_y)
|
||||||
y_pred = model["model"].predict(validate_x)
|
y_pred = model['model'].predict(validate_x)
|
||||||
|
|
||||||
acc = accuracy_score(validate_y, y_pred)
|
acc = accuracy_score(validate_y, y_pred)
|
||||||
prec = precision_score(validate_y, y_pred)
|
prec = precision_score(validate_y, y_pred)
|
||||||
|
@ -166,29 +161,11 @@ for k in range(n_clusters):
|
||||||
print("Recall", rec)
|
print("Recall", rec)
|
||||||
print("F1", f1)
|
print("F1", f1)
|
||||||
|
|
||||||
cmodels[model_title]["metrics"][0] += acc
|
cmodels[model_title]['metrics'][0] += acc
|
||||||
cmodels[model_title]["metrics"][1] += prec
|
cmodels[model_title]['metrics'][1] += prec
|
||||||
cmodels[model_title]["metrics"][2] += rec
|
cmodels[model_title]['metrics'][2] += rec
|
||||||
cmodels[model_title]["metrics"][3] += f1
|
cmodels[model_title]['metrics'][3] += f1
|
||||||
|
|
||||||
# %% jupyter={"source_hidden": true}
|
# %% jupyter={"source_hidden": true}
|
||||||
# Get overall results
|
# Get overall results
|
||||||
scores = cm.get_total_models_scores(n_clusters=n_clusters)
|
cm.get_total_models_scores(n_clusters=n_clusters)
|
||||||
|
|
||||||
# %%
|
|
||||||
print(scores)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
|
||||||
path_output_full = PATH_OUTPUT / (
|
|
||||||
TARGET_VARIABLE
|
|
||||||
+ "_"
|
|
||||||
+ SEGMENT_LENGTH
|
|
||||||
+ "_classification"
|
|
||||||
+ str(BINS)
|
|
||||||
+ "_CLUST_"
|
|
||||||
+ CLUST_COL
|
|
||||||
+ +str(n_clusters)
|
|
||||||
+ ".csv"
|
|
||||||
)
|
|
||||||
scores.to_csv(path_output_full, index=False)
|
|
||||||
|
|
|
@ -6,61 +6,350 @@
|
||||||
# extension: .py
|
# extension: .py
|
||||||
# format_name: percent
|
# format_name: percent
|
||||||
# format_version: '1.3'
|
# format_version: '1.3'
|
||||||
# jupytext_version: 1.14.5
|
# jupytext_version: 1.13.0
|
||||||
# kernelspec:
|
# kernelspec:
|
||||||
# display_name: straw2analysis
|
# display_name: straw2analysis
|
||||||
# language: python
|
# language: python
|
||||||
# name: straw2analysis
|
# name: straw2analysis
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
# %%
|
# %% jupyter={"source_hidden": true}
|
||||||
|
# %matplotlib inline
|
||||||
|
import datetime
|
||||||
|
import importlib
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
from machine_learning.helper import (
|
import yaml
|
||||||
impute_encode_categorical_features,
|
from pyprojroot import here
|
||||||
prepare_cross_validator,
|
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
|
||||||
prepare_sklearn_data_format,
|
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
|
||||||
run_all_regression_models,
|
from sklearn.metrics import mean_squared_error, r2_score
|
||||||
)
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn.dummy import DummyRegressor
|
||||||
|
import xgboost as xg
|
||||||
|
from IPython.core.interactiveshell import InteractiveShell
|
||||||
|
InteractiveShell.ast_node_interactivity = "all"
|
||||||
|
|
||||||
nb_dir = os.path.split(os.getcwd())[0]
|
nb_dir = os.path.split(os.getcwd())[0]
|
||||||
if nb_dir not in sys.path:
|
if nb_dir not in sys.path:
|
||||||
sys.path.append(nb_dir)
|
sys.path.append(nb_dir)
|
||||||
|
|
||||||
# %%
|
import machine_learning.features_sensor
|
||||||
model_input = pd.read_csv(
|
import machine_learning.labels
|
||||||
"../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
|
import machine_learning.model
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# # RAPIDS models
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ## PANAS negative affect
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
|
#if "pid" in model_input.columns:
|
||||||
|
# index_columns.append("pid")
|
||||||
|
model_input.set_index(index_columns, inplace=True)
|
||||||
|
|
||||||
|
cv_method = 'half_logo' # logo, half_logo, 5kfold
|
||||||
|
if cv_method == 'logo':
|
||||||
|
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
|
||||||
|
else:
|
||||||
|
model_input['pid_index'] = model_input.groupby('pid').cumcount()
|
||||||
|
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
|
||||||
|
|
||||||
|
model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
|
||||||
|
model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
|
||||||
|
|
||||||
|
data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||||
|
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||||
|
categorical_feature_colnames += additional_categorical_features
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
# fillna with mode
|
||||||
|
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
# one-hot encoding
|
||||||
|
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||||
|
if not categorical_features.empty:
|
||||||
|
categorical_features = pd.get_dummies(categorical_features)
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
train_x.dtypes
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
logo = LeaveOneGroupOut()
|
||||||
|
logo.get_n_splits(
|
||||||
|
train_x,
|
||||||
|
data_y,
|
||||||
|
groups=data_groups,
|
||||||
)
|
)
|
||||||
|
|
||||||
# %%
|
# Defaults to 5 k folds in cross_validate method
|
||||||
model_input = model_input[model_input["local_segment"].str.contains("daily")]
|
if cv_method != 'logo' and cv_method != 'half_logo':
|
||||||
|
logo = None
|
||||||
|
|
||||||
# %%
|
# %% jupyter={"source_hidden": true}
|
||||||
CV_METHOD = "logo" # logo, half_logo, 5kfold
|
sum(data_y.isna())
|
||||||
|
|
||||||
model_input_encoded = impute_encode_categorical_features(model_input)
|
# %% [markdown]
|
||||||
# %%
|
# ### Baseline: Dummy Regression (mean)
|
||||||
data_x, data_y, data_groups = prepare_sklearn_data_format(
|
dummy_regr = DummyRegressor(strategy="mean")
|
||||||
model_input_encoded, CV_METHOD
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
dummy_regressor = cross_validate(
|
||||||
|
dummy_regr,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=logo,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||||
)
|
)
|
||||||
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
|
print("Negative Mean Squared Error", np.median(dummy_regressor['test_neg_mean_squared_error']))
|
||||||
# %%
|
print("Negative Mean Absolute Error", np.median(dummy_regressor['test_neg_mean_absolute_error']))
|
||||||
data_y.head()
|
print("Negative Root Mean Squared Error", np.median(dummy_regressor['test_neg_root_mean_squared_error']))
|
||||||
|
print("R2", np.median(dummy_regressor['test_r2']))
|
||||||
|
|
||||||
# %%
|
# %% [markdown]
|
||||||
data_y.tail()
|
# ### Linear Regression
|
||||||
|
|
||||||
# %%
|
# %% jupyter={"source_hidden": true}
|
||||||
data_y.shape
|
lin_reg_rapids = linear_model.LinearRegression()
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||||
|
|
||||||
# %%
|
# %% jupyter={"source_hidden": true}
|
||||||
scores = run_all_regression_models(data_x, data_y, data_groups, cross_validator)
|
lin_reg_scores = cross_validate(
|
||||||
|
lin_reg_rapids,
|
||||||
# %%
|
X=imputer.fit_transform(train_x),
|
||||||
scores.to_csv(
|
y=data_y,
|
||||||
"../presentation/JCQ_supervisor_support_regression_" + CV_METHOD + ".csv",
|
groups=data_groups,
|
||||||
index=False,
|
cv=logo,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||||
)
|
)
|
||||||
|
print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
|
||||||
|
print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
|
||||||
|
print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
|
||||||
|
print("R2", np.median(lin_reg_scores['test_r2']))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### XGBRegressor Linear Regression
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
xgb_reg_scores = cross_validate(
|
||||||
|
xgb_r,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=logo,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||||
|
)
|
||||||
|
print("Negative Mean Squared Error", np.median(xgb_reg_scores['test_neg_mean_squared_error']))
|
||||||
|
print("Negative Mean Absolute Error", np.median(xgb_reg_scores['test_neg_mean_absolute_error']))
|
||||||
|
print("Negative Root Mean Squared Error", np.median(xgb_reg_scores['test_neg_root_mean_squared_error']))
|
||||||
|
print("R2", np.median(xgb_reg_scores['test_r2']))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### XGBRegressor Pseudo Huber Error Regression
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
xgb_psuedo_huber_reg_scores = cross_validate(
|
||||||
|
xgb_psuedo_huber_r,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=logo,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||||
|
)
|
||||||
|
print("Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
|
||||||
|
print("Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
|
||||||
|
print("Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
|
||||||
|
print("R2", np.median(xgb_psuedo_huber_reg_scores['test_r2']))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Ridge regression
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
ridge_reg = linear_model.Ridge(alpha=.5)
|
||||||
|
|
||||||
|
# %% tags=[] jupyter={"source_hidden": true}
|
||||||
|
ridge_reg_scores = cross_validate(
|
||||||
|
ridge_reg,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=logo,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||||
|
)
|
||||||
|
print("Negative Mean Squared Error", np.median(ridge_reg_scores['test_neg_mean_squared_error']))
|
||||||
|
print("Negative Mean Absolute Error", np.median(ridge_reg_scores['test_neg_mean_absolute_error']))
|
||||||
|
print("Negative Root Mean Squared Error", np.median(ridge_reg_scores['test_neg_root_mean_squared_error']))
|
||||||
|
print("R2", np.median(ridge_reg_scores['test_r2']))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Lasso
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
lasso_reg = linear_model.Lasso(alpha=0.1)
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
lasso_reg_score = cross_validate(
|
||||||
|
lasso_reg,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=logo,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||||
|
)
|
||||||
|
print("Negative Mean Squared Error", np.median(lasso_reg_score['test_neg_mean_squared_error']))
|
||||||
|
print("Negative Mean Absolute Error", np.median(lasso_reg_score['test_neg_mean_absolute_error']))
|
||||||
|
print("Negative Root Mean Squared Error", np.median(lasso_reg_score['test_neg_root_mean_squared_error']))
|
||||||
|
print("R2", np.median(lasso_reg_score['test_r2']))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Bayesian Ridge
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
bayesian_ridge_reg = linear_model.BayesianRidge()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
bayesian_ridge_reg_score = cross_validate(
|
||||||
|
bayesian_ridge_reg,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=logo,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||||
|
)
|
||||||
|
print("Negative Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
|
||||||
|
print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
|
||||||
|
print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
|
||||||
|
print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### RANSAC (outlier robust regression)
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
ransac_reg = linear_model.RANSACRegressor()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
ransac_reg_scores = cross_validate(
|
||||||
|
ransac_reg,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=logo,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||||
|
)
|
||||||
|
print("Negative Mean Squared Error", np.median(ransac_reg_scores['test_neg_mean_squared_error']))
|
||||||
|
print("Negative Mean Absolute Error", np.median(ransac_reg_scores['test_neg_mean_absolute_error']))
|
||||||
|
print("Negative Root Mean Squared Error", np.median(ransac_reg_scores['test_neg_root_mean_squared_error']))
|
||||||
|
print("R2", np.median(ransac_reg_scores['test_r2']))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Support vector regression
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
svr = svm.SVR()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
svr_scores = cross_validate(
|
||||||
|
svr,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=logo,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||||
|
)
|
||||||
|
print("Negative Mean Squared Error", np.median(svr_scores['test_neg_mean_squared_error']))
|
||||||
|
print("Negative Mean Absolute Error", np.median(svr_scores['test_neg_mean_absolute_error']))
|
||||||
|
print("Negative Root Mean Squared Error", np.median(svr_scores['test_neg_root_mean_squared_error']))
|
||||||
|
print("R2", np.median(svr_scores['test_r2']))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Kernel Ridge regression
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
kridge = kernel_ridge.KernelRidge()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
kridge_scores = cross_validate(
|
||||||
|
kridge,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=logo,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||||
|
)
|
||||||
|
print("Negative Mean Squared Error", np.median(kridge_scores['test_neg_mean_squared_error']))
|
||||||
|
print("Negative Mean Absolute Error", np.median(kridge_scores['test_neg_mean_absolute_error']))
|
||||||
|
print("Negative Root Mean Squared Error", np.median(kridge_scores['test_neg_root_mean_squared_error']))
|
||||||
|
print("R2", np.median(kridge_scores['test_r2']))
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Gaussian Process Regression
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
gpr = gaussian_process.GaussianProcessRegressor()
|
||||||
|
|
||||||
|
# %% jupyter={"source_hidden": true}
|
||||||
|
|
||||||
|
gpr_scores = cross_validate(
|
||||||
|
gpr,
|
||||||
|
X=imputer.fit_transform(train_x),
|
||||||
|
y=data_y,
|
||||||
|
groups=data_groups,
|
||||||
|
cv=logo,
|
||||||
|
n_jobs=-1,
|
||||||
|
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||||
|
)
|
||||||
|
print("Negative Mean Squared Error", np.median(gpr_scores['test_neg_mean_squared_error']))
|
||||||
|
print("Negative Mean Absolute Error", np.median(gpr_scores['test_neg_mean_absolute_error']))
|
||||||
|
print("Negative Root Mean Squared Error", np.median(gpr_scores['test_neg_root_mean_squared_error']))
|
||||||
|
print("R2", np.median(gpr_scores['test_r2']))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
|
@ -1,217 +0,0 @@
|
||||||
# ---
|
|
||||||
# jupyter:
|
|
||||||
# jupytext:
|
|
||||||
# text_representation:
|
|
||||||
# extension: .py
|
|
||||||
# format_name: percent
|
|
||||||
# format_version: '1.3'
|
|
||||||
# jupytext_version: 1.14.5
|
|
||||||
# kernelspec:
|
|
||||||
# display_name: straw2analysis
|
|
||||||
# language: python
|
|
||||||
# name: straw2analysis
|
|
||||||
# ---
|
|
||||||
|
|
||||||
# %%
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from features.esm_JCQ import DICT_JCQ_DEMAND_CONTROL_REVERSE
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_questions = pd.read_csv(
|
|
||||||
"E:/STRAWbaseline/survey637813+question_text.csv", header=None
|
|
||||||
).T
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_questions
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_questions[["code", "text"]] = limesurvey_questions[0].str.split(
|
|
||||||
r"\.\s", expand=True, n=1
|
|
||||||
)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_questions
|
|
||||||
|
|
||||||
# %%
|
|
||||||
demand_reverse_lime_rows = (
|
|
||||||
limesurvey_questions["text"].str.startswith(" [Od mene se ne zahteva,")
|
|
||||||
| limesurvey_questions["text"].str.startswith(" [Imam dovolj časa, da končam")
|
|
||||||
| limesurvey_questions["text"].str.startswith(
|
|
||||||
" [Pri svojem delu se ne srečujem s konfliktnimi"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
control_reverse_lime_rows = limesurvey_questions["text"].str.startswith(
|
|
||||||
" [Moje delo vključuje veliko ponavljajočega"
|
|
||||||
) | limesurvey_questions["text"].str.startswith(
|
|
||||||
" [Pri svojem delu imam zelo malo svobode"
|
|
||||||
)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
demand_reverse_lime = limesurvey_questions[demand_reverse_lime_rows]
|
|
||||||
demand_reverse_lime.loc[:, "qid"] = demand_reverse_lime["code"].str.extract(
|
|
||||||
r"\[(\d+)\]"
|
|
||||||
)
|
|
||||||
control_reverse_lime = limesurvey_questions[control_reverse_lime_rows]
|
|
||||||
control_reverse_lime.loc[:, "qid"] = control_reverse_lime["code"].str.extract(
|
|
||||||
r"\[(\d+)\]"
|
|
||||||
)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_questions.loc[89, "text"]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_questions[limesurvey_questions["code"].str.startswith("JobEisen")]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
demand_reverse_lime
|
|
||||||
|
|
||||||
# %%
|
|
||||||
control_reverse_lime
|
|
||||||
|
|
||||||
# %%
|
|
||||||
participant_info = pd.read_csv(
|
|
||||||
"C:/Users/junos/Documents/FWO-ARRS/Analysis/straw2analysis/rapids/data/raw/p031/participant_baseline_raw.csv",
|
|
||||||
parse_dates=["date_of_birth"],
|
|
||||||
)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
participant_info_t = participant_info.T
|
|
||||||
|
|
||||||
# %%
|
|
||||||
rows_baseline = participant_info_t.index
|
|
||||||
|
|
||||||
# %%
|
|
||||||
rows_demand = rows_baseline.str.startswith("JobEisen") & ~rows_baseline.str.endswith(
|
|
||||||
"Time"
|
|
||||||
)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
rows_baseline[rows_demand]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_control = (
|
|
||||||
participant_info_t[rows_demand]
|
|
||||||
.reset_index()
|
|
||||||
.rename(columns={"index": "question", 0: "score_original"})
|
|
||||||
)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_control
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_control["qid"] = (
|
|
||||||
limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
|
|
||||||
)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_control["score"] = limesurvey_control["score_original"]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_control["qid"][0]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
rows_demand_reverse = limesurvey_control["qid"].isin(
|
|
||||||
DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
|
|
||||||
)
|
|
||||||
limesurvey_control.loc[rows_demand_reverse, "score"] = (
|
|
||||||
4 + 1 - limesurvey_control.loc[rows_demand_reverse, "score_original"]
|
|
||||||
)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
JCQ_DEMAND = "JobEisen"
|
|
||||||
JCQ_CONTROL = "JobControle"
|
|
||||||
dict_JCQ_demand_control_reverse = {
|
|
||||||
JCQ_DEMAND: {
|
|
||||||
3: " [Od mene se ne zahteva,",
|
|
||||||
4: " [Imam dovolj časa, da končam",
|
|
||||||
5: " [Pri svojem delu se ne srečujem s konfliktnimi",
|
|
||||||
},
|
|
||||||
JCQ_CONTROL: {
|
|
||||||
2: " |Moje delo vključuje veliko ponavljajočega",
|
|
||||||
6: " [Pri svojem delu imam zelo malo svobode",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_control
|
|
||||||
|
|
||||||
# %%
|
|
||||||
test = pd.DataFrame(
|
|
||||||
data={"question": "one", "score_original": 3, "score": 3, "qid": 10}, index=[0]
|
|
||||||
)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
pd.concat([test, limesurvey_control]).reset_index()
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_control["score"].sum()
|
|
||||||
|
|
||||||
# %%
|
|
||||||
rows_demand_reverse
|
|
||||||
|
|
||||||
# %%
|
|
||||||
dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
|
|
||||||
|
|
||||||
# %%
|
|
||||||
limesurvey_control
|
|
||||||
|
|
||||||
# %%
|
|
||||||
DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
|
|
||||||
DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
|
|
||||||
|
|
||||||
JCQ_NORMS = {
|
|
||||||
"F": {
|
|
||||||
0: DEMAND_CONTROL_RATIO_MIN,
|
|
||||||
1: 0.45,
|
|
||||||
2: 0.52,
|
|
||||||
3: 0.62,
|
|
||||||
4: DEMAND_CONTROL_RATIO_MAX,
|
|
||||||
},
|
|
||||||
"M": {
|
|
||||||
0: DEMAND_CONTROL_RATIO_MIN,
|
|
||||||
1: 0.41,
|
|
||||||
2: 0.48,
|
|
||||||
3: 0.56,
|
|
||||||
4: DEMAND_CONTROL_RATIO_MAX,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
# %%
|
|
||||||
JCQ_NORMS[participant_info.loc[0, "gender"]][0]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
participant_info_t.index.str.startswith("JobControle")
|
|
||||||
|
|
||||||
# %%
|
|
||||||
columns_baseline = participant_info.columns
|
|
||||||
|
|
||||||
# %%
|
|
||||||
columns_demand = columns_baseline.str.startswith(
|
|
||||||
"JobControle"
|
|
||||||
) & ~columns_baseline.str.endswith("Time")
|
|
||||||
|
|
||||||
# %%
|
|
||||||
columns_baseline[columns_demand]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
participant_control = participant_info.loc[:, columns_demand]
|
|
||||||
|
|
||||||
# %%
|
|
||||||
participant_control["id"] = participant_control.index
|
|
||||||
|
|
||||||
# %%
|
|
||||||
participant_control
|
|
||||||
|
|
||||||
# %%
|
|
||||||
pd.wide_to_long(
|
|
||||||
participant_control,
|
|
||||||
stubnames="JobControle",
|
|
||||||
i="id",
|
|
||||||
j="qid",
|
|
||||||
sep="[",
|
|
||||||
suffix="(\\d+)]",
|
|
||||||
)
|
|
193
features/esm.py
193
features/esm.py
|
@ -20,47 +20,11 @@ ANSWER_DAY_OFF = "DayOff3421"
|
||||||
ANSWER_SET_EVENING = "DayFinishedSetEvening"
|
ANSWER_SET_EVENING = "DayFinishedSetEvening"
|
||||||
|
|
||||||
MAX_MORNING_LENGTH = 3
|
MAX_MORNING_LENGTH = 3
|
||||||
# When the participant was not yet at work at the time of the first (morning) EMA,
|
# When the participants was not yet at work at the time of the first (morning) EMA,
|
||||||
# only three items were answered.
|
# only three items were answered.
|
||||||
# Two sleep related items and one indicating NOT starting work yet.
|
# Two sleep related items and one indicating NOT starting work yet.
|
||||||
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
|
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
|
||||||
|
|
||||||
QUESTIONNAIRE_IDS = {
|
|
||||||
"sleep_quality": 1,
|
|
||||||
"PANAS_positive_affect": 8,
|
|
||||||
"PANAS_negative_affect": 9,
|
|
||||||
"JCQ_job_demand": 10,
|
|
||||||
"JCQ_job_control": 11,
|
|
||||||
"JCQ_supervisor_support": 12,
|
|
||||||
"JCQ_coworker_support": 13,
|
|
||||||
"PFITS_supervisor": 14,
|
|
||||||
"PFITS_coworkers": 15,
|
|
||||||
"UWES_vigor": 16,
|
|
||||||
"UWES_dedication": 17,
|
|
||||||
"UWES_absorption": 18,
|
|
||||||
"COPE_active": 19,
|
|
||||||
"COPE_support": 20,
|
|
||||||
"COPE_emotions": 21,
|
|
||||||
"balance_life_work": 22,
|
|
||||||
"balance_work_life": 23,
|
|
||||||
"recovery_experience_detachment": 24,
|
|
||||||
"recovery_experience_relaxation": 25,
|
|
||||||
"symptoms": 26,
|
|
||||||
"appraisal_stressfulness_event": 87,
|
|
||||||
"appraisal_threat": 88,
|
|
||||||
"appraisal_challenge": 89,
|
|
||||||
"appraisal_event_time": 90,
|
|
||||||
"appraisal_event_duration": 91,
|
|
||||||
"appraisal_event_work_related": 92,
|
|
||||||
"appraisal_stressfulness_period": 93,
|
|
||||||
"late_work": 94,
|
|
||||||
"work_hours": 95,
|
|
||||||
"left_work": 96,
|
|
||||||
"activities": 97,
|
|
||||||
"coffee_breaks": 98,
|
|
||||||
"at_work_yet": 99,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_esm_data(usernames: Collection) -> pd.DataFrame:
|
def get_esm_data(usernames: Collection) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
|
@ -88,8 +52,6 @@ def get_esm_data(usernames: Collection) -> pd.DataFrame:
|
||||||
|
|
||||||
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Convert timestamps and expand JSON column.
|
|
||||||
|
|
||||||
Convert timestamps into human-readable datetimes and dates
|
Convert timestamps into human-readable datetimes and dates
|
||||||
and expand the JSON column into several Pandas DF columns.
|
and expand the JSON column into several Pandas DF columns.
|
||||||
|
|
||||||
|
@ -101,8 +63,7 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
df_esm_preprocessed: pd.DataFrame
|
df_esm_preprocessed: pd.DataFrame
|
||||||
A dataframe with added columns: datetime in Ljubljana timezone
|
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
|
||||||
and all fields from ESM_JSON column.
|
|
||||||
"""
|
"""
|
||||||
df_esm = helper.get_date_from_timestamp(df_esm)
|
df_esm = helper.get_date_from_timestamp(df_esm)
|
||||||
|
|
||||||
|
@ -115,39 +76,31 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||||
def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
For each distinct EMA session, determine how the participant responded to it.
|
For each distinct EMA session, determine how the participant responded to it.
|
||||||
|
Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
|
||||||
Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED,
|
|
||||||
and SESSION_STATUS_COMPLETE
|
|
||||||
|
|
||||||
This is done in three steps.
|
This is done in three steps.
|
||||||
|
|
||||||
First, the esm_status is considered.
|
First, the esm_status is considered.
|
||||||
If any of the ESMs in a session has a status *other than* "answered",
|
If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
|
||||||
then this session is taken as unfinished.
|
|
||||||
|
|
||||||
Second, the sessions which do not represent full questionnaires are identified.
|
Second, the sessions which do not represent full questionnaires are identified.
|
||||||
These are sessions where participants only marked they are finished with the day
|
These are sessions where participants only marked they are finished with the day or have not yet started working.
|
||||||
or have not yet started working.
|
|
||||||
|
|
||||||
Third, the sessions with only one item are marked with their trigger.
|
Third, the sessions with only one item are marked with their trigger.
|
||||||
We never offered questionnaires with single items,
|
We never offered questionnaires with single items, so we can be sure these are unfinished.
|
||||||
so we can be sure these are unfinished.
|
|
||||||
|
|
||||||
Finally, all sessions that remain are marked as completed.
|
Finally, all sessions that remain are marked as completed.
|
||||||
By going through different possibilities in expl_esm_adherence.ipynb,
|
By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option.
|
||||||
this turned out to be a reasonable option.
|
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
df_esm_preprocessed: pd.DataFrame
|
df_esm_preprocessed: pd.DataFrame
|
||||||
A preprocessed dataframe of esm data,
|
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||||
which must include the session ID (esm_session).
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
df_session_counts: pd.Dataframe
|
df_session_counts: pd.Dataframe
|
||||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
|
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
|
||||||
with their statuses and the number of items.
|
|
||||||
"""
|
"""
|
||||||
sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
|
sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
|
||||||
|
|
||||||
|
@ -202,22 +155,17 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat
|
||||||
|
|
||||||
def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Classify EMA sessions into morning, workday, or evening.
|
For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)
|
||||||
|
|
||||||
For each EMA session, determine the time of the first user answer
|
|
||||||
and its time type (morning, workday, or evening).
|
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
df_esm_preprocessed: pd.DataFrame
|
df_esm_preprocessed: pd.DataFrame
|
||||||
A preprocessed dataframe of esm data,
|
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||||
which must include the session ID (esm_session).
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
df_session_time: pd.DataFrame
|
df_session_time: pd.DataFrame
|
||||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
|
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
|
||||||
with their time type and timestamp of first answer.
|
|
||||||
"""
|
"""
|
||||||
df_session_time = (
|
df_session_time = (
|
||||||
df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
|
df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
|
||||||
|
@ -231,17 +179,13 @@ def classify_sessions_by_completion_time(
|
||||||
df_esm_preprocessed: pd.DataFrame,
|
df_esm_preprocessed: pd.DataFrame,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Classify sessions and correct the time type.
|
The point of this function is to not only classify sessions by using the previously defined functions.
|
||||||
|
|
||||||
The point of this function is to not only classify sessions
|
|
||||||
by using the previously defined functions.
|
|
||||||
It also serves to "correct" the time type of some EMA sessions.
|
It also serves to "correct" the time type of some EMA sessions.
|
||||||
|
|
||||||
A morning questionnaire could seamlessly transition into a daytime questionnaire,
|
A morning questionnaire could seamlessly transition into a daytime questionnaire,
|
||||||
if the participant was already at work.
|
if the participant was already at work.
|
||||||
In this case, the "time" label changed mid-session.
|
In this case, the "time" label changed mid-session.
|
||||||
Because of the way classify_sessions_by_time works,
|
Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
|
||||||
this questionnaire was classified as "morning".
|
|
||||||
But for all intents and purposes, it can be treated as a "daytime" EMA.
|
But for all intents and purposes, it can be treated as a "daytime" EMA.
|
||||||
|
|
||||||
The way this scenario is differentiated from a true "morning" questionnaire,
|
The way this scenario is differentiated from a true "morning" questionnaire,
|
||||||
|
@ -250,16 +194,13 @@ def classify_sessions_by_completion_time(
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
df_esm_preprocessed: pd.DataFrame
|
df_esm_preprocessed: pd.DataFrame
|
||||||
A preprocessed dataframe of esm data,
|
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||||
which must include the session ID (esm_session).
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
df_session_counts_time: pd.DataFrame
|
df_session_counts_time: pd.DataFrame
|
||||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses,
|
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items,
|
||||||
the number of items,
|
their time type (with some morning EMAs reclassified) and timestamp of first answer.
|
||||||
their time type (with some morning EMAs reclassified)
|
|
||||||
and timestamp of first answer.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
|
df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
|
||||||
|
@ -278,8 +219,7 @@ def classify_sessions_by_completion_time(
|
||||||
|
|
||||||
def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Eliminate invalid ESM responses.
|
This function eliminates invalid ESM responses.
|
||||||
|
|
||||||
It removes unanswered ESMs and those that indicate end of work and similar.
|
It removes unanswered ESMs and those that indicate end of work and similar.
|
||||||
It also extracts a numeric answer from strings such as "4 - I strongly agree".
|
It also extracts a numeric answer from strings such as "4 - I strongly agree".
|
||||||
|
|
||||||
|
@ -316,100 +256,3 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return df_esm_clean
|
return df_esm_clean
|
||||||
|
|
||||||
|
|
||||||
def increment_answers(df_esm_clean: pd.DataFrame, increment_by=1):
|
|
||||||
"""
|
|
||||||
Increment answers to keep in line with original scoring.
|
|
||||||
|
|
||||||
We always used 0 for the lowest value of user answer.
|
|
||||||
Some scales originally used other scoring, such as starting from 1.
|
|
||||||
This restores original scoring so that the values are comparable to references.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
df_esm_clean: pd.DataFrame
|
|
||||||
A cleaned ESM dataframe, which must also include esm_user_answer_numeric.
|
|
||||||
increment_by:
|
|
||||||
A number to add to the user answer.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
df_esm_clean: pd.DataFrame
|
|
||||||
The same df with addition of a column 'esm_user_answer_numeric'.
|
|
||||||
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
df_esm_clean = df_esm_clean.assign(
|
|
||||||
esm_user_score=lambda x: x.esm_user_answer_numeric + increment_by
|
|
||||||
)
|
|
||||||
except AttributeError as e:
|
|
||||||
print("Please, clean the dataframe first using features.esm.clean_up_esm.")
|
|
||||||
print(e)
|
|
||||||
return df_esm_clean
|
|
||||||
|
|
||||||
|
|
||||||
def reassign_question_ids(
|
|
||||||
df_esm_cleaned: pd.DataFrame, question_ids_content: dict
|
|
||||||
) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Fix question IDs to match their actual content.
|
|
||||||
|
|
||||||
Unfortunately, when altering the protocol to adapt to COVID pandemic,
|
|
||||||
we did not retain original question IDs.
|
|
||||||
This means that for participants before 2021, they are different
|
|
||||||
from for the rest of them.
|
|
||||||
This function searches for question IDs by matching their strings.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
df_esm_cleaned: pd.DataFrame
|
|
||||||
A cleaned up dataframe, which must also include esm_user_answer_numeric.
|
|
||||||
question_ids_content: dict
|
|
||||||
A dictionary, linking question IDs with their content ("instructions").
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
df_esm_fixed: pd.DataFrame
|
|
||||||
The same dataframe but with fixed question IDs.
|
|
||||||
"""
|
|
||||||
df_esm_unique_questions = (
|
|
||||||
df_esm_cleaned.groupby("question_id")
|
|
||||||
.esm_instructions.value_counts()
|
|
||||||
.rename()
|
|
||||||
.reset_index()
|
|
||||||
)
|
|
||||||
# Tabulate all possible answers to each question (group by question ID).
|
|
||||||
|
|
||||||
# First, check that we anticipated all esm instructions.
|
|
||||||
for q_id in question_ids_content.keys():
|
|
||||||
# Look for all questions ("instructions") occurring in the dataframe.
|
|
||||||
actual_questions = df_esm_unique_questions.loc[
|
|
||||||
df_esm_unique_questions["question_id"] == q_id,
|
|
||||||
"esm_instructions",
|
|
||||||
]
|
|
||||||
# These are all answers to a given question (by q_id).
|
|
||||||
questions_matches = actual_questions.str.startswith(
|
|
||||||
question_ids_content.get(q_id)
|
|
||||||
)
|
|
||||||
# See if they are expected, i.e. included in the dictionary.
|
|
||||||
if ~actual_questions.all():
|
|
||||||
print("One of the questions that occur in the data was undefined.")
|
|
||||||
print("This were the questions found in the data: ")
|
|
||||||
raise KeyError(actual_questions[~questions_matches])
|
|
||||||
# In case there is an unexpected answer, raise an exception.
|
|
||||||
|
|
||||||
# Next, replace question IDs.
|
|
||||||
df_esm_fixed = df_esm_cleaned.copy()
|
|
||||||
df_esm_fixed["question_id"] = df_esm_cleaned["esm_instructions"].apply(
|
|
||||||
lambda x: next(
|
|
||||||
(
|
|
||||||
key
|
|
||||||
for key, values in question_ids_content.items()
|
|
||||||
if x.startswith(values)
|
|
||||||
),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return df_esm_fixed
|
|
||||||
|
|
|
@ -1,125 +0,0 @@
|
||||||
COPE_ORIGINAL_MAX = 4
|
|
||||||
COPE_ORIGINAL_MIN = 1
|
|
||||||
|
|
||||||
DICT_COPE_QUESTION_IDS = {
|
|
||||||
164: (
|
|
||||||
"I took additional action to try to get rid of the problem",
|
|
||||||
"Ik deed extra mijn best om er iets aan te doen",
|
|
||||||
"Vložila sem dodaten napor, da bi rešila problem",
|
|
||||||
"Vložil sem dodaten napor, da bi rešil problem",
|
|
||||||
),
|
|
||||||
165: (
|
|
||||||
"I concentrated my efforts on doing something about it",
|
|
||||||
"Ik probeerde de situatie te verbeteren",
|
|
||||||
"Svoje sile sem usmerila v reševanje nastale situacije",
|
|
||||||
"Svoje sile sem usmeril v reševanje nastale situacije",
|
|
||||||
),
|
|
||||||
166: (
|
|
||||||
"I did what had to be done, one step at a time",
|
|
||||||
"Ik deed stap voor stap wat nodig was",
|
|
||||||
"Naredila sem, kar je bilo potrebno – korak za korakom",
|
|
||||||
"Naredil sem, kar je bilo potrebno – korak za korakom",
|
|
||||||
),
|
|
||||||
167: (
|
|
||||||
"I took direct action to get around the problem",
|
|
||||||
"Ik handelde vlug om het probleem te verhelpen",
|
|
||||||
"Nekaj sem naredila, da sem zaobšla problem",
|
|
||||||
"Nekaj sem naredil, da sem zaobšel problem",
|
|
||||||
),
|
|
||||||
168: (
|
|
||||||
"I tried to come up with a strategy about what to do",
|
|
||||||
"Ik probeerde te verzinnen wat ik er aan kon doen",
|
|
||||||
"Skušala sem najti ustrezen način za rešitev situacije",
|
|
||||||
"Skušal sem najti ustrezen način za rešitev situacije",
|
|
||||||
),
|
|
||||||
169: (
|
|
||||||
"I made a plan of action",
|
|
||||||
"Ik maakte een plan",
|
|
||||||
"Naredila sem načrt za delovanje",
|
|
||||||
"Naredil sem načrt za delovanje",
|
|
||||||
),
|
|
||||||
170: (
|
|
||||||
"I thought hard about what steps to take",
|
|
||||||
"Ik dacht hard na over wat ik moest doen",
|
|
||||||
"Dobro sem premislila, katere korake moram narediti, da rešim problem",
|
|
||||||
"Dobro sem premislil, katere korake moram narediti, da rešim problem",
|
|
||||||
),
|
|
||||||
171: (
|
|
||||||
"I thought about how I might best handle the problem",
|
|
||||||
"lk dacht na over hoe ik het probleem het best kon aanpakken",
|
|
||||||
"Razmišljala sem, kaj bi bilo najbolje narediti s problemom",
|
|
||||||
"Razmišljal sem, kaj bi bilo najbolje narediti s problemom",
|
|
||||||
),
|
|
||||||
172: (
|
|
||||||
"I asked people who have had similar experiences what they did",
|
|
||||||
"Ik vroeg aan mensen met dergelijke ervaringen hoe zij reageerden",
|
|
||||||
"Vprašala sem posameznike s podobnimi izkušnjami, kaj so storili",
|
|
||||||
"Vprašal sem posameznike s podobnimi izkušnjami, kaj so storili",
|
|
||||||
),
|
|
||||||
173: (
|
|
||||||
"I tried to get advice from someone about what to do",
|
|
||||||
"lk vroeg advies aan iemand",
|
|
||||||
"Pri drugih sem poskušala dobiti nasvet, kaj naj storim",
|
|
||||||
"Pri drugih sem poskušal dobiti nasvet, kaj naj storim",
|
|
||||||
),
|
|
||||||
174: (
|
|
||||||
"I talked to someone to find out more about the situation",
|
|
||||||
"Ik sprak met iemand om meer te weten te komen over de situatie",
|
|
||||||
"Z nekom sem se pogovorila, da bi izvedela še kaj o svojem problemu",
|
|
||||||
"Z nekom sem se pogovoril, da bi izvedel še kaj o svojem problemu",
|
|
||||||
),
|
|
||||||
175: (
|
|
||||||
"I talked to someone who could do something concrete about the problem",
|
|
||||||
"Ik sprak met iemand die iets aan het probleem kon doen",
|
|
||||||
"Pogovorila sem se s kom, ki bi lahko naredil kaj konkretnega",
|
|
||||||
"Pogovoril sem se s kom, ki bi lahko naredil kaj konkretnega",
|
|
||||||
),
|
|
||||||
176: (
|
|
||||||
"I talked to someone about how I felt",
|
|
||||||
"Ik sprak met iemand over hoe ik mij voelde",
|
|
||||||
"Z nekom sem se pogovorila o tem, kako sem se počutila",
|
|
||||||
"Z nekom sem se pogovoril o tem, kako sem se počutil",
|
|
||||||
),
|
|
||||||
177: (
|
|
||||||
"I tried to get emotional support from friends or relatives",
|
|
||||||
"Ik zocht steun bij vrienden of familie",
|
|
||||||
"Skušala sem dobiti čustveno podporo prijateljev ali sorodnikov",
|
|
||||||
"Skušal sem dobiti čustveno podporo prijateljev ali sorodnikov",
|
|
||||||
),
|
|
||||||
178: (
|
|
||||||
"I discussed my feelings with someone",
|
|
||||||
"lk besprak mijn gevoelens met iemand",
|
|
||||||
"O svojih občutkih sem se z nekom pogovorila",
|
|
||||||
"O svojih občutkih sem se z nekom pogovoril",
|
|
||||||
),
|
|
||||||
179: (
|
|
||||||
"I got sympathy and understanding from someone",
|
|
||||||
"Ik vroeg medeleven en begrip van iemand",
|
|
||||||
"Poiskala sem naklonjenost in razumevanje drugih",
|
|
||||||
"Poiskal sem naklonjenost in razumevanje drugih",
|
|
||||||
),
|
|
||||||
180: (
|
|
||||||
"I got upset and let my emotions out",
|
|
||||||
"Ik raakte van streek",
|
|
||||||
"Razburila sem se in to tudi pokazala",
|
|
||||||
"Razburil sem se in to tudi pokazal",
|
|
||||||
),
|
|
||||||
181: (
|
|
||||||
"I let my feelings out",
|
|
||||||
"Ik toonde mijn gevoelens",
|
|
||||||
"Svojim čustvom sem dala prosto pot",
|
|
||||||
"Svojim čustvom sem dal prosto pot",
|
|
||||||
),
|
|
||||||
182: (
|
|
||||||
"I felt a lot of emotional distress and I found myself expressing",
|
|
||||||
"lk liet duidelijk blijken hoe ellendig ik mij voelde",
|
|
||||||
"Doživljala sem veliko stresa in opažala, da sem čustva",
|
|
||||||
"Doživljal sem veliko stresa in opažal, da sem čustva",
|
|
||||||
),
|
|
||||||
183: (
|
|
||||||
"I got upset, and I was really aware of it",
|
|
||||||
"Ik merkte dat ik erg van streek was",
|
|
||||||
"Razburila sem se in razmišljala samo o tem",
|
|
||||||
"Razburil sem se in razmišljal samo o tem",
|
|
||||||
),
|
|
||||||
}
|
|
|
@ -1,11 +1,9 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from features.esm import increment_answers
|
|
||||||
|
|
||||||
JCQ_ORIGINAL_MAX = 4
|
JCQ_ORIGINAL_MAX = 4
|
||||||
JCQ_ORIGINAL_MIN = 1
|
JCQ_ORIGINAL_MIN = 1
|
||||||
|
|
||||||
DICT_JCQ_DEMAND_CONTROL_REVERSE = {
|
dict_JCQ_demand_control_reverse = {
|
||||||
75: (
|
75: (
|
||||||
"I was NOT asked",
|
"I was NOT asked",
|
||||||
"Men legde mij geen overdreven",
|
"Men legde mij geen overdreven",
|
||||||
|
@ -42,14 +40,10 @@ def reverse_jcq_demand_control_scoring(
|
||||||
df_esm_jcq_demand_control: pd.DataFrame,
|
df_esm_jcq_demand_control: pd.DataFrame,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Reverse JCQ demand and control answers.
|
This function recodes answers in Job content questionnaire by first incrementing them by 1,
|
||||||
|
to be in line with original (1-4) scoring.
|
||||||
This function recodes answers in Job content questionnaire
|
Then, some answers are reversed (i.e. 1 becomes 4 etc.), because the questions are negatively phrased.
|
||||||
by first incrementing them by 1, to be in line with original (1-4) scoring.
|
These answers are listed in dict_JCQ_demand_control_reverse and identified by their question ID.
|
||||||
Then, some answers are reversed (i.e. 1 becomes 4 etc.),
|
|
||||||
because the questions are negatively phrased.
|
|
||||||
These answers are listed in DICT_JCQ_DEMAND_CONTROL_REVERSE
|
|
||||||
and identified by their question ID.
|
|
||||||
However, the existing data is checked against literal phrasing of these questions
|
However, the existing data is checked against literal phrasing of these questions
|
||||||
to protect against wrong numbering of questions (differing question IDs).
|
to protect against wrong numbering of questions (differing question IDs).
|
||||||
|
|
||||||
|
@ -61,8 +55,7 @@ def reverse_jcq_demand_control_scoring(
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
df_esm_jcq_demand_control: pd.DataFrame
|
df_esm_jcq_demand_control: pd.DataFrame
|
||||||
The same dataframe with a column esm_user_score
|
The same dataframe with a column esm_user_score containing answers recoded and reversed.
|
||||||
containing answers recoded and reversed.
|
|
||||||
"""
|
"""
|
||||||
df_esm_jcq_demand_control_unique_answers = (
|
df_esm_jcq_demand_control_unique_answers = (
|
||||||
df_esm_jcq_demand_control.groupby("question_id")
|
df_esm_jcq_demand_control.groupby("question_id")
|
||||||
|
@ -71,7 +64,7 @@ def reverse_jcq_demand_control_scoring(
|
||||||
.reset_index()
|
.reset_index()
|
||||||
)
|
)
|
||||||
# Tabulate all possible answers to each question (group by question ID).
|
# Tabulate all possible answers to each question (group by question ID).
|
||||||
for q_id in DICT_JCQ_DEMAND_CONTROL_REVERSE.keys():
|
for q_id in dict_JCQ_demand_control_reverse.keys():
|
||||||
# Look through all answers that need to be reversed.
|
# Look through all answers that need to be reversed.
|
||||||
possible_answers = df_esm_jcq_demand_control_unique_answers.loc[
|
possible_answers = df_esm_jcq_demand_control_unique_answers.loc[
|
||||||
df_esm_jcq_demand_control_unique_answers["question_id"] == q_id,
|
df_esm_jcq_demand_control_unique_answers["question_id"] == q_id,
|
||||||
|
@ -79,7 +72,7 @@ def reverse_jcq_demand_control_scoring(
|
||||||
]
|
]
|
||||||
# These are all answers to a given question (by q_id).
|
# These are all answers to a given question (by q_id).
|
||||||
answers_matches = possible_answers.str.startswith(
|
answers_matches = possible_answers.str.startswith(
|
||||||
DICT_JCQ_DEMAND_CONTROL_REVERSE.get(q_id)
|
dict_JCQ_demand_control_reverse.get(q_id)
|
||||||
)
|
)
|
||||||
# See if they are expected, i.e. included in the dictionary.
|
# See if they are expected, i.e. included in the dictionary.
|
||||||
if ~answers_matches.all():
|
if ~answers_matches.all():
|
||||||
|
@ -89,16 +82,18 @@ def reverse_jcq_demand_control_scoring(
|
||||||
# In case there is an unexpected answer, raise an exception.
|
# In case there is an unexpected answer, raise an exception.
|
||||||
|
|
||||||
try:
|
try:
|
||||||
df_esm_jcq_demand_control = increment_answers(df_esm_jcq_demand_control)
|
df_esm_jcq_demand_control = df_esm_jcq_demand_control.assign(
|
||||||
# Increment the original answer by 1 to keep in line
|
esm_user_score=lambda x: x.esm_user_answer_numeric + 1
|
||||||
# with traditional scoring (from JCQ_ORIGINAL_MIN to JCQ_ORIGINAL_MAX).
|
)
|
||||||
|
# Increment the original answer by 1
|
||||||
|
# to keep in line with traditional scoring (JCQ_ORIGINAL_MIN - JCQ_ORIGINAL_MAX).
|
||||||
df_esm_jcq_demand_control[
|
df_esm_jcq_demand_control[
|
||||||
df_esm_jcq_demand_control["question_id"].isin(
|
df_esm_jcq_demand_control["question_id"].isin(
|
||||||
DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
|
dict_JCQ_demand_control_reverse.keys()
|
||||||
)
|
)
|
||||||
] = df_esm_jcq_demand_control[
|
] = df_esm_jcq_demand_control[
|
||||||
df_esm_jcq_demand_control["question_id"].isin(
|
df_esm_jcq_demand_control["question_id"].isin(
|
||||||
DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
|
dict_JCQ_demand_control_reverse.keys()
|
||||||
)
|
)
|
||||||
].assign(
|
].assign(
|
||||||
esm_user_score=lambda x: JCQ_ORIGINAL_MAX
|
esm_user_score=lambda x: JCQ_ORIGINAL_MAX
|
||||||
|
|
|
@ -3,9 +3,6 @@ import pandas as pd
|
||||||
|
|
||||||
import features.esm
|
import features.esm
|
||||||
|
|
||||||
SAM_ORIGINAL_MAX = 5
|
|
||||||
SAM_ORIGINAL_MIN = 1
|
|
||||||
|
|
||||||
QUESTIONNAIRE_ID_SAM = {
|
QUESTIONNAIRE_ID_SAM = {
|
||||||
"event_stress": 87,
|
"event_stress": 87,
|
||||||
"event_threat": 88,
|
"event_threat": 88,
|
||||||
|
@ -23,107 +20,10 @@ GROUP_QUESTIONNAIRES_BY = [
|
||||||
"device_id",
|
"device_id",
|
||||||
"esm_session",
|
"esm_session",
|
||||||
]
|
]
|
||||||
# Each questionnaire occurs only once within each esm_session on the same device
|
# Each questionnaire occurs only once within each esm_session on the same device within the same participant.
|
||||||
# within the same participant.
|
|
||||||
|
|
||||||
|
|
||||||
DICT_SAM_QUESTION_IDS = {
|
|
||||||
87: (
|
|
||||||
"Was there a particular event that created tension in you?",
|
|
||||||
"Was er een bepaalde gebeurtenis die spanning veroorzaakte?",
|
|
||||||
"Je prišlo do kakega dogodka, ki je v vas ustvaril napetost?",
|
|
||||||
),
|
|
||||||
88: (
|
|
||||||
"Did this event make you feel anxious?",
|
|
||||||
"Voelde je je angstig door deze gebeurtenis?",
|
|
||||||
"Ste se zaradi tega dogodka počutili tesnob",
|
|
||||||
),
|
|
||||||
89: (
|
|
||||||
"Will the outcome of this event be negative?",
|
|
||||||
"Zal de uitkomst van deze gebeurtenis negatief zijn?",
|
|
||||||
"Bo izid tega dogodka negativen?",
|
|
||||||
),
|
|
||||||
90: (
|
|
||||||
"How threatening was this event?",
|
|
||||||
"Hoe bedreigend was deze gebeurtenis?",
|
|
||||||
"Kako grozeč je bil ta dogodek?",
|
|
||||||
),
|
|
||||||
91: (
|
|
||||||
"Is this going to have a negative impact on you?",
|
|
||||||
"Zal dit een negatieve impact op je hebben?",
|
|
||||||
"Ali bo to negativno vplivalo na vas?",
|
|
||||||
),
|
|
||||||
92: (
|
|
||||||
"Is this going to have a positive impact on you?",
|
|
||||||
"Zal dit een positief effect op je hebben?",
|
|
||||||
"Ali bo to pozitivno vplivalo na vas?",
|
|
||||||
),
|
|
||||||
93: (
|
|
||||||
"How eager are you to tackle this event?",
|
|
||||||
"Hoe graag wil je deze gebeurtenis aanpakken?",
|
|
||||||
"Kako zagnani ste bili",
|
|
||||||
),
|
|
||||||
94: (
|
|
||||||
"To what extent can you become a stronger person because of this event?",
|
|
||||||
"In welke mate kan je een sterkere persoon worden door deze gebeurtenis?",
|
|
||||||
"V kolikšni meri lahko zaradi tega dogodka postanete močnejša oseba?",
|
|
||||||
),
|
|
||||||
95: (
|
|
||||||
"To what extent are you excited thinking about the outcome of this event?",
|
|
||||||
"In welke mate ben je enthousiast bij de gedachte",
|
|
||||||
"V kolikšni meri vas misel na izid tega dogodka navdušuje?",
|
|
||||||
),
|
|
||||||
96: (
|
|
||||||
"At what time did this event occur?",
|
|
||||||
"Hoe laat vond deze gebeurtenis plaats?",
|
|
||||||
"Kdaj se je ta dogodek zgodil?",
|
|
||||||
),
|
|
||||||
97: (
|
|
||||||
"How long did this event last?",
|
|
||||||
"Hoe lang duurde deze gebeurtenis?",
|
|
||||||
"Kako dolgo je trajal ta dogodek?",
|
|
||||||
),
|
|
||||||
98: (
|
|
||||||
"Was/is this event work-related?",
|
|
||||||
"Was/is deze gebeurtenis werkgerelateerd?",
|
|
||||||
"Je (bil) ta dogodek povezan s službo?",
|
|
||||||
"Je bil ali je ta dogodek povezan s službo?",
|
|
||||||
),
|
|
||||||
99: (
|
|
||||||
"Did this overall period create tension in you?",
|
|
||||||
"Heeft deze globale periode spanning veroorzaakt?",
|
|
||||||
"Je to obdobje kot celota v vas ustvarilo napetost?",
|
|
||||||
"Je to celo obdobje v vas ustvarilo napetost?",
|
|
||||||
),
|
|
||||||
100: (
|
|
||||||
"To what extent do you perceive this overall period as stressful?",
|
|
||||||
"In welke mate ervaar je deze globale periode als stressvol?",
|
|
||||||
"V kolikšni meri ste to obdobje dojemali kot stresno?",
|
|
||||||
"V kolikšni meri ste celo to obdobje dojemali kot stresno?",
|
|
||||||
),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
|
def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
|
||||||
Extract information about stressful events.
|
|
||||||
|
|
||||||
Participants were asked: "Was there a particular event that created tension in you?"
|
|
||||||
Then a subset of questions related to this event followed.
|
|
||||||
This function goes through the follow-up questions one by one
|
|
||||||
and preprocesses them, so that it adds new columns to the dataframe.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
df_esm: pd.DataFrame
|
|
||||||
A raw dataframe of all ESM data.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
df_esm_events: pd.DataFrame
|
|
||||||
A cleaned up df of Stress Appraisal Measure items with additional columns.
|
|
||||||
|
|
||||||
"""
|
|
||||||
# 0. Select only questions from Stress Appraisal Measure.
|
# 0. Select only questions from Stress Appraisal Measure.
|
||||||
df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
|
df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
|
||||||
df_esm_sam = df_esm_preprocessed[
|
df_esm_sam = df_esm_preprocessed[
|
||||||
|
@ -178,8 +78,7 @@ def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
|
||||||
def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
This function calculates challenge and threat
|
This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
|
||||||
(two Stress Appraisal Measure subscales) means,
|
|
||||||
for each ESM session (within participants and devices).
|
for each ESM session (within participants and devices).
|
||||||
It creates a grouped dataframe with means in two columns.
|
It creates a grouped dataframe with means in two columns.
|
||||||
|
|
||||||
|
@ -191,8 +90,7 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
df_esm_event_threat_challenge_mean_wide: pd.DataFrame
|
df_esm_event_threat_challenge_mean_wide: pd.DataFrame
|
||||||
A dataframe of unique ESM sessions (by participants and devices)
|
A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
|
||||||
with threat and challenge means.
|
|
||||||
"""
|
"""
|
||||||
# Select only threat and challenge assessments for events
|
# Select only threat and challenge assessments for events
|
||||||
df_esm_event_threat_challenge = df_esm_sam_clean[
|
df_esm_event_threat_challenge = df_esm_sam_clean[
|
||||||
|
@ -214,8 +112,8 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
|
||||||
aggfunc="mean",
|
aggfunc="mean",
|
||||||
)
|
)
|
||||||
# Drop unnecessary column values.
|
# Drop unnecessary column values.
|
||||||
df_esm_event_threat_challenge_mean_wide.columns = (
|
df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
|
||||||
df_esm_event_threat_challenge_mean_wide.columns.get_level_values(1)
|
1
|
||||||
)
|
)
|
||||||
df_esm_event_threat_challenge_mean_wide.columns.name = None
|
df_esm_event_threat_challenge_mean_wide.columns.name = None
|
||||||
df_esm_event_threat_challenge_mean_wide.rename(
|
df_esm_event_threat_challenge_mean_wide.rename(
|
||||||
|
@ -291,12 +189,10 @@ def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
|
||||||
def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
This function only serves to convert the string datetime answer
|
This function only serves to convert the string datetime answer into a real datetime type.
|
||||||
into a real datetime type.
|
Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
|
||||||
Errors during this conversion are coerced, meaning that non-datetime answers
|
NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
|
||||||
are assigned Not a Time (NaT).
|
the NaTs can be interpreted to mean this.
|
||||||
NOTE: Since the only available non-datetime answer to this question was
|
|
||||||
"0 - I do not remember", the NaTs can be interpreted to mean this.
|
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
@ -312,13 +208,9 @@ def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
|
df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
|
||||||
].assign(
|
].assign(
|
||||||
event_time=lambda x: pd.to_datetime(
|
event_time=lambda x: pd.to_datetime(
|
||||||
x.esm_user_answer,
|
x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
|
||||||
errors="coerce",
|
|
||||||
format="%Y-%m-%d %H:%M:%S %z",
|
|
||||||
exact=True,
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
# Example answer: 2020-09-29 00:05:00 +0200
|
|
||||||
return df_esm_event_time
|
return df_esm_event_time
|
||||||
|
|
||||||
|
|
||||||
|
@ -349,12 +241,9 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
== QUESTIONNAIRE_ID_SAM.get("event_duration")
|
== QUESTIONNAIRE_ID_SAM.get("event_duration")
|
||||||
].assign(
|
].assign(
|
||||||
event_duration=lambda x: pd.to_datetime(
|
event_duration=lambda x: pd.to_datetime(
|
||||||
x.esm_user_answer.str.slice(start=0, stop=-6),
|
x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
|
||||||
errors="coerce",
|
|
||||||
format="%Y-%m-%d %H:%M:%S",
|
|
||||||
).dt.time
|
).dt.time
|
||||||
)
|
)
|
||||||
# Example answer: 2020-09-29 00:05:00 +0200
|
|
||||||
# TODO Explore the values recorded in event_duration and possibly fix mistakes.
|
# TODO Explore the values recorded in event_duration and possibly fix mistakes.
|
||||||
# For example, participants reported setting 23:50:00 instead of 00:50:00.
|
# For example, participants reported setting 23:50:00 instead of 00:50:00.
|
||||||
|
|
||||||
|
@ -362,7 +251,7 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
# we can determine whether:
|
# we can determine whether:
|
||||||
# - this event is still going on ("1 - It is still going on")
|
# - this event is still going on ("1 - It is still going on")
|
||||||
# - the participant couldn't remember it's duration ("0 - I do not remember")
|
# - the participant couldn't remember it's duration ("0 - I do not remember")
|
||||||
# Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm
|
# Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
|
||||||
# but only the numeric types of questions and answers.
|
# but only the numeric types of questions and answers.
|
||||||
# Since this was of "datetime" type, convert these specific answers here again.
|
# Since this was of "datetime" type, convert these specific answers here again.
|
||||||
df_esm_event_duration["event_duration_info"] = np.nan
|
df_esm_event_duration["event_duration_info"] = np.nan
|
||||||
|
@ -375,5 +264,4 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||||
return df_esm_event_duration
|
return df_esm_event_duration
|
||||||
|
|
||||||
|
|
||||||
# TODO: How many questions about the stressfulness of the period were asked
|
# TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?
|
||||||
# and how does this relate to events?
|
|
||||||
|
|
|
@ -1,11 +1,10 @@
|
||||||
import pandas as pd
|
|
||||||
import xgboost as xg
|
|
||||||
from lightgbm import LGBMClassifier
|
|
||||||
from sklearn import ensemble, linear_model, naive_bayes, neighbors, svm, tree
|
|
||||||
from sklearn.dummy import DummyClassifier
|
from sklearn.dummy import DummyClassifier
|
||||||
|
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||||
|
from lightgbm import LGBMClassifier
|
||||||
|
import xgboost as xg
|
||||||
|
|
||||||
|
class ClassificationModels():
|
||||||
|
|
||||||
class ClassificationModels:
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.cmodels = self.init_classification_models()
|
self.cmodels = self.init_classification_models()
|
||||||
|
|
||||||
|
@ -14,110 +13,59 @@ class ClassificationModels:
|
||||||
|
|
||||||
def init_classification_models(self):
|
def init_classification_models(self):
|
||||||
cmodels = {
|
cmodels = {
|
||||||
"dummy_classifier": {
|
'dummy_classifier': {
|
||||||
"model": DummyClassifier(strategy="most_frequent"),
|
'model': DummyClassifier(strategy="most_frequent"),
|
||||||
"metrics": [0, 0, 0, 0],
|
'metrics': [0, 0, 0, 0]
|
||||||
},
|
},
|
||||||
"logistic_regression": {
|
'logistic_regression': {
|
||||||
"model": linear_model.LogisticRegression(max_iter=1000),
|
'model': linear_model.LogisticRegression(max_iter=1000),
|
||||||
"metrics": [0, 0, 0, 0],
|
'metrics': [0, 0, 0, 0]
|
||||||
},
|
},
|
||||||
"support_vector_machine": {"model": svm.SVC(), "metrics": [0, 0, 0, 0]},
|
'support_vector_machine': {
|
||||||
"gaussian_naive_bayes": {
|
'model': svm.SVC(),
|
||||||
"model": naive_bayes.GaussianNB(),
|
'metrics': [0, 0, 0, 0]
|
||||||
"metrics": [0, 0, 0, 0],
|
|
||||||
},
|
},
|
||||||
"stochastic_gradient_descent_classifier": {
|
'gaussian_naive_bayes': {
|
||||||
"model": linear_model.SGDClassifier(),
|
'model': naive_bayes.GaussianNB(),
|
||||||
"metrics": [0, 0, 0, 0],
|
'metrics': [0, 0, 0, 0]
|
||||||
},
|
},
|
||||||
"knn": {"model": neighbors.KNeighborsClassifier(), "metrics": [0, 0, 0, 0]},
|
'stochastic_gradient_descent_classifier': {
|
||||||
"decision_tree": {
|
'model': linear_model.SGDClassifier(),
|
||||||
"model": tree.DecisionTreeClassifier(),
|
'metrics': [0, 0, 0, 0]
|
||||||
"metrics": [0, 0, 0, 0],
|
|
||||||
},
|
},
|
||||||
"random_forest_classifier": {
|
'knn': {
|
||||||
"model": ensemble.RandomForestClassifier(),
|
'model': neighbors.KNeighborsClassifier(),
|
||||||
"metrics": [0, 0, 0, 0],
|
'metrics': [0, 0, 0, 0]
|
||||||
},
|
},
|
||||||
"gradient_boosting_classifier": {
|
'decision_tree': {
|
||||||
"model": ensemble.GradientBoostingClassifier(),
|
'model': tree.DecisionTreeClassifier(),
|
||||||
"metrics": [0, 0, 0, 0],
|
'metrics': [0, 0, 0, 0]
|
||||||
},
|
},
|
||||||
"lgbm_classifier": {"model": LGBMClassifier(), "metrics": [0, 0, 0, 0]},
|
'random_forest_classifier': {
|
||||||
"XGBoost_classifier": {
|
'model': ensemble.RandomForestClassifier(),
|
||||||
"model": xg.sklearn.XGBClassifier(),
|
'metrics': [0, 0, 0, 0]
|
||||||
"metrics": [0, 0, 0, 0],
|
|
||||||
},
|
},
|
||||||
|
'gradient_boosting_classifier': {
|
||||||
|
'model': ensemble.GradientBoostingClassifier(),
|
||||||
|
'metrics': [0, 0, 0, 0]
|
||||||
|
},
|
||||||
|
'lgbm_classifier': {
|
||||||
|
'model': LGBMClassifier(),
|
||||||
|
'metrics': [0, 0, 0, 0]
|
||||||
|
},
|
||||||
|
'XGBoost_classifier': {
|
||||||
|
'model': xg.sklearn.XGBClassifier(),
|
||||||
|
'metrics': [0, 0, 0, 0]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return cmodels
|
return cmodels
|
||||||
|
|
||||||
def get_total_models_scores(self, n_clusters=1):
|
def get_total_models_scores(self, n_clusters=1):
|
||||||
scores = pd.DataFrame(columns=["method", "metric", "mean"])
|
|
||||||
for model_title, model in self.cmodels.items():
|
for model_title, model in self.cmodels.items():
|
||||||
scores_df = pd.DataFrame(columns=["method", "metric", "mean"])
|
|
||||||
print("\n************************************\n")
|
print("\n************************************\n")
|
||||||
print("Current model:", model_title, end="\n")
|
print("Current model:", model_title, end="\n")
|
||||||
print("Acc:", model["metrics"][0] / n_clusters)
|
print("Acc:", model['metrics'][0]/n_clusters)
|
||||||
scores_df = pd.concat(
|
print("Precision:", model['metrics'][1]/n_clusters)
|
||||||
[
|
print("Recall:", model['metrics'][2]/n_clusters)
|
||||||
scores_df,
|
print("F1:", model['metrics'][3]/n_clusters)
|
||||||
pd.DataFrame(
|
|
||||||
{
|
|
||||||
"method": model_title,
|
|
||||||
"metric": "test_accuracy",
|
|
||||||
"mean": model["metrics"][0] / n_clusters,
|
|
||||||
},
|
|
||||||
index=[0],
|
|
||||||
),
|
|
||||||
],
|
|
||||||
ignore_index=True,
|
|
||||||
)
|
|
||||||
print("Precision:", model["metrics"][1] / n_clusters)
|
|
||||||
scores_df = pd.concat(
|
|
||||||
[
|
|
||||||
scores_df,
|
|
||||||
pd.DataFrame(
|
|
||||||
{
|
|
||||||
"method": model_title,
|
|
||||||
"metric": "test_precision",
|
|
||||||
"mean": model["metrics"][1] / n_clusters,
|
|
||||||
},
|
|
||||||
index=[0],
|
|
||||||
),
|
|
||||||
],
|
|
||||||
ignore_index=True,
|
|
||||||
)
|
|
||||||
print("Recall:", model["metrics"][2] / n_clusters)
|
|
||||||
scores_df = pd.concat(
|
|
||||||
[
|
|
||||||
scores_df,
|
|
||||||
pd.DataFrame(
|
|
||||||
{
|
|
||||||
"method": model_title,
|
|
||||||
"metric": "test_recall",
|
|
||||||
"mean": model["metrics"][2] / n_clusters,
|
|
||||||
},
|
|
||||||
index=[0],
|
|
||||||
),
|
|
||||||
],
|
|
||||||
ignore_index=True,
|
|
||||||
)
|
|
||||||
print("F1:", model["metrics"][3] / n_clusters)
|
|
||||||
scores_df = pd.concat(
|
|
||||||
[
|
|
||||||
scores_df,
|
|
||||||
pd.DataFrame(
|
|
||||||
{
|
|
||||||
"method": model_title,
|
|
||||||
"metric": "test_f1",
|
|
||||||
"mean": model["metrics"][3] / n_clusters,
|
|
||||||
},
|
|
||||||
index=[0],
|
|
||||||
),
|
|
||||||
],
|
|
||||||
ignore_index=True,
|
|
||||||
)
|
|
||||||
scores = pd.concat([scores, scores_df])
|
|
||||||
return scores
|
|
|
@ -1,24 +1,15 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble, naive_bayes, neighbors, tree
|
||||||
|
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate
|
||||||
|
from sklearn.metrics import mean_squared_error, r2_score
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn.dummy import DummyRegressor, DummyClassifier
|
||||||
|
|
||||||
|
from xgboost import XGBRegressor, XGBClassifier
|
||||||
|
import xgboost as xg
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn import (
|
import numpy as np
|
||||||
ensemble,
|
|
||||||
gaussian_process,
|
|
||||||
kernel_ridge,
|
|
||||||
linear_model,
|
|
||||||
naive_bayes,
|
|
||||||
svm,
|
|
||||||
)
|
|
||||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
|
||||||
from sklearn.metrics import confusion_matrix
|
|
||||||
from sklearn.model_selection import (
|
|
||||||
BaseCrossValidator,
|
|
||||||
LeaveOneGroupOut,
|
|
||||||
StratifiedKFold,
|
|
||||||
cross_validate,
|
|
||||||
)
|
|
||||||
from xgboost import XGBClassifier, XGBRegressor
|
|
||||||
|
|
||||||
|
|
||||||
def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
|
def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
@ -74,116 +65,52 @@ def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> P
|
||||||
full_path = folder / export_filename
|
full_path = folder / export_filename
|
||||||
return full_path
|
return full_path
|
||||||
|
|
||||||
|
|
||||||
def insert_row(df, row):
|
def insert_row(df, row):
|
||||||
return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
|
return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
|
||||||
|
|
||||||
|
def prepare_regression_model_input(input_csv):
|
||||||
|
|
||||||
def impute_encode_categorical_features(model_input: pd.DataFrame) -> pd.DataFrame:
|
model_input = pd.read_csv(input_csv)
|
||||||
categorical_feature_col_names = [
|
|
||||||
"gender",
|
|
||||||
"startlanguage",
|
|
||||||
"limesurvey_demand_control_ratio_quartile",
|
|
||||||
]
|
|
||||||
additional_categorical_features = [
|
|
||||||
col
|
|
||||||
for col in model_input.columns
|
|
||||||
if "mostcommonactivity" in col or "homelabel" in col
|
|
||||||
]
|
|
||||||
categorical_feature_col_names += additional_categorical_features
|
|
||||||
|
|
||||||
categorical_features = model_input[categorical_feature_col_names].copy()
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||||
|
model_input.set_index(index_columns, inplace=True)
|
||||||
|
|
||||||
|
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
|
||||||
|
|
||||||
|
categorical_feature_colnames = ["gender", "startlanguage", "limesurvey_demand_control_ratio_quartile"]
|
||||||
|
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||||
|
categorical_feature_colnames += additional_categorical_features
|
||||||
|
#TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully
|
||||||
|
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||||
# fillna with mode
|
# fillna with mode
|
||||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||||
# one-hot encoding
|
# one-hot encoding
|
||||||
categorical_features = categorical_features.apply(
|
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||||
lambda col: col.astype("category")
|
|
||||||
)
|
|
||||||
if not categorical_features.empty:
|
if not categorical_features.empty:
|
||||||
categorical_features = pd.get_dummies(categorical_features)
|
categorical_features = pd.get_dummies(categorical_features)
|
||||||
|
|
||||||
numerical_features = model_input.drop(categorical_feature_col_names, axis=1)
|
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||||
|
|
||||||
model_input = pd.concat([numerical_features, categorical_features], axis=1)
|
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||||
return model_input
|
|
||||||
|
return train_x, data_y, data_groups
|
||||||
|
|
||||||
|
|
||||||
def prepare_sklearn_data_format(
|
def run_all_regression_models(input_csv):
|
||||||
model_input: pd.DataFrame, cv_method: str = "logo"
|
# Prepare data
|
||||||
) -> tuple:
|
data_x, data_y, data_groups = prepare_regression_model_input(input_csv)
|
||||||
index_columns = [
|
|
||||||
"local_segment",
|
|
||||||
"local_segment_label",
|
|
||||||
"local_segment_start_datetime",
|
|
||||||
"local_segment_end_datetime",
|
|
||||||
]
|
|
||||||
model_input.set_index(index_columns, inplace=True)
|
|
||||||
|
|
||||||
if cv_method == "half_logo":
|
# Prepare cross validation
|
||||||
model_input["pid_index"] = model_input.groupby("pid").cumcount()
|
logo = LeaveOneGroupOut()
|
||||||
model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")
|
logo.get_n_splits(
|
||||||
|
|
||||||
model_input["pid_index"] = (
|
|
||||||
model_input["pid_index"] / model_input["pid_count"] + 1
|
|
||||||
).round()
|
|
||||||
model_input["pid_half"] = (
|
|
||||||
model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
|
|
||||||
)
|
|
||||||
|
|
||||||
data_x, data_y, data_groups = (
|
|
||||||
model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
|
|
||||||
model_input["target"],
|
|
||||||
model_input["pid_half"],
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
data_x, data_y, data_groups = (
|
|
||||||
model_input.drop(["target", "pid"], axis=1),
|
|
||||||
model_input["target"],
|
|
||||||
model_input["pid"],
|
|
||||||
)
|
|
||||||
return data_x, data_y, data_groups
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_cross_validator(
|
|
||||||
data_x: pd.DataFrame,
|
|
||||||
data_y: pd.DataFrame,
|
|
||||||
data_groups: pd.DataFrame,
|
|
||||||
cv_method: str = "logo",
|
|
||||||
) -> BaseCrossValidator:
|
|
||||||
if cv_method == "logo" or cv_method == "half_logo":
|
|
||||||
cv = LeaveOneGroupOut()
|
|
||||||
cv.get_n_splits(
|
|
||||||
data_x,
|
data_x,
|
||||||
data_y,
|
data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
)
|
)
|
||||||
else:
|
metrics = ['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error']
|
||||||
cv = StratifiedKFold(n_splits=5, shuffle=True)
|
|
||||||
return cv
|
|
||||||
|
|
||||||
|
|
||||||
def aggregate_and_transpose(df: pd.DataFrame, statistics=None) -> pd.DataFrame:
|
|
||||||
if statistics is None:
|
|
||||||
statistics = ["max", "mean"]
|
|
||||||
return (
|
|
||||||
df.agg(statistics)
|
|
||||||
.transpose()
|
|
||||||
.reset_index()
|
|
||||||
.rename(columns={"index": "test_metric"})
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def run_all_regression_models(
|
|
||||||
data_x: pd.DataFrame,
|
|
||||||
data_y: pd.DataFrame,
|
|
||||||
data_groups: pd.DataFrame,
|
|
||||||
cross_validator: BaseCrossValidator,
|
|
||||||
) -> pd.DataFrame:
|
|
||||||
metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
|
|
||||||
test_metrics = ["test_" + metric for metric in metrics]
|
test_metrics = ["test_" + metric for metric in metrics]
|
||||||
scores = pd.DataFrame(columns=["method", "test_metric", "max", "nanmedian"])
|
scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
|
||||||
|
|
||||||
# Validate models
|
# Validate models
|
||||||
dummy_regr = DummyRegressor(strategy="mean")
|
dummy_regr = DummyRegressor(strategy="mean")
|
||||||
|
@ -192,58 +119,53 @@ def run_all_regression_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=logo,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
)
|
||||||
print("Dummy model:")
|
print("Dummy model:")
|
||||||
print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))
|
print("R^2: ", np.nanmedian(dummy_regr_scores['test_r2']))
|
||||||
|
|
||||||
scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
|
scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||||
scores_df["method"] = "dummy"
|
scores_df["method"] = "dummy"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del dummy_regr
|
|
||||||
del dummy_regr_scores
|
|
||||||
|
|
||||||
lin_reg = linear_model.LinearRegression()
|
lin_reg_rapids = linear_model.LinearRegression()
|
||||||
lin_reg_scores = cross_validate(
|
lin_reg_scores = cross_validate(
|
||||||
lin_reg,
|
lin_reg_rapids,
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=logo,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
)
|
||||||
print("Linear regression:")
|
print("Linear regression:")
|
||||||
print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))
|
print("R^2: ", np.nanmedian(lin_reg_scores['test_r2']))
|
||||||
|
|
||||||
scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
|
scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||||
scores_df["method"] = "linear_reg"
|
scores_df["method"] = "linear_reg"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del lin_reg
|
|
||||||
del lin_reg_scores
|
|
||||||
|
|
||||||
ridge_reg = linear_model.Ridge(alpha=0.5)
|
ridge_reg = linear_model.Ridge(alpha=.5)
|
||||||
ridge_reg_scores = cross_validate(
|
ridge_reg_scores = cross_validate(
|
||||||
ridge_reg,
|
ridge_reg,
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=logo,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
)
|
||||||
print("Ridge regression")
|
print("Ridge regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
|
scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||||
scores_df["method"] = "ridge_reg"
|
scores_df["method"] = "ridge_reg"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del ridge_reg
|
|
||||||
del ridge_reg_scores
|
|
||||||
|
|
||||||
lasso_reg = linear_model.Lasso(alpha=0.1)
|
lasso_reg = linear_model.Lasso(alpha=0.1)
|
||||||
lasso_reg_score = cross_validate(
|
lasso_reg_score = cross_validate(
|
||||||
|
@ -251,18 +173,16 @@ def run_all_regression_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=logo,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
)
|
||||||
print("Lasso regression")
|
print("Lasso regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
|
scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||||
scores_df["method"] = "lasso_reg"
|
scores_df["method"] = "lasso_reg"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del lasso_reg
|
|
||||||
del lasso_reg_score
|
|
||||||
|
|
||||||
bayesian_ridge_reg = linear_model.BayesianRidge()
|
bayesian_ridge_reg = linear_model.BayesianRidge()
|
||||||
bayesian_ridge_reg_score = cross_validate(
|
bayesian_ridge_reg_score = cross_validate(
|
||||||
|
@ -270,18 +190,16 @@ def run_all_regression_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=logo,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
)
|
||||||
print("Bayesian Ridge")
|
print("Bayesian Ridge")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
|
scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||||
scores_df["method"] = "bayesian_ridge"
|
scores_df["method"] = "bayesian_ridge"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del bayesian_ridge_reg
|
|
||||||
del bayesian_ridge_reg_score
|
|
||||||
|
|
||||||
ransac_reg = linear_model.RANSACRegressor()
|
ransac_reg = linear_model.RANSACRegressor()
|
||||||
ransac_reg_score = cross_validate(
|
ransac_reg_score = cross_validate(
|
||||||
|
@ -289,18 +207,16 @@ def run_all_regression_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=logo,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
)
|
||||||
print("RANSAC (outlier robust regression)")
|
print("RANSAC (outlier robust regression)")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
|
scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||||
scores_df["method"] = "RANSAC"
|
scores_df["method"] = "RANSAC"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del ransac_reg
|
|
||||||
del ransac_reg_score
|
|
||||||
|
|
||||||
svr = svm.SVR()
|
svr = svm.SVR()
|
||||||
svr_score = cross_validate(
|
svr_score = cross_validate(
|
||||||
|
@ -308,18 +224,16 @@ def run_all_regression_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=logo,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
)
|
||||||
print("Support vector regression")
|
print("Support vector regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(svr_score)[test_metrics]
|
scores_df = pd.DataFrame(svr_score)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||||
scores_df["method"] = "SVR"
|
scores_df["method"] = "SVR"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del svr
|
|
||||||
del svr_score
|
|
||||||
|
|
||||||
kridge = kernel_ridge.KernelRidge()
|
kridge = kernel_ridge.KernelRidge()
|
||||||
kridge_score = cross_validate(
|
kridge_score = cross_validate(
|
||||||
|
@ -327,18 +241,16 @@ def run_all_regression_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=logo,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
)
|
||||||
print("Kernel Ridge regression")
|
print("Kernel Ridge regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(kridge_score)[test_metrics]
|
scores_df = pd.DataFrame(kridge_score)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||||
scores_df["method"] = "kernel_ridge"
|
scores_df["method"] = "kernel_ridge"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del kridge
|
|
||||||
del kridge_score
|
|
||||||
|
|
||||||
gpr = gaussian_process.GaussianProcessRegressor()
|
gpr = gaussian_process.GaussianProcessRegressor()
|
||||||
gpr_score = cross_validate(
|
gpr_score = cross_validate(
|
||||||
|
@ -346,18 +258,16 @@ def run_all_regression_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=logo,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
)
|
||||||
print("Gaussian Process Regression")
|
print("Gaussian Process Regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(gpr_score)[test_metrics]
|
scores_df = pd.DataFrame(gpr_score)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||||
scores_df["method"] = "gaussian_proc"
|
scores_df["method"] = "gaussian_proc"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del gpr
|
|
||||||
del gpr_score
|
|
||||||
|
|
||||||
rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
|
rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
|
||||||
rfr_score = cross_validate(
|
rfr_score = cross_validate(
|
||||||
|
@ -365,18 +275,16 @@ def run_all_regression_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=logo,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
)
|
||||||
print("Random Forest Regression")
|
print("Random Forest Regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(rfr_score)[test_metrics]
|
scores_df = pd.DataFrame(rfr_score)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||||
scores_df["method"] = "random_forest"
|
scores_df["method"] = "random_forest"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del rfr
|
|
||||||
del rfr_score
|
|
||||||
|
|
||||||
xgb = XGBRegressor()
|
xgb = XGBRegressor()
|
||||||
xgb_score = cross_validate(
|
xgb_score = cross_validate(
|
||||||
|
@ -384,18 +292,16 @@ def run_all_regression_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=logo,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
)
|
||||||
print("XGBoost Regressor")
|
print("XGBoost Regressor")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(xgb_score)[test_metrics]
|
scores_df = pd.DataFrame(xgb_score)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||||
scores_df["method"] = "XGBoost"
|
scores_df["method"] = "XGBoost"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del xgb
|
|
||||||
del xgb_score
|
|
||||||
|
|
||||||
ada = ensemble.AdaBoostRegressor()
|
ada = ensemble.AdaBoostRegressor()
|
||||||
ada_score = cross_validate(
|
ada_score = cross_validate(
|
||||||
|
@ -403,54 +309,25 @@ def run_all_regression_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=logo,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
)
|
||||||
print("ADA Boost Regressor")
|
print("ADA Boost Regressor")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(ada_score)[test_metrics]
|
scores_df = pd.DataFrame(ada_score)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||||
scores_df["method"] = "ADA_boost"
|
scores_df["method"] = "ADA_boost"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del ada
|
|
||||||
del ada_score
|
|
||||||
|
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
|
|
||||||
def confusion_matrix_scorer(clf, X, y):
|
def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
||||||
y_pred = clf.predict(X)
|
metrics = ['accuracy', 'average_precision', 'recall', 'f1']
|
||||||
cm = confusion_matrix(y, y_pred)
|
|
||||||
return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]}
|
|
||||||
|
|
||||||
|
|
||||||
def aggregate_confusion_matrix(scores_dict: dict) -> pd.DataFrame:
|
|
||||||
scores_aggregated = aggregate_and_transpose(
|
|
||||||
pd.DataFrame(scores_dict), statistics=["sum"]
|
|
||||||
)
|
|
||||||
return scores_aggregated[
|
|
||||||
~scores_aggregated.test_metric.isin(["fit_time", "score_time"])
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def run_all_classification_models(
|
|
||||||
data_x: pd.DataFrame,
|
|
||||||
data_y: pd.DataFrame,
|
|
||||||
data_groups: pd.DataFrame,
|
|
||||||
cross_validator: BaseCrossValidator,
|
|
||||||
):
|
|
||||||
data_y_value_counts = data_y.value_counts()
|
|
||||||
if len(data_y_value_counts) == 1:
|
|
||||||
raise (ValueError("There is only one unique value in data_y."))
|
|
||||||
if len(data_y_value_counts) == 2:
|
|
||||||
metrics = ["accuracy", "average_precision", "recall", "f1"]
|
|
||||||
else:
|
|
||||||
metrics = ["accuracy", "precision_micro", "recall_micro", "f1_micro"]
|
|
||||||
|
|
||||||
test_metrics = ["test_" + metric for metric in metrics]
|
test_metrics = ["test_" + metric for metric in metrics]
|
||||||
|
|
||||||
scores = pd.DataFrame(columns=["method", "test_metric", "max", "mean"])
|
scores = pd.DataFrame(columns=["method", "max", "mean"])
|
||||||
|
|
||||||
dummy_class = DummyClassifier(strategy="most_frequent")
|
dummy_class = DummyClassifier(strategy="most_frequent")
|
||||||
|
|
||||||
|
@ -459,39 +336,17 @@ def run_all_classification_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=cv_method,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
error_score="raise",
|
error_score='raise',
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
|
||||||
dummy_confusion_matrix = cross_validate(
|
|
||||||
dummy_class,
|
|
||||||
X=data_x,
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cross_validator,
|
|
||||||
n_jobs=-1,
|
|
||||||
error_score="raise",
|
|
||||||
scoring=confusion_matrix_scorer,
|
|
||||||
)
|
)
|
||||||
print("Dummy")
|
print("Dummy")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(dummy_score)[test_metrics]
|
scores_df = pd.DataFrame(dummy_score)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
scores_df = pd.concat(
|
scores_df["method"] = "Dummy"
|
||||||
[
|
|
||||||
scores_df,
|
|
||||||
aggregate_confusion_matrix(dummy_confusion_matrix).rename(
|
|
||||||
columns={"sum": "mean"}
|
|
||||||
# Note: the column is misleadingly renamed to get concise output.
|
|
||||||
),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
scores_df["method"] = "dummy_classifier"
|
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del dummy_class
|
|
||||||
del dummy_score
|
|
||||||
del dummy_confusion_matrix
|
|
||||||
|
|
||||||
logistic_regression = linear_model.LogisticRegression()
|
logistic_regression = linear_model.LogisticRegression()
|
||||||
|
|
||||||
|
@ -500,37 +355,16 @@ def run_all_classification_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=cv_method,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
|
||||||
log_reg_confusion_matrix = cross_validate(
|
|
||||||
logistic_regression,
|
|
||||||
X=data_x,
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cross_validator,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=confusion_matrix_scorer,
|
|
||||||
)
|
)
|
||||||
print("Logistic regression")
|
print("Logistic regression")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
|
scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
scores_df = pd.concat(
|
scores_df["method"] = "logistic_reg"
|
||||||
[
|
|
||||||
scores_df,
|
|
||||||
aggregate_confusion_matrix(log_reg_confusion_matrix).rename(
|
|
||||||
columns={"sum": "mean"}
|
|
||||||
# Note: the column is misleadingly renamed to get concise output.
|
|
||||||
),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
scores_df["method"] = "logistic_regression"
|
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del logistic_regression
|
|
||||||
del log_reg_scores
|
|
||||||
del log_reg_confusion_matrix
|
|
||||||
|
|
||||||
svc = svm.SVC()
|
svc = svm.SVC()
|
||||||
|
|
||||||
|
@ -539,37 +373,16 @@ def run_all_classification_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=cv_method,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
|
||||||
svc_confusion_matrix = cross_validate(
|
|
||||||
svc,
|
|
||||||
X=data_x,
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cross_validator,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=confusion_matrix_scorer,
|
|
||||||
)
|
)
|
||||||
print("Support Vector Machine")
|
print("Support Vector Machine")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(svc_scores)[test_metrics]
|
scores_df = pd.DataFrame(svc_scores)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
scores_df = pd.concat(
|
scores_df["method"] = "svc"
|
||||||
[
|
|
||||||
scores_df,
|
|
||||||
aggregate_confusion_matrix(svc_confusion_matrix).rename(
|
|
||||||
columns={"sum": "mean"}
|
|
||||||
# Note: the column is misleadingly renamed to get concise output.
|
|
||||||
),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
scores_df["method"] = "SVC"
|
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del svc
|
|
||||||
del svc_scores
|
|
||||||
del svc_confusion_matrix
|
|
||||||
|
|
||||||
gaussian_nb = naive_bayes.GaussianNB()
|
gaussian_nb = naive_bayes.GaussianNB()
|
||||||
|
|
||||||
|
@ -578,37 +391,16 @@ def run_all_classification_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=cv_method,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
|
||||||
gaussian_nb_confusion_matrix = cross_validate(
|
|
||||||
gaussian_nb,
|
|
||||||
X=data_x,
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cross_validator,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=confusion_matrix_scorer,
|
|
||||||
)
|
)
|
||||||
print("Gaussian Naive Bayes")
|
print("Gaussian Naive Bayes")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
|
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
scores_df = pd.concat(
|
|
||||||
[
|
|
||||||
scores_df,
|
|
||||||
aggregate_confusion_matrix(gaussian_nb_confusion_matrix).rename(
|
|
||||||
columns={"sum": "mean"}
|
|
||||||
# Note: the column is misleadingly renamed to get concise output.
|
|
||||||
),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
scores_df["method"] = "gaussian_naive_bayes"
|
scores_df["method"] = "gaussian_naive_bayes"
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del gaussian_nb
|
|
||||||
del gaussian_nb_scores
|
|
||||||
del gaussian_nb_confusion_matrix
|
|
||||||
|
|
||||||
sgdc = linear_model.SGDClassifier()
|
sgdc = linear_model.SGDClassifier()
|
||||||
|
|
||||||
|
@ -617,37 +409,16 @@ def run_all_classification_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=cv_method,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
|
||||||
sgdc_confusion_matrix = cross_validate(
|
|
||||||
sgdc,
|
|
||||||
X=data_x,
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cross_validator,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=confusion_matrix_scorer,
|
|
||||||
)
|
)
|
||||||
print("Stochastic Gradient Descent")
|
print("Stochastic Gradient Descent")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
|
scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
scores_df = pd.concat(
|
scores_df["method"] = "stochastic_gradient_descent"
|
||||||
[
|
|
||||||
scores_df,
|
|
||||||
aggregate_confusion_matrix(sgdc_confusion_matrix).rename(
|
|
||||||
columns={"sum": "mean"}
|
|
||||||
# Note: the column is misleadingly renamed to get concise output.
|
|
||||||
),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
scores_df["method"] = "stochastic_gradient_descent_classifier"
|
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del sgdc
|
|
||||||
del sgdc_scores
|
|
||||||
del sgdc_confusion_matrix
|
|
||||||
|
|
||||||
rfc = ensemble.RandomForestClassifier()
|
rfc = ensemble.RandomForestClassifier()
|
||||||
|
|
||||||
|
@ -656,37 +427,16 @@ def run_all_classification_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=cv_method,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
|
||||||
rfc_confusion_matrix = cross_validate(
|
|
||||||
rfc,
|
|
||||||
X=data_x,
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cross_validator,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=confusion_matrix_scorer,
|
|
||||||
)
|
)
|
||||||
print("Random Forest")
|
print("Random Forest")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(rfc_scores)[test_metrics]
|
scores_df = pd.DataFrame(rfc_scores)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
scores_df = pd.concat(
|
scores_df["method"] = "random_forest"
|
||||||
[
|
|
||||||
scores_df,
|
|
||||||
aggregate_confusion_matrix(rfc_confusion_matrix).rename(
|
|
||||||
columns={"sum": "mean"}
|
|
||||||
# Note: the column is misleadingly renamed to get concise output.
|
|
||||||
),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
scores_df["method"] = "random_forest_classifier"
|
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del rfc
|
|
||||||
del rfc_scores
|
|
||||||
del rfc_confusion_matrix
|
|
||||||
|
|
||||||
xgb_classifier = XGBClassifier()
|
xgb_classifier = XGBClassifier()
|
||||||
|
|
||||||
|
@ -695,36 +445,15 @@ def run_all_classification_models(
|
||||||
X=data_x,
|
X=data_x,
|
||||||
y=data_y,
|
y=data_y,
|
||||||
groups=data_groups,
|
groups=data_groups,
|
||||||
cv=cross_validator,
|
cv=cv_method,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
scoring=metrics,
|
scoring=metrics
|
||||||
)
|
|
||||||
xgb_confusion_matrix = cross_validate(
|
|
||||||
xgb_classifier,
|
|
||||||
X=data_x,
|
|
||||||
y=data_y,
|
|
||||||
groups=data_groups,
|
|
||||||
cv=cross_validator,
|
|
||||||
n_jobs=-1,
|
|
||||||
scoring=confusion_matrix_scorer,
|
|
||||||
)
|
)
|
||||||
print("XGBoost")
|
print("XGBoost")
|
||||||
|
|
||||||
scores_df = pd.DataFrame(xgb_scores)[test_metrics]
|
scores_df = pd.DataFrame(xgb_scores)[test_metrics]
|
||||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||||
scores_df = pd.concat(
|
scores_df["method"] = "xgboost"
|
||||||
[
|
|
||||||
scores_df,
|
|
||||||
aggregate_confusion_matrix(xgb_confusion_matrix).rename(
|
|
||||||
columns={"sum": "mean"}
|
|
||||||
# Note: the column is misleadingly renamed to get concise output.
|
|
||||||
),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
scores_df["method"] = "XGBoost_classifier"
|
|
||||||
scores = pd.concat([scores, scores_df])
|
scores = pd.concat([scores, scores_df])
|
||||||
del xgb_classifier
|
|
||||||
del xgb_scores
|
|
||||||
del xgb_confusion_matrix
|
|
||||||
|
|
||||||
return scores
|
return scores
|
||||||
|
|
|
@ -34,114 +34,18 @@ df_app_categories <- tbl(con, "app_categories") %>%
|
||||||
head(df_app_categories)
|
head(df_app_categories)
|
||||||
table(df_app_categories$play_store_genre)
|
table(df_app_categories$play_store_genre)
|
||||||
|
|
||||||
df_app_categories %>%
|
|
||||||
filter(play_store_genre == "not_found") %>%
|
|
||||||
group_by(play_store_response) %>%
|
|
||||||
count()
|
|
||||||
# All "not_found" have an HTTP status of 404.
|
|
||||||
|
|
||||||
df_app_categories %>%
|
|
||||||
filter(play_store_genre == "not_found") %>%
|
|
||||||
group_by(package_name) %>%
|
|
||||||
count() %>%
|
|
||||||
arrange(desc(n))
|
|
||||||
# All "not_found" apps are unique.
|
|
||||||
|
|
||||||
# Exclude phone manufacturers, custom ROM names and similar.
|
|
||||||
manufacturers <- c(
|
|
||||||
"samsung",
|
|
||||||
"oneplus",
|
|
||||||
"huawei",
|
|
||||||
"xiaomi",
|
|
||||||
"lge",
|
|
||||||
"motorola",
|
|
||||||
"miui",
|
|
||||||
"lenovo",
|
|
||||||
"oppo",
|
|
||||||
"mediatek"
|
|
||||||
)
|
|
||||||
custom_rom <- c("coloros", "lineageos", "myos", "cyanogenmod", "foundation.e")
|
|
||||||
other <- c("android", "wssyncmldm")
|
|
||||||
|
|
||||||
grep_pattern <- paste(c(manufacturers, custom_rom, other), collapse = "|")
|
|
||||||
|
|
||||||
rows_os_manufacturer <- grepl(grep_pattern, df_app_categories$package_name)
|
|
||||||
|
|
||||||
# Explore what remains after excluding above.
|
|
||||||
df_app_categories[!rows_os_manufacturer, ] %>%
|
|
||||||
filter(play_store_genre == "not_found")
|
|
||||||
|
|
||||||
# Also check the relationship between is_system_app and System category.
|
|
||||||
tbl(con, "applications") %>%
|
|
||||||
filter(is_system_app, play_store_genre != "System") %>%
|
|
||||||
count()
|
|
||||||
# They are perfectly correlated.
|
|
||||||
|
|
||||||
# Manually classify apps
|
|
||||||
df_app_categories[df_app_categories$play_store_genre == "not_found",] <-
|
|
||||||
df_app_categories %>%
|
|
||||||
filter(play_store_genre == "not_found") %>%
|
|
||||||
mutate(
|
|
||||||
play_store_genre =
|
|
||||||
case_when(
|
|
||||||
str_detect(str_to_lower(package_name), grep_pattern) ~ "System",
|
|
||||||
str_detect(str_to_lower(package_name), "straw") ~ "STRAW",
|
|
||||||
str_detect(str_to_lower(package_name), "chromium") ~ "Communication", # Same as chrome.
|
|
||||||
str_detect(str_to_lower(package_name), "skype") ~ "Communication", # Skype Lite not classified.
|
|
||||||
str_detect(str_to_lower(package_name), "imsservice") ~ "Communication", # IP Multimedia Subsystem
|
|
||||||
str_detect(str_to_lower(package_name), paste(c("covid", "empatica"), collapse = "|")) ~ "Medical",
|
|
||||||
str_detect(str_to_lower(package_name), paste(c("libri", "tachiyomi"), collapse = "|")) ~ "Books & Reference",
|
|
||||||
str_detect(str_to_lower(package_name), paste(c("bricks", "chess"), collapse = "|")) ~ "Casual",
|
|
||||||
str_detect(str_to_lower(package_name), "weather") ~ "Weather",
|
|
||||||
str_detect(str_to_lower(package_name), "excel") ~ "Productivity",
|
|
||||||
str_detect(str_to_lower(package_name), paste(c("qr", "barcode", "archimedes", "mixplorer", "winrar", "filemanager", "shot", "faceunlock", "signin", "milink"), collapse = "|")) ~ "Tools",
|
|
||||||
str_detect(str_to_lower(package_name), "stupeflix") ~ "Photography",
|
|
||||||
str_detect(str_to_lower(package_name), "anyme") ~ "Entertainment",
|
|
||||||
str_detect(str_to_lower(package_name), "vanced") ~ "Video Players & Editors",
|
|
||||||
str_detect(str_to_lower(package_name), paste(c("music", "radio", "dolby"), collapse = "|")) ~ "Music & Audio",
|
|
||||||
str_detect(str_to_lower(package_name), paste(c("tensorflow", "object_detection"), collapse = "|")) ~ "Education",
|
|
||||||
.default = play_store_genre
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Explore what remains after classifying above.
|
|
||||||
df_app_categories %>%
|
|
||||||
filter(play_store_genre == "not_found")
|
|
||||||
|
|
||||||
# After this, 13 applications remain, which I will classify as "Other".
|
|
||||||
|
|
||||||
# Correct some mistakes
|
# Correct some mistakes
|
||||||
# And classify 'not_found'
|
df_app_categories %<>% mutate(
|
||||||
df_app_categories %<>%
|
|
||||||
mutate(
|
|
||||||
play_store_genre = {
|
play_store_genre = {
|
||||||
function(x) {
|
function(x) {
|
||||||
case_when(
|
case_when(
|
||||||
x == "Education,Education" ~ "Education",
|
x == "Education,Education" ~ "Education",
|
||||||
x == "EducationEducation" ~ "Education",
|
x == "EducationEducation" ~ "Education",
|
||||||
x == "not_found" ~ "Other",
|
x == "not_found" ~ "System",
|
||||||
.default = x
|
.default = x
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}(play_store_genre)
|
}(play_store_genre)
|
||||||
) %>%
|
|
||||||
select(-package_name) %>%
|
|
||||||
rename(
|
|
||||||
genre = play_store_genre,
|
|
||||||
package_name = package_hash
|
|
||||||
)
|
|
||||||
|
|
||||||
table(df_app_categories$genre)
|
|
||||||
|
|
||||||
df_app_categories %>%
|
|
||||||
group_by(genre) %>%
|
|
||||||
count() %>%
|
|
||||||
arrange(desc(n)) %>%
|
|
||||||
write_csv("play_store_categories_count.csv")
|
|
||||||
|
|
||||||
write_csv(
|
|
||||||
x = select(df_app_categories, c(package_name, genre)),
|
|
||||||
file = "play_store_application_genre_catalogue.csv"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
dbDisconnect(con)
|
dbDisconnect(con)
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
method,metric,max,mean
|
||||||
|
Dummy,test_accuracy,0.8557046979865772,0.8548446932649828
|
||||||
|
Dummy,test_average_precision,0.1457286432160804,0.14515530673501736
|
||||||
|
Dummy,test_recall,0.0,0.0
|
||||||
|
Dummy,test_f1,0.0,0.0
|
||||||
|
logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606
|
||||||
|
logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834
|
||||||
|
logistic_reg,test_recall,0.3023255813953488,0.24266238973536486
|
||||||
|
logistic_reg,test_f1,0.3909774436090226,0.318943511424051
|
||||||
|
svc,test_accuracy,0.8557046979865772,0.8548446932649828
|
||||||
|
svc,test_average_precision,0.44514416839823046,0.4068200938341621
|
||||||
|
svc,test_recall,0.0,0.0
|
||||||
|
svc,test_f1,0.0,0.0
|
||||||
|
gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234
|
||||||
|
gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853
|
||||||
|
gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935
|
||||||
|
gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699
|
||||||
|
stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601
|
||||||
|
stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592
|
||||||
|
stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675
|
||||||
|
stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583
|
||||||
|
random_forest,test_accuracy,0.9110738255033557,0.9011129472867694
|
||||||
|
random_forest,test_average_precision,0.6998372262021191,0.6619275281099584
|
||||||
|
random_forest,test_recall,0.4069767441860465,0.35356856455493185
|
||||||
|
random_forest,test_f1,0.5691056910569107,0.5078402513053142
|
||||||
|
xgboost,test_accuracy,0.9128978224455612,0.9007711937764886
|
||||||
|
xgboost,test_average_precision,0.7366643049075349,0.698622165966308
|
||||||
|
xgboost,test_recall,0.5287356321839081,0.44346431435445066
|
||||||
|
xgboost,test_f1,0.638888888888889,0.5633957169928393
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
method,metric,max,mean
|
||||||
|
Dummy,test_accuracy,1.0,0.8524114578096439
|
||||||
|
Dummy,test_average_precision,0.7,0.14758854219035614
|
||||||
|
Dummy,test_recall,0.0,0.0
|
||||||
|
Dummy,test_f1,0.0,0.0
|
||||||
|
logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311
|
||||||
|
logistic_reg,test_average_precision,1.0,0.44605167668563583
|
||||||
|
logistic_reg,test_recall,1.0,0.25353566685532386
|
||||||
|
logistic_reg,test_f1,0.823529411764706,0.27951926390778625
|
||||||
|
svc,test_accuracy,1.0,0.8524114578096439
|
||||||
|
svc,test_average_precision,0.9612401707068228,0.44179454944271934
|
||||||
|
svc,test_recall,0.0,0.0
|
||||||
|
svc,test_f1,0.0,0.0
|
||||||
|
gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129
|
||||||
|
gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991
|
||||||
|
gaussian_naive_bayes,test_recall,1.0,0.3743761174081108
|
||||||
|
gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668
|
||||||
|
stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739
|
||||||
|
stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582
|
||||||
|
stochastic_gradient_descent,test_recall,1.0,0.30156420704502945
|
||||||
|
stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388
|
||||||
|
random_forest,test_accuracy,1.0,0.8722158105763481
|
||||||
|
random_forest,test_average_precision,1.0,0.49817066323226833
|
||||||
|
random_forest,test_recall,1.0,0.18161263127840668
|
||||||
|
random_forest,test_f1,1.0,0.2508096532365307
|
||||||
|
xgboost,test_accuracy,1.0,0.8812627400277729
|
||||||
|
xgboost,test_average_precision,1.0,0.5505695112208401
|
||||||
|
xgboost,test_recall,1.0,0.2896161238315027
|
||||||
|
xgboost,test_f1,0.9411764705882353,0.36887408735855665
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,45 +0,0 @@
|
||||||
genre,n
|
|
||||||
System,261
|
|
||||||
Tools,96
|
|
||||||
Productivity,71
|
|
||||||
Health & Fitness,60
|
|
||||||
Finance,54
|
|
||||||
Communication,39
|
|
||||||
Music & Audio,39
|
|
||||||
Shopping,38
|
|
||||||
Lifestyle,33
|
|
||||||
Education,28
|
|
||||||
News & Magazines,24
|
|
||||||
Maps & Navigation,23
|
|
||||||
Entertainment,21
|
|
||||||
Business,18
|
|
||||||
Travel & Local,18
|
|
||||||
Books & Reference,16
|
|
||||||
Social,16
|
|
||||||
Weather,16
|
|
||||||
Food & Drink,14
|
|
||||||
Sports,14
|
|
||||||
Other,13
|
|
||||||
Photography,13
|
|
||||||
Puzzle,13
|
|
||||||
Video Players & Editors,12
|
|
||||||
Card,9
|
|
||||||
Casual,9
|
|
||||||
Personalization,8
|
|
||||||
Medical,7
|
|
||||||
Board,5
|
|
||||||
Strategy,4
|
|
||||||
House & Home,3
|
|
||||||
Trivia,3
|
|
||||||
Word,3
|
|
||||||
Adventure,2
|
|
||||||
Art & Design,2
|
|
||||||
Auto & Vehicles,2
|
|
||||||
Dating,2
|
|
||||||
Role Playing,2
|
|
||||||
STRAW,2
|
|
||||||
Simulation,2
|
|
||||||
"Board,Brain Games",1
|
|
||||||
"Entertainment,Music & Video",1
|
|
||||||
Parenting,1
|
|
||||||
Racing,1
|
|
|
|
@ -1,7 +0,0 @@
|
||||||
[tool.isort]
|
|
||||||
profile = "black"
|
|
||||||
py_version = 311
|
|
||||||
skip_gitignore = "true"
|
|
||||||
|
|
||||||
[tool.black]
|
|
||||||
target-version = ["py311"]
|
|
2
rapids
2
rapids
|
@ -1 +1 @@
|
||||||
Subproject commit 059774bda10545a83ab282f59eb7a329fef9ee4c
|
Subproject commit 7b8538ce5152bb6e978cae37fcb7099941e95364
|
5
setup.py
5
setup.py
|
@ -1,7 +1,8 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import sqlalchemy.engine.url
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from sqlalchemy import URL, create_engine
|
from sqlalchemy import create_engine
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
@ -10,7 +11,7 @@ testing: bool = False
|
||||||
|
|
||||||
db_password = os.getenv("DB_PASSWORD")
|
db_password = os.getenv("DB_PASSWORD")
|
||||||
|
|
||||||
db_uri = URL.create(
|
db_uri = sqlalchemy.engine.url.URL(
|
||||||
drivername="postgresql+psycopg2",
|
drivername="postgresql+psycopg2",
|
||||||
username="staw_db",
|
username="staw_db",
|
||||||
password=db_password,
|
password=db_password,
|
||||||
|
|
|
@ -1,60 +0,0 @@
|
||||||
---
|
|
||||||
title: "Reliability of SAM threat and challenge and COPE"
|
|
||||||
output: html_notebook
|
|
||||||
---
|
|
||||||
|
|
||||||
|
|
||||||
```{r libraries, message=FALSE, warning=FALSE, include=FALSE, cache=FALSE}
|
|
||||||
library(conflicted)
|
|
||||||
library(here)
|
|
||||||
library(tidyverse)
|
|
||||||
library(magrittr)
|
|
||||||
library(lavaan)
|
|
||||||
library(kableExtra)
|
|
||||||
|
|
||||||
conflicts_prefer(
|
|
||||||
readr::col_factor,
|
|
||||||
purrr::discard,
|
|
||||||
dplyr::filter,
|
|
||||||
dplyr::lag,
|
|
||||||
purrr::set_names,
|
|
||||||
tidyr::extract,
|
|
||||||
kableExtra::group_rows
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
```{r style, include=FALSE, cache=FALSE}
|
|
||||||
styler::style_file(
|
|
||||||
here("statistical_analysis", "scale_reliability.Rmd"),
|
|
||||||
scope = "tokens",
|
|
||||||
indent_by = 4L
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
The data were preprocessed and cleaned using [expl_esm_labels.py](../exploration/expl_esm_labels.py) script and read as csv here.
|
|
||||||
|
|
||||||
```{r read_data}
|
|
||||||
COL_TYPES <- cols(
|
|
||||||
.default = col_double(),
|
|
||||||
participant_id = col_factor(),
|
|
||||||
username = col_factor(),
|
|
||||||
device_id = col_factor(),
|
|
||||||
esm_trigger = col_factor(),
|
|
||||||
esm_instructions = col_factor(),
|
|
||||||
double_esm_user_answer_timestamp = col_double(),
|
|
||||||
datetime_lj = col_datetime(format = ""),
|
|
||||||
date_lj = col_date(format = ""),
|
|
||||||
time = col_factor(),
|
|
||||||
esm_user_answer = col_factor()
|
|
||||||
)
|
|
||||||
df_SAM <- read_csv(here("data", "raw", "df_esm_SAM_threat_challenge.csv"), col_types = COL_TYPES)
|
|
||||||
df_COPE <- read_csv(here("data", "raw", "df_esm_COPE.csv"), col_types = COL_TYPES)
|
|
||||||
```
|
|
||||||
|
|
||||||
Demonstrate factor analysis for a single participant.
|
|
||||||
|
|
||||||
```{r}
|
|
||||||
df_COPE %>%
|
|
||||||
group_by(question_id, questionnaire_id) %>%
|
|
||||||
count()
|
|
||||||
```
|
|
|
@ -1,20 +0,0 @@
|
||||||
Version: 1.0
|
|
||||||
|
|
||||||
RestoreWorkspace: Default
|
|
||||||
SaveWorkspace: Default
|
|
||||||
AlwaysSaveHistory: Default
|
|
||||||
|
|
||||||
EnableCodeIndexing: Yes
|
|
||||||
UseSpacesForTab: No
|
|
||||||
NumSpacesForTab: 4
|
|
||||||
Encoding: UTF-8
|
|
||||||
|
|
||||||
RnwWeave: Sweave
|
|
||||||
LaTeX: XeLaTeX
|
|
||||||
|
|
||||||
AutoAppendNewline: Yes
|
|
||||||
StripTrailingWhitespace: Yes
|
|
||||||
|
|
||||||
PythonType: conda
|
|
||||||
PythonVersion: 3.11.3
|
|
||||||
PythonPath: E:/ProgramData/mambaforge/envs/straw2analysis/python.exe
|
|
Loading…
Reference in New Issue