Compare commits
No commits in common. "master" and "ml_pipeline" have entirely different histories.
master
...
ml_pipelin
9
.flake8
9
.flake8
|
@ -1,9 +0,0 @@
|
|||
[flake8]
|
||||
max-line-length = 88
|
||||
extend-ignore =
|
||||
E203,
|
||||
# E501 line too long for docstrings
|
||||
D501
|
||||
per-file-ignores =
|
||||
exploration/*.py:E501
|
||||
docstring-convention = numpy
|
|
@ -12,15 +12,12 @@ __pycache__/
|
|||
/data/*input*.csv
|
||||
/data/daily*
|
||||
/data/intradaily*
|
||||
/data/raw
|
||||
/data/stressfulness_event*
|
||||
/data/30min*
|
||||
/presentation/*scores.csv
|
||||
/presentation/Results.ods
|
||||
/presentation/results/
|
||||
.Rproj.user
|
||||
.Rhistory
|
||||
/presentation/*.nb.html
|
||||
presentation/event_stressful_detection_half_loso.csv
|
||||
presentation/event_stressful_detection_loso.csv
|
||||
/statistical_analysis/scale_reliability.nb.html
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
<component name="ProjectCodeStyleConfiguration">
|
||||
<code_scheme name="Project" version="173">
|
||||
<option name="RIGHT_MARGIN" value="150" />
|
||||
<option name="SOFT_MARGINS" value="88" />
|
||||
</code_scheme>
|
||||
</component>
|
|
@ -1,5 +0,0 @@
|
|||
<component name="ProjectCodeStyleConfiguration">
|
||||
<state>
|
||||
<option name="USE_PER_PROJECT_SETTINGS" value="true" />
|
||||
</state>
|
||||
</component>
|
|
@ -1,3 +0,0 @@
|
|||
<component name="ProjectDictionaryState">
|
||||
<dictionary name="junos" />
|
||||
</component>
|
|
@ -1,9 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="straw2analysis" project-jdk-type="Python SDK" />
|
||||
<component name="PyCharmDSProjectLayout">
|
||||
<option name="id" value="JupyterRightHiddenStructureLayout" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (straw2analysis)" project-jdk-type="Python SDK" />
|
||||
<component name="PyCharmProfessionalAdvertiser">
|
||||
<option name="shown" value="true" />
|
||||
</component>
|
||||
|
@ -17,15 +14,7 @@
|
|||
</RMarkdownRenderProfile>
|
||||
</value>
|
||||
</entry>
|
||||
<entry key="file://$PROJECT_DIR$/statistical_analysis/scale_reliability.rmd">
|
||||
<value>
|
||||
<RMarkdownRenderProfile>
|
||||
<option name="lastOutput" value="$PROJECT_DIR$/statistical_analysis/scale_reliability.nb.html" />
|
||||
<option name="outputDirectoryUrl" value="file://$PROJECT_DIR$/statistical_analysis" />
|
||||
</RMarkdownRenderProfile>
|
||||
</value>
|
||||
</entry>
|
||||
</map>
|
||||
</option>
|
||||
</component>
|
||||
</project>
|
||||
</project>
|
|
@ -1,9 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="RGraphicsSettings">
|
||||
<option name="height" value="600" />
|
||||
<option name="resolution" value="75" />
|
||||
<option name="version" value="2" />
|
||||
<option name="width" value="960" />
|
||||
</component>
|
||||
</project>
|
|
@ -1,7 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="RMarkdownGraphicsSettings">
|
||||
<option name="globalResolution" value="75" />
|
||||
<option name="version" value="2" />
|
||||
</component>
|
||||
</project>
|
|
@ -1,6 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="RSettings">
|
||||
<option name="interpreterPath" value="C:\Program Files\R\R-4.3.1\bin\R.exe" />
|
||||
</component>
|
||||
</project>
|
|
@ -5,7 +5,7 @@
|
|||
<excludeFolder url="file://$MODULE_DIR$/config/.ipynb_checkpoints" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/exploration/.ipynb_checkpoints" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="straw2analysis" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.9 (straw2analysis)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
|
|
|
@ -3,5 +3,6 @@
|
|||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
<mapping directory="$PROJECT_DIR$/rapids" vcs="Git" />
|
||||
<mapping directory="$PROJECT_DIR$/rapids/calculatingfeatures" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
|
@ -1,30 +0,0 @@
|
|||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.4.0
|
||||
hooks:
|
||||
- id: check-yaml
|
||||
- id: end-of-file-fixer
|
||||
- id: trailing-whitespace
|
||||
- repo: https://github.com/pycqa/isort
|
||||
rev: 5.12.0
|
||||
hooks:
|
||||
- id: isort
|
||||
name: isort (python)
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 23.3.0
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3
|
||||
- repo: https://github.com/pycqa/flake8
|
||||
rev: 6.0.0
|
||||
hooks:
|
||||
- id: flake8
|
||||
# - repo: https://github.com/mwouts/jupytext
|
||||
# rev: v1.14.7
|
||||
# hooks:
|
||||
# - id: jupytext
|
||||
# args: [ --from, "py:percent", --to, "ipynb" ]
|
||||
# additional_dependencies:
|
||||
# - isort==5.12.0 # Matches hook
|
||||
# - black==23.3.0
|
||||
# - flake8==6.0.0
|
|
@ -1,12 +1,12 @@
|
|||
name: straw2analysis
|
||||
channels:
|
||||
- defaults
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.11
|
||||
- python=3.9
|
||||
- black
|
||||
- isort
|
||||
- flake8
|
||||
- flake8-docstrings
|
||||
- imbalanced-learn=0.10.0
|
||||
- jupyterlab
|
||||
- jupytext
|
||||
|
@ -15,7 +15,6 @@ dependencies:
|
|||
- nodejs
|
||||
- pandas
|
||||
- psycopg2 >= 2.9.1
|
||||
- pre-commit
|
||||
- python-dotenv
|
||||
- pytz
|
||||
- pyprojroot
|
||||
|
@ -24,5 +23,4 @@ dependencies:
|
|||
- scikit-learn
|
||||
- sqlalchemy
|
||||
- statsmodels
|
||||
- tabulate
|
||||
- xgboost
|
||||
- tabulate
|
|
@ -14,9 +14,15 @@
|
|||
# ---
|
||||
|
||||
# %%
|
||||
import os, sys
|
||||
import importlib
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from rapids.src.features.utils.utils import chunk_episodes
|
||||
# import plotly.graph_objects as go
|
||||
from importlib import util
|
||||
from pathlib import Path
|
||||
import yaml
|
||||
|
||||
# %%
|
||||
phone_data_yield = pd.read_csv(
|
||||
|
@ -30,29 +36,23 @@ time_segments_labels = pd.read_csv(
|
|||
# %%
|
||||
phone_data_yield["assigned_segments"] = phone_data_yield[
|
||||
"assigned_segments"
|
||||
].str.replace(r"_RR\d+SS#", "#", regex=True)
|
||||
].str.replace(r"_RR\d+SS#", "#")
|
||||
time_segments_labels["label"] = time_segments_labels["label"].str.replace(
|
||||
r"_RR\d+SS$", "", regex=True
|
||||
r"_RR\d+SS$", ""
|
||||
)
|
||||
|
||||
|
||||
# %% tags=[]
|
||||
def filter_data_by_segment(data, time_segment_current):
|
||||
def filter_data_by_segment(data, time_segment):
|
||||
data.dropna(subset=["assigned_segments"], inplace=True)
|
||||
if data.shape[0] == 0: # data is empty
|
||||
data["local_segment"] = data["timestamps_segment"] = None
|
||||
return data
|
||||
|
||||
datetime_regex = (
|
||||
r"[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||
)
|
||||
timestamps_regex = r"[0-9]{13}"
|
||||
segment_regex = r"\[({}#{},{};{},{})\]".format(
|
||||
time_segment_current,
|
||||
datetime_regex,
|
||||
datetime_regex,
|
||||
timestamps_regex,
|
||||
timestamps_regex,
|
||||
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||
timestamps_regex = "[0-9]{13}"
|
||||
segment_regex = "\[({}#{},{};{},{})\]".format(
|
||||
time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex
|
||||
)
|
||||
data["local_segment"] = data["assigned_segments"].str.extract(
|
||||
segment_regex, expand=True
|
||||
|
@ -147,17 +147,14 @@ def getDataForPlot(phone_data_yield_per_segment):
|
|||
.fillna(0)
|
||||
)
|
||||
|
||||
# transpose the dataframe per local start datetime of the segment
|
||||
# and discard the useless index layer
|
||||
# transpose the dataframe per local start datetime of the segment and discard the useless index layer
|
||||
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
||||
"local_segment_start_datetimes"
|
||||
)[["minutes_after_segment_start", "sensor"]].apply(
|
||||
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
||||
)
|
||||
phone_data_yield_per_segment.index = (
|
||||
phone_data_yield_per_segment.index.get_level_values(
|
||||
"local_segment_start_datetimes"
|
||||
)
|
||||
phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
|
||||
"local_segment_start_datetimes"
|
||||
)
|
||||
return phone_data_yield_per_segment
|
||||
|
||||
|
@ -230,13 +227,9 @@ phone_data_yield_per_segment.tail()
|
|||
# # A workaround
|
||||
|
||||
# %%
|
||||
phone_data_yield_per_segment[
|
||||
"local_segment_start_datetimes", "minutes_after_segment_start"
|
||||
] = phone_data_yield_per_segment[
|
||||
phone_data_yield_per_segment["local_segment_start_datetimes", "minutes_after_segment_start"] = phone_data_yield_per_segment[
|
||||
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||
].drop_duplicates(
|
||||
keep="first"
|
||||
)
|
||||
].drop_duplicates(keep="first")
|
||||
|
||||
# %%
|
||||
phone_data_yield_per_segment.set_index(
|
||||
|
@ -251,9 +244,8 @@ phone_data_yield_per_segment.head()
|
|||
# %% [markdown]
|
||||
# # Retry
|
||||
|
||||
|
||||
# %%
|
||||
def get_data_for_plot(phone_data_yield_per_segment):
|
||||
def getDataForPlot(phone_data_yield_per_segment):
|
||||
# calculate the length (in minute) of per segment instance
|
||||
phone_data_yield_per_segment["length"] = (
|
||||
phone_data_yield_per_segment["timestamps_segment"]
|
||||
|
@ -300,10 +292,7 @@ def get_data_for_plot(phone_data_yield_per_segment):
|
|||
full_index,
|
||||
names=("local_segment_start_datetimes", "minutes_after_segment_start"),
|
||||
)
|
||||
phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(
|
||||
subset=["local_segment_start_datetimes", "minutes_after_segment_start"],
|
||||
keep="first",
|
||||
)
|
||||
phone_data_yield_per_segment = phone_data_yield_per_segment.drop_duplicates(subset=["local_segment_start_datetimes", "minutes_after_segment_start"],keep="first")
|
||||
phone_data_yield_per_segment = (
|
||||
phone_data_yield_per_segment.set_index(
|
||||
["local_segment_start_datetimes", "minutes_after_segment_start"]
|
||||
|
@ -313,17 +302,14 @@ def get_data_for_plot(phone_data_yield_per_segment):
|
|||
.fillna(0)
|
||||
)
|
||||
|
||||
# transpose the dataframe per local start datetime of the segment
|
||||
# and discard the useless index layer
|
||||
# transpose the dataframe per local start datetime of the segment and discard the useless index layer
|
||||
phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(
|
||||
"local_segment_start_datetimes"
|
||||
)[["minutes_after_segment_start", "sensor"]].apply(
|
||||
lambda x: x.set_index("minutes_after_segment_start").transpose()
|
||||
)
|
||||
phone_data_yield_per_segment.index = (
|
||||
phone_data_yield_per_segment.index.get_level_values(
|
||||
"local_segment_start_datetimes"
|
||||
)
|
||||
phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values(
|
||||
"local_segment_start_datetimes"
|
||||
)
|
||||
return phone_data_yield_per_segment
|
||||
|
||||
|
@ -332,6 +318,6 @@ def get_data_for_plot(phone_data_yield_per_segment):
|
|||
phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment)
|
||||
|
||||
# %%
|
||||
data_for_plot_per_segment = get_data_for_plot(phone_data_yield_per_segment)
|
||||
data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment)
|
||||
|
||||
# %%
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.14.5
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
|
@ -15,33 +15,19 @@
|
|||
# ---
|
||||
|
||||
# %%
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
|
||||
import seaborn as sns
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
import participants.query_db
|
||||
from features.esm import (
|
||||
QUESTIONNAIRE_IDS,
|
||||
clean_up_esm,
|
||||
get_esm_data,
|
||||
increment_answers,
|
||||
preprocess_esm,
|
||||
reassign_question_ids,
|
||||
)
|
||||
from features.esm_COPE import DICT_COPE_QUESTION_IDS
|
||||
from features.esm_JCQ import reverse_jcq_demand_control_scoring
|
||||
from features.esm_SAM import DICT_SAM_QUESTION_IDS, extract_stressful_events
|
||||
|
||||
# import os
|
||||
# import sys
|
||||
# nb_dir = os.path.split(os.getcwd())[0]
|
||||
# if nb_dir not in sys.path:
|
||||
# sys.path.append(nb_dir)
|
||||
|
||||
|
||||
# %%
|
||||
save_figs = False
|
||||
export_data = True
|
||||
from features.esm import *
|
||||
from features.esm_JCQ import *
|
||||
from features.esm_SAM import *
|
||||
|
||||
# %%
|
||||
participants_inactive_usernames = participants.query_db.get_usernames(
|
||||
|
@ -57,14 +43,8 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)
|
|||
|
||||
# %%
|
||||
df_esm_PANAS = df_esm_preprocessed[
|
||||
(
|
||||
df_esm_preprocessed["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["PANAS_positive_affect"]
|
||||
)
|
||||
| (
|
||||
df_esm_preprocessed["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["PANAS_negative_affect"]
|
||||
)
|
||||
(df_esm_preprocessed["questionnaire_id"] == 8)
|
||||
| (df_esm_preprocessed["questionnaire_id"] == 9)
|
||||
]
|
||||
df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)
|
||||
|
||||
|
@ -85,47 +65,35 @@ df_esm_PANAS_daily_means = (
|
|||
# %%
|
||||
df_esm_PANAS_summary_participant = (
|
||||
df_esm_PANAS_daily_means.groupby(["participant_id", "questionnaire_id"])
|
||||
.esm_numeric_mean.agg(["mean", "median", "std"])
|
||||
.agg(["mean", "median", "std"])
|
||||
.reset_index(col_level=1)
|
||||
)
|
||||
df_esm_PANAS_summary_participant.columns = df_esm_PANAS_summary_participant.columns.get_level_values(
|
||||
1
|
||||
)
|
||||
df_esm_PANAS_summary_participant[
|
||||
"PANAS subscale"
|
||||
"PANAS_subscale"
|
||||
] = df_esm_PANAS_daily_means.questionnaire_id.astype("category").cat.rename_categories(
|
||||
{8.0: "positive affect", 9.0: "negative affect"}
|
||||
{8.0: "PA", 9.0: "NA"}
|
||||
)
|
||||
|
||||
# %%
|
||||
df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["mean"]
|
||||
|
||||
# %%
|
||||
df_esm_PANAS_summary_participant.groupby("PANAS subscale").describe()["std"]
|
||||
|
||||
# %%
|
||||
df_esm_PANAS_summary_participant.query("std == 0")
|
||||
|
||||
# %%
|
||||
fig1 = sns.displot(
|
||||
data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS subscale", binwidth=0.2
|
||||
sns.displot(
|
||||
data=df_esm_PANAS_summary_participant, x="mean", hue="PANAS_subscale", binwidth=0.2
|
||||
)
|
||||
fig1.set_axis_labels(x_var="participant mean", y_var="frequency")
|
||||
if save_figs:
|
||||
fig1.figure.savefig("PANAS_mean_participant.pdf", dpi=300)
|
||||
|
||||
# %%
|
||||
sns.displot(
|
||||
data=df_esm_PANAS_summary_participant,
|
||||
x="median",
|
||||
hue="PANAS subscale",
|
||||
hue="PANAS_subscale",
|
||||
binwidth=0.2,
|
||||
)
|
||||
|
||||
# %%
|
||||
fig2 = sns.displot(
|
||||
data=df_esm_PANAS_summary_participant, x="std", hue="PANAS subscale", binwidth=0.05
|
||||
sns.displot(
|
||||
data=df_esm_PANAS_summary_participant, x="std", hue="PANAS_subscale", binwidth=0.05
|
||||
)
|
||||
fig2.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
|
||||
if save_figs:
|
||||
fig2.figure.savefig("PANAS_std_participant.pdf", dpi=300)
|
||||
|
||||
# %%
|
||||
df_esm_PANAS_summary_participant[df_esm_PANAS_summary_participant["std"] < 0.1]
|
||||
|
@ -141,14 +109,8 @@ df_SAM_all.head()
|
|||
|
||||
# %%
|
||||
df_esm_SAM = df_esm_preprocessed[
|
||||
(
|
||||
df_esm_preprocessed["questionnaire_id"]
|
||||
>= QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
|
||||
)
|
||||
& (
|
||||
df_esm_preprocessed["questionnaire_id"]
|
||||
<= QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
|
||||
)
|
||||
(df_esm_preprocessed["questionnaire_id"] >= 87)
|
||||
& (df_esm_preprocessed["questionnaire_id"] <= 93)
|
||||
]
|
||||
df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
|
||||
|
||||
|
@ -156,10 +118,9 @@ df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
|
|||
# ## Stressful events
|
||||
|
||||
# %%
|
||||
df_esm_SAM_event = df_esm_SAM_clean[
|
||||
df_esm_SAM_clean["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
|
||||
].assign(stressful_event=lambda x: (x.esm_user_answer_numeric > 0))
|
||||
df_esm_SAM_event = df_esm_SAM_clean[df_esm_SAM_clean["questionnaire_id"] == 87].assign(
|
||||
stressful_event=lambda x: (x.esm_user_answer_numeric > 0)
|
||||
)
|
||||
|
||||
# %%
|
||||
df_esm_SAM_daily_events = (
|
||||
|
@ -170,22 +131,20 @@ df_esm_SAM_daily_events = (
|
|||
)
|
||||
|
||||
# %% [markdown]
|
||||
# Calculate the daily mean of YES (1) or NO (0) answers to the question about stressful events. This is then the daily ratio of EMA sessions that included a stressful event.
|
||||
# Calculate the daily mean of YES (1) or NO (0) answers to the question about a stressful events. This is then the daily ratio of EMA sessions that included a stressful event.
|
||||
|
||||
# %%
|
||||
df_esm_SAM_event_summary_participant = (
|
||||
df_esm_SAM_daily_events.groupby(["participant_id"])
|
||||
.SAM_event_ratio.agg(["mean", "median", "std"])
|
||||
.agg(["mean", "median", "std"])
|
||||
.reset_index(col_level=1)
|
||||
)
|
||||
df_esm_SAM_event_summary_participant.columns = df_esm_SAM_event_summary_participant.columns.get_level_values(
|
||||
1
|
||||
)
|
||||
|
||||
# %%
|
||||
fig6 = sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
|
||||
fig6.set_axis_labels(
|
||||
x_var="participant proportion of stressful events", y_var="frequency"
|
||||
)
|
||||
if save_figs:
|
||||
fig6.figure.savefig("SAM_events_mean_participant.pdf", dpi=300)
|
||||
sns.displot(data=df_esm_SAM_event_summary_participant, x="mean", binwidth=0.1)
|
||||
|
||||
# %%
|
||||
sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
|
||||
|
@ -196,12 +155,7 @@ sns.displot(data=df_esm_SAM_event_summary_participant, x="std", binwidth=0.05)
|
|||
# %% [markdown]
|
||||
# * Example of threat: "Did this event make you feel anxious?"
|
||||
# * Example of challenge: "How eager are you to tackle this event?"
|
||||
# * Possible answers:
|
||||
# 0 - Not at all,
|
||||
# 1 - Slightly,
|
||||
# 2 - Moderately,
|
||||
# 3 - Considerably,
|
||||
# 4 - Extremely
|
||||
# * Possible answers: 0 - Not at all, 1 - Slightly, 2 - Moderately, 3 - Considerably, 4 - Extremely
|
||||
|
||||
# %%
|
||||
df_esm_SAM_daily = (
|
||||
|
@ -213,45 +167,27 @@ df_esm_SAM_daily = (
|
|||
|
||||
# %%
|
||||
df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[
|
||||
(df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
|
||||
| (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
|
||||
(df_esm_SAM_daily["questionnaire_id"] == 88)
|
||||
| (df_esm_SAM_daily["questionnaire_id"] == 89)
|
||||
]
|
||||
|
||||
# %%
|
||||
df_esm_SAM_summary_participant = (
|
||||
df_esm_SAM_daily.groupby(["participant_id", "questionnaire_id"])
|
||||
.esm_numeric_mean.agg(["mean", "median", "std"])
|
||||
.agg(["mean", "median", "std"])
|
||||
.reset_index(col_level=1)
|
||||
)
|
||||
|
||||
# %%
|
||||
df_esm_SAM_event_stressfulness_summary_participant = df_esm_SAM_summary_participant[
|
||||
df_esm_SAM_summary_participant["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
|
||||
]
|
||||
df_esm_SAM_event_stressfulness_summary_participant.describe()["mean"]
|
||||
|
||||
# %%
|
||||
df_esm_SAM_event_stressfulness_summary_participant.describe()["std"]
|
||||
|
||||
# %%
|
||||
sns.displot(
|
||||
data=df_esm_SAM_event_stressfulness_summary_participant, x="mean", binwidth=0.2
|
||||
df_esm_SAM_summary_participant.columns = df_esm_SAM_summary_participant.columns.get_level_values(
|
||||
1
|
||||
)
|
||||
|
||||
# %%
|
||||
df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[
|
||||
(
|
||||
df_esm_SAM_summary_participant["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["appraisal_threat"]
|
||||
)
|
||||
| (
|
||||
df_esm_SAM_summary_participant["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["appraisal_challenge"]
|
||||
)
|
||||
(df_esm_SAM_summary_participant["questionnaire_id"] == 88)
|
||||
| (df_esm_SAM_summary_participant["questionnaire_id"] == 89)
|
||||
]
|
||||
df_esm_SAM_threat_challenge_summary_participant[
|
||||
"event subscale"
|
||||
"event_subscale"
|
||||
] = df_esm_SAM_threat_challenge_summary_participant.questionnaire_id.astype(
|
||||
"category"
|
||||
).cat.rename_categories(
|
||||
|
@ -262,84 +198,26 @@ df_esm_SAM_threat_challenge_summary_participant[
|
|||
sns.displot(
|
||||
data=df_esm_SAM_threat_challenge_summary_participant,
|
||||
x="mean",
|
||||
hue="event subscale",
|
||||
hue="event_subscale",
|
||||
binwidth=0.2,
|
||||
)
|
||||
|
||||
# %%
|
||||
fig3 = sns.displot(
|
||||
sns.displot(
|
||||
data=df_esm_SAM_threat_challenge_summary_participant,
|
||||
x="std",
|
||||
hue="event subscale",
|
||||
hue="event_subscale",
|
||||
binwidth=0.1,
|
||||
)
|
||||
fig3.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
|
||||
if save_figs:
|
||||
fig3.figure.savefig("SAM_std_participant.pdf", dpi=300)
|
||||
|
||||
# %%
|
||||
df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
|
||||
"mean"
|
||||
]
|
||||
|
||||
# %%
|
||||
df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").describe()[
|
||||
"std"
|
||||
]
|
||||
|
||||
# %%
|
||||
df_esm_SAM_clean.columns
|
||||
|
||||
# %%
|
||||
df_esm_SAM_clean.esm_status.value_counts()
|
||||
|
||||
# %%
|
||||
if export_data:
|
||||
df_esm_SAM_fixed = reassign_question_ids(df_esm_SAM_clean, DICT_SAM_QUESTION_IDS)
|
||||
df_esm_SAM_fixed = increment_answers(df_esm_SAM_fixed)
|
||||
df_esm_SAM_for_export = df_esm_SAM_fixed[
|
||||
[
|
||||
"participant_id",
|
||||
"username",
|
||||
"device_id",
|
||||
"_id",
|
||||
"esm_trigger",
|
||||
"esm_session",
|
||||
"esm_notification_id",
|
||||
"question_id",
|
||||
"questionnaire_id",
|
||||
"esm_instructions",
|
||||
"double_esm_user_answer_timestamp",
|
||||
"datetime_lj",
|
||||
"date_lj",
|
||||
"time",
|
||||
"esm_user_answer",
|
||||
"esm_user_answer_numeric",
|
||||
]
|
||||
]
|
||||
df_esm_SAM_for_export.sort_values(
|
||||
by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
|
||||
)
|
||||
print(df_esm_SAM_for_export.head())
|
||||
df_esm_SAM_for_export.to_csv(
|
||||
"../data/raw/df_esm_SAM_threat_challenge.csv", index=False
|
||||
)
|
||||
|
||||
# %% [markdown]
|
||||
# ## Stressfulness of period
|
||||
|
||||
# %%
|
||||
df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[
|
||||
df_esm_SAM_summary_participant["questionnaire_id"]
|
||||
== QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
|
||||
df_esm_SAM_summary_participant["questionnaire_id"] == 93
|
||||
]
|
||||
|
||||
# %%
|
||||
df_esm_SAM_period_summary_participant.describe()["mean"]
|
||||
|
||||
# %%
|
||||
df_esm_SAM_period_summary_participant.describe()["std"]
|
||||
|
||||
# %%
|
||||
sns.displot(data=df_esm_SAM_period_summary_participant, x="mean", binwidth=0.2)
|
||||
|
||||
|
@ -351,8 +229,8 @@ sns.displot(data=df_esm_SAM_period_summary_participant, x="std", binwidth=0.1)
|
|||
|
||||
# %%
|
||||
df_esm_JCQ_demand_control = df_esm_preprocessed[
|
||||
(df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["JCQ_job_demand"])
|
||||
& (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["JCQ_job_control"])
|
||||
(df_esm_preprocessed["questionnaire_id"] >= 10)
|
||||
& (df_esm_preprocessed["questionnaire_id"] <= 11)
|
||||
]
|
||||
df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control)
|
||||
|
||||
|
@ -372,11 +250,14 @@ df_esm_JCQ_daily = (
|
|||
)
|
||||
df_esm_JCQ_summary_participant = (
|
||||
df_esm_JCQ_daily.groupby(["participant_id", "questionnaire_id"])
|
||||
.esm_score_mean.agg(["mean", "median", "std"])
|
||||
.agg(["mean", "median", "std"])
|
||||
.reset_index(col_level=1)
|
||||
)
|
||||
df_esm_JCQ_summary_participant.columns = df_esm_JCQ_summary_participant.columns.get_level_values(
|
||||
1
|
||||
)
|
||||
df_esm_JCQ_summary_participant[
|
||||
"JCQ subscale"
|
||||
"JCQ_subscale"
|
||||
] = df_esm_JCQ_summary_participant.questionnaire_id.astype(
|
||||
"category"
|
||||
).cat.rename_categories(
|
||||
|
@ -384,71 +265,11 @@ df_esm_JCQ_summary_participant[
|
|||
)
|
||||
|
||||
# %%
|
||||
df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["mean"]
|
||||
|
||||
# %%
|
||||
df_esm_JCQ_summary_participant.groupby("JCQ subscale").describe()["std"]
|
||||
|
||||
# %%
|
||||
fig4 = sns.displot(
|
||||
data=df_esm_JCQ_summary_participant,
|
||||
x="mean",
|
||||
hue="JCQ subscale",
|
||||
binwidth=0.1,
|
||||
sns.displot(
|
||||
data=df_esm_JCQ_summary_participant, x="mean", hue="JCQ_subscale", binwidth=0.1,
|
||||
)
|
||||
fig4.set_axis_labels(x_var="participant mean", y_var="frequency")
|
||||
if save_figs:
|
||||
fig4.figure.savefig("JCQ_mean_participant.pdf", dpi=300)
|
||||
|
||||
# %%
|
||||
fig5 = sns.displot(
|
||||
data=df_esm_JCQ_summary_participant,
|
||||
x="std",
|
||||
hue="JCQ subscale",
|
||||
binwidth=0.05,
|
||||
sns.displot(
|
||||
data=df_esm_JCQ_summary_participant, x="std", hue="JCQ_subscale", binwidth=0.05,
|
||||
)
|
||||
fig6.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
|
||||
if save_figs:
|
||||
fig5.figure.savefig("JCQ_std_participant.pdf", dpi=300)
|
||||
|
||||
# %% [markdown]
|
||||
# # COPE Inventory
|
||||
|
||||
# %%
|
||||
df_esm_COPE = df_esm_preprocessed[
|
||||
(df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["COPE_active"])
|
||||
& (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["COPE_emotions"])
|
||||
]
|
||||
|
||||
# %%
|
||||
df_esm_COPE_clean = clean_up_esm(df_esm_COPE)
|
||||
df_esm_COPE_clean = increment_answers(df_esm_COPE_clean)
|
||||
df_esm_COPE_fixed = reassign_question_ids(df_esm_COPE_clean, DICT_COPE_QUESTION_IDS)
|
||||
|
||||
# %%
|
||||
if export_data:
|
||||
df_esm_COPE_for_export = df_esm_COPE_fixed[
|
||||
[
|
||||
"participant_id",
|
||||
"username",
|
||||
"device_id",
|
||||
"_id",
|
||||
"esm_trigger",
|
||||
"esm_session",
|
||||
"esm_notification_id",
|
||||
"question_id",
|
||||
"questionnaire_id",
|
||||
"esm_instructions",
|
||||
"double_esm_user_answer_timestamp",
|
||||
"datetime_lj",
|
||||
"date_lj",
|
||||
"time",
|
||||
"esm_user_answer",
|
||||
"esm_user_answer_numeric",
|
||||
]
|
||||
]
|
||||
df_esm_COPE_for_export.sort_values(
|
||||
by=["participant_id", "device_id", "_id"], ignore_index=True, inplace=True
|
||||
)
|
||||
print(df_esm_COPE_for_export.head())
|
||||
df_esm_COPE_for_export.to_csv("../data/raw/df_esm_COPE.csv", index=False)
|
||||
|
|
|
@ -6,129 +6,457 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.14.5
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
# from IPython.core.interactiveshell import InteractiveShell
|
||||
from pathlib import Path
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
# %matplotlib inline
|
||||
import os
|
||||
import sys
|
||||
|
||||
# matplotlib inline
|
||||
# import os
|
||||
# import sys
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
from machine_learning.helper import (
|
||||
impute_encode_categorical_features,
|
||||
prepare_cross_validator,
|
||||
prepare_sklearn_data_format,
|
||||
run_all_classification_models,
|
||||
)
|
||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
|
||||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn.impute import SimpleImputer
|
||||
|
||||
# InteractiveShell.ast_node_interactivity = "all"
|
||||
from lightgbm import LGBMClassifier
|
||||
import xgboost as xg
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
import machine_learning.helper
|
||||
|
||||
# %% [markdown]
|
||||
# # RAPIDS models
|
||||
|
||||
# %% [markdown]
|
||||
# ## Set script's parameters
|
||||
#
|
||||
# nb_dir = os.path.split(os.getcwd())[0]
|
||||
# if nb_dir not in sys.path:
|
||||
# sys.path.append(nb_dir)
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method)
|
||||
|
||||
# %%
|
||||
CV_METHOD = "logo" # logo, half_logo, 5kfold
|
||||
# Cross-validation method (could be regarded as a hyperparameter)
|
||||
print("CV_METHOD: " + CV_METHOD)
|
||||
N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
UNDERSAMPLING = False
|
||||
# (bool) If True this will train and test data on balanced dataset
|
||||
# (using undersampling method)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
|
||||
# model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
PATH_BASE = Path("E:/STRAWresults/20230415")
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
model_input['target'].value_counts()
|
||||
|
||||
SEGMENT_TYPE = "period"
|
||||
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
||||
SEGMENT_LENGTH = "30_minutes_before"
|
||||
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
||||
TARGET_VARIABLE = "JCQ_job_control"
|
||||
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
||||
|
||||
if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
|
||||
TARGET_VARIABLE += "_"
|
||||
TARGET_VARIABLE += SEGMENT_TYPE
|
||||
|
||||
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
||||
|
||||
model_input = pd.read_csv(PATH_FULL)
|
||||
|
||||
if SEGMENT_LENGTH == "daily":
|
||||
DAY_LENGTH = "daily" # or "working"
|
||||
print(DAY_LENGTH)
|
||||
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
model_input["target"].value_counts()
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
# bins = [-10, 0, 10] # bins for z-scored targets
|
||||
BINS = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||
print("BINS: ", BINS)
|
||||
model_input["target"], edges = pd.cut(
|
||||
model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
|
||||
) # ['low', 'medium', 'high']
|
||||
print(model_input["target"].value_counts())
|
||||
REMOVE_MEDIUM = True
|
||||
if ("medium" in model_input["target"]) and REMOVE_MEDIUM:
|
||||
model_input = model_input[model_input["target"] != "medium"]
|
||||
model_input["target"] = (
|
||||
model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||
)
|
||||
else:
|
||||
model_input["target"] = model_input["target"].map(
|
||||
{"low": 0, "medium": 1, "high": 2}
|
||||
)
|
||||
print(model_input["target"].value_counts())
|
||||
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
||||
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
|
||||
model_input['target'].value_counts(), edges
|
||||
# model_input = model_input[model_input['target'] != "medium"]
|
||||
model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||
|
||||
model_input['target'].value_counts()
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
# UnderSampling
|
||||
if UNDERSAMPLING:
|
||||
no_stress = model_input[model_input["target"] == 0]
|
||||
stress = model_input[model_input["target"] == 1]
|
||||
|
||||
if undersampling:
|
||||
no_stress = model_input[model_input['target'] == 0]
|
||||
stress = model_input[model_input['target'] == 1]
|
||||
|
||||
no_stress = no_stress.sample(n=len(stress))
|
||||
model_input = pd.concat([stress, no_stress], axis=0)
|
||||
model_input = pd.concat([stress,no_stress], axis=0)
|
||||
|
||||
# model_input_new = pd.DataFrame(columns=model_input.columns)
|
||||
# for pid in model_input["pid"].unique():
|
||||
# stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
|
||||
# no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
|
||||
# if (len(stress) == 0):
|
||||
# continue
|
||||
# if (len(no_stress) == 0):
|
||||
# continue
|
||||
# model_input_new = pd.concat([model_input_new, stress], axis=0)
|
||||
|
||||
# no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
|
||||
# # In case there are more stress samples than no_stress, take all instances of no_stress.
|
||||
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
|
||||
# model_input = model_input_new
|
||||
# model_input_new = pd.concat([model_input_new, no_stress], axis=0)
|
||||
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
model_input_encoded = impute_encode_categorical_features(model_input)
|
||||
# %%
|
||||
data_x, data_y, data_groups = prepare_sklearn_data_format(
|
||||
model_input_encoded, CV_METHOD
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
if cv_method_str == 'half_logo':
|
||||
model_input['pid_index'] = model_input.groupby('pid').cumcount()
|
||||
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
|
||||
|
||||
model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
|
||||
model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
|
||||
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
|
||||
else:
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
train_x.dtypes
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
|
||||
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
||||
cv_method = LeaveOneGroupOut()
|
||||
cv_method.get_n_splits(
|
||||
train_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
|
||||
# %% [markdown]
|
||||
# ### Baseline: Dummy Classifier (most frequent)
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
dummy_class = DummyClassifier(strategy="most_frequent")
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
dummy_classifier = cross_validate(
|
||||
dummy_class,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
|
||||
print("Precision", np.mean(dummy_classifier['test_precision']))
|
||||
print("Recall", np.mean(dummy_classifier['test_recall']))
|
||||
print("F1", np.mean(dummy_classifier['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %%
|
||||
data_y.head()
|
||||
# %% [markdown] nteract={"transient": {"deleting": false}}
|
||||
# ### All models
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
final_scores = machine_learning.helper.run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
||||
# %%
|
||||
data_y.tail()
|
||||
# %%
|
||||
data_y.shape
|
||||
# %%
|
||||
scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
|
||||
# %%
|
||||
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
||||
path_output_full = PATH_OUTPUT / (
|
||||
TARGET_VARIABLE
|
||||
+ "_"
|
||||
+ SEGMENT_LENGTH
|
||||
+ "_classification"
|
||||
+ str(BINS)
|
||||
+ "_"
|
||||
+ CV_METHOD
|
||||
+ ".csv"
|
||||
final_scores.index.name = "metric"
|
||||
final_scores = final_scores.set_index(["method", final_scores.index])
|
||||
final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv")
|
||||
|
||||
# %% [markdown]
|
||||
# ### Logistic Regression
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
logistic_regression = linear_model.LogisticRegression()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
log_reg_scores = cross_validate(
|
||||
logistic_regression,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
scores.to_csv(path_output_full, index=False)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
|
||||
print("Precision", np.mean(log_reg_scores['test_precision']))
|
||||
print("Recall", np.mean(log_reg_scores['test_recall']))
|
||||
print("F1", np.mean(log_reg_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Support Vector Machine
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
svc = svm.SVC()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
svc_scores = cross_validate(
|
||||
svc,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
|
||||
print("Precision", np.mean(svc_scores['test_precision']))
|
||||
print("Recall", np.mean(svc_scores['test_recall']))
|
||||
print("F1", np.mean(svc_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Gaussian Naive Bayes
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
gaussian_nb = naive_bayes.GaussianNB()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
gaussian_nb_scores = cross_validate(
|
||||
gaussian_nb,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
|
||||
print("Precision", np.mean(gaussian_nb_scores['test_precision']))
|
||||
print("Recall", np.mean(gaussian_nb_scores['test_recall']))
|
||||
print("F1", np.mean(gaussian_nb_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Stochastic Gradient Descent Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
sgdc = linear_model.SGDClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
sgdc_scores = cross_validate(
|
||||
sgdc,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
|
||||
print("Precision", np.mean(sgdc_scores['test_precision']))
|
||||
print("Recall", np.mean(sgdc_scores['test_recall']))
|
||||
print("F1", np.mean(sgdc_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### K-nearest neighbors
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
knn = neighbors.KNeighborsClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
knn_scores = cross_validate(
|
||||
knn,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
|
||||
print("Precision", np.mean(knn_scores['test_precision']))
|
||||
print("Recall", np.mean(knn_scores['test_recall']))
|
||||
print("F1", np.mean(knn_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Decision Tree
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
dtree = tree.DecisionTreeClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
dtree_scores = cross_validate(
|
||||
dtree,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
|
||||
print("Precision", np.mean(dtree_scores['test_precision']))
|
||||
print("Recall", np.mean(dtree_scores['test_recall']))
|
||||
print("F1", np.mean(dtree_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Random Forest Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
rfc = ensemble.RandomForestClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
rfc_scores = cross_validate(
|
||||
rfc,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1'),
|
||||
return_estimator=True
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
|
||||
print("Precision", np.mean(rfc_scores['test_precision']))
|
||||
print("Recall", np.mean(rfc_scores['test_recall']))
|
||||
print("F1", np.mean(rfc_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Feature importance (RFC)
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
|
||||
for idx, estimator in enumerate(rfc_scores['estimator']):
|
||||
feature_importances = pd.DataFrame(estimator.feature_importances_,
|
||||
index = list(train_x.columns),
|
||||
columns=['importance'])
|
||||
# print("\nFeatures sorted by their score for estimator {}:".format(idx))
|
||||
# print(feature_importances.sort_values('importance', ascending=False).head(10))
|
||||
rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
|
||||
|
||||
pd.set_option('display.max_rows', 100)
|
||||
print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
|
||||
|
||||
rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
|
||||
|
||||
rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
|
||||
|
||||
train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
|
||||
|
||||
# %% [markdown]
|
||||
# ### Gradient Boosting Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
gbc = ensemble.GradientBoostingClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
gbc_scores = cross_validate(
|
||||
gbc,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
|
||||
print("Precision", np.mean(gbc_scores['test_precision']))
|
||||
print("Recall", np.mean(gbc_scores['test_recall']))
|
||||
print("F1", np.mean(gbc_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### LGBM Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
lgbm = LGBMClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
lgbm_scores = cross_validate(
|
||||
lgbm,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
|
||||
print("Precision", np.mean(lgbm_scores['test_precision']))
|
||||
print("Recall", np.mean(lgbm_scores['test_recall']))
|
||||
print("F1", np.mean(lgbm_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
# %% [markdown]
|
||||
# ### XGBoost Classifier
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
xgb_classifier = xg.sklearn.XGBClassifier()
|
||||
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
xgb_classifier_scores = cross_validate(
|
||||
xgb_classifier,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
||||
print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
|
||||
print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
|
||||
print("Precision", np.mean(xgb_classifier_scores['test_precision']))
|
||||
print("Recall", np.mean(xgb_classifier_scores['test_recall']))
|
||||
print("F1", np.mean(xgb_classifier_scores['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
|
|
|
@ -1,177 +0,0 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# formats: ipynb,py:percent
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.14.5
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
from machine_learning.helper import (
|
||||
impute_encode_categorical_features,
|
||||
prepare_cross_validator,
|
||||
prepare_sklearn_data_format,
|
||||
run_all_classification_models,
|
||||
)
|
||||
|
||||
# %%
|
||||
CV_METHOD = "logo" # logo, half_logo, 5kfold
|
||||
# Cross-validation method (could be regarded as a hyperparameter)
|
||||
print("CV_METHOD: " + CV_METHOD)
|
||||
N_SL = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
UNDERSAMPLING = False
|
||||
# (bool) If True this will train and test data on balanced dataset
|
||||
# (using undersampling method)
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
PATH_BASE = Path("E:/STRAWresults/20230415")
|
||||
|
||||
SEGMENT_TYPE = "period"
|
||||
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
||||
SEGMENT_LENGTH = "30_minutes_before"
|
||||
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
||||
|
||||
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / "features" / "all_sensor_features.csv"
|
||||
|
||||
all_features_with_baseline = pd.read_csv(PATH_FULL)
|
||||
|
||||
# %%
|
||||
TARGETS = [
|
||||
"PANAS_negative_affect_mean",
|
||||
"PANAS_positive_affect_mean",
|
||||
"JCQ_job_demand_mean",
|
||||
"JCQ_job_control_mean",
|
||||
"appraisal_stressfulness_period_mean",
|
||||
]
|
||||
|
||||
# %%
|
||||
all_features_cleaned = pd.DataFrame()
|
||||
for target in TARGETS:
|
||||
PATH_FULL = (
|
||||
PATH_BASE
|
||||
/ SEGMENT_LENGTH
|
||||
/ "features"
|
||||
/ ("all_sensor_features_cleaned_straw_py_(" + target + ").csv")
|
||||
)
|
||||
current_features = pd.read_csv(PATH_FULL, index_col="local_segment")
|
||||
if all_features_cleaned.empty:
|
||||
all_features_cleaned = current_features
|
||||
else:
|
||||
all_features_cleaned = all_features_cleaned.join(
|
||||
current_features[("phone_esm_straw_" + target)],
|
||||
how="inner",
|
||||
rsuffix="_" + target,
|
||||
)
|
||||
print(all_features_cleaned.shape)
|
||||
|
||||
# %%
|
||||
pca = PCA(n_components=1)
|
||||
TARGETS_PREFIXED = ["phone_esm_straw_" + target for target in TARGETS]
|
||||
pca.fit(all_features_cleaned[TARGETS_PREFIXED])
|
||||
print(pca.explained_variance_ratio_)
|
||||
|
||||
# %%
|
||||
model_input = all_features_cleaned.drop(columns=TARGETS_PREFIXED)
|
||||
model_input["target"] = pca.fit_transform(all_features_cleaned[TARGETS_PREFIXED])
|
||||
|
||||
# %%
|
||||
sns.histplot(data=model_input, x="target")
|
||||
|
||||
# %%
|
||||
model_input.target.quantile(0.6)
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
# bins = [-10, 0, 10] # bins for z-scored targets
|
||||
BINS = [-10, 0, 10] # bins for stressfulness (0-4) target
|
||||
print("BINS: ", BINS)
|
||||
model_input["target"], edges = pd.cut(
|
||||
model_input.target, bins=BINS, labels=["low", "high"], retbins=True, right=True
|
||||
) # ['low', 'medium', 'high']
|
||||
print(model_input["target"].value_counts())
|
||||
REMOVE_MEDIUM = True
|
||||
if REMOVE_MEDIUM:
|
||||
if "medium" in model_input["target"]:
|
||||
model_input = model_input[model_input["target"] != "medium"]
|
||||
model_input["target"] = (
|
||||
model_input["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||
)
|
||||
else:
|
||||
model_input["target"] = model_input["target"].map(
|
||||
{"low": 0, "medium": 1, "high": 2}
|
||||
)
|
||||
print(model_input["target"].value_counts())
|
||||
|
||||
|
||||
# %% jupyter={"outputs_hidden": false, "source_hidden": false}
|
||||
# UnderSampling
|
||||
if UNDERSAMPLING:
|
||||
no_stress = model_input[model_input["target"] == 0]
|
||||
stress = model_input[model_input["target"] == 1]
|
||||
|
||||
no_stress = no_stress.sample(n=len(stress))
|
||||
model_input = pd.concat([stress, no_stress], axis=0)
|
||||
|
||||
|
||||
# %%
|
||||
TARGET_VARIABLE = "PANAS_negative_affect"
|
||||
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
||||
|
||||
PATH_FULL_HELP = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
||||
|
||||
model_input_with_baseline = pd.read_csv(PATH_FULL_HELP, index_col="local_segment")
|
||||
|
||||
# %%
|
||||
baseline_col_names = [
|
||||
col for col in model_input_with_baseline.columns if col not in model_input.columns
|
||||
]
|
||||
print(baseline_col_names)
|
||||
|
||||
# %%
|
||||
model_input = model_input.join(
|
||||
model_input_with_baseline[baseline_col_names], how="left"
|
||||
)
|
||||
model_input.reset_index(inplace=True)
|
||||
|
||||
# %%
|
||||
model_input_encoded = impute_encode_categorical_features(model_input)
|
||||
|
||||
# %%
|
||||
data_x, data_y, data_groups = prepare_sklearn_data_format(
|
||||
model_input_encoded, CV_METHOD
|
||||
)
|
||||
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
|
||||
|
||||
# %%
|
||||
data_y.head()
|
||||
|
||||
# %%
|
||||
data_y.tail()
|
||||
# %%
|
||||
data_y.shape
|
||||
# %%
|
||||
scores = run_all_classification_models(data_x, data_y, data_groups, cross_validator)
|
||||
# %%
|
||||
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
||||
path_output_full = PATH_OUTPUT / (
|
||||
"composite_"
|
||||
+ SEGMENT_LENGTH
|
||||
+ "_classification"
|
||||
+ str(BINS)
|
||||
+ "_"
|
||||
+ CV_METHOD
|
||||
+ ".csv"
|
||||
)
|
||||
scores.to_csv(path_output_full, index=False)
|
|
@ -6,7 +6,7 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.14.5
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
|
@ -14,85 +14,80 @@
|
|||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
from pathlib import Path
|
||||
# %matplotlib inline
|
||||
import datetime
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold, cross_validate
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
|
||||
from sklearn.impute import SimpleImputer
|
||||
|
||||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||
import xgboost as xg
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
import machine_learning.labels
|
||||
import machine_learning.model
|
||||
from machine_learning.classification_models import ClassificationModels
|
||||
|
||||
# %%
|
||||
# %% [markdown]
|
||||
# # RAPIDS models
|
||||
|
||||
# %% [markdown]
|
||||
# ## Set script's parameters
|
||||
N_CLUSTERS = 4 # Number of clusters (could be regarded as a hyperparameter)
|
||||
CV_METHOD = "logo" # logo, halflogo, 5kfold
|
||||
# Cross-validation method (could be regarded as a hyperparameter)
|
||||
N_SL = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
|
||||
# %%
|
||||
PATH_BASE = Path("E:/STRAWresults/20230415")
|
||||
|
||||
SEGMENT_TYPE = "period"
|
||||
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
||||
SEGMENT_LENGTH = "30_minutes_before"
|
||||
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
||||
TARGET_VARIABLE = "appraisal_stressfulness"
|
||||
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
||||
|
||||
if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
|
||||
TARGET_VARIABLE += "_"
|
||||
TARGET_VARIABLE += SEGMENT_TYPE
|
||||
|
||||
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
||||
|
||||
model_input = pd.read_csv(PATH_FULL)
|
||||
|
||||
if SEGMENT_LENGTH == "daily":
|
||||
DAY_LENGTH = "daily" # or "working"
|
||||
print(DAY_LENGTH)
|
||||
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
|
||||
n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
|
||||
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
index_columns = [
|
||||
"local_segment",
|
||||
"local_segment_label",
|
||||
"local_segment_start_datetime",
|
||||
"local_segment_end_datetime",
|
||||
]
|
||||
model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
|
||||
CLUST_COL = "limesurvey_demand_control_ratio_quartile"
|
||||
print("CLUST_COL: " + CLUST_COL)
|
||||
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
|
||||
|
||||
BINS = [-1, 0, 4]
|
||||
print("BINS: " + str(BINS))
|
||||
model_input.columns[list(model_input.columns).index('age'):-1]
|
||||
|
||||
model_input[CLUST_COL].describe()
|
||||
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
|
||||
lime_cols
|
||||
lime_col = 'limesurvey_demand_control_ratio_quartile'
|
||||
clust_col = lime_col
|
||||
|
||||
model_input[clust_col].describe()
|
||||
|
||||
# %%
|
||||
model_input["target"].value_counts()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Filter-out outlier rows by clust_col
|
||||
# model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||
|
||||
uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
|
||||
# Filter-out outlier rows by clust_col
|
||||
#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||
|
||||
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
||||
uniq = uniq.dropna()
|
||||
plt.bar(uniq["pid"], uniq[CLUST_COL])
|
||||
plt.bar(uniq['pid'], uniq[clust_col])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Get clusters by cluster col & and merge the clusters to main df
|
||||
km = KMeans(n_clusters=N_CLUSTERS).fit_predict(uniq.set_index("pid"))
|
||||
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
|
||||
np.unique(km, return_counts=True)
|
||||
uniq["cluster"] = km
|
||||
uniq['cluster'] = km
|
||||
uniq
|
||||
|
||||
model_input = model_input.merge(uniq[["pid", "cluster"]])
|
||||
|
||||
# %%
|
||||
model_input[["cluster", "target"]].value_counts().sort_index()
|
||||
model_input = model_input.merge(uniq[['pid', 'cluster']])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
|
@ -103,56 +98,31 @@ cm = ClassificationModels()
|
|||
cmodels = cm.get_cmodels()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
for k in range(N_CLUSTERS):
|
||||
for k in range(n_clusters):
|
||||
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
||||
model_input_subset.loc[:, "target"] = pd.cut(
|
||||
model_input_subset.loc[:, "target"],
|
||||
bins=BINS,
|
||||
labels=["low", "high"],
|
||||
right=True,
|
||||
) # ['low', 'medium', 'high']
|
||||
model_input_subset["target"].value_counts()
|
||||
# model_input_subset = model_input_subset[model_input_subset["target"] != "medium"]
|
||||
model_input_subset["target"] = (
|
||||
model_input_subset["target"].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||
)
|
||||
bins = [-10, -1, 1, 10] # bins for z-scored targets
|
||||
model_input_subset.loc[:, 'target'] = \
|
||||
pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
|
||||
model_input_subset['target'].value_counts()
|
||||
model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
|
||||
model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
|
||||
|
||||
print(model_input_subset["target"].value_counts())
|
||||
model_input_subset['target'].value_counts()
|
||||
|
||||
if cv_method_str == 'half_logo':
|
||||
model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
|
||||
model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
|
||||
|
||||
if CV_METHOD == "half_logo":
|
||||
model_input_subset["pid_index"] = model_input_subset.groupby("pid").cumcount()
|
||||
model_input_subset["pid_count"] = model_input_subset.groupby("pid")[
|
||||
"pid"
|
||||
].transform("count")
|
||||
model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
|
||||
model_input_subset["pid_half"] = model_input_subset["pid"] + "_" + model_input_subset["pid_index"].astype(int).astype(str)
|
||||
|
||||
model_input_subset["pid_index"] = (
|
||||
model_input_subset["pid_index"] / model_input_subset["pid_count"] + 1
|
||||
).round()
|
||||
model_input_subset["pid_half"] = (
|
||||
model_input_subset["pid"]
|
||||
+ "_"
|
||||
+ model_input_subset["pid_index"].astype(int).astype(str)
|
||||
)
|
||||
|
||||
data_x, data_y, data_groups = (
|
||||
model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
|
||||
model_input_subset["target"],
|
||||
model_input_subset["pid_half"],
|
||||
)
|
||||
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
|
||||
else:
|
||||
data_x, data_y, data_groups = (
|
||||
model_input_subset.drop(["target", "pid"], axis=1),
|
||||
model_input_subset["target"],
|
||||
model_input_subset["pid"],
|
||||
)
|
||||
data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]
|
||||
|
||||
# Treat categorical features
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [
|
||||
col
|
||||
for col in data_x.columns
|
||||
if "mostcommonactivity" in col or "homelabel" in col
|
||||
]
|
||||
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||
|
@ -162,9 +132,7 @@ for k in range(N_CLUSTERS):
|
|||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(
|
||||
lambda col: col.astype("category")
|
||||
)
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
|
@ -172,10 +140,8 @@ for k in range(N_CLUSTERS):
|
|||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# Establish cv method
|
||||
cv_method = StratifiedKFold(
|
||||
n_splits=5, shuffle=True
|
||||
) # Defaults to 5 k-folds in cross_validate method
|
||||
if CV_METHOD == "logo" or CV_METHOD == "half_logo":
|
||||
cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
|
||||
if cv_method_str == 'logo' or cv_method_str == 'half_logo':
|
||||
cv_method = LeaveOneGroupOut()
|
||||
cv_method.get_n_splits(
|
||||
train_x,
|
||||
|
@ -183,57 +149,36 @@ for k in range(N_CLUSTERS):
|
|||
groups=data_groups,
|
||||
)
|
||||
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy="median")
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
|
||||
for model_title, model in cmodels.items():
|
||||
|
||||
classifier = cross_validate(
|
||||
model["model"],
|
||||
model['model'],
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score="raise",
|
||||
scoring=("accuracy", "precision", "recall", "f1"),
|
||||
error_score='raise',
|
||||
scoring=('accuracy', 'precision', 'recall', 'f1')
|
||||
)
|
||||
|
||||
|
||||
print("\n-------------------------------------\n")
|
||||
print("Current cluster:", k, end="\n")
|
||||
print("Current model:", model_title, end="\n")
|
||||
print("Acc", np.mean(classifier["test_accuracy"]))
|
||||
print("Precision", np.mean(classifier["test_precision"]))
|
||||
print("Recall", np.mean(classifier["test_recall"]))
|
||||
print("F1", np.mean(classifier["test_f1"]))
|
||||
print(
|
||||
f"Largest {N_SL} ACC:",
|
||||
np.sort(-np.partition(-classifier["test_accuracy"], N_SL)[:N_SL])[::-1],
|
||||
)
|
||||
print(
|
||||
f"Smallest {N_SL} ACC:",
|
||||
np.sort(np.partition(classifier["test_accuracy"], N_SL)[:N_SL]),
|
||||
)
|
||||
|
||||
cmodels[model_title]["metrics"][0] += np.mean(classifier["test_accuracy"])
|
||||
cmodels[model_title]["metrics"][1] += np.mean(classifier["test_precision"])
|
||||
cmodels[model_title]["metrics"][2] += np.mean(classifier["test_recall"])
|
||||
cmodels[model_title]["metrics"][3] += np.mean(classifier["test_f1"])
|
||||
print("Acc", np.mean(classifier['test_accuracy']))
|
||||
print("Precision", np.mean(classifier['test_precision']))
|
||||
print("Recall", np.mean(classifier['test_recall']))
|
||||
print("F1", np.mean(classifier['test_f1']))
|
||||
print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
|
||||
print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))
|
||||
|
||||
cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
|
||||
cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
|
||||
cmodels[model_title]['metrics'][2] += np.mean(classifier['test_recall'])
|
||||
cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Get overall results
|
||||
scores = cm.get_total_models_scores(n_clusters=N_CLUSTERS)
|
||||
|
||||
# %%
|
||||
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
||||
path_output_full = PATH_OUTPUT / (
|
||||
TARGET_VARIABLE
|
||||
+ "_"
|
||||
+ SEGMENT_LENGTH
|
||||
+ "_classification_"
|
||||
+ CV_METHOD
|
||||
+ str(BINS)
|
||||
+ "_clust_"
|
||||
+ CLUST_COL
|
||||
+ str(N_CLUSTERS)
|
||||
+ ".csv"
|
||||
)
|
||||
scores.to_csv(path_output_full, index=False)
|
||||
cm.get_total_models_scores(n_clusters=n_clusters)
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.14.5
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
|
@ -14,83 +14,92 @@
|
|||
# ---
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
from pathlib import Path
|
||||
# %matplotlib inline
|
||||
import os
|
||||
import sys
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
from scipy import stats
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
from machine_learning.classification_models import ClassificationModels
|
||||
from machine_learning.helper import impute_encode_categorical_features
|
||||
|
||||
# %% [markdown]
|
||||
# # RAPIDS models
|
||||
|
||||
# %% [markdown]
|
||||
# # Useful method
|
||||
def treat_categorical_features(input_set):
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
categorical_features = input_set[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = input_set.drop(categorical_feature_colnames, axis=1)
|
||||
|
||||
return pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# %% [markdown]
|
||||
# ## Set script's parameters
|
||||
#
|
||||
|
||||
# %%
|
||||
n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
|
||||
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
|
||||
# %%
|
||||
PATH_BASE = Path("E:/STRAWresults/20230415")
|
||||
|
||||
SEGMENT_TYPE = "period"
|
||||
print("SEGMENT_TYPE: " + SEGMENT_TYPE)
|
||||
SEGMENT_LENGTH = "30_minutes_before"
|
||||
print("SEGMENT_LENGTH: " + SEGMENT_LENGTH)
|
||||
TARGET_VARIABLE = "appraisal_stressfulness"
|
||||
print("TARGET_VARIABLE: " + TARGET_VARIABLE)
|
||||
|
||||
if ("appraisal" in TARGET_VARIABLE) and ("stressfulness" in TARGET_VARIABLE):
|
||||
TARGET_VARIABLE += "_"
|
||||
TARGET_VARIABLE += SEGMENT_TYPE
|
||||
|
||||
PATH_FULL = PATH_BASE / SEGMENT_LENGTH / ("input_" + TARGET_VARIABLE + "_mean.csv")
|
||||
|
||||
model_input = pd.read_csv(PATH_FULL)
|
||||
|
||||
if SEGMENT_LENGTH == "daily":
|
||||
DAY_LENGTH = "daily" # or "working"
|
||||
print(DAY_LENGTH)
|
||||
model_input = model_input[model_input["local_segment"].str.contains(DAY_LENGTH)]
|
||||
n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
|
||||
n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
CLUST_COL = "limesurvey_demand_control_ratio"
|
||||
print("CLUST_COL: " + CLUST_COL)
|
||||
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
|
||||
BINS = [-1, 0, 4]
|
||||
print("BINS: " + str(BINS))
|
||||
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
|
||||
|
||||
index_columns = [
|
||||
"local_segment",
|
||||
"local_segment_label",
|
||||
"local_segment_start_datetime",
|
||||
"local_segment_end_datetime",
|
||||
]
|
||||
model_input.columns[list(model_input.columns).index('age'):-1]
|
||||
|
||||
model_input[CLUST_COL].describe()
|
||||
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
|
||||
lime_cols
|
||||
lime_col = 'limesurvey_demand_control_ratio'
|
||||
clust_col = lime_col
|
||||
|
||||
model_input[clust_col].describe()
|
||||
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Filter-out outlier rows by clust_col
|
||||
model_input = model_input[(np.abs(stats.zscore(model_input[CLUST_COL])) < 3)]
|
||||
|
||||
uniq = model_input[[CLUST_COL, "pid"]].drop_duplicates().reset_index(drop=True)
|
||||
plt.bar(uniq["pid"], uniq[CLUST_COL])
|
||||
# Filter-out outlier rows by clust_col
|
||||
model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||
|
||||
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
||||
plt.bar(uniq['pid'], uniq[clust_col])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Get clusters by cluster col & and merge the clusters to main df
|
||||
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index("pid"))
|
||||
km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
|
||||
np.unique(km, return_counts=True)
|
||||
uniq["cluster"] = km
|
||||
print(uniq)
|
||||
uniq['cluster'] = km
|
||||
uniq
|
||||
|
||||
model_input = model_input.merge(uniq[["pid", "cluster"]])
|
||||
model_input = model_input.merge(uniq[['pid', 'cluster']])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
|
@ -100,64 +109,50 @@ model_input.set_index(index_columns, inplace=True)
|
|||
cm = ClassificationModels()
|
||||
cmodels = cm.get_cmodels()
|
||||
|
||||
# %%
|
||||
model_input["target"].value_counts()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
for k in range(n_clusters):
|
||||
model_input_subset = model_input[model_input["cluster"] == k].copy()
|
||||
|
||||
|
||||
# Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
|
||||
# model_input_subset['numerical_target'] = model_input_subset['target']
|
||||
|
||||
model_input_subset.loc[:, "target"] = pd.cut(
|
||||
model_input_subset.loc[:, "target"], bins=BINS, labels=[0, 1], right=True
|
||||
)
|
||||
|
||||
# p15 = np.percentile(model_input_subset['numerical_target'], 15)
|
||||
# p85 = np.percentile(model_input_subset['numerical_target'], 85)
|
||||
|
||||
model_input_subset['numerical_target'] = model_input_subset['target']
|
||||
bins = [-10, 0, 10] # bins for z-scored targets
|
||||
model_input_subset.loc[:, 'target'] = \
|
||||
pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True)
|
||||
|
||||
p15 = np.percentile(model_input_subset['numerical_target'], 15)
|
||||
p85 = np.percentile(model_input_subset['numerical_target'], 85)
|
||||
|
||||
# Treat categorical features
|
||||
model_input_subset = impute_encode_categorical_features(model_input_subset)
|
||||
|
||||
model_input_subset = treat_categorical_features(model_input_subset)
|
||||
|
||||
# Split to train, validate, and test subsets
|
||||
# train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
|
||||
# test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
|
||||
train_set, test_set = train_test_split(
|
||||
model_input_subset,
|
||||
test_size=0.3,
|
||||
stratify=model_input_subset["pid"],
|
||||
random_state=42,
|
||||
)
|
||||
|
||||
print(train_set["target"].value_counts())
|
||||
print(test_set["target"].value_counts())
|
||||
train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
|
||||
test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
|
||||
|
||||
train_set['target'].value_counts()
|
||||
test_set['target'].value_counts()
|
||||
|
||||
train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
|
||||
|
||||
validate_x, test_x, validate_y, test_y = train_test_split(
|
||||
test_set.drop(["target", "pid"], axis=1),
|
||||
test_set["target"],
|
||||
test_size=0.50,
|
||||
random_state=42,
|
||||
)
|
||||
|
||||
|
||||
validate_x, test_x, validate_y, test_y = \
|
||||
train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42)
|
||||
|
||||
# Impute missing values
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy="median")
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
||||
|
||||
train_x = imputer.fit_transform(train_x)
|
||||
validate_x = imputer.fit_transform(validate_x)
|
||||
test_x = imputer.fit_transform(test_x)
|
||||
|
||||
for model_title, model in cmodels.items():
|
||||
model["model"].fit(train_x, train_y)
|
||||
y_pred = model["model"].predict(validate_x)
|
||||
|
||||
model['model'].fit(train_x, train_y)
|
||||
y_pred = model['model'].predict(validate_x)
|
||||
|
||||
acc = accuracy_score(validate_y, y_pred)
|
||||
prec = precision_score(validate_y, y_pred)
|
||||
rec = recall_score(validate_y, y_pred)
|
||||
f1 = f1_score(validate_y, y_pred)
|
||||
|
||||
|
||||
print("\n-------------------------------------\n")
|
||||
print("Current cluster:", k, end="\n")
|
||||
print("Current model:", model_title, end="\n")
|
||||
|
@ -165,30 +160,12 @@ for k in range(n_clusters):
|
|||
print("Precision", prec)
|
||||
print("Recall", rec)
|
||||
print("F1", f1)
|
||||
|
||||
cmodels[model_title]["metrics"][0] += acc
|
||||
cmodels[model_title]["metrics"][1] += prec
|
||||
cmodels[model_title]["metrics"][2] += rec
|
||||
cmodels[model_title]["metrics"][3] += f1
|
||||
|
||||
cmodels[model_title]['metrics'][0] += acc
|
||||
cmodels[model_title]['metrics'][1] += prec
|
||||
cmodels[model_title]['metrics'][2] += rec
|
||||
cmodels[model_title]['metrics'][3] += f1
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# Get overall results
|
||||
scores = cm.get_total_models_scores(n_clusters=n_clusters)
|
||||
|
||||
# %%
|
||||
print(scores)
|
||||
|
||||
# %%
|
||||
PATH_OUTPUT = Path("..") / Path("presentation/results")
|
||||
path_output_full = PATH_OUTPUT / (
|
||||
TARGET_VARIABLE
|
||||
+ "_"
|
||||
+ SEGMENT_LENGTH
|
||||
+ "_classification"
|
||||
+ str(BINS)
|
||||
+ "_CLUST_"
|
||||
+ CLUST_COL
|
||||
+ +str(n_clusters)
|
||||
+ ".csv"
|
||||
)
|
||||
scores.to_csv(path_output_full, index=False)
|
||||
cm.get_total_models_scores(n_clusters=n_clusters)
|
||||
|
|
|
@ -6,61 +6,350 @@
|
|||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.14.5
|
||||
# jupytext_version: 1.13.0
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %%
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# %matplotlib inline
|
||||
import datetime
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
from machine_learning.helper import (
|
||||
impute_encode_categorical_features,
|
||||
prepare_cross_validator,
|
||||
prepare_sklearn_data_format,
|
||||
run_all_regression_models,
|
||||
)
|
||||
import seaborn as sns
|
||||
import yaml
|
||||
from pyprojroot import here
|
||||
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.dummy import DummyRegressor
|
||||
import xgboost as xg
|
||||
from IPython.core.interactiveshell import InteractiveShell
|
||||
InteractiveShell.ast_node_interactivity = "all"
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
|
||||
# %%
|
||||
model_input = pd.read_csv(
|
||||
"../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv"
|
||||
import machine_learning.features_sensor
|
||||
import machine_learning.labels
|
||||
import machine_learning.model
|
||||
|
||||
# %% [markdown]
|
||||
# # RAPIDS models
|
||||
|
||||
# %% [markdown]
|
||||
# ## PANAS negative affect
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
#if "pid" in model_input.columns:
|
||||
# index_columns.append("pid")
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
|
||||
cv_method = 'half_logo' # logo, half_logo, 5kfold
|
||||
if cv_method == 'logo':
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
|
||||
else:
|
||||
model_input['pid_index'] = model_input.groupby('pid').cumcount()
|
||||
model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
|
||||
|
||||
model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
|
||||
model_input["pid_half"] = model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
|
||||
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
train_x.dtypes
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
logo = LeaveOneGroupOut()
|
||||
logo.get_n_splits(
|
||||
train_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
|
||||
# %%
|
||||
model_input = model_input[model_input["local_segment"].str.contains("daily")]
|
||||
# Defaults to 5 k folds in cross_validate method
|
||||
if cv_method != 'logo' and cv_method != 'half_logo':
|
||||
logo = None
|
||||
|
||||
# %%
|
||||
CV_METHOD = "logo" # logo, half_logo, 5kfold
|
||||
# %% jupyter={"source_hidden": true}
|
||||
sum(data_y.isna())
|
||||
|
||||
model_input_encoded = impute_encode_categorical_features(model_input)
|
||||
# %%
|
||||
data_x, data_y, data_groups = prepare_sklearn_data_format(
|
||||
model_input_encoded, CV_METHOD
|
||||
# %% [markdown]
|
||||
# ### Baseline: Dummy Regression (mean)
|
||||
dummy_regr = DummyRegressor(strategy="mean")
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
dummy_regressor = cross_validate(
|
||||
dummy_regr,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
cross_validator = prepare_cross_validator(data_x, data_y, data_groups, CV_METHOD)
|
||||
# %%
|
||||
data_y.head()
|
||||
print("Negative Mean Squared Error", np.median(dummy_regressor['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(dummy_regressor['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(dummy_regressor['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(dummy_regressor['test_r2']))
|
||||
|
||||
# %%
|
||||
data_y.tail()
|
||||
# %% [markdown]
|
||||
# ### Linear Regression
|
||||
|
||||
# %%
|
||||
data_y.shape
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lin_reg_rapids = linear_model.LinearRegression()
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||
|
||||
# %%
|
||||
scores = run_all_regression_models(data_x, data_y, data_groups, cross_validator)
|
||||
|
||||
# %%
|
||||
scores.to_csv(
|
||||
"../presentation/JCQ_supervisor_support_regression_" + CV_METHOD + ".csv",
|
||||
index=False,
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lin_reg_scores = cross_validate(
|
||||
lin_reg_rapids,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(lin_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### XGBRegressor Linear Regression
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_reg_scores = cross_validate(
|
||||
xgb_r,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(xgb_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(xgb_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(xgb_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(xgb_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### XGBRegressor Pseudo Huber Error Regression
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
|
||||
# %% jupyter={"source_hidden": true}
|
||||
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
xgb_psuedo_huber_reg_scores = cross_validate(
|
||||
xgb_psuedo_huber_r,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(xgb_psuedo_huber_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Ridge regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ridge_reg = linear_model.Ridge(alpha=.5)
|
||||
|
||||
# %% tags=[] jupyter={"source_hidden": true}
|
||||
ridge_reg_scores = cross_validate(
|
||||
ridge_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(ridge_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(ridge_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(ridge_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(ridge_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Lasso
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lasso_reg = linear_model.Lasso(alpha=0.1)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
lasso_reg_score = cross_validate(
|
||||
lasso_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(lasso_reg_score['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(lasso_reg_score['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(lasso_reg_score['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(lasso_reg_score['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Bayesian Ridge
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
bayesian_ridge_reg = linear_model.BayesianRidge()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
bayesian_ridge_reg_score = cross_validate(
|
||||
bayesian_ridge_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### RANSAC (outlier robust regression)
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ransac_reg = linear_model.RANSACRegressor()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
ransac_reg_scores = cross_validate(
|
||||
ransac_reg,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(ransac_reg_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(ransac_reg_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(ransac_reg_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(ransac_reg_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Support vector regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
svr = svm.SVR()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
svr_scores = cross_validate(
|
||||
svr,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(svr_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(svr_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(svr_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(svr_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Kernel Ridge regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
kridge = kernel_ridge.KernelRidge()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
kridge_scores = cross_validate(
|
||||
kridge,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(kridge_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(kridge_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(kridge_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(kridge_scores['test_r2']))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Gaussian Process Regression
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
gpr = gaussian_process.GaussianProcessRegressor()
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
|
||||
gpr_scores = cross_validate(
|
||||
gpr,
|
||||
X=imputer.fit_transform(train_x),
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
|
||||
)
|
||||
print("Negative Mean Squared Error", np.median(gpr_scores['test_neg_mean_squared_error']))
|
||||
print("Negative Mean Absolute Error", np.median(gpr_scores['test_neg_mean_absolute_error']))
|
||||
print("Negative Root Mean Squared Error", np.median(gpr_scores['test_neg_root_mean_squared_error']))
|
||||
print("R2", np.median(gpr_scores['test_r2']))
|
||||
|
||||
# %%
|
||||
|
|
|
@ -1,217 +0,0 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: percent
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 1.14.5
|
||||
# kernelspec:
|
||||
# display_name: straw2analysis
|
||||
# language: python
|
||||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
# %%
|
||||
import pandas as pd
|
||||
|
||||
from features.esm_JCQ import DICT_JCQ_DEMAND_CONTROL_REVERSE
|
||||
|
||||
# %%
|
||||
limesurvey_questions = pd.read_csv(
|
||||
"E:/STRAWbaseline/survey637813+question_text.csv", header=None
|
||||
).T
|
||||
|
||||
# %%
|
||||
limesurvey_questions
|
||||
|
||||
# %%
|
||||
limesurvey_questions[["code", "text"]] = limesurvey_questions[0].str.split(
|
||||
r"\.\s", expand=True, n=1
|
||||
)
|
||||
|
||||
# %%
|
||||
limesurvey_questions
|
||||
|
||||
# %%
|
||||
demand_reverse_lime_rows = (
|
||||
limesurvey_questions["text"].str.startswith(" [Od mene se ne zahteva,")
|
||||
| limesurvey_questions["text"].str.startswith(" [Imam dovolj časa, da končam")
|
||||
| limesurvey_questions["text"].str.startswith(
|
||||
" [Pri svojem delu se ne srečujem s konfliktnimi"
|
||||
)
|
||||
)
|
||||
control_reverse_lime_rows = limesurvey_questions["text"].str.startswith(
|
||||
" [Moje delo vključuje veliko ponavljajočega"
|
||||
) | limesurvey_questions["text"].str.startswith(
|
||||
" [Pri svojem delu imam zelo malo svobode"
|
||||
)
|
||||
|
||||
# %%
|
||||
demand_reverse_lime = limesurvey_questions[demand_reverse_lime_rows]
|
||||
demand_reverse_lime.loc[:, "qid"] = demand_reverse_lime["code"].str.extract(
|
||||
r"\[(\d+)\]"
|
||||
)
|
||||
control_reverse_lime = limesurvey_questions[control_reverse_lime_rows]
|
||||
control_reverse_lime.loc[:, "qid"] = control_reverse_lime["code"].str.extract(
|
||||
r"\[(\d+)\]"
|
||||
)
|
||||
|
||||
# %%
|
||||
limesurvey_questions.loc[89, "text"]
|
||||
|
||||
# %%
|
||||
limesurvey_questions[limesurvey_questions["code"].str.startswith("JobEisen")]
|
||||
|
||||
# %%
|
||||
demand_reverse_lime
|
||||
|
||||
# %%
|
||||
control_reverse_lime
|
||||
|
||||
# %%
|
||||
participant_info = pd.read_csv(
|
||||
"C:/Users/junos/Documents/FWO-ARRS/Analysis/straw2analysis/rapids/data/raw/p031/participant_baseline_raw.csv",
|
||||
parse_dates=["date_of_birth"],
|
||||
)
|
||||
|
||||
# %%
|
||||
participant_info_t = participant_info.T
|
||||
|
||||
# %%
|
||||
rows_baseline = participant_info_t.index
|
||||
|
||||
# %%
|
||||
rows_demand = rows_baseline.str.startswith("JobEisen") & ~rows_baseline.str.endswith(
|
||||
"Time"
|
||||
)
|
||||
|
||||
# %%
|
||||
rows_baseline[rows_demand]
|
||||
|
||||
# %%
|
||||
limesurvey_control = (
|
||||
participant_info_t[rows_demand]
|
||||
.reset_index()
|
||||
.rename(columns={"index": "question", 0: "score_original"})
|
||||
)
|
||||
|
||||
# %%
|
||||
limesurvey_control
|
||||
|
||||
# %%
|
||||
limesurvey_control["qid"] = (
|
||||
limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
|
||||
)
|
||||
|
||||
# %%
|
||||
limesurvey_control["question"].str.extract(r"\[(\d+)\]").astype(int)
|
||||
|
||||
# %%
|
||||
limesurvey_control["score"] = limesurvey_control["score_original"]
|
||||
|
||||
# %%
|
||||
limesurvey_control["qid"][0]
|
||||
|
||||
# %%
|
||||
rows_demand_reverse = limesurvey_control["qid"].isin(
|
||||
DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
|
||||
)
|
||||
limesurvey_control.loc[rows_demand_reverse, "score"] = (
|
||||
4 + 1 - limesurvey_control.loc[rows_demand_reverse, "score_original"]
|
||||
)
|
||||
|
||||
# %%
|
||||
JCQ_DEMAND = "JobEisen"
|
||||
JCQ_CONTROL = "JobControle"
|
||||
dict_JCQ_demand_control_reverse = {
|
||||
JCQ_DEMAND: {
|
||||
3: " [Od mene se ne zahteva,",
|
||||
4: " [Imam dovolj časa, da končam",
|
||||
5: " [Pri svojem delu se ne srečujem s konfliktnimi",
|
||||
},
|
||||
JCQ_CONTROL: {
|
||||
2: " |Moje delo vključuje veliko ponavljajočega",
|
||||
6: " [Pri svojem delu imam zelo malo svobode",
|
||||
},
|
||||
}
|
||||
|
||||
# %%
|
||||
limesurvey_control
|
||||
|
||||
# %%
|
||||
test = pd.DataFrame(
|
||||
data={"question": "one", "score_original": 3, "score": 3, "qid": 10}, index=[0]
|
||||
)
|
||||
|
||||
# %%
|
||||
pd.concat([test, limesurvey_control]).reset_index()
|
||||
|
||||
# %%
|
||||
limesurvey_control["score"].sum()
|
||||
|
||||
# %%
|
||||
rows_demand_reverse
|
||||
|
||||
# %%
|
||||
dict_JCQ_demand_control_reverse[JCQ_DEMAND].keys()
|
||||
|
||||
# %%
|
||||
limesurvey_control
|
||||
|
||||
# %%
|
||||
DEMAND_CONTROL_RATIO_MIN = 5 / (9 * 4)
|
||||
DEMAND_CONTROL_RATIO_MAX = (4 * 5) / 9
|
||||
|
||||
JCQ_NORMS = {
|
||||
"F": {
|
||||
0: DEMAND_CONTROL_RATIO_MIN,
|
||||
1: 0.45,
|
||||
2: 0.52,
|
||||
3: 0.62,
|
||||
4: DEMAND_CONTROL_RATIO_MAX,
|
||||
},
|
||||
"M": {
|
||||
0: DEMAND_CONTROL_RATIO_MIN,
|
||||
1: 0.41,
|
||||
2: 0.48,
|
||||
3: 0.56,
|
||||
4: DEMAND_CONTROL_RATIO_MAX,
|
||||
},
|
||||
}
|
||||
|
||||
# %%
|
||||
JCQ_NORMS[participant_info.loc[0, "gender"]][0]
|
||||
|
||||
# %%
|
||||
participant_info_t.index.str.startswith("JobControle")
|
||||
|
||||
# %%
|
||||
columns_baseline = participant_info.columns
|
||||
|
||||
# %%
|
||||
columns_demand = columns_baseline.str.startswith(
|
||||
"JobControle"
|
||||
) & ~columns_baseline.str.endswith("Time")
|
||||
|
||||
# %%
|
||||
columns_baseline[columns_demand]
|
||||
|
||||
# %%
|
||||
participant_control = participant_info.loc[:, columns_demand]
|
||||
|
||||
# %%
|
||||
participant_control["id"] = participant_control.index
|
||||
|
||||
# %%
|
||||
participant_control
|
||||
|
||||
# %%
|
||||
pd.wide_to_long(
|
||||
participant_control,
|
||||
stubnames="JobControle",
|
||||
i="id",
|
||||
j="qid",
|
||||
sep="[",
|
||||
suffix="(\\d+)]",
|
||||
)
|
195
features/esm.py
195
features/esm.py
|
@ -20,47 +20,11 @@ ANSWER_DAY_OFF = "DayOff3421"
|
|||
ANSWER_SET_EVENING = "DayFinishedSetEvening"
|
||||
|
||||
MAX_MORNING_LENGTH = 3
|
||||
# When the participant was not yet at work at the time of the first (morning) EMA,
|
||||
# When the participants was not yet at work at the time of the first (morning) EMA,
|
||||
# only three items were answered.
|
||||
# Two sleep related items and one indicating NOT starting work yet.
|
||||
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
|
||||
|
||||
QUESTIONNAIRE_IDS = {
|
||||
"sleep_quality": 1,
|
||||
"PANAS_positive_affect": 8,
|
||||
"PANAS_negative_affect": 9,
|
||||
"JCQ_job_demand": 10,
|
||||
"JCQ_job_control": 11,
|
||||
"JCQ_supervisor_support": 12,
|
||||
"JCQ_coworker_support": 13,
|
||||
"PFITS_supervisor": 14,
|
||||
"PFITS_coworkers": 15,
|
||||
"UWES_vigor": 16,
|
||||
"UWES_dedication": 17,
|
||||
"UWES_absorption": 18,
|
||||
"COPE_active": 19,
|
||||
"COPE_support": 20,
|
||||
"COPE_emotions": 21,
|
||||
"balance_life_work": 22,
|
||||
"balance_work_life": 23,
|
||||
"recovery_experience_detachment": 24,
|
||||
"recovery_experience_relaxation": 25,
|
||||
"symptoms": 26,
|
||||
"appraisal_stressfulness_event": 87,
|
||||
"appraisal_threat": 88,
|
||||
"appraisal_challenge": 89,
|
||||
"appraisal_event_time": 90,
|
||||
"appraisal_event_duration": 91,
|
||||
"appraisal_event_work_related": 92,
|
||||
"appraisal_stressfulness_period": 93,
|
||||
"late_work": 94,
|
||||
"work_hours": 95,
|
||||
"left_work": 96,
|
||||
"activities": 97,
|
||||
"coffee_breaks": 98,
|
||||
"at_work_yet": 99,
|
||||
}
|
||||
|
||||
|
||||
def get_esm_data(usernames: Collection) -> pd.DataFrame:
|
||||
"""
|
||||
|
@ -88,10 +52,8 @@ def get_esm_data(usernames: Collection) -> pd.DataFrame:
|
|||
|
||||
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Convert timestamps and expand JSON column.
|
||||
|
||||
Convert timestamps into human-readable datetimes and dates
|
||||
and expand the JSON column into several Pandas DF columns.
|
||||
and expand the JSON column into several Pandas DF columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
@ -101,8 +63,7 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
|||
Returns
|
||||
-------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A dataframe with added columns: datetime in Ljubljana timezone
|
||||
and all fields from ESM_JSON column.
|
||||
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
|
||||
"""
|
||||
df_esm = helper.get_date_from_timestamp(df_esm)
|
||||
|
||||
|
@ -115,39 +76,31 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
|||
def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
For each distinct EMA session, determine how the participant responded to it.
|
||||
|
||||
Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED,
|
||||
and SESSION_STATUS_COMPLETE
|
||||
Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
|
||||
|
||||
This is done in three steps.
|
||||
|
||||
First, the esm_status is considered.
|
||||
If any of the ESMs in a session has a status *other than* "answered",
|
||||
then this session is taken as unfinished.
|
||||
If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
|
||||
|
||||
Second, the sessions which do not represent full questionnaires are identified.
|
||||
These are sessions where participants only marked they are finished with the day
|
||||
or have not yet started working.
|
||||
These are sessions where participants only marked they are finished with the day or have not yet started working.
|
||||
|
||||
Third, the sessions with only one item are marked with their trigger.
|
||||
We never offered questionnaires with single items,
|
||||
so we can be sure these are unfinished.
|
||||
We never offered questionnaires with single items, so we can be sure these are unfinished.
|
||||
|
||||
Finally, all sessions that remain are marked as completed.
|
||||
By going through different possibilities in expl_esm_adherence.ipynb,
|
||||
this turned out to be a reasonable option.
|
||||
By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A preprocessed dataframe of esm data,
|
||||
which must include the session ID (esm_session).
|
||||
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_session_counts: pd.Dataframe
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
|
||||
with their statuses and the number of items.
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
|
||||
"""
|
||||
sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
|
||||
|
||||
|
@ -202,22 +155,17 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat
|
|||
|
||||
def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Classify EMA sessions into morning, workday, or evening.
|
||||
|
||||
For each EMA session, determine the time of the first user answer
|
||||
and its time type (morning, workday, or evening).
|
||||
For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A preprocessed dataframe of esm data,
|
||||
which must include the session ID (esm_session).
|
||||
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_session_time: pd.DataFrame
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
|
||||
with their time type and timestamp of first answer.
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
|
||||
"""
|
||||
df_session_time = (
|
||||
df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
|
||||
|
@ -231,17 +179,13 @@ def classify_sessions_by_completion_time(
|
|||
df_esm_preprocessed: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Classify sessions and correct the time type.
|
||||
|
||||
The point of this function is to not only classify sessions
|
||||
by using the previously defined functions.
|
||||
The point of this function is to not only classify sessions by using the previously defined functions.
|
||||
It also serves to "correct" the time type of some EMA sessions.
|
||||
|
||||
A morning questionnaire could seamlessly transition into a daytime questionnaire,
|
||||
if the participant was already at work.
|
||||
In this case, the "time" label changed mid-session.
|
||||
Because of the way classify_sessions_by_time works,
|
||||
this questionnaire was classified as "morning".
|
||||
Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
|
||||
But for all intents and purposes, it can be treated as a "daytime" EMA.
|
||||
|
||||
The way this scenario is differentiated from a true "morning" questionnaire,
|
||||
|
@ -250,16 +194,13 @@ def classify_sessions_by_completion_time(
|
|||
Parameters
|
||||
----------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A preprocessed dataframe of esm data,
|
||||
which must include the session ID (esm_session).
|
||||
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_session_counts_time: pd.DataFrame
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses,
|
||||
the number of items,
|
||||
their time type (with some morning EMAs reclassified)
|
||||
and timestamp of first answer.
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items,
|
||||
their time type (with some morning EMAs reclassified) and timestamp of first answer.
|
||||
|
||||
"""
|
||||
df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
|
||||
|
@ -278,8 +219,7 @@ def classify_sessions_by_completion_time(
|
|||
|
||||
def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Eliminate invalid ESM responses.
|
||||
|
||||
This function eliminates invalid ESM responses.
|
||||
It removes unanswered ESMs and those that indicate end of work and similar.
|
||||
It also extracts a numeric answer from strings such as "4 - I strongly agree".
|
||||
|
||||
|
@ -316,100 +256,3 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
|||
)
|
||||
)
|
||||
return df_esm_clean
|
||||
|
||||
|
||||
def increment_answers(df_esm_clean: pd.DataFrame, increment_by=1):
|
||||
"""
|
||||
Increment answers to keep in line with original scoring.
|
||||
|
||||
We always used 0 for the lowest value of user answer.
|
||||
Some scales originally used other scoring, such as starting from 1.
|
||||
This restores original scoring so that the values are comparable to references.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_clean: pd.DataFrame
|
||||
A cleaned ESM dataframe, which must also include esm_user_answer_numeric.
|
||||
increment_by:
|
||||
A number to add to the user answer.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_clean: pd.DataFrame
|
||||
The same df with addition of a column 'esm_user_answer_numeric'.
|
||||
|
||||
"""
|
||||
try:
|
||||
df_esm_clean = df_esm_clean.assign(
|
||||
esm_user_score=lambda x: x.esm_user_answer_numeric + increment_by
|
||||
)
|
||||
except AttributeError as e:
|
||||
print("Please, clean the dataframe first using features.esm.clean_up_esm.")
|
||||
print(e)
|
||||
return df_esm_clean
|
||||
|
||||
|
||||
def reassign_question_ids(
|
||||
df_esm_cleaned: pd.DataFrame, question_ids_content: dict
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Fix question IDs to match their actual content.
|
||||
|
||||
Unfortunately, when altering the protocol to adapt to COVID pandemic,
|
||||
we did not retain original question IDs.
|
||||
This means that for participants before 2021, they are different
|
||||
from for the rest of them.
|
||||
This function searches for question IDs by matching their strings.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_cleaned: pd.DataFrame
|
||||
A cleaned up dataframe, which must also include esm_user_answer_numeric.
|
||||
question_ids_content: dict
|
||||
A dictionary, linking question IDs with their content ("instructions").
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_fixed: pd.DataFrame
|
||||
The same dataframe but with fixed question IDs.
|
||||
"""
|
||||
df_esm_unique_questions = (
|
||||
df_esm_cleaned.groupby("question_id")
|
||||
.esm_instructions.value_counts()
|
||||
.rename()
|
||||
.reset_index()
|
||||
)
|
||||
# Tabulate all possible answers to each question (group by question ID).
|
||||
|
||||
# First, check that we anticipated all esm instructions.
|
||||
for q_id in question_ids_content.keys():
|
||||
# Look for all questions ("instructions") occurring in the dataframe.
|
||||
actual_questions = df_esm_unique_questions.loc[
|
||||
df_esm_unique_questions["question_id"] == q_id,
|
||||
"esm_instructions",
|
||||
]
|
||||
# These are all answers to a given question (by q_id).
|
||||
questions_matches = actual_questions.str.startswith(
|
||||
question_ids_content.get(q_id)
|
||||
)
|
||||
# See if they are expected, i.e. included in the dictionary.
|
||||
if ~actual_questions.all():
|
||||
print("One of the questions that occur in the data was undefined.")
|
||||
print("This were the questions found in the data: ")
|
||||
raise KeyError(actual_questions[~questions_matches])
|
||||
# In case there is an unexpected answer, raise an exception.
|
||||
|
||||
# Next, replace question IDs.
|
||||
df_esm_fixed = df_esm_cleaned.copy()
|
||||
df_esm_fixed["question_id"] = df_esm_cleaned["esm_instructions"].apply(
|
||||
lambda x: next(
|
||||
(
|
||||
key
|
||||
for key, values in question_ids_content.items()
|
||||
if x.startswith(values)
|
||||
),
|
||||
None,
|
||||
)
|
||||
)
|
||||
|
||||
return df_esm_fixed
|
||||
|
|
|
@ -1,125 +0,0 @@
|
|||
COPE_ORIGINAL_MAX = 4
|
||||
COPE_ORIGINAL_MIN = 1
|
||||
|
||||
DICT_COPE_QUESTION_IDS = {
|
||||
164: (
|
||||
"I took additional action to try to get rid of the problem",
|
||||
"Ik deed extra mijn best om er iets aan te doen",
|
||||
"Vložila sem dodaten napor, da bi rešila problem",
|
||||
"Vložil sem dodaten napor, da bi rešil problem",
|
||||
),
|
||||
165: (
|
||||
"I concentrated my efforts on doing something about it",
|
||||
"Ik probeerde de situatie te verbeteren",
|
||||
"Svoje sile sem usmerila v reševanje nastale situacije",
|
||||
"Svoje sile sem usmeril v reševanje nastale situacije",
|
||||
),
|
||||
166: (
|
||||
"I did what had to be done, one step at a time",
|
||||
"Ik deed stap voor stap wat nodig was",
|
||||
"Naredila sem, kar je bilo potrebno – korak za korakom",
|
||||
"Naredil sem, kar je bilo potrebno – korak za korakom",
|
||||
),
|
||||
167: (
|
||||
"I took direct action to get around the problem",
|
||||
"Ik handelde vlug om het probleem te verhelpen",
|
||||
"Nekaj sem naredila, da sem zaobšla problem",
|
||||
"Nekaj sem naredil, da sem zaobšel problem",
|
||||
),
|
||||
168: (
|
||||
"I tried to come up with a strategy about what to do",
|
||||
"Ik probeerde te verzinnen wat ik er aan kon doen",
|
||||
"Skušala sem najti ustrezen način za rešitev situacije",
|
||||
"Skušal sem najti ustrezen način za rešitev situacije",
|
||||
),
|
||||
169: (
|
||||
"I made a plan of action",
|
||||
"Ik maakte een plan",
|
||||
"Naredila sem načrt za delovanje",
|
||||
"Naredil sem načrt za delovanje",
|
||||
),
|
||||
170: (
|
||||
"I thought hard about what steps to take",
|
||||
"Ik dacht hard na over wat ik moest doen",
|
||||
"Dobro sem premislila, katere korake moram narediti, da rešim problem",
|
||||
"Dobro sem premislil, katere korake moram narediti, da rešim problem",
|
||||
),
|
||||
171: (
|
||||
"I thought about how I might best handle the problem",
|
||||
"lk dacht na over hoe ik het probleem het best kon aanpakken",
|
||||
"Razmišljala sem, kaj bi bilo najbolje narediti s problemom",
|
||||
"Razmišljal sem, kaj bi bilo najbolje narediti s problemom",
|
||||
),
|
||||
172: (
|
||||
"I asked people who have had similar experiences what they did",
|
||||
"Ik vroeg aan mensen met dergelijke ervaringen hoe zij reageerden",
|
||||
"Vprašala sem posameznike s podobnimi izkušnjami, kaj so storili",
|
||||
"Vprašal sem posameznike s podobnimi izkušnjami, kaj so storili",
|
||||
),
|
||||
173: (
|
||||
"I tried to get advice from someone about what to do",
|
||||
"lk vroeg advies aan iemand",
|
||||
"Pri drugih sem poskušala dobiti nasvet, kaj naj storim",
|
||||
"Pri drugih sem poskušal dobiti nasvet, kaj naj storim",
|
||||
),
|
||||
174: (
|
||||
"I talked to someone to find out more about the situation",
|
||||
"Ik sprak met iemand om meer te weten te komen over de situatie",
|
||||
"Z nekom sem se pogovorila, da bi izvedela še kaj o svojem problemu",
|
||||
"Z nekom sem se pogovoril, da bi izvedel še kaj o svojem problemu",
|
||||
),
|
||||
175: (
|
||||
"I talked to someone who could do something concrete about the problem",
|
||||
"Ik sprak met iemand die iets aan het probleem kon doen",
|
||||
"Pogovorila sem se s kom, ki bi lahko naredil kaj konkretnega",
|
||||
"Pogovoril sem se s kom, ki bi lahko naredil kaj konkretnega",
|
||||
),
|
||||
176: (
|
||||
"I talked to someone about how I felt",
|
||||
"Ik sprak met iemand over hoe ik mij voelde",
|
||||
"Z nekom sem se pogovorila o tem, kako sem se počutila",
|
||||
"Z nekom sem se pogovoril o tem, kako sem se počutil",
|
||||
),
|
||||
177: (
|
||||
"I tried to get emotional support from friends or relatives",
|
||||
"Ik zocht steun bij vrienden of familie",
|
||||
"Skušala sem dobiti čustveno podporo prijateljev ali sorodnikov",
|
||||
"Skušal sem dobiti čustveno podporo prijateljev ali sorodnikov",
|
||||
),
|
||||
178: (
|
||||
"I discussed my feelings with someone",
|
||||
"lk besprak mijn gevoelens met iemand",
|
||||
"O svojih občutkih sem se z nekom pogovorila",
|
||||
"O svojih občutkih sem se z nekom pogovoril",
|
||||
),
|
||||
179: (
|
||||
"I got sympathy and understanding from someone",
|
||||
"Ik vroeg medeleven en begrip van iemand",
|
||||
"Poiskala sem naklonjenost in razumevanje drugih",
|
||||
"Poiskal sem naklonjenost in razumevanje drugih",
|
||||
),
|
||||
180: (
|
||||
"I got upset and let my emotions out",
|
||||
"Ik raakte van streek",
|
||||
"Razburila sem se in to tudi pokazala",
|
||||
"Razburil sem se in to tudi pokazal",
|
||||
),
|
||||
181: (
|
||||
"I let my feelings out",
|
||||
"Ik toonde mijn gevoelens",
|
||||
"Svojim čustvom sem dala prosto pot",
|
||||
"Svojim čustvom sem dal prosto pot",
|
||||
),
|
||||
182: (
|
||||
"I felt a lot of emotional distress and I found myself expressing",
|
||||
"lk liet duidelijk blijken hoe ellendig ik mij voelde",
|
||||
"Doživljala sem veliko stresa in opažala, da sem čustva",
|
||||
"Doživljal sem veliko stresa in opažal, da sem čustva",
|
||||
),
|
||||
183: (
|
||||
"I got upset, and I was really aware of it",
|
||||
"Ik merkte dat ik erg van streek was",
|
||||
"Razburila sem se in razmišljala samo o tem",
|
||||
"Razburil sem se in razmišljal samo o tem",
|
||||
),
|
||||
}
|
|
@ -1,11 +1,9 @@
|
|||
import pandas as pd
|
||||
|
||||
from features.esm import increment_answers
|
||||
|
||||
JCQ_ORIGINAL_MAX = 4
|
||||
JCQ_ORIGINAL_MIN = 1
|
||||
|
||||
DICT_JCQ_DEMAND_CONTROL_REVERSE = {
|
||||
dict_JCQ_demand_control_reverse = {
|
||||
75: (
|
||||
"I was NOT asked",
|
||||
"Men legde mij geen overdreven",
|
||||
|
@ -42,14 +40,10 @@ def reverse_jcq_demand_control_scoring(
|
|||
df_esm_jcq_demand_control: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Reverse JCQ demand and control answers.
|
||||
|
||||
This function recodes answers in Job content questionnaire
|
||||
by first incrementing them by 1, to be in line with original (1-4) scoring.
|
||||
Then, some answers are reversed (i.e. 1 becomes 4 etc.),
|
||||
because the questions are negatively phrased.
|
||||
These answers are listed in DICT_JCQ_DEMAND_CONTROL_REVERSE
|
||||
and identified by their question ID.
|
||||
This function recodes answers in Job content questionnaire by first incrementing them by 1,
|
||||
to be in line with original (1-4) scoring.
|
||||
Then, some answers are reversed (i.e. 1 becomes 4 etc.), because the questions are negatively phrased.
|
||||
These answers are listed in dict_JCQ_demand_control_reverse and identified by their question ID.
|
||||
However, the existing data is checked against literal phrasing of these questions
|
||||
to protect against wrong numbering of questions (differing question IDs).
|
||||
|
||||
|
@ -61,8 +55,7 @@ def reverse_jcq_demand_control_scoring(
|
|||
Returns
|
||||
-------
|
||||
df_esm_jcq_demand_control: pd.DataFrame
|
||||
The same dataframe with a column esm_user_score
|
||||
containing answers recoded and reversed.
|
||||
The same dataframe with a column esm_user_score containing answers recoded and reversed.
|
||||
"""
|
||||
df_esm_jcq_demand_control_unique_answers = (
|
||||
df_esm_jcq_demand_control.groupby("question_id")
|
||||
|
@ -71,7 +64,7 @@ def reverse_jcq_demand_control_scoring(
|
|||
.reset_index()
|
||||
)
|
||||
# Tabulate all possible answers to each question (group by question ID).
|
||||
for q_id in DICT_JCQ_DEMAND_CONTROL_REVERSE.keys():
|
||||
for q_id in dict_JCQ_demand_control_reverse.keys():
|
||||
# Look through all answers that need to be reversed.
|
||||
possible_answers = df_esm_jcq_demand_control_unique_answers.loc[
|
||||
df_esm_jcq_demand_control_unique_answers["question_id"] == q_id,
|
||||
|
@ -79,7 +72,7 @@ def reverse_jcq_demand_control_scoring(
|
|||
]
|
||||
# These are all answers to a given question (by q_id).
|
||||
answers_matches = possible_answers.str.startswith(
|
||||
DICT_JCQ_DEMAND_CONTROL_REVERSE.get(q_id)
|
||||
dict_JCQ_demand_control_reverse.get(q_id)
|
||||
)
|
||||
# See if they are expected, i.e. included in the dictionary.
|
||||
if ~answers_matches.all():
|
||||
|
@ -89,16 +82,18 @@ def reverse_jcq_demand_control_scoring(
|
|||
# In case there is an unexpected answer, raise an exception.
|
||||
|
||||
try:
|
||||
df_esm_jcq_demand_control = increment_answers(df_esm_jcq_demand_control)
|
||||
# Increment the original answer by 1 to keep in line
|
||||
# with traditional scoring (from JCQ_ORIGINAL_MIN to JCQ_ORIGINAL_MAX).
|
||||
df_esm_jcq_demand_control = df_esm_jcq_demand_control.assign(
|
||||
esm_user_score=lambda x: x.esm_user_answer_numeric + 1
|
||||
)
|
||||
# Increment the original answer by 1
|
||||
# to keep in line with traditional scoring (JCQ_ORIGINAL_MIN - JCQ_ORIGINAL_MAX).
|
||||
df_esm_jcq_demand_control[
|
||||
df_esm_jcq_demand_control["question_id"].isin(
|
||||
DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
|
||||
dict_JCQ_demand_control_reverse.keys()
|
||||
)
|
||||
] = df_esm_jcq_demand_control[
|
||||
df_esm_jcq_demand_control["question_id"].isin(
|
||||
DICT_JCQ_DEMAND_CONTROL_REVERSE.keys()
|
||||
dict_JCQ_demand_control_reverse.keys()
|
||||
)
|
||||
].assign(
|
||||
esm_user_score=lambda x: JCQ_ORIGINAL_MAX
|
||||
|
|
|
@ -3,9 +3,6 @@ import pandas as pd
|
|||
|
||||
import features.esm
|
||||
|
||||
SAM_ORIGINAL_MAX = 5
|
||||
SAM_ORIGINAL_MIN = 1
|
||||
|
||||
QUESTIONNAIRE_ID_SAM = {
|
||||
"event_stress": 87,
|
||||
"event_threat": 88,
|
||||
|
@ -23,107 +20,10 @@ GROUP_QUESTIONNAIRES_BY = [
|
|||
"device_id",
|
||||
"esm_session",
|
||||
]
|
||||
# Each questionnaire occurs only once within each esm_session on the same device
|
||||
# within the same participant.
|
||||
|
||||
|
||||
DICT_SAM_QUESTION_IDS = {
|
||||
87: (
|
||||
"Was there a particular event that created tension in you?",
|
||||
"Was er een bepaalde gebeurtenis die spanning veroorzaakte?",
|
||||
"Je prišlo do kakega dogodka, ki je v vas ustvaril napetost?",
|
||||
),
|
||||
88: (
|
||||
"Did this event make you feel anxious?",
|
||||
"Voelde je je angstig door deze gebeurtenis?",
|
||||
"Ste se zaradi tega dogodka počutili tesnob",
|
||||
),
|
||||
89: (
|
||||
"Will the outcome of this event be negative?",
|
||||
"Zal de uitkomst van deze gebeurtenis negatief zijn?",
|
||||
"Bo izid tega dogodka negativen?",
|
||||
),
|
||||
90: (
|
||||
"How threatening was this event?",
|
||||
"Hoe bedreigend was deze gebeurtenis?",
|
||||
"Kako grozeč je bil ta dogodek?",
|
||||
),
|
||||
91: (
|
||||
"Is this going to have a negative impact on you?",
|
||||
"Zal dit een negatieve impact op je hebben?",
|
||||
"Ali bo to negativno vplivalo na vas?",
|
||||
),
|
||||
92: (
|
||||
"Is this going to have a positive impact on you?",
|
||||
"Zal dit een positief effect op je hebben?",
|
||||
"Ali bo to pozitivno vplivalo na vas?",
|
||||
),
|
||||
93: (
|
||||
"How eager are you to tackle this event?",
|
||||
"Hoe graag wil je deze gebeurtenis aanpakken?",
|
||||
"Kako zagnani ste bili",
|
||||
),
|
||||
94: (
|
||||
"To what extent can you become a stronger person because of this event?",
|
||||
"In welke mate kan je een sterkere persoon worden door deze gebeurtenis?",
|
||||
"V kolikšni meri lahko zaradi tega dogodka postanete močnejša oseba?",
|
||||
),
|
||||
95: (
|
||||
"To what extent are you excited thinking about the outcome of this event?",
|
||||
"In welke mate ben je enthousiast bij de gedachte",
|
||||
"V kolikšni meri vas misel na izid tega dogodka navdušuje?",
|
||||
),
|
||||
96: (
|
||||
"At what time did this event occur?",
|
||||
"Hoe laat vond deze gebeurtenis plaats?",
|
||||
"Kdaj se je ta dogodek zgodil?",
|
||||
),
|
||||
97: (
|
||||
"How long did this event last?",
|
||||
"Hoe lang duurde deze gebeurtenis?",
|
||||
"Kako dolgo je trajal ta dogodek?",
|
||||
),
|
||||
98: (
|
||||
"Was/is this event work-related?",
|
||||
"Was/is deze gebeurtenis werkgerelateerd?",
|
||||
"Je (bil) ta dogodek povezan s službo?",
|
||||
"Je bil ali je ta dogodek povezan s službo?",
|
||||
),
|
||||
99: (
|
||||
"Did this overall period create tension in you?",
|
||||
"Heeft deze globale periode spanning veroorzaakt?",
|
||||
"Je to obdobje kot celota v vas ustvarilo napetost?",
|
||||
"Je to celo obdobje v vas ustvarilo napetost?",
|
||||
),
|
||||
100: (
|
||||
"To what extent do you perceive this overall period as stressful?",
|
||||
"In welke mate ervaar je deze globale periode als stressvol?",
|
||||
"V kolikšni meri ste to obdobje dojemali kot stresno?",
|
||||
"V kolikšni meri ste celo to obdobje dojemali kot stresno?",
|
||||
),
|
||||
}
|
||||
# Each questionnaire occurs only once within each esm_session on the same device within the same participant.
|
||||
|
||||
|
||||
def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Extract information about stressful events.
|
||||
|
||||
Participants were asked: "Was there a particular event that created tension in you?"
|
||||
Then a subset of questions related to this event followed.
|
||||
This function goes through the follow-up questions one by one
|
||||
and preprocesses them, so that it adds new columns to the dataframe.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm: pd.DataFrame
|
||||
A raw dataframe of all ESM data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_events: pd.DataFrame
|
||||
A cleaned up df of Stress Appraisal Measure items with additional columns.
|
||||
|
||||
"""
|
||||
# 0. Select only questions from Stress Appraisal Measure.
|
||||
df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
|
||||
df_esm_sam = df_esm_preprocessed[
|
||||
|
@ -178,8 +78,7 @@ def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
|
|||
|
||||
def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
This function calculates challenge and threat
|
||||
(two Stress Appraisal Measure subscales) means,
|
||||
This function calculates challenge and threat (two Stress Appraisal Measure subscales) means,
|
||||
for each ESM session (within participants and devices).
|
||||
It creates a grouped dataframe with means in two columns.
|
||||
|
||||
|
@ -191,8 +90,7 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
|
|||
Returns
|
||||
-------
|
||||
df_esm_event_threat_challenge_mean_wide: pd.DataFrame
|
||||
A dataframe of unique ESM sessions (by participants and devices)
|
||||
with threat and challenge means.
|
||||
A dataframe of unique ESM sessions (by participants and devices) with threat and challenge means.
|
||||
"""
|
||||
# Select only threat and challenge assessments for events
|
||||
df_esm_event_threat_challenge = df_esm_sam_clean[
|
||||
|
@ -214,8 +112,8 @@ def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataF
|
|||
aggfunc="mean",
|
||||
)
|
||||
# Drop unnecessary column values.
|
||||
df_esm_event_threat_challenge_mean_wide.columns = (
|
||||
df_esm_event_threat_challenge_mean_wide.columns.get_level_values(1)
|
||||
df_esm_event_threat_challenge_mean_wide.columns = df_esm_event_threat_challenge_mean_wide.columns.get_level_values(
|
||||
1
|
||||
)
|
||||
df_esm_event_threat_challenge_mean_wide.columns.name = None
|
||||
df_esm_event_threat_challenge_mean_wide.rename(
|
||||
|
@ -291,12 +189,10 @@ def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
|||
|
||||
def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
This function only serves to convert the string datetime answer
|
||||
into a real datetime type.
|
||||
Errors during this conversion are coerced, meaning that non-datetime answers
|
||||
are assigned Not a Time (NaT).
|
||||
NOTE: Since the only available non-datetime answer to this question was
|
||||
"0 - I do not remember", the NaTs can be interpreted to mean this.
|
||||
This function only serves to convert the string datetime answer into a real datetime type.
|
||||
Errors during this conversion are coerced, meaning that non-datetime answers are assigned Not a Time (NaT).
|
||||
NOTE: Since the only available non-datetime answer to this question was "0 - I do not remember",
|
||||
the NaTs can be interpreted to mean this.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
@ -312,13 +208,9 @@ def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
|||
df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
|
||||
].assign(
|
||||
event_time=lambda x: pd.to_datetime(
|
||||
x.esm_user_answer,
|
||||
errors="coerce",
|
||||
format="%Y-%m-%d %H:%M:%S %z",
|
||||
exact=True,
|
||||
x.esm_user_answer, errors="coerce", infer_datetime_format=True, exact=True
|
||||
)
|
||||
)
|
||||
# Example answer: 2020-09-29 00:05:00 +0200
|
||||
return df_esm_event_time
|
||||
|
||||
|
||||
|
@ -349,12 +241,9 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
|||
== QUESTIONNAIRE_ID_SAM.get("event_duration")
|
||||
].assign(
|
||||
event_duration=lambda x: pd.to_datetime(
|
||||
x.esm_user_answer.str.slice(start=0, stop=-6),
|
||||
errors="coerce",
|
||||
format="%Y-%m-%d %H:%M:%S",
|
||||
x.esm_user_answer.str.slice(start=0, stop=-6), errors="coerce"
|
||||
).dt.time
|
||||
)
|
||||
# Example answer: 2020-09-29 00:05:00 +0200
|
||||
# TODO Explore the values recorded in event_duration and possibly fix mistakes.
|
||||
# For example, participants reported setting 23:50:00 instead of 00:50:00.
|
||||
|
||||
|
@ -362,7 +251,7 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
|||
# we can determine whether:
|
||||
# - this event is still going on ("1 - It is still going on")
|
||||
# - the participant couldn't remember it's duration ("0 - I do not remember")
|
||||
# Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm
|
||||
# Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm,
|
||||
# but only the numeric types of questions and answers.
|
||||
# Since this was of "datetime" type, convert these specific answers here again.
|
||||
df_esm_event_duration["event_duration_info"] = np.nan
|
||||
|
@ -375,5 +264,4 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
|
|||
return df_esm_event_duration
|
||||
|
||||
|
||||
# TODO: How many questions about the stressfulness of the period were asked
|
||||
# and how does this relate to events?
|
||||
# TODO: How many questions about the stressfulness of the period were asked and how does this relate to events?
|
||||
|
|
|
@ -1,123 +1,71 @@
|
|||
import pandas as pd
|
||||
import xgboost as xg
|
||||
from lightgbm import LGBMClassifier
|
||||
from sklearn import ensemble, linear_model, naive_bayes, neighbors, svm, tree
|
||||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||
from lightgbm import LGBMClassifier
|
||||
import xgboost as xg
|
||||
|
||||
|
||||
class ClassificationModels:
|
||||
class ClassificationModels():
|
||||
|
||||
def __init__(self):
|
||||
self.cmodels = self.init_classification_models()
|
||||
|
||||
|
||||
def get_cmodels(self):
|
||||
return self.cmodels
|
||||
|
||||
def init_classification_models(self):
|
||||
cmodels = {
|
||||
"dummy_classifier": {
|
||||
"model": DummyClassifier(strategy="most_frequent"),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
'dummy_classifier': {
|
||||
'model': DummyClassifier(strategy="most_frequent"),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
"logistic_regression": {
|
||||
"model": linear_model.LogisticRegression(max_iter=1000),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
'logistic_regression': {
|
||||
'model': linear_model.LogisticRegression(max_iter=1000),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
"support_vector_machine": {"model": svm.SVC(), "metrics": [0, 0, 0, 0]},
|
||||
"gaussian_naive_bayes": {
|
||||
"model": naive_bayes.GaussianNB(),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
'support_vector_machine': {
|
||||
'model': svm.SVC(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
"stochastic_gradient_descent_classifier": {
|
||||
"model": linear_model.SGDClassifier(),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
'gaussian_naive_bayes': {
|
||||
'model': naive_bayes.GaussianNB(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
"knn": {"model": neighbors.KNeighborsClassifier(), "metrics": [0, 0, 0, 0]},
|
||||
"decision_tree": {
|
||||
"model": tree.DecisionTreeClassifier(),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
'stochastic_gradient_descent_classifier': {
|
||||
'model': linear_model.SGDClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
"random_forest_classifier": {
|
||||
"model": ensemble.RandomForestClassifier(),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
'knn': {
|
||||
'model': neighbors.KNeighborsClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
"gradient_boosting_classifier": {
|
||||
"model": ensemble.GradientBoostingClassifier(),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
'decision_tree': {
|
||||
'model': tree.DecisionTreeClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
"lgbm_classifier": {"model": LGBMClassifier(), "metrics": [0, 0, 0, 0]},
|
||||
"XGBoost_classifier": {
|
||||
"model": xg.sklearn.XGBClassifier(),
|
||||
"metrics": [0, 0, 0, 0],
|
||||
'random_forest_classifier': {
|
||||
'model': ensemble.RandomForestClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'gradient_boosting_classifier': {
|
||||
'model': ensemble.GradientBoostingClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'lgbm_classifier': {
|
||||
'model': LGBMClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
},
|
||||
'XGBoost_classifier': {
|
||||
'model': xg.sklearn.XGBClassifier(),
|
||||
'metrics': [0, 0, 0, 0]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return cmodels
|
||||
|
||||
|
||||
def get_total_models_scores(self, n_clusters=1):
|
||||
scores = pd.DataFrame(columns=["method", "metric", "mean"])
|
||||
for model_title, model in self.cmodels.items():
|
||||
scores_df = pd.DataFrame(columns=["method", "metric", "mean"])
|
||||
print("\n************************************\n")
|
||||
print("Current model:", model_title, end="\n")
|
||||
print("Acc:", model["metrics"][0] / n_clusters)
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
pd.DataFrame(
|
||||
{
|
||||
"method": model_title,
|
||||
"metric": "test_accuracy",
|
||||
"mean": model["metrics"][0] / n_clusters,
|
||||
},
|
||||
index=[0],
|
||||
),
|
||||
],
|
||||
ignore_index=True,
|
||||
)
|
||||
print("Precision:", model["metrics"][1] / n_clusters)
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
pd.DataFrame(
|
||||
{
|
||||
"method": model_title,
|
||||
"metric": "test_precision",
|
||||
"mean": model["metrics"][1] / n_clusters,
|
||||
},
|
||||
index=[0],
|
||||
),
|
||||
],
|
||||
ignore_index=True,
|
||||
)
|
||||
print("Recall:", model["metrics"][2] / n_clusters)
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
pd.DataFrame(
|
||||
{
|
||||
"method": model_title,
|
||||
"metric": "test_recall",
|
||||
"mean": model["metrics"][2] / n_clusters,
|
||||
},
|
||||
index=[0],
|
||||
),
|
||||
],
|
||||
ignore_index=True,
|
||||
)
|
||||
print("F1:", model["metrics"][3] / n_clusters)
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
pd.DataFrame(
|
||||
{
|
||||
"method": model_title,
|
||||
"metric": "test_f1",
|
||||
"mean": model["metrics"][3] / n_clusters,
|
||||
},
|
||||
index=[0],
|
||||
),
|
||||
],
|
||||
ignore_index=True,
|
||||
)
|
||||
scores = pd.concat([scores, scores_df])
|
||||
return scores
|
||||
print("Acc:", model['metrics'][0]/n_clusters)
|
||||
print("Precision:", model['metrics'][1]/n_clusters)
|
||||
print("Recall:", model['metrics'][2]/n_clusters)
|
||||
print("F1:", model['metrics'][3]/n_clusters)
|
|
@ -1,24 +1,15 @@
|
|||
from pathlib import Path
|
||||
from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble, naive_bayes, neighbors, tree
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.dummy import DummyRegressor, DummyClassifier
|
||||
|
||||
from xgboost import XGBRegressor, XGBClassifier
|
||||
import xgboost as xg
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn import (
|
||||
ensemble,
|
||||
gaussian_process,
|
||||
kernel_ridge,
|
||||
linear_model,
|
||||
naive_bayes,
|
||||
svm,
|
||||
)
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.model_selection import (
|
||||
BaseCrossValidator,
|
||||
LeaveOneGroupOut,
|
||||
StratifiedKFold,
|
||||
cross_validate,
|
||||
)
|
||||
from xgboost import XGBClassifier, XGBRegressor
|
||||
import numpy as np
|
||||
|
||||
|
||||
def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
|
||||
|
@ -74,116 +65,52 @@ def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> P
|
|||
full_path = folder / export_filename
|
||||
return full_path
|
||||
|
||||
|
||||
def insert_row(df, row):
|
||||
return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
|
||||
|
||||
def prepare_regression_model_input(input_csv):
|
||||
|
||||
def impute_encode_categorical_features(model_input: pd.DataFrame) -> pd.DataFrame:
|
||||
categorical_feature_col_names = [
|
||||
"gender",
|
||||
"startlanguage",
|
||||
"limesurvey_demand_control_ratio_quartile",
|
||||
]
|
||||
additional_categorical_features = [
|
||||
col
|
||||
for col in model_input.columns
|
||||
if "mostcommonactivity" in col or "homelabel" in col
|
||||
]
|
||||
categorical_feature_col_names += additional_categorical_features
|
||||
model_input = pd.read_csv(input_csv)
|
||||
|
||||
categorical_features = model_input[categorical_feature_col_names].copy()
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
|
||||
data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
|
||||
|
||||
categorical_feature_colnames = ["gender", "startlanguage", "limesurvey_demand_control_ratio_quartile"]
|
||||
additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_feature_colnames += additional_categorical_features
|
||||
#TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully
|
||||
categorical_features = data_x[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
# fillna with mode
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
# one-hot encoding
|
||||
categorical_features = categorical_features.apply(
|
||||
lambda col: col.astype("category")
|
||||
)
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
|
||||
numerical_features = model_input.drop(categorical_feature_col_names, axis=1)
|
||||
numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
|
||||
|
||||
model_input = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
return model_input
|
||||
train_x = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
return train_x, data_y, data_groups
|
||||
|
||||
|
||||
def prepare_sklearn_data_format(
|
||||
model_input: pd.DataFrame, cv_method: str = "logo"
|
||||
) -> tuple:
|
||||
index_columns = [
|
||||
"local_segment",
|
||||
"local_segment_label",
|
||||
"local_segment_start_datetime",
|
||||
"local_segment_end_datetime",
|
||||
]
|
||||
model_input.set_index(index_columns, inplace=True)
|
||||
def run_all_regression_models(input_csv):
|
||||
# Prepare data
|
||||
data_x, data_y, data_groups = prepare_regression_model_input(input_csv)
|
||||
|
||||
if cv_method == "half_logo":
|
||||
model_input["pid_index"] = model_input.groupby("pid").cumcount()
|
||||
model_input["pid_count"] = model_input.groupby("pid")["pid"].transform("count")
|
||||
|
||||
model_input["pid_index"] = (
|
||||
model_input["pid_index"] / model_input["pid_count"] + 1
|
||||
).round()
|
||||
model_input["pid_half"] = (
|
||||
model_input["pid"] + "_" + model_input["pid_index"].astype(int).astype(str)
|
||||
)
|
||||
|
||||
data_x, data_y, data_groups = (
|
||||
model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1),
|
||||
model_input["target"],
|
||||
model_input["pid_half"],
|
||||
)
|
||||
else:
|
||||
data_x, data_y, data_groups = (
|
||||
model_input.drop(["target", "pid"], axis=1),
|
||||
model_input["target"],
|
||||
model_input["pid"],
|
||||
)
|
||||
return data_x, data_y, data_groups
|
||||
|
||||
|
||||
def prepare_cross_validator(
|
||||
data_x: pd.DataFrame,
|
||||
data_y: pd.DataFrame,
|
||||
data_groups: pd.DataFrame,
|
||||
cv_method: str = "logo",
|
||||
) -> BaseCrossValidator:
|
||||
if cv_method == "logo" or cv_method == "half_logo":
|
||||
cv = LeaveOneGroupOut()
|
||||
cv.get_n_splits(
|
||||
data_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
else:
|
||||
cv = StratifiedKFold(n_splits=5, shuffle=True)
|
||||
return cv
|
||||
|
||||
|
||||
def aggregate_and_transpose(df: pd.DataFrame, statistics=None) -> pd.DataFrame:
|
||||
if statistics is None:
|
||||
statistics = ["max", "mean"]
|
||||
return (
|
||||
df.agg(statistics)
|
||||
.transpose()
|
||||
.reset_index()
|
||||
.rename(columns={"index": "test_metric"})
|
||||
# Prepare cross validation
|
||||
logo = LeaveOneGroupOut()
|
||||
logo.get_n_splits(
|
||||
data_x,
|
||||
data_y,
|
||||
groups=data_groups,
|
||||
)
|
||||
|
||||
|
||||
def run_all_regression_models(
|
||||
data_x: pd.DataFrame,
|
||||
data_y: pd.DataFrame,
|
||||
data_groups: pd.DataFrame,
|
||||
cross_validator: BaseCrossValidator,
|
||||
) -> pd.DataFrame:
|
||||
metrics = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]
|
||||
metrics = ['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error']
|
||||
test_metrics = ["test_" + metric for metric in metrics]
|
||||
scores = pd.DataFrame(columns=["method", "test_metric", "max", "nanmedian"])
|
||||
scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
|
||||
|
||||
# Validate models
|
||||
dummy_regr = DummyRegressor(strategy="mean")
|
||||
|
@ -192,58 +119,53 @@ def run_all_regression_models(
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Dummy model:")
|
||||
print("R^2: ", np.nanmedian(dummy_regr_scores["test_r2"]))
|
||||
|
||||
print("R^2: ", np.nanmedian(dummy_regr_scores['test_r2']))
|
||||
|
||||
scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||
scores_df["method"] = "dummy"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del dummy_regr
|
||||
del dummy_regr_scores
|
||||
|
||||
lin_reg = linear_model.LinearRegression()
|
||||
lin_reg_rapids = linear_model.LinearRegression()
|
||||
lin_reg_scores = cross_validate(
|
||||
lin_reg,
|
||||
lin_reg_rapids,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Linear regression:")
|
||||
print("R^2: ", np.nanmedian(lin_reg_scores["test_r2"]))
|
||||
print("R^2: ", np.nanmedian(lin_reg_scores['test_r2']))
|
||||
|
||||
scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||
scores_df["method"] = "linear_reg"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del lin_reg
|
||||
del lin_reg_scores
|
||||
|
||||
ridge_reg = linear_model.Ridge(alpha=0.5)
|
||||
ridge_reg = linear_model.Ridge(alpha=.5)
|
||||
ridge_reg_scores = cross_validate(
|
||||
ridge_reg,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Ridge regression")
|
||||
|
||||
scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||
scores_df["method"] = "ridge_reg"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del ridge_reg
|
||||
del ridge_reg_scores
|
||||
|
||||
|
||||
lasso_reg = linear_model.Lasso(alpha=0.1)
|
||||
lasso_reg_score = cross_validate(
|
||||
|
@ -251,18 +173,16 @@ def run_all_regression_models(
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Lasso regression")
|
||||
|
||||
scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||
scores_df["method"] = "lasso_reg"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del lasso_reg
|
||||
del lasso_reg_score
|
||||
|
||||
bayesian_ridge_reg = linear_model.BayesianRidge()
|
||||
bayesian_ridge_reg_score = cross_validate(
|
||||
|
@ -270,18 +190,16 @@ def run_all_regression_models(
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Bayesian Ridge")
|
||||
|
||||
scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||
scores_df["method"] = "bayesian_ridge"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del bayesian_ridge_reg
|
||||
del bayesian_ridge_reg_score
|
||||
|
||||
ransac_reg = linear_model.RANSACRegressor()
|
||||
ransac_reg_score = cross_validate(
|
||||
|
@ -289,18 +207,16 @@ def run_all_regression_models(
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
scoring=metrics
|
||||
)
|
||||
print("RANSAC (outlier robust regression)")
|
||||
|
||||
scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||
scores_df["method"] = "RANSAC"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del ransac_reg
|
||||
del ransac_reg_score
|
||||
|
||||
svr = svm.SVR()
|
||||
svr_score = cross_validate(
|
||||
|
@ -308,18 +224,16 @@ def run_all_regression_models(
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Support vector regression")
|
||||
|
||||
|
||||
scores_df = pd.DataFrame(svr_score)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||
scores_df["method"] = "SVR"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del svr
|
||||
del svr_score
|
||||
|
||||
kridge = kernel_ridge.KernelRidge()
|
||||
kridge_score = cross_validate(
|
||||
|
@ -327,18 +241,16 @@ def run_all_regression_models(
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Kernel Ridge regression")
|
||||
|
||||
|
||||
scores_df = pd.DataFrame(kridge_score)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||
scores_df["method"] = "kernel_ridge"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del kridge
|
||||
del kridge_score
|
||||
|
||||
gpr = gaussian_process.GaussianProcessRegressor()
|
||||
gpr_score = cross_validate(
|
||||
|
@ -346,18 +258,16 @@ def run_all_regression_models(
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Gaussian Process Regression")
|
||||
|
||||
scores_df = pd.DataFrame(gpr_score)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||
scores_df["method"] = "gaussian_proc"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del gpr
|
||||
del gpr_score
|
||||
|
||||
rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
|
||||
rfr_score = cross_validate(
|
||||
|
@ -365,18 +275,16 @@ def run_all_regression_models(
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Random Forest Regression")
|
||||
|
||||
scores_df = pd.DataFrame(rfr_score)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||
scores_df["method"] = "random_forest"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del rfr
|
||||
del rfr_score
|
||||
|
||||
xgb = XGBRegressor()
|
||||
xgb_score = cross_validate(
|
||||
|
@ -384,18 +292,16 @@ def run_all_regression_models(
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
scoring=metrics
|
||||
)
|
||||
print("XGBoost Regressor")
|
||||
|
||||
scores_df = pd.DataFrame(xgb_score)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||
scores_df["method"] = "XGBoost"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del xgb
|
||||
del xgb_score
|
||||
|
||||
ada = ensemble.AdaBoostRegressor()
|
||||
ada_score = cross_validate(
|
||||
|
@ -403,328 +309,151 @@ def run_all_regression_models(
|
|||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
cv=logo,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
scoring=metrics
|
||||
)
|
||||
print("ADA Boost Regressor")
|
||||
|
||||
scores_df = pd.DataFrame(ada_score)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", np.nanmedian])
|
||||
scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
|
||||
scores_df["method"] = "ADA_boost"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del ada
|
||||
del ada_score
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
def confusion_matrix_scorer(clf, X, y):
|
||||
y_pred = clf.predict(X)
|
||||
cm = confusion_matrix(y, y_pred)
|
||||
return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]}
|
||||
|
||||
|
||||
def aggregate_confusion_matrix(scores_dict: dict) -> pd.DataFrame:
|
||||
scores_aggregated = aggregate_and_transpose(
|
||||
pd.DataFrame(scores_dict), statistics=["sum"]
|
||||
)
|
||||
return scores_aggregated[
|
||||
~scores_aggregated.test_metric.isin(["fit_time", "score_time"])
|
||||
]
|
||||
|
||||
|
||||
def run_all_classification_models(
|
||||
data_x: pd.DataFrame,
|
||||
data_y: pd.DataFrame,
|
||||
data_groups: pd.DataFrame,
|
||||
cross_validator: BaseCrossValidator,
|
||||
):
|
||||
data_y_value_counts = data_y.value_counts()
|
||||
if len(data_y_value_counts) == 1:
|
||||
raise (ValueError("There is only one unique value in data_y."))
|
||||
if len(data_y_value_counts) == 2:
|
||||
metrics = ["accuracy", "average_precision", "recall", "f1"]
|
||||
else:
|
||||
metrics = ["accuracy", "precision_micro", "recall_micro", "f1_micro"]
|
||||
|
||||
def run_all_classification_models(data_x, data_y, data_groups, cv_method):
|
||||
metrics = ['accuracy', 'average_precision', 'recall', 'f1']
|
||||
test_metrics = ["test_" + metric for metric in metrics]
|
||||
|
||||
scores = pd.DataFrame(columns=["method", "test_metric", "max", "mean"])
|
||||
scores = pd.DataFrame(columns=["method", "max", "mean"])
|
||||
|
||||
dummy_class = DummyClassifier(strategy="most_frequent")
|
||||
|
||||
dummy_score = cross_validate(
|
||||
dummy_class,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
error_score="raise",
|
||||
scoring=metrics,
|
||||
)
|
||||
dummy_confusion_matrix = cross_validate(
|
||||
dummy_class,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
error_score="raise",
|
||||
scoring=confusion_matrix_scorer,
|
||||
dummy_class,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
error_score='raise',
|
||||
scoring=metrics
|
||||
)
|
||||
print("Dummy")
|
||||
|
||||
scores_df = pd.DataFrame(dummy_score)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(dummy_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df["method"] = "dummy_classifier"
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "Dummy"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del dummy_class
|
||||
del dummy_score
|
||||
del dummy_confusion_matrix
|
||||
|
||||
logistic_regression = linear_model.LogisticRegression()
|
||||
|
||||
log_reg_scores = cross_validate(
|
||||
logistic_regression,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
log_reg_confusion_matrix = cross_validate(
|
||||
logistic_regression,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=confusion_matrix_scorer,
|
||||
logistic_regression,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Logistic regression")
|
||||
|
||||
scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(log_reg_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df["method"] = "logistic_regression"
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "logistic_reg"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del logistic_regression
|
||||
del log_reg_scores
|
||||
del log_reg_confusion_matrix
|
||||
|
||||
svc = svm.SVC()
|
||||
|
||||
svc_scores = cross_validate(
|
||||
svc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
svc_confusion_matrix = cross_validate(
|
||||
svc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=confusion_matrix_scorer,
|
||||
svc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Support Vector Machine")
|
||||
|
||||
scores_df = pd.DataFrame(svc_scores)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(svc_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df["method"] = "SVC"
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "svc"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del svc
|
||||
del svc_scores
|
||||
del svc_confusion_matrix
|
||||
|
||||
gaussian_nb = naive_bayes.GaussianNB()
|
||||
|
||||
|
||||
gaussian_nb_scores = cross_validate(
|
||||
gaussian_nb,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
gaussian_nb_confusion_matrix = cross_validate(
|
||||
gaussian_nb,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=confusion_matrix_scorer,
|
||||
gaussian_nb,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Gaussian Naive Bayes")
|
||||
|
||||
scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(gaussian_nb_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "gaussian_naive_bayes"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del gaussian_nb
|
||||
del gaussian_nb_scores
|
||||
del gaussian_nb_confusion_matrix
|
||||
|
||||
sgdc = linear_model.SGDClassifier()
|
||||
|
||||
sgdc_scores = cross_validate(
|
||||
sgdc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
sgdc_confusion_matrix = cross_validate(
|
||||
sgdc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=confusion_matrix_scorer,
|
||||
sgdc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Stochastic Gradient Descent")
|
||||
|
||||
scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(sgdc_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df["method"] = "stochastic_gradient_descent_classifier"
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "stochastic_gradient_descent"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del sgdc
|
||||
del sgdc_scores
|
||||
del sgdc_confusion_matrix
|
||||
|
||||
rfc = ensemble.RandomForestClassifier()
|
||||
|
||||
rfc_scores = cross_validate(
|
||||
rfc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
rfc_confusion_matrix = cross_validate(
|
||||
rfc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=confusion_matrix_scorer,
|
||||
rfc,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=metrics
|
||||
)
|
||||
print("Random Forest")
|
||||
|
||||
scores_df = pd.DataFrame(rfc_scores)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(rfc_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df["method"] = "random_forest_classifier"
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "random_forest"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del rfc
|
||||
del rfc_scores
|
||||
del rfc_confusion_matrix
|
||||
|
||||
xgb_classifier = XGBClassifier()
|
||||
|
||||
xgb_scores = cross_validate(
|
||||
xgb_classifier,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=metrics,
|
||||
)
|
||||
xgb_confusion_matrix = cross_validate(
|
||||
xgb_classifier,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cross_validator,
|
||||
n_jobs=-1,
|
||||
scoring=confusion_matrix_scorer,
|
||||
xgb_classifier,
|
||||
X=data_x,
|
||||
y=data_y,
|
||||
groups=data_groups,
|
||||
cv=cv_method,
|
||||
n_jobs=-1,
|
||||
scoring=metrics
|
||||
)
|
||||
print("XGBoost")
|
||||
|
||||
scores_df = pd.DataFrame(xgb_scores)[test_metrics]
|
||||
scores_df = aggregate_and_transpose(scores_df, statistics=["max", "mean"])
|
||||
scores_df = pd.concat(
|
||||
[
|
||||
scores_df,
|
||||
aggregate_confusion_matrix(xgb_confusion_matrix).rename(
|
||||
columns={"sum": "mean"}
|
||||
# Note: the column is misleadingly renamed to get concise output.
|
||||
),
|
||||
]
|
||||
)
|
||||
scores_df["method"] = "XGBoost_classifier"
|
||||
scores_df = scores_df.agg(['max', 'mean']).transpose()
|
||||
scores_df["method"] = "xgboost"
|
||||
scores = pd.concat([scores, scores_df])
|
||||
del xgb_classifier
|
||||
del xgb_scores
|
||||
del xgb_confusion_matrix
|
||||
|
||||
return scores
|
||||
|
|
|
@ -34,114 +34,18 @@ df_app_categories <- tbl(con, "app_categories") %>%
|
|||
head(df_app_categories)
|
||||
table(df_app_categories$play_store_genre)
|
||||
|
||||
df_app_categories %>%
|
||||
filter(play_store_genre == "not_found") %>%
|
||||
group_by(play_store_response) %>%
|
||||
count()
|
||||
# All "not_found" have an HTTP status of 404.
|
||||
|
||||
df_app_categories %>%
|
||||
filter(play_store_genre == "not_found") %>%
|
||||
group_by(package_name) %>%
|
||||
count() %>%
|
||||
arrange(desc(n))
|
||||
# All "not_found" apps are unique.
|
||||
|
||||
# Exclude phone manufacturers, custom ROM names and similar.
|
||||
manufacturers <- c(
|
||||
"samsung",
|
||||
"oneplus",
|
||||
"huawei",
|
||||
"xiaomi",
|
||||
"lge",
|
||||
"motorola",
|
||||
"miui",
|
||||
"lenovo",
|
||||
"oppo",
|
||||
"mediatek"
|
||||
)
|
||||
custom_rom <- c("coloros", "lineageos", "myos", "cyanogenmod", "foundation.e")
|
||||
other <- c("android", "wssyncmldm")
|
||||
|
||||
grep_pattern <- paste(c(manufacturers, custom_rom, other), collapse = "|")
|
||||
|
||||
rows_os_manufacturer <- grepl(grep_pattern, df_app_categories$package_name)
|
||||
|
||||
# Explore what remains after excluding above.
|
||||
df_app_categories[!rows_os_manufacturer, ] %>%
|
||||
filter(play_store_genre == "not_found")
|
||||
|
||||
# Also check the relationship between is_system_app and System category.
|
||||
tbl(con, "applications") %>%
|
||||
filter(is_system_app, play_store_genre != "System") %>%
|
||||
count()
|
||||
# They are perfectly correlated.
|
||||
|
||||
# Manually classify apps
|
||||
df_app_categories[df_app_categories$play_store_genre == "not_found",] <-
|
||||
df_app_categories %>%
|
||||
filter(play_store_genre == "not_found") %>%
|
||||
mutate(
|
||||
play_store_genre =
|
||||
case_when(
|
||||
str_detect(str_to_lower(package_name), grep_pattern) ~ "System",
|
||||
str_detect(str_to_lower(package_name), "straw") ~ "STRAW",
|
||||
str_detect(str_to_lower(package_name), "chromium") ~ "Communication", # Same as chrome.
|
||||
str_detect(str_to_lower(package_name), "skype") ~ "Communication", # Skype Lite not classified.
|
||||
str_detect(str_to_lower(package_name), "imsservice") ~ "Communication", # IP Multimedia Subsystem
|
||||
str_detect(str_to_lower(package_name), paste(c("covid", "empatica"), collapse = "|")) ~ "Medical",
|
||||
str_detect(str_to_lower(package_name), paste(c("libri", "tachiyomi"), collapse = "|")) ~ "Books & Reference",
|
||||
str_detect(str_to_lower(package_name), paste(c("bricks", "chess"), collapse = "|")) ~ "Casual",
|
||||
str_detect(str_to_lower(package_name), "weather") ~ "Weather",
|
||||
str_detect(str_to_lower(package_name), "excel") ~ "Productivity",
|
||||
str_detect(str_to_lower(package_name), paste(c("qr", "barcode", "archimedes", "mixplorer", "winrar", "filemanager", "shot", "faceunlock", "signin", "milink"), collapse = "|")) ~ "Tools",
|
||||
str_detect(str_to_lower(package_name), "stupeflix") ~ "Photography",
|
||||
str_detect(str_to_lower(package_name), "anyme") ~ "Entertainment",
|
||||
str_detect(str_to_lower(package_name), "vanced") ~ "Video Players & Editors",
|
||||
str_detect(str_to_lower(package_name), paste(c("music", "radio", "dolby"), collapse = "|")) ~ "Music & Audio",
|
||||
str_detect(str_to_lower(package_name), paste(c("tensorflow", "object_detection"), collapse = "|")) ~ "Education",
|
||||
.default = play_store_genre
|
||||
)
|
||||
)
|
||||
|
||||
# Explore what remains after classifying above.
|
||||
df_app_categories %>%
|
||||
filter(play_store_genre == "not_found")
|
||||
|
||||
# After this, 13 applications remain, which I will classify as "Other".
|
||||
|
||||
# Correct some mistakes
|
||||
# And classify 'not_found'
|
||||
df_app_categories %<>%
|
||||
mutate(
|
||||
play_store_genre = {
|
||||
function(x) {
|
||||
case_when(
|
||||
x == "Education,Education" ~ "Education",
|
||||
x == "EducationEducation" ~ "Education",
|
||||
x == "not_found" ~ "Other",
|
||||
.default = x
|
||||
)
|
||||
}
|
||||
}(play_store_genre)
|
||||
) %>%
|
||||
select(-package_name) %>%
|
||||
rename(
|
||||
genre = play_store_genre,
|
||||
package_name = package_hash
|
||||
)
|
||||
|
||||
table(df_app_categories$genre)
|
||||
|
||||
df_app_categories %>%
|
||||
group_by(genre) %>%
|
||||
count() %>%
|
||||
arrange(desc(n)) %>%
|
||||
write_csv("play_store_categories_count.csv")
|
||||
|
||||
write_csv(
|
||||
x = select(df_app_categories, c(package_name, genre)),
|
||||
file = "play_store_application_genre_catalogue.csv"
|
||||
df_app_categories %<>% mutate(
|
||||
play_store_genre = {
|
||||
function(x) {
|
||||
case_when(
|
||||
x == "Education,Education" ~ "Education",
|
||||
x == "EducationEducation" ~ "Education",
|
||||
x == "not_found" ~ "System",
|
||||
.default = x
|
||||
)
|
||||
}
|
||||
}(play_store_genre)
|
||||
)
|
||||
|
||||
dbDisconnect(con)
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
method,metric,max,mean
|
||||
Dummy,test_accuracy,0.8557046979865772,0.8548446932649828
|
||||
Dummy,test_average_precision,0.1457286432160804,0.14515530673501736
|
||||
Dummy,test_recall,0.0,0.0
|
||||
Dummy,test_f1,0.0,0.0
|
||||
logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606
|
||||
logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834
|
||||
logistic_reg,test_recall,0.3023255813953488,0.24266238973536486
|
||||
logistic_reg,test_f1,0.3909774436090226,0.318943511424051
|
||||
svc,test_accuracy,0.8557046979865772,0.8548446932649828
|
||||
svc,test_average_precision,0.44514416839823046,0.4068200938341621
|
||||
svc,test_recall,0.0,0.0
|
||||
svc,test_f1,0.0,0.0
|
||||
gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234
|
||||
gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853
|
||||
gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935
|
||||
gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699
|
||||
stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601
|
||||
stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592
|
||||
stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675
|
||||
stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583
|
||||
random_forest,test_accuracy,0.9110738255033557,0.9011129472867694
|
||||
random_forest,test_average_precision,0.6998372262021191,0.6619275281099584
|
||||
random_forest,test_recall,0.4069767441860465,0.35356856455493185
|
||||
random_forest,test_f1,0.5691056910569107,0.5078402513053142
|
||||
xgboost,test_accuracy,0.9128978224455612,0.9007711937764886
|
||||
xgboost,test_average_precision,0.7366643049075349,0.698622165966308
|
||||
xgboost,test_recall,0.5287356321839081,0.44346431435445066
|
||||
xgboost,test_f1,0.638888888888889,0.5633957169928393
|
|
|
@ -0,0 +1,29 @@
|
|||
method,metric,max,mean
|
||||
Dummy,test_accuracy,1.0,0.8524114578096439
|
||||
Dummy,test_average_precision,0.7,0.14758854219035614
|
||||
Dummy,test_recall,0.0,0.0
|
||||
Dummy,test_f1,0.0,0.0
|
||||
logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311
|
||||
logistic_reg,test_average_precision,1.0,0.44605167668563583
|
||||
logistic_reg,test_recall,1.0,0.25353566685532386
|
||||
logistic_reg,test_f1,0.823529411764706,0.27951926390778625
|
||||
svc,test_accuracy,1.0,0.8524114578096439
|
||||
svc,test_average_precision,0.9612401707068228,0.44179454944271934
|
||||
svc,test_recall,0.0,0.0
|
||||
svc,test_f1,0.0,0.0
|
||||
gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129
|
||||
gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991
|
||||
gaussian_naive_bayes,test_recall,1.0,0.3743761174081108
|
||||
gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668
|
||||
stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739
|
||||
stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582
|
||||
stochastic_gradient_descent,test_recall,1.0,0.30156420704502945
|
||||
stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388
|
||||
random_forest,test_accuracy,1.0,0.8722158105763481
|
||||
random_forest,test_average_precision,1.0,0.49817066323226833
|
||||
random_forest,test_recall,1.0,0.18161263127840668
|
||||
random_forest,test_f1,1.0,0.2508096532365307
|
||||
xgboost,test_accuracy,1.0,0.8812627400277729
|
||||
xgboost,test_average_precision,1.0,0.5505695112208401
|
||||
xgboost,test_recall,1.0,0.2896161238315027
|
||||
xgboost,test_f1,0.9411764705882353,0.36887408735855665
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,45 +0,0 @@
|
|||
genre,n
|
||||
System,261
|
||||
Tools,96
|
||||
Productivity,71
|
||||
Health & Fitness,60
|
||||
Finance,54
|
||||
Communication,39
|
||||
Music & Audio,39
|
||||
Shopping,38
|
||||
Lifestyle,33
|
||||
Education,28
|
||||
News & Magazines,24
|
||||
Maps & Navigation,23
|
||||
Entertainment,21
|
||||
Business,18
|
||||
Travel & Local,18
|
||||
Books & Reference,16
|
||||
Social,16
|
||||
Weather,16
|
||||
Food & Drink,14
|
||||
Sports,14
|
||||
Other,13
|
||||
Photography,13
|
||||
Puzzle,13
|
||||
Video Players & Editors,12
|
||||
Card,9
|
||||
Casual,9
|
||||
Personalization,8
|
||||
Medical,7
|
||||
Board,5
|
||||
Strategy,4
|
||||
House & Home,3
|
||||
Trivia,3
|
||||
Word,3
|
||||
Adventure,2
|
||||
Art & Design,2
|
||||
Auto & Vehicles,2
|
||||
Dating,2
|
||||
Role Playing,2
|
||||
STRAW,2
|
||||
Simulation,2
|
||||
"Board,Brain Games",1
|
||||
"Entertainment,Music & Video",1
|
||||
Parenting,1
|
||||
Racing,1
|
|
|
@ -1,7 +0,0 @@
|
|||
[tool.isort]
|
||||
profile = "black"
|
||||
py_version = 311
|
||||
skip_gitignore = "true"
|
||||
|
||||
[tool.black]
|
||||
target-version = ["py311"]
|
2
rapids
2
rapids
|
@ -1 +1 @@
|
|||
Subproject commit 059774bda10545a83ab282f59eb7a329fef9ee4c
|
||||
Subproject commit 7b8538ce5152bb6e978cae37fcb7099941e95364
|
5
setup.py
5
setup.py
|
@ -1,7 +1,8 @@
|
|||
import os
|
||||
|
||||
import sqlalchemy.engine.url
|
||||
from dotenv import load_dotenv
|
||||
from sqlalchemy import URL, create_engine
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
load_dotenv()
|
||||
|
@ -10,7 +11,7 @@ testing: bool = False
|
|||
|
||||
db_password = os.getenv("DB_PASSWORD")
|
||||
|
||||
db_uri = URL.create(
|
||||
db_uri = sqlalchemy.engine.url.URL(
|
||||
drivername="postgresql+psycopg2",
|
||||
username="staw_db",
|
||||
password=db_password,
|
||||
|
|
|
@ -1,60 +0,0 @@
|
|||
---
|
||||
title: "Reliability of SAM threat and challenge and COPE"
|
||||
output: html_notebook
|
||||
---
|
||||
|
||||
|
||||
```{r libraries, message=FALSE, warning=FALSE, include=FALSE, cache=FALSE}
|
||||
library(conflicted)
|
||||
library(here)
|
||||
library(tidyverse)
|
||||
library(magrittr)
|
||||
library(lavaan)
|
||||
library(kableExtra)
|
||||
|
||||
conflicts_prefer(
|
||||
readr::col_factor,
|
||||
purrr::discard,
|
||||
dplyr::filter,
|
||||
dplyr::lag,
|
||||
purrr::set_names,
|
||||
tidyr::extract,
|
||||
kableExtra::group_rows
|
||||
)
|
||||
```
|
||||
|
||||
```{r style, include=FALSE, cache=FALSE}
|
||||
styler::style_file(
|
||||
here("statistical_analysis", "scale_reliability.Rmd"),
|
||||
scope = "tokens",
|
||||
indent_by = 4L
|
||||
)
|
||||
```
|
||||
|
||||
The data were preprocessed and cleaned using [expl_esm_labels.py](../exploration/expl_esm_labels.py) script and read as csv here.
|
||||
|
||||
```{r read_data}
|
||||
COL_TYPES <- cols(
|
||||
.default = col_double(),
|
||||
participant_id = col_factor(),
|
||||
username = col_factor(),
|
||||
device_id = col_factor(),
|
||||
esm_trigger = col_factor(),
|
||||
esm_instructions = col_factor(),
|
||||
double_esm_user_answer_timestamp = col_double(),
|
||||
datetime_lj = col_datetime(format = ""),
|
||||
date_lj = col_date(format = ""),
|
||||
time = col_factor(),
|
||||
esm_user_answer = col_factor()
|
||||
)
|
||||
df_SAM <- read_csv(here("data", "raw", "df_esm_SAM_threat_challenge.csv"), col_types = COL_TYPES)
|
||||
df_COPE <- read_csv(here("data", "raw", "df_esm_COPE.csv"), col_types = COL_TYPES)
|
||||
```
|
||||
|
||||
Demonstrate factor analysis for a single participant.
|
||||
|
||||
```{r}
|
||||
df_COPE %>%
|
||||
group_by(question_id, questionnaire_id) %>%
|
||||
count()
|
||||
```
|
|
@ -1,20 +0,0 @@
|
|||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: Default
|
||||
SaveWorkspace: Default
|
||||
AlwaysSaveHistory: Default
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: No
|
||||
NumSpacesForTab: 4
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: Sweave
|
||||
LaTeX: XeLaTeX
|
||||
|
||||
AutoAppendNewline: Yes
|
||||
StripTrailingWhitespace: Yes
|
||||
|
||||
PythonType: conda
|
||||
PythonVersion: 3.11.3
|
||||
PythonPath: E:/ProgramData/mambaforge/envs/straw2analysis/python.exe
|
Loading…
Reference in New Issue