From ae2ca63bc4912ef55a29897203563b5b2fe5c607 Mon Sep 17 00:00:00 2001
From: junos <junos.lukan@ijs.si>
Date: Mon, 3 Jul 2023 17:09:15 +0200
Subject: [PATCH] Define QUESTIONNAIRE IDs and use them.

Clean up docstrings.
---
 .flake8                        |  5 +-
 config/environment.yml         |  1 -
 exploration/expl_esm_labels.py | 54 +++++++++++++------
 features/esm.py                | 98 +++++++++++++++++++++++++++-------
 4 files changed, 121 insertions(+), 37 deletions(-)

diff --git a/.flake8 b/.flake8
index 0d24263..5138cbd 100644
--- a/.flake8
+++ b/.flake8
@@ -1,6 +1,9 @@
 [flake8]
 max-line-length = 88
-extend-ignore = E203
+extend-ignore =
+    E203,
+    # E501 line too long for docstrings
+    D501
 per-file-ignores =
     exploration/*.py:E501
 docstring-convention = numpy
diff --git a/config/environment.yml b/config/environment.yml
index 084c5c7..9806941 100644
--- a/config/environment.yml
+++ b/config/environment.yml
@@ -16,7 +16,6 @@ dependencies:
   - pandas
   - psycopg2 >= 2.9.1
   - pre-commit
-  - pydocstyle
   - python-dotenv
   - pytz
   - pyprojroot
diff --git a/exploration/expl_esm_labels.py b/exploration/expl_esm_labels.py
index d45371b..f8f3a53 100644
--- a/exploration/expl_esm_labels.py
+++ b/exploration/expl_esm_labels.py
@@ -20,7 +20,7 @@ import datetime
 import seaborn as sns
 
 import participants.query_db
-from features.esm import clean_up_esm, get_esm_data, preprocess_esm
+from features.esm import QUESTIONNAIRE_IDS, clean_up_esm, get_esm_data, preprocess_esm
 from features.esm_JCQ import reverse_jcq_demand_control_scoring
 from features.esm_SAM import extract_stressful_events
 
@@ -48,8 +48,14 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)
 
 # %%
 df_esm_PANAS = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] == 8)
-    | (df_esm_preprocessed["questionnaire_id"] == 9)
+    (
+        df_esm_preprocessed["questionnaire_id"]
+        == QUESTIONNAIRE_IDS["PANAS_positive_affect"]
+    )
+    | (
+        df_esm_preprocessed["questionnaire_id"]
+        == QUESTIONNAIRE_IDS["PANAS_negative_affect"]
+    )
 ]
 df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)
 
@@ -126,8 +132,14 @@ df_SAM_all.head()
 
 # %%
 df_esm_SAM = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= 87)
-    & (df_esm_preprocessed["questionnaire_id"] <= 93)
+    (
+        df_esm_preprocessed["questionnaire_id"]
+        >= QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
+    )
+    & (
+        df_esm_preprocessed["questionnaire_id"]
+        <= QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
+    )
 ]
 df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
 
@@ -135,9 +147,10 @@ df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
 # ## Stressful events
 
 # %%
-df_esm_SAM_event = df_esm_SAM_clean[df_esm_SAM_clean["questionnaire_id"] == 87].assign(
-    stressful_event=lambda x: (x.esm_user_answer_numeric > 0)
-)
+df_esm_SAM_event = df_esm_SAM_clean[
+    df_esm_SAM_clean["questionnaire_id"]
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
+].assign(stressful_event=lambda x: (x.esm_user_answer_numeric > 0))
 
 # %%
 df_esm_SAM_daily_events = (
@@ -191,8 +204,8 @@ df_esm_SAM_daily = (
 
 # %%
 df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[
-    (df_esm_SAM_daily["questionnaire_id"] == 88)
-    | (df_esm_SAM_daily["questionnaire_id"] == 89)
+    (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
+    | (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
 ]
 
 # %%
@@ -204,7 +217,8 @@ df_esm_SAM_summary_participant = (
 
 # %%
 df_esm_SAM_event_stressfulness_summary_participant = df_esm_SAM_summary_participant[
-    df_esm_SAM_summary_participant["questionnaire_id"] == 87
+    df_esm_SAM_summary_participant["questionnaire_id"]
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
 ]
 df_esm_SAM_event_stressfulness_summary_participant.describe()["mean"]
 
@@ -218,8 +232,8 @@ sns.displot(
 
 # %%
 df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[
-    (df_esm_SAM_summary_participant["questionnaire_id"] == 88)
-    | (df_esm_SAM_summary_participant["questionnaire_id"] == 89)
+    (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
+    | (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
 ]
 df_esm_SAM_threat_challenge_summary_participant[
     "event subscale"
@@ -263,7 +277,8 @@ df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").descri
 
 # %%
 df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[
-    df_esm_SAM_summary_participant["questionnaire_id"] == 93
+    df_esm_SAM_summary_participant["questionnaire_id"]
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
 ]
 
 # %%
@@ -283,8 +298,8 @@ sns.displot(data=df_esm_SAM_period_summary_participant, x="std", binwidth=0.1)
 
 # %%
 df_esm_JCQ_demand_control = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= 10)
-    & (df_esm_preprocessed["questionnaire_id"] <= 11)
+    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["JCQ_job_demand"])
+    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["JCQ_job_control"])
 ]
 df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control)
 
@@ -343,4 +358,11 @@ fig6.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
 if save_figs:
     fig5.figure.savefig("JCQ_std_participant.pdf", dpi=300)
 
+# %% [markdown]
+# # COPE Inventory
+
 # %%
+df_esm_COPE = df_esm_preprocessed[
+    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["COPE_active"])
+    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["COPE_emotions"])
+]
diff --git a/features/esm.py b/features/esm.py
index 066542b..3d27d77 100644
--- a/features/esm.py
+++ b/features/esm.py
@@ -20,11 +20,47 @@ ANSWER_DAY_OFF = "DayOff3421"
 ANSWER_SET_EVENING = "DayFinishedSetEvening"
 
 MAX_MORNING_LENGTH = 3
-# When the participants was not yet at work at the time of the first (morning) EMA,
+# When the participant was not yet at work at the time of the first (morning) EMA,
 # only three items were answered.
 # Two sleep related items and one indicating NOT starting work yet.
 # Daytime EMAs are all longer, in fact they always consist of at least 6 items.
 
+QUESTIONNAIRE_IDS = {
+    "sleep_quality": 1,
+    "PANAS_positive_affect": 8,
+    "PANAS_negative_affect": 9,
+    "JCQ_job_demand": 10,
+    "JCQ_job_control": 11,
+    "JCQ_supervisor_support": 12,
+    "JCQ_coworker_support": 13,
+    "PFITS_supervisor": 14,
+    "PFITS_coworkers": 15,
+    "UWES_vigor": 16,
+    "UWES_dedication": 17,
+    "UWES_absorption": 18,
+    "COPE_active": 19,
+    "COPE_support": 20,
+    "COPE_emotions": 21,
+    "balance_life_work": 22,
+    "balance_work_life": 23,
+    "recovery_experience_detachment": 24,
+    "recovery_experience_relaxation": 25,
+    "symptoms": 26,
+    "appraisal_stressfulness_event": 87,
+    "appraisal_threat": 88,
+    "appraisal_challenge": 89,
+    "appraisal_event_time": 90,
+    "appraisal_event_duration": 91,
+    "appraisal_event_work_related": 92,
+    "appraisal_stressfulness_period": 93,
+    "late_work": 94,
+    "work_hours": 95,
+    "left_work": 96,
+    "activities": 97,
+    "coffee_breaks": 98,
+    "at_work_yet": 99,
+}
+
 
 def get_esm_data(usernames: Collection) -> pd.DataFrame:
     """
@@ -52,8 +88,10 @@ def get_esm_data(usernames: Collection) -> pd.DataFrame:
 
 def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
     """
+    Convert timestamps and expand JSON column.
+
     Convert timestamps into human-readable datetimes and dates
-    and expand the JSON column into several Pandas DF columns.
+        and expand the JSON column into several Pandas DF columns.
 
     Parameters
     ----------
@@ -63,7 +101,8 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
     Returns
     -------
     df_esm_preprocessed: pd.DataFrame
-        A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
+        A dataframe with added columns: datetime in Ljubljana timezone
+            and all fields from ESM_JSON column.
     """
     df_esm = helper.get_date_from_timestamp(df_esm)
 
@@ -76,31 +115,39 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
 def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
     """
     For each distinct EMA session, determine how the participant responded to it.
-    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
+
+    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED,
+        and SESSION_STATUS_COMPLETE
 
     This is done in three steps.
 
     First, the esm_status is considered.
-    If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
+    If any of the ESMs in a session has a status *other than* "answered",
+        then this session is taken as unfinished.
 
     Second, the sessions which do not represent full questionnaires are identified.
-    These are sessions where participants only marked they are finished with the day or have not yet started working.
+    These are sessions where participants only marked they are finished with the day
+        or have not yet started working.
 
     Third, the sessions with only one item are marked with their trigger.
-    We never offered questionnaires with single items, so we can be sure these are unfinished.
+    We never offered questionnaires with single items,
+        so we can be sure these are unfinished.
 
     Finally, all sessions that remain are marked as completed.
-    By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option.
+    By going through different possibilities in expl_esm_adherence.ipynb,
+        this turned out to be a reasonable option.
 
     Parameters
     ----------
     df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
+            which must include the session ID (esm_session).
 
     Returns
     -------
     df_session_counts: pd.Dataframe
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
+            with their statuses and the number of items.
     """
     sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
 
@@ -155,17 +202,22 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat
 
 def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
     """
-    For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)
+    Classify EMA sessions into morning, workday, or evening.
+
+    For each EMA session, determine the time of the first user answer
+        and its time type (morning, workday, or evening).
 
     Parameters
     ----------
     df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
+            which must include the session ID (esm_session).
 
     Returns
     -------
     df_session_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
+            with their time type and timestamp of first answer.
     """
     df_session_time = (
         df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
@@ -179,13 +231,17 @@ def classify_sessions_by_completion_time(
     df_esm_preprocessed: pd.DataFrame,
 ) -> pd.DataFrame:
     """
-    The point of this function is to not only classify sessions by using the previously defined functions.
+    Classify sessions and correct the time type.
+
+    The point of this function is to not only classify sessions
+        by using the previously defined functions.
     It also serves to "correct" the time type of some EMA sessions.
 
     A morning questionnaire could seamlessly transition into a daytime questionnaire,
         if the participant was already at work.
     In this case, the "time" label changed mid-session.
-    Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
+    Because of the way classify_sessions_by_time works,
+        this questionnaire was classified as "morning".
     But for all intents and purposes, it can be treated as a "daytime" EMA.
 
     The way this scenario is differentiated from a true "morning" questionnaire,
@@ -194,13 +250,16 @@ def classify_sessions_by_completion_time(
     Parameters
     ----------
     df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
+            which must include the session ID (esm_session).
 
     Returns
     -------
     df_session_counts_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items,
-            their time type (with some morning EMAs reclassified) and timestamp of first answer.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses,
+            the number of items,
+            their time type (with some morning EMAs reclassified)
+            and timestamp of first answer.
 
     """
     df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
@@ -219,7 +278,8 @@ def classify_sessions_by_completion_time(
 
 def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
     """
-    This function eliminates invalid ESM responses.
+    Eliminate invalid ESM responses.
+
     It removes unanswered ESMs and those that indicate end of work and similar.
     It also extracts a numeric answer from strings such as "4 - I strongly agree".