Define QUESTIONNAIRE IDs and use them.

Clean up docstrings.
2023-07-03 17:09:15 +02:00 · 2023-07-03 17:09:15 +02:00 · ae2ca63bc4
parent 577f1330da
commit ae2ca63bc4
4 changed files with 121 additions and 37 deletions
--- a/.flake8
+++ b/.flake8
@ -1,6 +1,9 @@
 [flake8]
 max-line-length = 88
-extend-ignore = E203
+extend-ignore =
+    E203,
+    # E501 line too long for docstrings
+    D501
 per-file-ignores =
    exploration/*.py:E501
 docstring-convention = numpy
--- a/config/environment.yml
+++ b/config/environment.yml
@ -16,7 +16,6 @@ dependencies:
  - pandas
  - psycopg2 >= 2.9.1
  - pre-commit
-  - pydocstyle
  - python-dotenv
  - pytz
  - pyprojroot
--- a/exploration/expl_esm_labels.py
+++ b/exploration/expl_esm_labels.py
@ -20,7 +20,7 @@ import datetime
 import seaborn as sns

 import participants.query_db
-from features.esm import clean_up_esm, get_esm_data, preprocess_esm
+from features.esm import QUESTIONNAIRE_IDS, clean_up_esm, get_esm_data, preprocess_esm
 from features.esm_JCQ import reverse_jcq_demand_control_scoring
 from features.esm_SAM import extract_stressful_events

@ -48,8 +48,14 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)

 # %%
 df_esm_PANAS = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] == 8)
-    | (df_esm_preprocessed["questionnaire_id"] == 9)
+    (
+        df_esm_preprocessed["questionnaire_id"]
+        == QUESTIONNAIRE_IDS["PANAS_positive_affect"]
+    )
+    | (
+        df_esm_preprocessed["questionnaire_id"]
+        == QUESTIONNAIRE_IDS["PANAS_negative_affect"]
+    )
 ]
 df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)

@ -126,8 +132,14 @@ df_SAM_all.head()

 # %%
 df_esm_SAM = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= 87)
-    & (df_esm_preprocessed["questionnaire_id"] <= 93)
+    (
+        df_esm_preprocessed["questionnaire_id"]
+        >= QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
+    )
+    & (
+        df_esm_preprocessed["questionnaire_id"]
+        <= QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
+    )
 ]
 df_esm_SAM_clean = clean_up_esm(df_esm_SAM)

@ -135,9 +147,10 @@ df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
 # ## Stressful events

 # %%
-df_esm_SAM_event = df_esm_SAM_clean[df_esm_SAM_clean["questionnaire_id"] == 87].assign(
-    stressful_event=lambda x: (x.esm_user_answer_numeric > 0)
-)
+df_esm_SAM_event = df_esm_SAM_clean[
+    df_esm_SAM_clean["questionnaire_id"]
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
+].assign(stressful_event=lambda x: (x.esm_user_answer_numeric > 0))

 # %%
 df_esm_SAM_daily_events = (
@ -191,8 +204,8 @@ df_esm_SAM_daily = (

 # %%
 df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[
-    (df_esm_SAM_daily["questionnaire_id"] == 88)
-    | (df_esm_SAM_daily["questionnaire_id"] == 89)
+    (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
+    | (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
 ]

 # %%
@ -204,7 +217,8 @@ df_esm_SAM_summary_participant = (

 # %%
 df_esm_SAM_event_stressfulness_summary_participant = df_esm_SAM_summary_participant[
-    df_esm_SAM_summary_participant["questionnaire_id"] == 87
+    df_esm_SAM_summary_participant["questionnaire_id"]
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
 ]
 df_esm_SAM_event_stressfulness_summary_participant.describe()["mean"]

@ -218,8 +232,8 @@ sns.displot(

 # %%
 df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[
-    (df_esm_SAM_summary_participant["questionnaire_id"] == 88)
-    | (df_esm_SAM_summary_participant["questionnaire_id"] == 89)
+    (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
+    | (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
 ]
 df_esm_SAM_threat_challenge_summary_participant[
    "event subscale"
@ -263,7 +277,8 @@ df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").descri

 # %%
 df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[
-    df_esm_SAM_summary_participant["questionnaire_id"] == 93
+    df_esm_SAM_summary_participant["questionnaire_id"]
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
 ]

 # %%
@ -283,8 +298,8 @@ sns.displot(data=df_esm_SAM_period_summary_participant, x="std", binwidth=0.1)

 # %%
 df_esm_JCQ_demand_control = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= 10)
-    & (df_esm_preprocessed["questionnaire_id"] <= 11)
+    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["JCQ_job_demand"])
+    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["JCQ_job_control"])
 ]
 df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control)

@ -343,4 +358,11 @@ fig6.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
 if save_figs:
    fig5.figure.savefig("JCQ_std_participant.pdf", dpi=300)

+# %% [markdown]
+# # COPE Inventory
+
 # %%
+df_esm_COPE = df_esm_preprocessed[
+    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["COPE_active"])
+    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["COPE_emotions"])
+]
--- a/features/esm.py
+++ b/features/esm.py
@ -20,11 +20,47 @@ ANSWER_DAY_OFF = "DayOff3421"
 ANSWER_SET_EVENING = "DayFinishedSetEvening"

 MAX_MORNING_LENGTH = 3
-# When the participants was not yet at work at the time of the first (morning) EMA,
+# When the participant was not yet at work at the time of the first (morning) EMA,
 # only three items were answered.
 # Two sleep related items and one indicating NOT starting work yet.
 # Daytime EMAs are all longer, in fact they always consist of at least 6 items.

+QUESTIONNAIRE_IDS = {
+    "sleep_quality": 1,
+    "PANAS_positive_affect": 8,
+    "PANAS_negative_affect": 9,
+    "JCQ_job_demand": 10,
+    "JCQ_job_control": 11,
+    "JCQ_supervisor_support": 12,
+    "JCQ_coworker_support": 13,
+    "PFITS_supervisor": 14,
+    "PFITS_coworkers": 15,
+    "UWES_vigor": 16,
+    "UWES_dedication": 17,
+    "UWES_absorption": 18,
+    "COPE_active": 19,
+    "COPE_support": 20,
+    "COPE_emotions": 21,
+    "balance_life_work": 22,
+    "balance_work_life": 23,
+    "recovery_experience_detachment": 24,
+    "recovery_experience_relaxation": 25,
+    "symptoms": 26,
+    "appraisal_stressfulness_event": 87,
+    "appraisal_threat": 88,
+    "appraisal_challenge": 89,
+    "appraisal_event_time": 90,
+    "appraisal_event_duration": 91,
+    "appraisal_event_work_related": 92,
+    "appraisal_stressfulness_period": 93,
+    "late_work": 94,
+    "work_hours": 95,
+    "left_work": 96,
+    "activities": 97,
+    "coffee_breaks": 98,
+    "at_work_yet": 99,
+}
+

 def get_esm_data(usernames: Collection) -> pd.DataFrame:
    """
@ -52,6 +88,8 @@ def get_esm_data(usernames: Collection) -> pd.DataFrame:

 def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    """
+    Convert timestamps and expand JSON column.
+
    Convert timestamps into human-readable datetimes and dates
        and expand the JSON column into several Pandas DF columns.

@ -63,7 +101,8 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    Returns
    -------
    df_esm_preprocessed: pd.DataFrame
-        A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
+        A dataframe with added columns: datetime in Ljubljana timezone
+            and all fields from ESM_JSON column.
    """
    df_esm = helper.get_date_from_timestamp(df_esm)

@ -76,31 +115,39 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
 def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
    For each distinct EMA session, determine how the participant responded to it.
-    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
+
+    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED,
+        and SESSION_STATUS_COMPLETE

    This is done in three steps.

    First, the esm_status is considered.
-    If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
+    If any of the ESMs in a session has a status *other than* "answered",
+        then this session is taken as unfinished.

    Second, the sessions which do not represent full questionnaires are identified.
-    These are sessions where participants only marked they are finished with the day or have not yet started working.
+    These are sessions where participants only marked they are finished with the day
+        or have not yet started working.

    Third, the sessions with only one item are marked with their trigger.
-    We never offered questionnaires with single items, so we can be sure these are unfinished.
+    We never offered questionnaires with single items,
+        so we can be sure these are unfinished.

    Finally, all sessions that remain are marked as completed.
-    By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option.
+    By going through different possibilities in expl_esm_adherence.ipynb,
+        this turned out to be a reasonable option.

    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
+            which must include the session ID (esm_session).

    Returns
    -------
    df_session_counts: pd.Dataframe
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
+            with their statuses and the number of items.
    """
    sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)

@ -155,17 +202,22 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat

 def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
-    For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)
+    Classify EMA sessions into morning, workday, or evening.
+
+    For each EMA session, determine the time of the first user answer
+        and its time type (morning, workday, or evening).

    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
+            which must include the session ID (esm_session).

    Returns
    -------
    df_session_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
+            with their time type and timestamp of first answer.
    """
    df_session_time = (
        df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
@ -179,13 +231,17 @@ def classify_sessions_by_completion_time(
    df_esm_preprocessed: pd.DataFrame,
 ) -> pd.DataFrame:
    """
-    The point of this function is to not only classify sessions by using the previously defined functions.
+    Classify sessions and correct the time type.
+
+    The point of this function is to not only classify sessions
+        by using the previously defined functions.
    It also serves to "correct" the time type of some EMA sessions.

    A morning questionnaire could seamlessly transition into a daytime questionnaire,
        if the participant was already at work.
    In this case, the "time" label changed mid-session.
-    Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
+    Because of the way classify_sessions_by_time works,
+        this questionnaire was classified as "morning".
    But for all intents and purposes, it can be treated as a "daytime" EMA.

    The way this scenario is differentiated from a true "morning" questionnaire,
@ -194,13 +250,16 @@ def classify_sessions_by_completion_time(
    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
+            which must include the session ID (esm_session).

    Returns
    -------
    df_session_counts_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items,
-            their time type (with some morning EMAs reclassified) and timestamp of first answer.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses,
+            the number of items,
+            their time type (with some morning EMAs reclassified)
+            and timestamp of first answer.

    """
    df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
@ -219,7 +278,8 @@ def classify_sessions_by_completion_time(

 def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
-    This function eliminates invalid ESM responses.
+    Eliminate invalid ESM responses.
+
    It removes unanswered ESMs and those that indicate end of work and similar.
    It also extracts a numeric answer from strings such as "4 - I strongly agree".