Define QUESTIONNAIRE IDs and use them.

Clean up docstrings.
2023-07-03 17:09:15 +02:00 · 2023-07-03 17:09:15 +02:00 · ae2ca63bc4
parent 577f1330da
commit ae2ca63bc4
4 changed files with 121 additions and 37 deletions
--- a/.flake8
+++ b/.flake8
@ -1,6 +1,9 @@
 [flake8]
 max-line-length = 88
-extend-ignore = E203
+extend-ignore =
    E203,
    # E501 line too long for docstrings
    D501
 per-file-ignores =
    exploration/*.py:E501
 docstring-convention = numpy
--- a/config/environment.yml
+++ b/config/environment.yml
@ -16,7 +16,6 @@ dependencies:
  - pandas
  - psycopg2 >= 2.9.1
  - pre-commit
  - pydocstyle
  - python-dotenv
  - pytz
  - pyprojroot
--- a/exploration/expl_esm_labels.py
+++ b/exploration/expl_esm_labels.py
@ -20,7 +20,7 @@ import datetime
 import seaborn as sns
 import participants.query_db
-from features.esm import clean_up_esm, get_esm_data, preprocess_esm
+from features.esm import QUESTIONNAIRE_IDS, clean_up_esm, get_esm_data, preprocess_esm
 from features.esm_JCQ import reverse_jcq_demand_control_scoring
 from features.esm_SAM import extract_stressful_events
@ -48,8 +48,14 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)
 # %%
 df_esm_PANAS = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] == 8)
+    (
-    | (df_esm_preprocessed["questionnaire_id"] == 9)
+        df_esm_preprocessed["questionnaire_id"]
        == QUESTIONNAIRE_IDS["PANAS_positive_affect"]
    )
    | (
        df_esm_preprocessed["questionnaire_id"]
        == QUESTIONNAIRE_IDS["PANAS_negative_affect"]
    )
 ]
 df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)
@ -126,8 +132,14 @@ df_SAM_all.head()
 # %%
 df_esm_SAM = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= 87)
+    (
-    & (df_esm_preprocessed["questionnaire_id"] <= 93)
+        df_esm_preprocessed["questionnaire_id"]
        >= QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
    )
    & (
        df_esm_preprocessed["questionnaire_id"]
        <= QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
    )
 ]
 df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
@ -135,9 +147,10 @@ df_esm_SAM_clean = clean_up_esm(df_esm_SAM)
 # ## Stressful events
 # %%
-df_esm_SAM_event = df_esm_SAM_clean[df_esm_SAM_clean["questionnaire_id"] == 87].assign(
+df_esm_SAM_event = df_esm_SAM_clean[
-    stressful_event=lambda x: (x.esm_user_answer_numeric > 0)
+    df_esm_SAM_clean["questionnaire_id"]
-)
+    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
 ].assign(stressful_event=lambda x: (x.esm_user_answer_numeric > 0))
 # %%
 df_esm_SAM_daily_events = (
@ -191,8 +204,8 @@ df_esm_SAM_daily = (
 # %%
 df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[
-    (df_esm_SAM_daily["questionnaire_id"] == 88)
+    (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
-    | (df_esm_SAM_daily["questionnaire_id"] == 89)
+    | (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
 ]
 # %%
@ -204,7 +217,8 @@ df_esm_SAM_summary_participant = (
 # %%
 df_esm_SAM_event_stressfulness_summary_participant = df_esm_SAM_summary_participant[
-    df_esm_SAM_summary_participant["questionnaire_id"] == 87
+    df_esm_SAM_summary_participant["questionnaire_id"]
    == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"]
 ]
 df_esm_SAM_event_stressfulness_summary_participant.describe()["mean"]
@ -218,8 +232,8 @@ sns.displot(
 # %%
 df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[
-    (df_esm_SAM_summary_participant["questionnaire_id"] == 88)
+    (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"])
-    | (df_esm_SAM_summary_participant["questionnaire_id"] == 89)
+    | (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"])
 ]
 df_esm_SAM_threat_challenge_summary_participant[
    "event subscale"
@ -263,7 +277,8 @@ df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").descri
 # %%
 df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[
-    df_esm_SAM_summary_participant["questionnaire_id"] == 93
+    df_esm_SAM_summary_participant["questionnaire_id"]
    == QUESTIONNAIRE_IDS["appraisal_stressfulness_period"]
 ]
 # %%
@ -283,8 +298,8 @@ sns.displot(data=df_esm_SAM_period_summary_participant, x="std", binwidth=0.1)
 # %%
 df_esm_JCQ_demand_control = df_esm_preprocessed[
-    (df_esm_preprocessed["questionnaire_id"] >= 10)
+    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["JCQ_job_demand"])
-    & (df_esm_preprocessed["questionnaire_id"] <= 11)
+    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["JCQ_job_control"])
 ]
 df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control)
@ -343,4 +358,11 @@ fig6.set_axis_labels(x_var="participant standard deviation", y_var="frequency")
 if save_figs:
    fig5.figure.savefig("JCQ_std_participant.pdf", dpi=300)
 # %% [markdown]
 # # COPE Inventory
 # %%
 df_esm_COPE = df_esm_preprocessed[
    (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["COPE_active"])
    & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["COPE_emotions"])
 ]
--- a/features/esm.py
+++ b/features/esm.py
@ -20,11 +20,47 @@ ANSWER_DAY_OFF = "DayOff3421"
 ANSWER_SET_EVENING = "DayFinishedSetEvening"
 MAX_MORNING_LENGTH = 3
-# When the participants was not yet at work at the time of the first (morning) EMA,
+# When the participant was not yet at work at the time of the first (morning) EMA,
 # only three items were answered.
 # Two sleep related items and one indicating NOT starting work yet.
 # Daytime EMAs are all longer, in fact they always consist of at least 6 items.
 QUESTIONNAIRE_IDS = {
    "sleep_quality": 1,
    "PANAS_positive_affect": 8,
    "PANAS_negative_affect": 9,
    "JCQ_job_demand": 10,
    "JCQ_job_control": 11,
    "JCQ_supervisor_support": 12,
    "JCQ_coworker_support": 13,
    "PFITS_supervisor": 14,
    "PFITS_coworkers": 15,
    "UWES_vigor": 16,
    "UWES_dedication": 17,
    "UWES_absorption": 18,
    "COPE_active": 19,
    "COPE_support": 20,
    "COPE_emotions": 21,
    "balance_life_work": 22,
    "balance_work_life": 23,
    "recovery_experience_detachment": 24,
    "recovery_experience_relaxation": 25,
    "symptoms": 26,
    "appraisal_stressfulness_event": 87,
    "appraisal_threat": 88,
    "appraisal_challenge": 89,
    "appraisal_event_time": 90,
    "appraisal_event_duration": 91,
    "appraisal_event_work_related": 92,
    "appraisal_stressfulness_period": 93,
    "late_work": 94,
    "work_hours": 95,
    "left_work": 96,
    "activities": 97,
    "coffee_breaks": 98,
    "at_work_yet": 99,
 }
 def get_esm_data(usernames: Collection) -> pd.DataFrame:
    """
@ -52,6 +88,8 @@ def get_esm_data(usernames: Collection) -> pd.DataFrame:
 def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    """
    Convert timestamps and expand JSON column.
    Convert timestamps into human-readable datetimes and dates
        and expand the JSON column into several Pandas DF columns.
@ -63,7 +101,8 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    Returns
    -------
    df_esm_preprocessed: pd.DataFrame
-        A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
+        A dataframe with added columns: datetime in Ljubljana timezone
            and all fields from ESM_JSON column.
    """
    df_esm = helper.get_date_from_timestamp(df_esm)
@ -76,31 +115,39 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
 def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
    For each distinct EMA session, determine how the participant responded to it.
-    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
+
    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED,
        and SESSION_STATUS_COMPLETE
    This is done in three steps.
    First, the esm_status is considered.
-    If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
+    If any of the ESMs in a session has a status *other than* "answered",
        then this session is taken as unfinished.
    Second, the sessions which do not represent full questionnaires are identified.
-    These are sessions where participants only marked they are finished with the day or have not yet started working.
+    These are sessions where participants only marked they are finished with the day
        or have not yet started working.
    Third, the sessions with only one item are marked with their trigger.
-    We never offered questionnaires with single items, so we can be sure these are unfinished.
+    We never offered questionnaires with single items,
        so we can be sure these are unfinished.
    Finally, all sessions that remain are marked as completed.
-    By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option.
+    By going through different possibilities in expl_esm_adherence.ipynb,
        this turned out to be a reasonable option.
    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
            which must include the session ID (esm_session).
    Returns
    -------
    df_session_counts: pd.Dataframe
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
            with their statuses and the number of items.
    """
    sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
@ -155,17 +202,22 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat
 def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
-    For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)
+    Classify EMA sessions into morning, workday, or evening.
    For each EMA session, determine the time of the first user answer
        and its time type (morning, workday, or evening).
    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
            which must include the session ID (esm_session).
    Returns
    -------
    df_session_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
            with their time type and timestamp of first answer.
    """
    df_session_time = (
        df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
@ -179,13 +231,17 @@ def classify_sessions_by_completion_time(
    df_esm_preprocessed: pd.DataFrame,
 ) -> pd.DataFrame:
    """
-    The point of this function is to not only classify sessions by using the previously defined functions.
+    Classify sessions and correct the time type.
    The point of this function is to not only classify sessions
        by using the previously defined functions.
    It also serves to "correct" the time type of some EMA sessions.
    A morning questionnaire could seamlessly transition into a daytime questionnaire,
        if the participant was already at work.
    In this case, the "time" label changed mid-session.
-    Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
+    Because of the way classify_sessions_by_time works,
        this questionnaire was classified as "morning".
    But for all intents and purposes, it can be treated as a "daytime" EMA.
    The way this scenario is differentiated from a true "morning" questionnaire,
@ -194,13 +250,16 @@ def classify_sessions_by_completion_time(
    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
-        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+        A preprocessed dataframe of esm data,
            which must include the session ID (esm_session).
    Returns
    -------
    df_session_counts_time: pd.DataFrame
-        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items,
+        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses,
-            their time type (with some morning EMAs reclassified) and timestamp of first answer.
+            the number of items,
            their time type (with some morning EMAs reclassified)
            and timestamp of first answer.
    """
    df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
@ -219,7 +278,8 @@ def classify_sessions_by_completion_time(
 def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
-    This function eliminates invalid ESM responses.
+    Eliminate invalid ESM responses.
    It removes unanswered ESMs and those that indicate end of work and similar.
    It also extracts a numeric answer from strings such as "4 - I strongly agree".