From ae2ca63bc4912ef55a29897203563b5b2fe5c607 Mon Sep 17 00:00:00 2001 From: junos Date: Mon, 3 Jul 2023 17:09:15 +0200 Subject: [PATCH] Define QUESTIONNAIRE IDs and use them. Clean up docstrings. --- .flake8 | 5 +- config/environment.yml | 1 - exploration/expl_esm_labels.py | 54 +++++++++++++------ features/esm.py | 98 +++++++++++++++++++++++++++------- 4 files changed, 121 insertions(+), 37 deletions(-) diff --git a/.flake8 b/.flake8 index 0d24263..5138cbd 100644 --- a/.flake8 +++ b/.flake8 @@ -1,6 +1,9 @@ [flake8] max-line-length = 88 -extend-ignore = E203 +extend-ignore = + E203, + # E501 line too long for docstrings + D501 per-file-ignores = exploration/*.py:E501 docstring-convention = numpy diff --git a/config/environment.yml b/config/environment.yml index 084c5c7..9806941 100644 --- a/config/environment.yml +++ b/config/environment.yml @@ -16,7 +16,6 @@ dependencies: - pandas - psycopg2 >= 2.9.1 - pre-commit - - pydocstyle - python-dotenv - pytz - pyprojroot diff --git a/exploration/expl_esm_labels.py b/exploration/expl_esm_labels.py index d45371b..f8f3a53 100644 --- a/exploration/expl_esm_labels.py +++ b/exploration/expl_esm_labels.py @@ -20,7 +20,7 @@ import datetime import seaborn as sns import participants.query_db -from features.esm import clean_up_esm, get_esm_data, preprocess_esm +from features.esm import QUESTIONNAIRE_IDS, clean_up_esm, get_esm_data, preprocess_esm from features.esm_JCQ import reverse_jcq_demand_control_scoring from features.esm_SAM import extract_stressful_events @@ -48,8 +48,14 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive) # %% df_esm_PANAS = df_esm_preprocessed[ - (df_esm_preprocessed["questionnaire_id"] == 8) - | (df_esm_preprocessed["questionnaire_id"] == 9) + ( + df_esm_preprocessed["questionnaire_id"] + == QUESTIONNAIRE_IDS["PANAS_positive_affect"] + ) + | ( + df_esm_preprocessed["questionnaire_id"] + == QUESTIONNAIRE_IDS["PANAS_negative_affect"] + ) ] df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS) @@ -126,8 +132,14 @@ df_SAM_all.head() # %% df_esm_SAM = df_esm_preprocessed[ - (df_esm_preprocessed["questionnaire_id"] >= 87) - & (df_esm_preprocessed["questionnaire_id"] <= 93) + ( + df_esm_preprocessed["questionnaire_id"] + >= QUESTIONNAIRE_IDS["appraisal_stressfulness_event"] + ) + & ( + df_esm_preprocessed["questionnaire_id"] + <= QUESTIONNAIRE_IDS["appraisal_stressfulness_period"] + ) ] df_esm_SAM_clean = clean_up_esm(df_esm_SAM) @@ -135,9 +147,10 @@ df_esm_SAM_clean = clean_up_esm(df_esm_SAM) # ## Stressful events # %% -df_esm_SAM_event = df_esm_SAM_clean[df_esm_SAM_clean["questionnaire_id"] == 87].assign( - stressful_event=lambda x: (x.esm_user_answer_numeric > 0) -) +df_esm_SAM_event = df_esm_SAM_clean[ + df_esm_SAM_clean["questionnaire_id"] + == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"] +].assign(stressful_event=lambda x: (x.esm_user_answer_numeric > 0)) # %% df_esm_SAM_daily_events = ( @@ -191,8 +204,8 @@ df_esm_SAM_daily = ( # %% df_esm_SAM_daily_threat_challenge = df_esm_SAM_daily[ - (df_esm_SAM_daily["questionnaire_id"] == 88) - | (df_esm_SAM_daily["questionnaire_id"] == 89) + (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"]) + | (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"]) ] # %% @@ -204,7 +217,8 @@ df_esm_SAM_summary_participant = ( # %% df_esm_SAM_event_stressfulness_summary_participant = df_esm_SAM_summary_participant[ - df_esm_SAM_summary_participant["questionnaire_id"] == 87 + df_esm_SAM_summary_participant["questionnaire_id"] + == QUESTIONNAIRE_IDS["appraisal_stressfulness_event"] ] df_esm_SAM_event_stressfulness_summary_participant.describe()["mean"] @@ -218,8 +232,8 @@ sns.displot( # %% df_esm_SAM_threat_challenge_summary_participant = df_esm_SAM_summary_participant[ - (df_esm_SAM_summary_participant["questionnaire_id"] == 88) - | (df_esm_SAM_summary_participant["questionnaire_id"] == 89) + (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_threat"]) + | (df_esm_SAM_daily["questionnaire_id"] == QUESTIONNAIRE_IDS["appraisal_challenge"]) ] df_esm_SAM_threat_challenge_summary_participant[ "event subscale" @@ -263,7 +277,8 @@ df_esm_SAM_threat_challenge_summary_participant.groupby("event subscale").descri # %% df_esm_SAM_period_summary_participant = df_esm_SAM_summary_participant[ - df_esm_SAM_summary_participant["questionnaire_id"] == 93 + df_esm_SAM_summary_participant["questionnaire_id"] + == QUESTIONNAIRE_IDS["appraisal_stressfulness_period"] ] # %% @@ -283,8 +298,8 @@ sns.displot(data=df_esm_SAM_period_summary_participant, x="std", binwidth=0.1) # %% df_esm_JCQ_demand_control = df_esm_preprocessed[ - (df_esm_preprocessed["questionnaire_id"] >= 10) - & (df_esm_preprocessed["questionnaire_id"] <= 11) + (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["JCQ_job_demand"]) + & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["JCQ_job_control"]) ] df_esm_JCQ_demand_control_clean = clean_up_esm(df_esm_JCQ_demand_control) @@ -343,4 +358,11 @@ fig6.set_axis_labels(x_var="participant standard deviation", y_var="frequency") if save_figs: fig5.figure.savefig("JCQ_std_participant.pdf", dpi=300) +# %% [markdown] +# # COPE Inventory + # %% +df_esm_COPE = df_esm_preprocessed[ + (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_IDS["COPE_active"]) + & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_IDS["COPE_emotions"]) +] diff --git a/features/esm.py b/features/esm.py index 066542b..3d27d77 100644 --- a/features/esm.py +++ b/features/esm.py @@ -20,11 +20,47 @@ ANSWER_DAY_OFF = "DayOff3421" ANSWER_SET_EVENING = "DayFinishedSetEvening" MAX_MORNING_LENGTH = 3 -# When the participants was not yet at work at the time of the first (morning) EMA, +# When the participant was not yet at work at the time of the first (morning) EMA, # only three items were answered. # Two sleep related items and one indicating NOT starting work yet. # Daytime EMAs are all longer, in fact they always consist of at least 6 items. +QUESTIONNAIRE_IDS = { + "sleep_quality": 1, + "PANAS_positive_affect": 8, + "PANAS_negative_affect": 9, + "JCQ_job_demand": 10, + "JCQ_job_control": 11, + "JCQ_supervisor_support": 12, + "JCQ_coworker_support": 13, + "PFITS_supervisor": 14, + "PFITS_coworkers": 15, + "UWES_vigor": 16, + "UWES_dedication": 17, + "UWES_absorption": 18, + "COPE_active": 19, + "COPE_support": 20, + "COPE_emotions": 21, + "balance_life_work": 22, + "balance_work_life": 23, + "recovery_experience_detachment": 24, + "recovery_experience_relaxation": 25, + "symptoms": 26, + "appraisal_stressfulness_event": 87, + "appraisal_threat": 88, + "appraisal_challenge": 89, + "appraisal_event_time": 90, + "appraisal_event_duration": 91, + "appraisal_event_work_related": 92, + "appraisal_stressfulness_period": 93, + "late_work": 94, + "work_hours": 95, + "left_work": 96, + "activities": 97, + "coffee_breaks": 98, + "at_work_yet": 99, +} + def get_esm_data(usernames: Collection) -> pd.DataFrame: """ @@ -52,8 +88,10 @@ def get_esm_data(usernames: Collection) -> pd.DataFrame: def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame: """ + Convert timestamps and expand JSON column. + Convert timestamps into human-readable datetimes and dates - and expand the JSON column into several Pandas DF columns. + and expand the JSON column into several Pandas DF columns. Parameters ---------- @@ -63,7 +101,8 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame: Returns ------- df_esm_preprocessed: pd.DataFrame - A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column. + A dataframe with added columns: datetime in Ljubljana timezone + and all fields from ESM_JSON column. """ df_esm = helper.get_date_from_timestamp(df_esm) @@ -76,31 +115,39 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame: def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame: """ For each distinct EMA session, determine how the participant responded to it. - Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE + + Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, + and SESSION_STATUS_COMPLETE This is done in three steps. First, the esm_status is considered. - If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished. + If any of the ESMs in a session has a status *other than* "answered", + then this session is taken as unfinished. Second, the sessions which do not represent full questionnaires are identified. - These are sessions where participants only marked they are finished with the day or have not yet started working. + These are sessions where participants only marked they are finished with the day + or have not yet started working. Third, the sessions with only one item are marked with their trigger. - We never offered questionnaires with single items, so we can be sure these are unfinished. + We never offered questionnaires with single items, + so we can be sure these are unfinished. Finally, all sessions that remain are marked as completed. - By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option. + By going through different possibilities in expl_esm_adherence.ipynb, + this turned out to be a reasonable option. Parameters ---------- df_esm_preprocessed: pd.DataFrame - A preprocessed dataframe of esm data, which must include the session ID (esm_session). + A preprocessed dataframe of esm data, + which must include the session ID (esm_session). Returns ------- df_session_counts: pd.Dataframe - A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items. + A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) + with their statuses and the number of items. """ sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY) @@ -155,17 +202,22 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame: """ - For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.) + Classify EMA sessions into morning, workday, or evening. + + For each EMA session, determine the time of the first user answer + and its time type (morning, workday, or evening). Parameters ---------- df_esm_preprocessed: pd.DataFrame - A preprocessed dataframe of esm data, which must include the session ID (esm_session). + A preprocessed dataframe of esm data, + which must include the session ID (esm_session). Returns ------- df_session_time: pd.DataFrame - A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer. + A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) + with their time type and timestamp of first answer. """ df_session_time = ( df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"]) @@ -179,13 +231,17 @@ def classify_sessions_by_completion_time( df_esm_preprocessed: pd.DataFrame, ) -> pd.DataFrame: """ - The point of this function is to not only classify sessions by using the previously defined functions. + Classify sessions and correct the time type. + + The point of this function is to not only classify sessions + by using the previously defined functions. It also serves to "correct" the time type of some EMA sessions. A morning questionnaire could seamlessly transition into a daytime questionnaire, if the participant was already at work. In this case, the "time" label changed mid-session. - Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning". + Because of the way classify_sessions_by_time works, + this questionnaire was classified as "morning". But for all intents and purposes, it can be treated as a "daytime" EMA. The way this scenario is differentiated from a true "morning" questionnaire, @@ -194,13 +250,16 @@ def classify_sessions_by_completion_time( Parameters ---------- df_esm_preprocessed: pd.DataFrame - A preprocessed dataframe of esm data, which must include the session ID (esm_session). + A preprocessed dataframe of esm data, + which must include the session ID (esm_session). Returns ------- df_session_counts_time: pd.DataFrame - A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items, - their time type (with some morning EMAs reclassified) and timestamp of first answer. + A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, + the number of items, + their time type (with some morning EMAs reclassified) + and timestamp of first answer. """ df_session_counts = classify_sessions_by_completion(df_esm_preprocessed) @@ -219,7 +278,8 @@ def classify_sessions_by_completion_time( def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame: """ - This function eliminates invalid ESM responses. + Eliminate invalid ESM responses. + It removes unanswered ESMs and those that indicate end of work and similar. It also extracts a numeric answer from strings such as "4 - I strongly agree".