Clean up ESM by eliminating non-answers.

Convert radio string answer to numeric.
2021-07-03 16:34:11 +02:00 · 2021-07-03 16:34:11 +02:00 · e0da6757ec
parent 74392f229a
commit e0da6757ec
2 changed files with 27 additions and 14 deletions
--- a/exploration/expl_esm_labels.py
+++ b/exploration/expl_esm_labels.py
@ -25,12 +25,6 @@ if nb_dir not in sys.path:
 import participants.query_db
 from features.esm import *

-# %% [markdown]
-# # ESM data
-
-# %% [markdown]
-# Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON.
-
 # %%
 participants_inactive_usernames = participants.query_db.get_usernames(
    collection_start=datetime.date.fromisoformat("2020-08-01")
@ -39,15 +33,20 @@ df_esm_inactive = get_esm_data(participants_inactive_usernames)

 # %%
 df_esm_preprocessed = preprocess_esm(df_esm_inactive)
-df_esm_clean = clean_up_esm(df_esm_preprocessed)
+
+# %% [markdown]
+# # PANAS

 # %%
-df_esm_PANAS = df_esm_clean[
-    (df_esm_clean["questionnaire_id"] == 8) | (df_esm_clean["questionnaire_id"] == 9)
+df_esm_PANAS = df_esm_preprocessed[
+    (df_esm_preprocessed["questionnaire_id"] == 8)
+    | (df_esm_preprocessed["questionnaire_id"] == 9)
 ]
-df_esm_PANAS_grouped = df_esm_PANAS.groupby(["participant_id", "questionnaire_id"])
+df_esm_PANAS_clean = clean_up_esm(df_esm_PANAS)

 # %%
-df_esm_PANAS.head()
+df_esm_PANAS_grouped = df_esm_PANAS_clean.groupby(
+    ["participant_id", "questionnaire_id"]
+)

 # %%
--- a/features/esm.py
+++ b/features/esm.py
@ -17,6 +17,10 @@ SESSION_STATUS_UNANSWERED = "ema_unanswered"
 SESSION_STATUS_DAY_FINISHED = "day_finished"
 SESSION_STATUS_COMPLETE = "ema_completed"

+ANSWER_DAY_FINISHED = "DayFinished3421"
+ANSWER_DAY_OFF = "DayOff3421"
+ANSWER_SET_EVENING = "DayFinishedSetEvening"
+
 MAX_MORNING_LENGTH = 3
 # When the participants was not yet at work at the time of the first (morning) EMA,
 # only three items were answered.
@ -119,10 +123,10 @@ def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.Dat
    # 2. Identify non-sessions, i.e. answers about the end of the day.
    non_session = sessions_grouped.apply(
        lambda x: (
-            (x.esm_user_answer == "DayFinished3421")  # I finished working for today.
-            | (x.esm_user_answer == "DayOff3421")  # I am not going to work today.
+            (x.esm_user_answer == ANSWER_DAY_FINISHED)  # I finished working for today.
+            | (x.esm_user_answer == ANSWER_DAY_OFF)  # I am not going to work today.
            | (
-                x.esm_user_answer == "DayFinishedSetEvening"
+                x.esm_user_answer == ANSWER_SET_EVENING
            )  # When would you like to answer the evening EMA?
        ).any()
    )
@ -234,4 +238,14 @@ def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    df_esm_clean = df_esm_preprocessed[
        df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED
    ]
+    df_esm_clean = df_esm_clean[
+        ~df_esm_clean["esm_user_answer"].isin(
+            [ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING]
+        )
+    ]
+    df_esm_clean = df_esm_clean.assign(
+        esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
+            int
+        )
+    )
    return df_esm_clean