[WIP] Prepare a function to classify adherence and illustrate steps in Jupyter Notebook.

2021-06-07 19:32:38 +02:00 · 2021-06-07 19:32:38 +02:00 · d5cd76f05a
parent 224dedaced
commit d5cd76f05a
2 changed files with 186 additions and 14 deletions
--- a/exploration/expl_esm.py
+++ b/exploration/expl_esm.py
@ -32,7 +32,9 @@ from features.esm import *
 # Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON.

 # %%
-participants_inactive_usernames = participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01"))
+participants_inactive_usernames = participants.query_db.get_usernames(
+    collection_start=datetime.date.fromisoformat("2020-08-01")
+)
 df_esm_inactive = get_esm_data(participants_inactive_usernames)

 # %%
@ -47,7 +49,7 @@ df_esm_preprocessed.columns

 # %% [markdown]
 # The purpose of concordance is to count the number of EMA sessions that a participant answered in a day and possibly compare it to some maximum number of EMAs that could theoretically be presented for that day.
-# Traditionally, concordance (adherence) in EMA study is simply calculated as the ratio of (daily) answered EMAs. 
+# Traditionally, concordance (adherence) in EMA study is simply calculated as the ratio of (daily) answered EMAs.
 # This is possible for studies with simple EMA design, such that they are presented at fixed schedule and expired within a certain limit.
 #
 # Since EMAs were triggered more flexibly in our study, a different approach is needed.
@ -59,7 +61,9 @@ df_esm_preprocessed.columns
 # One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first.

 # %%
-session_counts = df_esm_preprocessed.groupby(["participant_id","esm_session"]).count()["id"]
+session_counts = df_esm_preprocessed.groupby(["participant_id", "esm_session"]).count()[
+    "id"
+]

 # %% [markdown]
 # Group data by participant_id and esm_session and count the number of instances (by id). Session counts are therefore counts of how many times a specific session ID appears *within* a specific participant.
@ -73,9 +77,13 @@ sns.displot(session_counts.to_numpy(), binwidth=1, height=8)
 # ### Unique session IDs

 # %%
-df_session_counts = pd.DataFrame(session_counts).rename(columns={"id": "esm_session_count"})
+df_session_counts = pd.DataFrame(session_counts).rename(
+    columns={"id": "esm_session_count"}
+)
 df_session_1 = df_session_counts[(df_session_counts["esm_session_count"] == 1)]
-df_esm_unique_session = df_session_1.join(df_esm_preprocessed.set_index(["participant_id","esm_session"]))
+df_esm_unique_session = df_session_1.join(
+    df_esm_preprocessed.set_index(["participant_id", "esm_session"])
+)

 # %%
 df_esm_unique_session["esm_user_answer"].value_counts()
@ -85,10 +93,14 @@ df_esm_unique_session["esm_user_answer"].value_counts()
 # What do the answers "Ne" represent?

 # %%
-df_esm_unique_session.query("esm_user_answer == 'Ne'")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]].head()
+df_esm_unique_session.query("esm_user_answer == 'Ne'")[
+    ["esm_trigger", "esm_instructions", "esm_user_answer"]
+].head()

 # %%
-df_esm_unique_session.loc[df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger"].value_counts()
+df_esm_unique_session.loc[
+    df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger"
+].value_counts()

 # %% [markdown]
 # These are all "first" questions of EMAs which serve as a way to postpone the daytime or evening EMAs.
@ -103,22 +115,31 @@ df_esm_unique_session.loc[df_esm_unique_session["esm_user_answer"].str.contains(
 # There are some session IDs that only appear twice or three times.

 # %%
-df_session_counts[(df_session_counts["esm_session_count"] < 4) & (df_session_counts["esm_session_count"] > 1)]
+df_session_counts[
+    (df_session_counts["esm_session_count"] < 4)
+    & (df_session_counts["esm_session_count"] > 1)
+]

 # %% [markdown]
 # Some represent the morning EMAs that only contained three questions.

 # %%
-df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]]
+df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[
+    ["esm_trigger", "esm_instructions", "esm_user_answer"]
+]

 # %%
-df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]]
+df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[
+    ["esm_trigger", "esm_instructions", "esm_user_answer"]
+]

 # %% [markdown]
 # Others represent interrupted EMA sessions.

 # %%
-df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]]
+df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[
+    ["esm_trigger", "esm_instructions", "esm_user_answer"]
+]

 # %% [markdown]
 # ### Long sessions
@ -127,7 +148,9 @@ df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[[ "esm_tri
 df_session_counts[(df_session_counts["esm_session_count"] > 40)]

 # %%
-df_esm_preprocessed.query("participant_id == 83").sort_values("_id")[[ "esm_trigger","datetime_lj", "_id", "username", "device_id"]]
+df_esm_preprocessed.query("participant_id == 83").sort_values("_id")[
+    ["esm_trigger", "datetime_lj", "_id", "username", "device_id"]
+]

 # %% [markdown]
 # Both, session ID and \_ID (and others) reset on application reinstall. Here, it can be seen that the application was reinstalled on 2 April (actually, the phone was replaced as reported by the participant).
@ -135,11 +158,108 @@ df_esm_preprocessed.query("participant_id == 83").sort_values("_id")[[ "esm_trig
 # Session IDs should therefore be grouped while taking the device ID into account.

 # %%
-session_counts_device = df_esm_preprocessed.groupby(["participant_id", "device_id", "esm_session"]).count()["id"]
+session_counts_device = df_esm_preprocessed.groupby(
+    ["participant_id", "device_id", "esm_session"]
+).count()["id"]
 sns.displot(session_counts_device.to_numpy(), binwidth=1, height=8)

 # %% [markdown]
 # ## Other possibilities

 # %% [markdown]
-# There are also answers that describe what happened to a pending question: "Removed%"
+# Prepare a dataframe with session response as determined from other indices.
+
+# %%
+import numpy as np
+
+df_session_counts = pd.DataFrame(session_counts_device).rename(
+    columns={"id": "esm_session_count"}
+)
+df_session_counts["session_response"] = np.NaN
+session_group_by = df_esm_preprocessed.groupby(
+    ["participant_id", "device_id", "esm_session"]
+)
+df_session_counts.count()
+
+# %% [markdown]
+# ### ESM statuses
+
+# %% [markdown]
+# The status of the ESM can be: 0-new, 1-dismissed, 2-answered, 3-expired, 4-visible, or 5-branched.
+#
+# Which statuses appear in the data?
+
+# %%
+df_esm_preprocessed["esm_status"].value_counts()
+
+# %% [markdown]
+# Most of the ESMs were answered (2). We can group all others as unanswered.
+
+# %%
+contains_status_not_2 = session_group_by.apply(lambda x: (x.esm_status != 2).any())
+df_session_counts.loc[contains_status_not_2, "session_response"] = "esm_unanswered"
+
+# %%
+df_session_counts.count()
+
+# %% [markdown]
+# ### Day finished or off
+
+# %%
+non_session = session_group_by.apply(
+    lambda x: (
+        (x.esm_user_answer == "DayFinished3421") | (x.esm_user_answer == "DayOff3421")
+    ).any()
+)
+df_session_counts.loc[non_session, "session_response"] = "day_finished"
+
+# %%
+df_session_counts.count()
+
+# %% [markdown]
+# ### Removed
+
+# %% [markdown]
+# There are also answers that explicitly describe what happened to a pending question that start with "Removed%".
+
+# %%
+esm_removed = session_group_by.apply(
+    lambda x: (x.esm_user_answer.str.contains("Removed")).any()
+)
+
+# %%
+df_session_counts.loc[esm_removed]
+
+# %%
+df_session_counts.loc[esm_removed, "session_response"].value_counts()
+
+# %% [markdown]
+# It turns out that these had been accounted for with ESM statuses.
+
+# %% [markdown]
+# ### Evening_last
+
+# %% [markdown]
+# When the evening EMA session comes to an end, the trigger should reflect this, that is, it should say `evening_last`.
+
+# %%
+finished_sessions = session_group_by.apply(
+    lambda x: (x.esm_trigger.str.endswith("_last")).any()
+)
+df_session_counts.loc[finished_sessions, "session_response"] = "esm_finished"
+
+# %%
+df_session_counts.count()
+
+# %%
+df_esm_preprocessed["esm_trigger"].value_counts()
+
+# %%
+sns.displot(
+    df_session_counts[df_session_counts.session_response.isna()],
+    x="esm_session_count",
+    binwidth=1,
+    height=8,
+)
+
+# %%
--- a/features/esm.py
+++ b/features/esm.py
@ -1,6 +1,7 @@
 import datetime
 from collections.abc import Collection

+import numpy as np
 import pandas as pd
 from pytz import timezone

@ -55,3 +56,54 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
        columns=["esm_trigger"]
    )  # The esm_trigger column is already present in the main df.
    return df_esm.join(df_esm_json)
+
+
+def classify_sessions_adherence(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
+    """
+    For each distinct EMA session, determine how the participant responded to it.
+    Possible outcomes are: esm_unanswered
+
+    This is done in several steps.
+    #TODO Finish the documentation.
+
+    Parameters
+    ----------
+    df_esm_preprocessed: pd.DataFrame
+        A preprocessed dataframe of esm data, which must include the session ID (esm_session).
+
+    Returns
+    -------
+    some dataframe
+    """
+    sessions_grouped = df_esm_preprocessed.groupby(
+        ["participant_id", "device_id", "esm_session"]
+    )
+
+    df_session_counts = pd.DataFrame(sessions_grouped.count()["id"]).rename(
+        columns={"id": "esm_session_count"}
+    )
+    df_session_counts["session_response"] = np.NaN
+
+    esm_not_answered = sessions_grouped.apply(lambda x: (x.esm_status != 2).any())
+    df_session_counts.loc[esm_not_answered, "session_response"] = "esm_unanswered"
+
+    non_session = sessions_grouped.apply(
+        lambda x: (
+            (x.esm_user_answer == "DayFinished3421")
+            | (x.esm_user_answer == "DayOff3421")
+        ).any()
+    )
+    df_session_counts.loc[non_session, "session_response"] = "day_finished"
+
+    finished_sessions = sessions_grouped.apply(
+        lambda x: (x.esm_trigger.str.endswith("_last")).any()
+    )
+    df_session_counts.loc[finished_sessions, "session_response"] = "esm_finished"
+
+    # TODO Look at evening-evening_last sequence, if everything is caught with finished sessions
+
+    # TODO What can be done about morning EMA, perhaps morning-morning_first (sic!) sequence?
+
+    # TODO What can be done about workday EMA.
+
+    return sessions_grouped.count()