Analyze adherence:

Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion.
2021-06-11 20:28:24 +02:00 · 2021-06-11 20:28:24 +02:00 · 23c3613c60
parent 7a12f68dfe
commit 23c3613c60
2 changed files with 202 additions and 9 deletions
--- a/exploration/expl_esm.py
+++ b/exploration/expl_esm.py
@ -328,7 +328,7 @@ df_esm_session_7 = df_session_7.join(
    how="left",
 )

-# %% jupyter={"outputs_hidden": true} tags=[]
+# %% tags=[]
 with pd.option_context(
    "display.max_rows", None, "display.max_columns", None
 ):  # more options can be specified also
@ -347,7 +347,7 @@ df_esm_session_27 = df_session_27.join(
    how="left",
 )

-# %% jupyter={"outputs_hidden": true} tags=[]
+# %% tags=[]
 with pd.option_context(
    "display.max_rows", None, "display.max_columns", None
 ):  # more options can be specified also
@ -357,3 +357,28 @@ with pd.option_context(
 # These are all morning questionnaires with morning *and* workday items, with the feedback added and also branched in the longest possible way.

 # %%
+df_session_6 = df_session_counts[
+    (df_session_counts["esm_session_count"] == 6)
+    & df_session_counts.session_response.isna()
+]
+df_esm_session_6 = df_session_6.join(
+    df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]),
+    how="left",
+)
+
+# %%
+display(df_esm_session_6[["esm_trigger", "esm_instructions", "esm_user_answer"]])
+
+# %%
+df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
+df_session_time = classify_sessions_by_time(df_esm_preprocessed)
+
+# %% [markdown]
+# The sessions were classified by time by taking the **first** record in a session.
+# However, a morning questionnaire could seamlessly transition into a daytime questionnaire, if the participant was already at work.
+# In this case, the "time" label changed mid-session.
+#
+# Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
+# But for all intents and purposes, it can be treated as a "daytime" EMA.
+#
+# This is corrected in `classify_sessions_by_completion_time`
--- a/statistical_analysis/concordance.py
+++ b/statistical_analysis/concordance.py
@ -55,19 +55,19 @@ df_esm_inactive = get_esm_data(participants_inactive_usernames)

 # %%
 df_esm_preprocessed = preprocess_esm(df_esm_inactive)
-df_session_counts = classify_sessions_adherence(df_esm_preprocessed)
+df_session_counts_time = classify_sessions_by_completion_time(df_esm_preprocessed)

 # %%
-tbl_session_outcomes = df_session_counts.reset_index()[
+tbl_session_outcomes = df_session_counts_time.reset_index()[
    "session_response"
 ].value_counts()

 # %%
-print("All sessions:", len(df_session_counts))
+print("All sessions:", len(df_session_counts_time))
 print("-------------------------------------")
 print(tbl_session_outcomes)
 print("-------------------------------------")
-print(tbl_session_outcomes / len(df_session_counts))
+print(tbl_session_outcomes / len(df_session_counts_time))

 # %%
 VARIABLES_TO_TRANSLATE = {
@ -83,11 +83,11 @@ baseline_inactive = baseline_inactive.assign(
 )

 # %%
-df_session_counts
+df_session_counts_time

 # %%
-df_session_finished = df_session_counts[
-    df_session_counts["session_response"] == "esm_finished"
+df_session_finished = df_session_counts_time[
+    df_session_counts_time["session_response"] == SESSION_STATUS_COMPLETE
 ].reset_index()

 # %%
@ -132,3 +132,171 @@ lr_ols = smf.ols(
 )
 ls_result = lr_ols.fit()
 ls_result.summary()
+
+# %% [markdown]
+# # Concordance by type
+
+# %% [markdown]
+# ## Workday EMA
+
+# %% [markdown]
+# ### Filter the EMA of interest.
+
+# %% [markdown]
+# Work with only completed EMA.
+
+# %% tags=[]
+df_session_counts_time_completed = df_session_counts_time[
+    df_session_counts_time.session_response == "ema_completed"
+]
+
+# %% [markdown]
+# To be able to compare EMA sessions *within* one day, add a date-part column.
+#
+# **NOTE**: Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM, the datetime is first translated to 4 h earlier.
+
+# %%
+df_session_counts_time_completed = df_session_counts_time_completed.assign(
+    date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
+)
+
+# %%
+df_session_counts_time_completed
+
+# %% [markdown]
+# Next, calculate differences between subsequent record. But first group them by participant and device ID (as usual) and *time*. This way, the differences between the same type of EMA sessions are calculated.
+
+# %% tags=[]
+df_session_time_diff = (
+    df_session_counts_time_completed[["datetime_lj", "date_lj", "time"]]
+    .groupby(["participant_id", "device_id", "time"])
+    .diff()
+    .rename(
+        columns={
+            "datetime_lj": "previous_same_type_time_diff",
+            "date_lj": "time_diff_days",
+        }
+    )
+)
+
+# %%
+df_session_time_diff
+
+# %% tags=[]
+df_session_counts_time_diff = df_session_counts_time_completed.join(
+    df_session_time_diff, how="left"
+)
+
+# %% [markdown]
+# Now, select only the daytime EMAs of interest. Discard the differences between *different day* EMAs.
+
+# %%
+time_workday_completed_less_than_1_day = (
+    (df_session_counts_time_diff.time == "daytime")  # Only take daytime EMAs.
+    & ~(
+        df_session_counts_time_diff.previous_same_type_time_diff.isna()
+    )  # Only where the diff was actually calculated.
+    & (df_session_counts_time_diff.time_diff_days == datetime.timedelta(0))
+)  # Only take differences *within* a day.
+
+# %% tags=[]
+df_session_workday = df_session_counts_time_diff[time_workday_completed_less_than_1_day]
+
+# %%
+df_session_workday = df_session_workday.assign(
+    time_diff_minutes=lambda x: x.previous_same_type_time_diff.dt.seconds / 60
+)
+
+# %%
+sns.displot(df_session_workday["time_diff_minutes"], binwidth=5, height=5)
+
+# %% [markdown]
+# There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those.
+
+# %%
+df_session_workday[df_session_workday.time_diff_minutes < 30]
+
+# %% [markdown]
+# There are only 2 instances, look at them individually.
+
+# %%
+df_esm_preprocessed.loc[
+    (df_esm_preprocessed.participant_id == 35) & (df_esm_preprocessed.esm_session == 6),
+    ["esm_trigger", "esm_session", "datetime_lj", "esm_instructions"],
+]
+
+# %%
+df_esm_preprocessed.loc[
+    (df_esm_preprocessed.participant_id == 45)
+    & (df_esm_preprocessed.esm_session < 3)
+    & (df_esm_preprocessed.device_id == "d848b1c4-33cc-4e22-82ae-96d6b6458a33"),
+    ["esm_trigger", "esm_session", "datetime_lj", "esm_instructions"],
+]
+
+# %% [markdown]
+# As these signify bugs, we can safely discard them in the following analysis.
+
+# %%
+df_session_workday = df_session_workday[df_session_workday.time_diff_minutes > 29]
+
+# %% [markdown]
+# ### All participants
+
+# %%
+df_session_workday.describe()
+
+# %% [markdown]
+# These statistics look reasonable.
+
+# %% [markdown]
+# ### Differences between participants
+
+# %%
+df_mean_daytime_interval = df_session_workday.groupby("participant_id").mean()
+
+# %%
+df_mean_daytime_interval.describe()
+
+# %%
+sns.displot(df_mean_daytime_interval.time_diff_minutes, binwidth=5, height=5)
+
+# %%
+df_count_daytime_per_participant = df_session_workday.groupby(
+    ["participant_id", "date_lj"]
+).count()
+
+# %%
+df_count_daytime_per_participant["time"].describe()
+
+# %%
+sns.displot(df_count_daytime_per_participant.time, binwidth=1, height=5)
+
+# %% [markdown]
+# ## Evening EMA
+
+# %% [markdown]
+# For evening EMA, determine whether in a day that any EMA session was completed, an evening EMA is also present.
+#
+# Note, we are only dealing with true EMA sessions, non-sessions etc. have already been filtered out.
+
+# %%
+s_evening_completed = df_session_counts_time_completed.groupby(
+    ["participant_id", "device_id", "date_lj"]
+).apply(lambda x: (x.time == "evening").any())
+
+# %%
+df_session_counts_time_completed
+
+# %%
+s_evening_completed_ratio = (
+    s_evening_completed.groupby("participant_id").sum()
+    / s_evening_completed.groupby("participant_id").count()
+)
+
+# %%
+s_evening_completed_ratio.describe()
+
+# %%
+sns.displot(s_evening_completed_ratio - 0.001, binwidth=0.1, height=5)
+
+# %%