[WIP] Prepare a function to classify adherence and illustrate steps in Jupyter Notebook.

communication
junos 2021-06-07 19:32:38 +02:00
parent 224dedaced
commit d5cd76f05a
2 changed files with 186 additions and 14 deletions

View File

@ -32,7 +32,9 @@ from features.esm import *
# Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON. # Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON.
# %% # %%
participants_inactive_usernames = participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01")) participants_inactive_usernames = participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01")
)
df_esm_inactive = get_esm_data(participants_inactive_usernames) df_esm_inactive = get_esm_data(participants_inactive_usernames)
# %% # %%
@ -47,7 +49,7 @@ df_esm_preprocessed.columns
# %% [markdown] # %% [markdown]
# The purpose of concordance is to count the number of EMA sessions that a participant answered in a day and possibly compare it to some maximum number of EMAs that could theoretically be presented for that day. # The purpose of concordance is to count the number of EMA sessions that a participant answered in a day and possibly compare it to some maximum number of EMAs that could theoretically be presented for that day.
# Traditionally, concordance (adherence) in EMA study is simply calculated as the ratio of (daily) answered EMAs. # Traditionally, concordance (adherence) in EMA study is simply calculated as the ratio of (daily) answered EMAs.
# This is possible for studies with simple EMA design, such that they are presented at fixed schedule and expired within a certain limit. # This is possible for studies with simple EMA design, such that they are presented at fixed schedule and expired within a certain limit.
# #
# Since EMAs were triggered more flexibly in our study, a different approach is needed. # Since EMAs were triggered more flexibly in our study, a different approach is needed.
@ -59,7 +61,9 @@ df_esm_preprocessed.columns
# One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first. # One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first.
# %% # %%
session_counts = df_esm_preprocessed.groupby(["participant_id","esm_session"]).count()["id"] session_counts = df_esm_preprocessed.groupby(["participant_id", "esm_session"]).count()[
"id"
]
# %% [markdown] # %% [markdown]
# Group data by participant_id and esm_session and count the number of instances (by id). Session counts are therefore counts of how many times a specific session ID appears *within* a specific participant. # Group data by participant_id and esm_session and count the number of instances (by id). Session counts are therefore counts of how many times a specific session ID appears *within* a specific participant.
@ -73,9 +77,13 @@ sns.displot(session_counts.to_numpy(), binwidth=1, height=8)
# ### Unique session IDs # ### Unique session IDs
# %% # %%
df_session_counts = pd.DataFrame(session_counts).rename(columns={"id": "esm_session_count"}) df_session_counts = pd.DataFrame(session_counts).rename(
columns={"id": "esm_session_count"}
)
df_session_1 = df_session_counts[(df_session_counts["esm_session_count"] == 1)] df_session_1 = df_session_counts[(df_session_counts["esm_session_count"] == 1)]
df_esm_unique_session = df_session_1.join(df_esm_preprocessed.set_index(["participant_id","esm_session"])) df_esm_unique_session = df_session_1.join(
df_esm_preprocessed.set_index(["participant_id", "esm_session"])
)
# %% # %%
df_esm_unique_session["esm_user_answer"].value_counts() df_esm_unique_session["esm_user_answer"].value_counts()
@ -85,10 +93,14 @@ df_esm_unique_session["esm_user_answer"].value_counts()
# What do the answers "Ne" represent? # What do the answers "Ne" represent?
# %% # %%
df_esm_unique_session.query("esm_user_answer == 'Ne'")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]].head() df_esm_unique_session.query("esm_user_answer == 'Ne'")[
["esm_trigger", "esm_instructions", "esm_user_answer"]
].head()
# %% # %%
df_esm_unique_session.loc[df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger"].value_counts() df_esm_unique_session.loc[
df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger"
].value_counts()
# %% [markdown] # %% [markdown]
# These are all "first" questions of EMAs which serve as a way to postpone the daytime or evening EMAs. # These are all "first" questions of EMAs which serve as a way to postpone the daytime or evening EMAs.
@ -103,22 +115,31 @@ df_esm_unique_session.loc[df_esm_unique_session["esm_user_answer"].str.contains(
# There are some session IDs that only appear twice or three times. # There are some session IDs that only appear twice or three times.
# %% # %%
df_session_counts[(df_session_counts["esm_session_count"] < 4) & (df_session_counts["esm_session_count"] > 1)] df_session_counts[
(df_session_counts["esm_session_count"] < 4)
& (df_session_counts["esm_session_count"] > 1)
]
# %% [markdown] # %% [markdown]
# Some represent the morning EMAs that only contained three questions. # Some represent the morning EMAs that only contained three questions.
# %% # %%
df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]] df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[
["esm_trigger", "esm_instructions", "esm_user_answer"]
]
# %% # %%
df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]] df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[
["esm_trigger", "esm_instructions", "esm_user_answer"]
]
# %% [markdown] # %% [markdown]
# Others represent interrupted EMA sessions. # Others represent interrupted EMA sessions.
# %% # %%
df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]] df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[
["esm_trigger", "esm_instructions", "esm_user_answer"]
]
# %% [markdown] # %% [markdown]
# ### Long sessions # ### Long sessions
@ -127,7 +148,9 @@ df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[[ "esm_tri
df_session_counts[(df_session_counts["esm_session_count"] > 40)] df_session_counts[(df_session_counts["esm_session_count"] > 40)]
# %% # %%
df_esm_preprocessed.query("participant_id == 83").sort_values("_id")[[ "esm_trigger","datetime_lj", "_id", "username", "device_id"]] df_esm_preprocessed.query("participant_id == 83").sort_values("_id")[
["esm_trigger", "datetime_lj", "_id", "username", "device_id"]
]
# %% [markdown] # %% [markdown]
# Both, session ID and \_ID (and others) reset on application reinstall. Here, it can be seen that the application was reinstalled on 2 April (actually, the phone was replaced as reported by the participant). # Both, session ID and \_ID (and others) reset on application reinstall. Here, it can be seen that the application was reinstalled on 2 April (actually, the phone was replaced as reported by the participant).
@ -135,11 +158,108 @@ df_esm_preprocessed.query("participant_id == 83").sort_values("_id")[[ "esm_trig
# Session IDs should therefore be grouped while taking the device ID into account. # Session IDs should therefore be grouped while taking the device ID into account.
# %% # %%
session_counts_device = df_esm_preprocessed.groupby(["participant_id", "device_id", "esm_session"]).count()["id"] session_counts_device = df_esm_preprocessed.groupby(
["participant_id", "device_id", "esm_session"]
).count()["id"]
sns.displot(session_counts_device.to_numpy(), binwidth=1, height=8) sns.displot(session_counts_device.to_numpy(), binwidth=1, height=8)
# %% [markdown] # %% [markdown]
# ## Other possibilities # ## Other possibilities
# %% [markdown] # %% [markdown]
# There are also answers that describe what happened to a pending question: "Removed%" # Prepare a dataframe with session response as determined from other indices.
# %%
import numpy as np
df_session_counts = pd.DataFrame(session_counts_device).rename(
columns={"id": "esm_session_count"}
)
df_session_counts["session_response"] = np.NaN
session_group_by = df_esm_preprocessed.groupby(
["participant_id", "device_id", "esm_session"]
)
df_session_counts.count()
# %% [markdown]
# ### ESM statuses
# %% [markdown]
# The status of the ESM can be: 0-new, 1-dismissed, 2-answered, 3-expired, 4-visible, or 5-branched.
#
# Which statuses appear in the data?
# %%
df_esm_preprocessed["esm_status"].value_counts()
# %% [markdown]
# Most of the ESMs were answered (2). We can group all others as unanswered.
# %%
contains_status_not_2 = session_group_by.apply(lambda x: (x.esm_status != 2).any())
df_session_counts.loc[contains_status_not_2, "session_response"] = "esm_unanswered"
# %%
df_session_counts.count()
# %% [markdown]
# ### Day finished or off
# %%
non_session = session_group_by.apply(
lambda x: (
(x.esm_user_answer == "DayFinished3421") | (x.esm_user_answer == "DayOff3421")
).any()
)
df_session_counts.loc[non_session, "session_response"] = "day_finished"
# %%
df_session_counts.count()
# %% [markdown]
# ### Removed
# %% [markdown]
# There are also answers that explicitly describe what happened to a pending question that start with "Removed%".
# %%
esm_removed = session_group_by.apply(
lambda x: (x.esm_user_answer.str.contains("Removed")).any()
)
# %%
df_session_counts.loc[esm_removed]
# %%
df_session_counts.loc[esm_removed, "session_response"].value_counts()
# %% [markdown]
# It turns out that these had been accounted for with ESM statuses.
# %% [markdown]
# ### Evening_last
# %% [markdown]
# When the evening EMA session comes to an end, the trigger should reflect this, that is, it should say `evening_last`.
# %%
finished_sessions = session_group_by.apply(
lambda x: (x.esm_trigger.str.endswith("_last")).any()
)
df_session_counts.loc[finished_sessions, "session_response"] = "esm_finished"
# %%
df_session_counts.count()
# %%
df_esm_preprocessed["esm_trigger"].value_counts()
# %%
sns.displot(
df_session_counts[df_session_counts.session_response.isna()],
x="esm_session_count",
binwidth=1,
height=8,
)
# %%

View File

@ -1,6 +1,7 @@
import datetime import datetime
from collections.abc import Collection from collections.abc import Collection
import numpy as np
import pandas as pd import pandas as pd
from pytz import timezone from pytz import timezone
@ -55,3 +56,54 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
columns=["esm_trigger"] columns=["esm_trigger"]
) # The esm_trigger column is already present in the main df. ) # The esm_trigger column is already present in the main df.
return df_esm.join(df_esm_json) return df_esm.join(df_esm_json)
def classify_sessions_adherence(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
"""
For each distinct EMA session, determine how the participant responded to it.
Possible outcomes are: esm_unanswered
This is done in several steps.
#TODO Finish the documentation.
Parameters
----------
df_esm_preprocessed: pd.DataFrame
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
Returns
-------
some dataframe
"""
sessions_grouped = df_esm_preprocessed.groupby(
["participant_id", "device_id", "esm_session"]
)
df_session_counts = pd.DataFrame(sessions_grouped.count()["id"]).rename(
columns={"id": "esm_session_count"}
)
df_session_counts["session_response"] = np.NaN
esm_not_answered = sessions_grouped.apply(lambda x: (x.esm_status != 2).any())
df_session_counts.loc[esm_not_answered, "session_response"] = "esm_unanswered"
non_session = sessions_grouped.apply(
lambda x: (
(x.esm_user_answer == "DayFinished3421")
| (x.esm_user_answer == "DayOff3421")
).any()
)
df_session_counts.loc[non_session, "session_response"] = "day_finished"
finished_sessions = sessions_grouped.apply(
lambda x: (x.esm_trigger.str.endswith("_last")).any()
)
df_session_counts.loc[finished_sessions, "session_response"] = "esm_finished"
# TODO Look at evening-evening_last sequence, if everything is caught with finished sessions
# TODO What can be done about morning EMA, perhaps morning-morning_first (sic!) sequence?
# TODO What can be done about workday EMA.
return sessions_grouped.count()