diff --git a/exploration/expl_esm.py b/exploration/expl_esm.py index 72bf376..6934b58 100644 --- a/exploration/expl_esm.py +++ b/exploration/expl_esm.py @@ -32,7 +32,9 @@ from features.esm import * # Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON. # %% -participants_inactive_usernames = participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01")) +participants_inactive_usernames = participants.query_db.get_usernames( + collection_start=datetime.date.fromisoformat("2020-08-01") +) df_esm_inactive = get_esm_data(participants_inactive_usernames) # %% @@ -47,7 +49,7 @@ df_esm_preprocessed.columns # %% [markdown] # The purpose of concordance is to count the number of EMA sessions that a participant answered in a day and possibly compare it to some maximum number of EMAs that could theoretically be presented for that day. -# Traditionally, concordance (adherence) in EMA study is simply calculated as the ratio of (daily) answered EMAs. +# Traditionally, concordance (adherence) in EMA study is simply calculated as the ratio of (daily) answered EMAs. # This is possible for studies with simple EMA design, such that they are presented at fixed schedule and expired within a certain limit. # # Since EMAs were triggered more flexibly in our study, a different approach is needed. @@ -59,7 +61,9 @@ df_esm_preprocessed.columns # One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first. # %% -session_counts = df_esm_preprocessed.groupby(["participant_id","esm_session"]).count()["id"] +session_counts = df_esm_preprocessed.groupby(["participant_id", "esm_session"]).count()[ + "id" +] # %% [markdown] # Group data by participant_id and esm_session and count the number of instances (by id). Session counts are therefore counts of how many times a specific session ID appears *within* a specific participant. @@ -73,9 +77,13 @@ sns.displot(session_counts.to_numpy(), binwidth=1, height=8) # ### Unique session IDs # %% -df_session_counts = pd.DataFrame(session_counts).rename(columns={"id": "esm_session_count"}) +df_session_counts = pd.DataFrame(session_counts).rename( + columns={"id": "esm_session_count"} +) df_session_1 = df_session_counts[(df_session_counts["esm_session_count"] == 1)] -df_esm_unique_session = df_session_1.join(df_esm_preprocessed.set_index(["participant_id","esm_session"])) +df_esm_unique_session = df_session_1.join( + df_esm_preprocessed.set_index(["participant_id", "esm_session"]) +) # %% df_esm_unique_session["esm_user_answer"].value_counts() @@ -85,10 +93,14 @@ df_esm_unique_session["esm_user_answer"].value_counts() # What do the answers "Ne" represent? # %% -df_esm_unique_session.query("esm_user_answer == 'Ne'")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]].head() +df_esm_unique_session.query("esm_user_answer == 'Ne'")[ + ["esm_trigger", "esm_instructions", "esm_user_answer"] +].head() # %% -df_esm_unique_session.loc[df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger"].value_counts() +df_esm_unique_session.loc[ + df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger" +].value_counts() # %% [markdown] # These are all "first" questions of EMAs which serve as a way to postpone the daytime or evening EMAs. @@ -103,22 +115,31 @@ df_esm_unique_session.loc[df_esm_unique_session["esm_user_answer"].str.contains( # There are some session IDs that only appear twice or three times. # %% -df_session_counts[(df_session_counts["esm_session_count"] < 4) & (df_session_counts["esm_session_count"] > 1)] +df_session_counts[ + (df_session_counts["esm_session_count"] < 4) + & (df_session_counts["esm_session_count"] > 1) +] # %% [markdown] # Some represent the morning EMAs that only contained three questions. # %% -df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]] +df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[ + ["esm_trigger", "esm_instructions", "esm_user_answer"] +] # %% -df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]] +df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[ + ["esm_trigger", "esm_instructions", "esm_user_answer"] +] # %% [markdown] # Others represent interrupted EMA sessions. # %% -df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]] +df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[ + ["esm_trigger", "esm_instructions", "esm_user_answer"] +] # %% [markdown] # ### Long sessions @@ -127,7 +148,9 @@ df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[[ "esm_tri df_session_counts[(df_session_counts["esm_session_count"] > 40)] # %% -df_esm_preprocessed.query("participant_id == 83").sort_values("_id")[[ "esm_trigger","datetime_lj", "_id", "username", "device_id"]] +df_esm_preprocessed.query("participant_id == 83").sort_values("_id")[ + ["esm_trigger", "datetime_lj", "_id", "username", "device_id"] +] # %% [markdown] # Both, session ID and \_ID (and others) reset on application reinstall. Here, it can be seen that the application was reinstalled on 2 April (actually, the phone was replaced as reported by the participant). @@ -135,11 +158,108 @@ df_esm_preprocessed.query("participant_id == 83").sort_values("_id")[[ "esm_trig # Session IDs should therefore be grouped while taking the device ID into account. # %% -session_counts_device = df_esm_preprocessed.groupby(["participant_id", "device_id", "esm_session"]).count()["id"] +session_counts_device = df_esm_preprocessed.groupby( + ["participant_id", "device_id", "esm_session"] +).count()["id"] sns.displot(session_counts_device.to_numpy(), binwidth=1, height=8) # %% [markdown] # ## Other possibilities # %% [markdown] -# There are also answers that describe what happened to a pending question: "Removed%" +# Prepare a dataframe with session response as determined from other indices. + +# %% +import numpy as np + +df_session_counts = pd.DataFrame(session_counts_device).rename( + columns={"id": "esm_session_count"} +) +df_session_counts["session_response"] = np.NaN +session_group_by = df_esm_preprocessed.groupby( + ["participant_id", "device_id", "esm_session"] +) +df_session_counts.count() + +# %% [markdown] +# ### ESM statuses + +# %% [markdown] +# The status of the ESM can be: 0-new, 1-dismissed, 2-answered, 3-expired, 4-visible, or 5-branched. +# +# Which statuses appear in the data? + +# %% +df_esm_preprocessed["esm_status"].value_counts() + +# %% [markdown] +# Most of the ESMs were answered (2). We can group all others as unanswered. + +# %% +contains_status_not_2 = session_group_by.apply(lambda x: (x.esm_status != 2).any()) +df_session_counts.loc[contains_status_not_2, "session_response"] = "esm_unanswered" + +# %% +df_session_counts.count() + +# %% [markdown] +# ### Day finished or off + +# %% +non_session = session_group_by.apply( + lambda x: ( + (x.esm_user_answer == "DayFinished3421") | (x.esm_user_answer == "DayOff3421") + ).any() +) +df_session_counts.loc[non_session, "session_response"] = "day_finished" + +# %% +df_session_counts.count() + +# %% [markdown] +# ### Removed + +# %% [markdown] +# There are also answers that explicitly describe what happened to a pending question that start with "Removed%". + +# %% +esm_removed = session_group_by.apply( + lambda x: (x.esm_user_answer.str.contains("Removed")).any() +) + +# %% +df_session_counts.loc[esm_removed] + +# %% +df_session_counts.loc[esm_removed, "session_response"].value_counts() + +# %% [markdown] +# It turns out that these had been accounted for with ESM statuses. + +# %% [markdown] +# ### Evening_last + +# %% [markdown] +# When the evening EMA session comes to an end, the trigger should reflect this, that is, it should say `evening_last`. + +# %% +finished_sessions = session_group_by.apply( + lambda x: (x.esm_trigger.str.endswith("_last")).any() +) +df_session_counts.loc[finished_sessions, "session_response"] = "esm_finished" + +# %% +df_session_counts.count() + +# %% +df_esm_preprocessed["esm_trigger"].value_counts() + +# %% +sns.displot( + df_session_counts[df_session_counts.session_response.isna()], + x="esm_session_count", + binwidth=1, + height=8, +) + +# %% diff --git a/features/esm.py b/features/esm.py index eb03119..8f9d4b0 100644 --- a/features/esm.py +++ b/features/esm.py @@ -1,6 +1,7 @@ import datetime from collections.abc import Collection +import numpy as np import pandas as pd from pytz import timezone @@ -55,3 +56,54 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame: columns=["esm_trigger"] ) # The esm_trigger column is already present in the main df. return df_esm.join(df_esm_json) + + +def classify_sessions_adherence(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame: + """ + For each distinct EMA session, determine how the participant responded to it. + Possible outcomes are: esm_unanswered + + This is done in several steps. + #TODO Finish the documentation. + + Parameters + ---------- + df_esm_preprocessed: pd.DataFrame + A preprocessed dataframe of esm data, which must include the session ID (esm_session). + + Returns + ------- + some dataframe + """ + sessions_grouped = df_esm_preprocessed.groupby( + ["participant_id", "device_id", "esm_session"] + ) + + df_session_counts = pd.DataFrame(sessions_grouped.count()["id"]).rename( + columns={"id": "esm_session_count"} + ) + df_session_counts["session_response"] = np.NaN + + esm_not_answered = sessions_grouped.apply(lambda x: (x.esm_status != 2).any()) + df_session_counts.loc[esm_not_answered, "session_response"] = "esm_unanswered" + + non_session = sessions_grouped.apply( + lambda x: ( + (x.esm_user_answer == "DayFinished3421") + | (x.esm_user_answer == "DayOff3421") + ).any() + ) + df_session_counts.loc[non_session, "session_response"] = "day_finished" + + finished_sessions = sessions_grouped.apply( + lambda x: (x.esm_trigger.str.endswith("_last")).any() + ) + df_session_counts.loc[finished_sessions, "session_response"] = "esm_finished" + + # TODO Look at evening-evening_last sequence, if everything is caught with finished sessions + + # TODO What can be done about morning EMA, perhaps morning-morning_first (sic!) sequence? + + # TODO What can be done about workday EMA. + + return sessions_grouped.count()