diff --git a/exploration/expl_esm.py b/exploration/expl_esm.py index 5f469f4..6e332c2 100644 --- a/exploration/expl_esm.py +++ b/exploration/expl_esm.py @@ -34,9 +34,7 @@ from features.esm import * # Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON. # %% -participants_inactive_usernames = participants.query_db.get_usernames( - collection_start=datetime.date.fromisoformat("2020-08-01") -) +participants_inactive_usernames = participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01")) df_esm_inactive = get_esm_data(participants_inactive_usernames) # %% @@ -51,7 +49,7 @@ df_esm_preprocessed.columns # %% [markdown] # The purpose of concordance is to count the number of EMA sessions that a participant answered in a day and possibly compare it to some maximum number of EMAs that could theoretically be presented for that day. -# Traditionally, concordance (adherence) in EMA study is simply calculated as the ratio of (daily) answered EMAs. +# Traditionally, concordance (adherence) in EMA study is simply calculated as the ratio of (daily) answered EMAs. # This is possible for studies with simple EMA design, such that they are presented at fixed schedule and expired within a certain limit. # # Since EMAs were triggered more flexibly in our study, a different approach is needed. @@ -63,9 +61,12 @@ df_esm_preprocessed.columns # One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first. # %% -session_counts = df_esm_preprocessed.groupby(["participant_id", "esm_session"])[ - "esm_session" -].count() +session_counts = df_esm_preprocessed.groupby(["participant_id","esm_session"]).count()["id"] + +# %% [markdown] +# Group data by participant_id and esm_session and count the number of instances (by id). Session counts are therefore counts of how many times a specific session ID appears *within* a specific participant. +# +# In the plot below, it is impossible to distinguish whether a specific count appears many times within the same or across different participants. # %% sns.displot(session_counts.to_numpy(), binwidth=1, height=8) @@ -74,11 +75,9 @@ sns.displot(session_counts.to_numpy(), binwidth=1, height=8) # ### Unique session IDs # %% -df_session_counts = pd.DataFrame(session_counts) -df_session_1 = df_session_counts[(df_session_counts["esm_session"] == 1)] -df_esm_unique_session = df_session_1.join( - df_esm_preprocessed.set_index(["participant_id", "esm_session"]) -) +df_session_counts = pd.DataFrame(session_counts).rename(columns={"id": "esm_session_count"}) +df_session_1 = df_session_counts[(df_session_counts["esm_session_count"] == 1)] +df_esm_unique_session = df_session_1.join(df_esm_preprocessed.set_index(["participant_id","esm_session"])) # %% df_esm_unique_session["esm_user_answer"].value_counts() @@ -88,14 +87,10 @@ df_esm_unique_session["esm_user_answer"].value_counts() # What do the answers "Ne" represent? # %% -df_esm_unique_session.query("esm_user_answer == 'Ne'")[ - ["esm_trigger", "esm_instructions", "esm_user_answer"] -].head() +df_esm_unique_session.query("esm_user_answer == 'Ne'")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]].head() # %% -df_esm_unique_session.loc[ - df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger" -].value_counts() +df_esm_unique_session.loc[df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger"].value_counts() # %% [markdown] # These are all "first" questions of EMAs which serve as a way to postpone the daytime or evening EMAs. @@ -110,30 +105,36 @@ df_esm_unique_session.loc[ # There are some session IDs that only appear twice or three times. # %% -df_session_counts[ - (df_session_counts["esm_session"] < 4) & (df_session_counts["esm_session"] > 1) -] +df_session_counts[(df_session_counts["esm_session_count"] < 4) & (df_session_counts["esm_session_count"] > 1)] # %% [markdown] # Some represent the morning EMAs that only contained three questions. # %% -df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[ - ["esm_trigger", "esm_instructions", "esm_user_answer"] -] +df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]] # %% -df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[ - ["esm_trigger", "esm_instructions", "esm_user_answer"] -] +df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]] # %% [markdown] # Others represent interrupted EMA sessions. # %% -df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[ - ["esm_trigger", "esm_instructions", "esm_user_answer"] -] +df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[[ "esm_trigger", "esm_instructions", "esm_user_answer"]] + +# %% [markdown] +# ### Long sessions + +# %% +df_session_counts[(df_session_counts["esm_session_count"] > 40)] + +# %% +df_esm_preprocessed.query("participant_id == 83").sort_values("_id")[[ "esm_trigger","datetime_lj", "_id", "username"]] + +# %% [markdown] +# Both, session ID and \_ID (and others) reset on application reinstall. Here, it can be seen that the application was reinstalled on 2 April (actually, the phone was replaced as reported by the participant). +# +# Session IDs should therefore be grouped while taking the timestamp into account (e.g. by sorting first). # %% [markdown] # ## Other possibilities