# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.11.4 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% # %matplotlib inline import os import sys import seaborn as sns nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import participants.query_db from features.esm import * # %% [markdown] # # ESM data # %% [markdown] # Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON. # %% participants_inactive_usernames = participants.query_db.get_usernames( collection_start=datetime.date.fromisoformat("2020-08-01") ) df_esm_inactive = get_esm_data(participants_inactive_usernames) # %% df_esm_preprocessed = preprocess_esm(df_esm_inactive) df_esm_preprocessed.head() # %% df_esm_preprocessed.columns # %% [markdown] # # Concordance # %% [markdown] # The purpose of concordance is to count the number of EMA sessions that a participant answered in a day and possibly compare it to some maximum number of EMAs that could theoretically be presented for that day. # Traditionally, concordance (adherence) in EMA study is simply calculated as the ratio of (daily) answered EMAs. # This is possible for studies with simple EMA design, such that they are presented at fixed schedule and expired within a certain limit. # # Since EMAs were triggered more flexibly in our study, a different approach is needed. # %% [markdown] # ## Session IDs # %% [markdown] # One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first. # %% session_counts = df_esm_preprocessed.groupby(["participant_id", "esm_session"]).count()[ "id" ] # %% [markdown] # Group data by participant_id and esm_session and count the number of instances (by id). Session counts are therefore counts of how many times a specific session ID appears *within* a specific participant. # # In the plot below, it is impossible to distinguish whether a specific count appears many times within the same or across different participants. # %% sns.displot(session_counts.to_numpy(), binwidth=1, height=8) # %% [markdown] # ### Unique session IDs # %% df_session_counts = pd.DataFrame(session_counts).rename( columns={"id": "esm_session_count"} ) df_session_1 = df_session_counts[(df_session_counts["esm_session_count"] == 1)] df_esm_unique_session = df_session_1.join( df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]) ) # %% df_esm_unique_session["esm_user_answer"].value_counts() # %% [markdown] # The "DayFinished3421" tag marks the last EMA, where the participant only marked "I finished with work for today" and did not answer any questions. # What do the answers "Ne" represent? # %% df_esm_unique_session.query("esm_user_answer == 'Ne'")[ ["esm_trigger", "esm_instructions", "esm_user_answer"] ].head() # %% df_esm_unique_session.loc[ df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger" ].value_counts() # %% [markdown] # These are all "first" questions of EMAs which serve as a way to postpone the daytime or evening EMAs. # %% [markdown] # The other answers signify expired or interrupted EMAs. # %% [markdown] # ### "Almost" unique session IDs # %% [markdown] # There are some session IDs that only appear twice or three times. # %% df_session_counts[ (df_session_counts["esm_session_count"] < 4) & (df_session_counts["esm_session_count"] > 1) ] # %% [markdown] # Some represent the morning EMAs that only contained three questions. # %% df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[ ["esm_trigger", "esm_instructions", "esm_user_answer"] ] # %% df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[ ["esm_trigger", "esm_instructions", "esm_user_answer"] ] # %% [markdown] # Others represent interrupted EMA sessions. # %% df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[ ["esm_trigger", "esm_instructions", "esm_user_answer"] ] # %% tags=[] df_esm_2 = ( df_session_counts[df_session_counts["esm_session_count"] == 2] .reset_index() .merge(df_esm_preprocessed, how="left", on=["participant_id", "esm_session"],) ) # with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also # display(df_esm_2) # %% [markdown] tags=[] # ### Long sessions # %% df_session_counts[(df_session_counts["esm_session_count"] > 40)] # %% df_esm_preprocessed.query("participant_id == 83").sort_values("_id")[ ["esm_trigger", "datetime_lj", "_id", "username", "device_id"] ] # %% [markdown] # Both, session ID and \_ID (and others) reset on application reinstall. Here, it can be seen that the application was reinstalled on 2 April (actually, the phone was replaced as reported by the participant). # # Session IDs should therefore be grouped while taking the device ID into account. # %% session_counts_device = df_esm_preprocessed.groupby( ["participant_id", "device_id", "esm_session"] ).count()["id"] sns.displot(session_counts_device.to_numpy(), binwidth=1, height=8) # %% [markdown] # ## Other possibilities # %% [markdown] # Prepare a dataframe with session response as determined from other indices. # %% import numpy as np df_session_counts = pd.DataFrame(session_counts_device).rename( columns={"id": "esm_session_count"} ) df_session_counts["session_response"] = np.nan session_group_by = df_esm_preprocessed.groupby( ["participant_id", "device_id", "esm_session"] ) df_session_counts.count() # %% [markdown] # ### ESM statuses # %% [markdown] # The status of the ESM can be: 0-new, 1-dismissed, 2-answered, 3-expired, 4-visible, or 5-branched. # # Which statuses appear in the data? # %% df_esm_preprocessed["esm_status"].value_counts() # %% [markdown] # Most of the ESMs were answered (2). We can group all others as unanswered. # %% contains_status_not_2 = session_group_by.apply(lambda x: (x.esm_status != 2).any()) df_session_counts.loc[contains_status_not_2, "session_response"] = "esm_unanswered" # %% df_session_counts.count() # %% [markdown] # ### Day finished or off # %% non_session = session_group_by.apply( lambda x: ( (x.esm_user_answer == "DayFinished3421") | (x.esm_user_answer == "DayOff3421") | (x.esm_user_answer == "DayFinishedSetEvening") ).any() ) df_session_counts.loc[non_session, "session_response"] = "day_finished" # %% df_session_counts.count() # %% [markdown] # ### Removed # %% [markdown] # There are also answers that explicitly describe what happened to a pending question that start with "Removed%". # %% esm_removed = session_group_by.apply( lambda x: (x.esm_user_answer.str.contains("Removed")).any() ) # %% df_session_counts.loc[esm_removed] # %% df_session_counts.loc[esm_removed, "session_response"].value_counts() # %% [markdown] # It turns out that these had been accounted for with ESM statuses. # %% [markdown] # ### Singleton sessions # %% df_session_counts.count() # %% df_session_counts[ (df_session_counts.esm_session_count == 1) & df_session_counts.session_response.isna() ] # %% df_session_1 = df_session_counts[ (df_session_counts["esm_session_count"] == 1) & df_session_counts.session_response.isna() ] df_esm_unique_session = df_session_1.join( df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]) ) df_esm_unique_session = df_esm_unique_session["esm_trigger"].rename("session_response") # %% df_session_counts.loc[ df_esm_unique_session.index, "session_response" ] = df_esm_unique_session # %% df_session_counts.count() # %% [markdown] # ### Evening_last # %% [markdown] # When the evening EMA session comes to an end, the trigger should reflect this, that is, it should say `evening_last`. # %% finished_sessions = session_group_by.apply( lambda x: (x.esm_trigger.str.endswith("_last")).any() ) df_session_counts.loc[finished_sessions, "session_response"] = "esm_finished" # %% df_session_counts.count() # %% df_esm_preprocessed["esm_trigger"].value_counts() # %% sns.displot( df_session_counts[df_session_counts.session_response.isna()], x="esm_session_count", binwidth=1, height=8, ) # %% [markdown] # ### Repeated sessions # %% [markdown] # The sessions lengths that repeat often can probably be used as filled in EMAs. Let's only review the session lengths that are rare. # %% df_session_counts.loc[ df_session_counts.session_response.isna(), "esm_session_count" ].value_counts().sort_index() # %% tags=[] df_session_7 = df_session_counts[ (df_session_counts["esm_session_count"] == 7) & df_session_counts.session_response.isna() ] df_esm_session_7 = df_session_7.join( df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]), how="left", ) # %% tags=[] with pd.option_context( "display.max_rows", None, "display.max_columns", None ): # more options can be specified also display(df_esm_session_7[["esm_trigger", "esm_instructions", "esm_user_answer"]]) # %% [markdown] # These are all morning questionnaires with "commute" selected or rarely "long break" in the morning. # %% df_session_27 = df_session_counts[ (df_session_counts["esm_session_count"] == 27) & df_session_counts.session_response.isna() ] df_esm_session_27 = df_session_27.join( df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]), how="left", ) # %% tags=[] with pd.option_context( "display.max_rows", None, "display.max_columns", None ): # more options can be specified also display(df_esm_session_27[["esm_trigger", "esm_instructions", "esm_user_answer"]]) # %% [markdown] # These are all morning questionnaires with morning *and* workday items, with the feedback added and also branched in the longest possible way. # %% df_session_6 = df_session_counts[ (df_session_counts["esm_session_count"] == 6) & df_session_counts.session_response.isna() ] df_esm_session_6 = df_session_6.join( df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]), how="left", ) # %% display(df_esm_session_6[["esm_trigger", "esm_instructions", "esm_user_answer"]]) # %% [markdown] # The 6-question sessions are long interruptions of work during daytime. # %% [markdown] # # Count and classify sessions # %% df_session_counts = classify_sessions_by_completion(df_esm_preprocessed) df_session_time = classify_sessions_by_time(df_esm_preprocessed) # %% df_session_time # %% [markdown] # The sessions were classified by time by taking the **first** record in a session. # However, a morning questionnaire could seamlessly transition into a daytime questionnaire, if the participant was already at work. # In this case, the "time" label changed mid-session. # # Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning". # But for all intents and purposes, it can be treated as a "daytime" EMA. # # This is corrected in `classify_sessions_by_completion_time`