# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.11.2 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- import datetime # %% import os import sys import seaborn as sns nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import participants.query_db from features.esm import * # %% [markdown] # # ESM data # %% [markdown] # Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON. # %% participants_inactive_usernames = participants.query_db.get_usernames( collection_start=datetime.date.fromisoformat("2020-08-01") ) df_esm_inactive = get_esm_data(participants_inactive_usernames) # %% df_esm_preprocessed = preprocess_esm(df_esm_inactive) df_esm_preprocessed.head() # %% df_esm_preprocessed.columns # %% [markdown] # # Concordance # %% [markdown] # The purpose of concordance is to count the number of EMA sessions that a participant answered in a day and possibly compare it to some maximum number of EMAs that could theoretically be presented for that day. # %% [markdown] # ## Session IDs # %% [markdown] # One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first. # %% session_counts = df_esm_preprocessed.groupby(["participant_id", "esm_session"])[ "esm_session" ].count() # %% sns.displot(session_counts.to_numpy(), binwidth=1, height=8) # %% [markdown] # ### Unique session IDs # %% df_session_counts = pd.DataFrame(session_counts) df_session_1 = df_session_counts[(df_session_counts["esm_session"] == 1)] df_esm_unique_session = df_session_1.join( df_esm_preprocessed.set_index(["participant_id", "esm_session"]) ) # %% df_esm_unique_session["esm_user_answer"].value_counts() # %% [markdown] # The "DayFinished3421" tag marks the last EMA, where the participant only marked "I finished with work for today" and did not answer any questions. # What do the answers "Ne" represent? # %% df_esm_unique_session.query("esm_user_answer == 'Ne'")[ ["esm_trigger", "esm_instructions", "esm_user_answer"] ].head() # %% df_esm_unique_session.loc[ df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger" ].value_counts() # %% [markdown] # These are all "first" questions of EMAs which serve as a way to postpone the daytime or evening EMAs. # %% [markdown] # The other answers signify expired or interrupted EMAs. # %% [markdown] # ### "Almost" unique session IDs # %% [markdown] # There are some session IDs that only appear twice or three times. # %% df_session_counts[ (df_session_counts["esm_session"] < 4) & (df_session_counts["esm_session"] > 1) ] # %% [markdown] # Some represent the morning EMAs that only contained three questions. # %% df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[ ["esm_trigger", "esm_instructions", "esm_user_answer"] ] # %% df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[ ["esm_trigger", "esm_instructions", "esm_user_answer"] ] # %% [markdown] # Others represent interrupted EMA sessions. # %% df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[ ["esm_trigger", "esm_instructions", "esm_user_answer"] ] # %% [markdown] # ## Other possibilities # %% [markdown] # There are also answers that describe what happened to a pending question: "Removed%"