From 2da9a8f9e35258684919723ce23e77fc5c0dde6f Mon Sep 17 00:00:00 2001 From: junos Date: Wed, 2 Jun 2021 18:35:00 +0200 Subject: [PATCH] Study session ID in depth. --- exploration/expl_esm.py | 113 +++++++++++++++++++++++++++++++++++++--- features/esm.py | 4 +- test/test_esm.py | 2 +- 3 files changed, 111 insertions(+), 8 deletions(-) diff --git a/exploration/expl_esm.py b/exploration/expl_esm.py index c41780a..ae7fd32 100644 --- a/exploration/expl_esm.py +++ b/exploration/expl_esm.py @@ -13,25 +13,126 @@ # name: straw2analysis # --- +import datetime + # %% import os import sys +import seaborn as sns + nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import participants.query_db from features.esm import * -# %% -df_esm_nokia = get_esm_data(["nokia_0000003"]) -print(df_esm_nokia) +# %% [markdown] +# # ESM data + +# %% [markdown] +# Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON. # %% -df_esm_nokia_json = pd.json_normalize(df_esm_nokia["esm_json"]).drop(columns=["esm_trigger"]) -df_esm_nokia_full = df_esm_nokia.join(df_esm_nokia_json) +participants_inactive_usernames = participants.query_db.get_usernames( + collection_start=datetime.date.fromisoformat("2020-08-01") +) +df_esm_inactive = get_esm_data(participants_inactive_usernames) # %% -df_esm_nokia_full.loc[df_esm_nokia_full["esm_user_answer"].str.contains("Remove"),"esm_user_answer"].value_counts() +df_esm_preprocessed = preprocess_esm(df_esm_inactive) +df_esm_preprocessed.head() # %% +df_esm_preprocessed.columns + +# %% [markdown] +# # Concordance + +# %% [markdown] +# The purpose of concordance is to count the number of EMA sessions that a participant answered in a day and possibly compare it to some maximum number of EMAs that could theoretically be presented for that day. + +# %% [markdown] +# ## Session IDs + +# %% [markdown] +# One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first. + +# %% +session_counts = df_esm_preprocessed.groupby(["participant_id", "esm_session"])[ + "esm_session" +].count() + +# %% +sns.displot(session_counts.to_numpy(), binwidth=1, height=8) + +# %% [markdown] +# ### Unique session IDs + +# %% +df_session_counts = pd.DataFrame(session_counts) +df_session_1 = df_session_counts[(df_session_counts["esm_session"] == 1)] +df_esm_unique_session = df_session_1.join( + df_esm_preprocessed.set_index(["participant_id", "esm_session"]) +) + +# %% +df_esm_unique_session["esm_user_answer"].value_counts() + +# %% [markdown] +# The "DayFinished3421" tag marks the last EMA, where the participant only marked "I finished with work for today" and did not answer any questions. +# What do the answers "Ne" represent? + +# %% +df_esm_unique_session.query("esm_user_answer == 'Ne'")[ + ["esm_trigger", "esm_instructions", "esm_user_answer"] +].head() + +# %% +df_esm_unique_session.loc[ + df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger" +].value_counts() + +# %% [markdown] +# These are all "first" questions of EMAs which serve as a way to postpone the daytime or evening EMAs. + +# %% [markdown] +# The other answers signify expired or interrupted EMAs. + +# %% [markdown] +# ### "Almost" unique session IDs + +# %% [markdown] +# There are some session IDs that only appear twice or three times. + +# %% +df_session_counts[ + (df_session_counts["esm_session"] < 4) & (df_session_counts["esm_session"] > 1) +] + +# %% [markdown] +# Some represent the morning EMAs that only contained three questions. + +# %% +df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[ + ["esm_trigger", "esm_instructions", "esm_user_answer"] +] + +# %% +df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[ + ["esm_trigger", "esm_instructions", "esm_user_answer"] +] + +# %% [markdown] +# Others represent interrupted EMA sessions. + +# %% +df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[ + ["esm_trigger", "esm_instructions", "esm_user_answer"] +] + +# %% [markdown] +# ## Other possibilities + +# %% [markdown] +# There are also answers that describe what happened to a pending question: "Removed%" diff --git a/features/esm.py b/features/esm.py index b5dfbb1..66dc82c 100644 --- a/features/esm.py +++ b/features/esm.py @@ -38,5 +38,7 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame: df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply( lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ) ) - df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(columns=["esm_trigger"]) # The esm_trigger column is already present in the main df. + df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop( + columns=["esm_trigger"] + ) # The esm_trigger column is already present in the main df. return df_esm.join(df_esm_json) diff --git a/test/test_esm.py b/test/test_esm.py index 4cd65e3..c531093 100644 --- a/test/test_esm.py +++ b/test/test_esm.py @@ -11,7 +11,7 @@ from features.esm import preprocess_esm class EsmFeatures(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - cls.esm = pd.read_csv("../data/example_esm.csv", sep=';') + cls.esm = pd.read_csv("../data/example_esm.csv", sep=";") cls.esm["esm_json"] = cls.esm["esm_json"].apply(eval) def test_preprocess_esm(self):