Study session ID in depth.

2021-06-02 18:35:00 +02:00 · 2021-06-02 18:35:00 +02:00 · 2da9a8f9e3
parent 53f34965e2
commit 2da9a8f9e3
3 changed files with 111 additions and 8 deletions
--- a/exploration/expl_esm.py
+++ b/exploration/expl_esm.py
@ -13,25 +13,126 @@
 #     name: straw2analysis
 # ---

+import datetime
+
 # %%
 import os
 import sys

+import seaborn as sns
+
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import participants.query_db
 from features.esm import *

-# %%
-df_esm_nokia = get_esm_data(["nokia_0000003"])
-print(df_esm_nokia)
+# %% [markdown]
+# # ESM data
+
+# %% [markdown]
+# Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON.

 # %%
-df_esm_nokia_json = pd.json_normalize(df_esm_nokia["esm_json"]).drop(columns=["esm_trigger"])
-df_esm_nokia_full = df_esm_nokia.join(df_esm_nokia_json)
+participants_inactive_usernames = participants.query_db.get_usernames(
+    collection_start=datetime.date.fromisoformat("2020-08-01")
+)
+df_esm_inactive = get_esm_data(participants_inactive_usernames)

 # %%
-df_esm_nokia_full.loc[df_esm_nokia_full["esm_user_answer"].str.contains("Remove"),"esm_user_answer"].value_counts()
+df_esm_preprocessed = preprocess_esm(df_esm_inactive)
+df_esm_preprocessed.head()

 # %%
+df_esm_preprocessed.columns
+
+# %% [markdown]
+# # Concordance
+
+# %% [markdown]
+# The purpose of concordance is to count the number of EMA sessions that a participant answered in a day and possibly compare it to some maximum number of EMAs that could theoretically be presented for that day.
+
+# %% [markdown]
+# ## Session IDs
+
+# %% [markdown]
+# One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first.
+
+# %%
+session_counts = df_esm_preprocessed.groupby(["participant_id", "esm_session"])[
+    "esm_session"
+].count()
+
+# %%
+sns.displot(session_counts.to_numpy(), binwidth=1, height=8)
+
+# %% [markdown]
+# ### Unique session IDs
+
+# %%
+df_session_counts = pd.DataFrame(session_counts)
+df_session_1 = df_session_counts[(df_session_counts["esm_session"] == 1)]
+df_esm_unique_session = df_session_1.join(
+    df_esm_preprocessed.set_index(["participant_id", "esm_session"])
+)
+
+# %%
+df_esm_unique_session["esm_user_answer"].value_counts()
+
+# %% [markdown]
+# The "DayFinished3421" tag marks the last EMA, where the participant only marked "I finished with work for today" and did not answer any questions.
+# What do the answers "Ne" represent?
+
+# %%
+df_esm_unique_session.query("esm_user_answer == 'Ne'")[
+    ["esm_trigger", "esm_instructions", "esm_user_answer"]
+].head()
+
+# %%
+df_esm_unique_session.loc[
+    df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger"
+].value_counts()
+
+# %% [markdown]
+# These are all "first" questions of EMAs which serve as a way to postpone the daytime or evening EMAs.
+
+# %% [markdown]
+# The other answers signify expired or interrupted EMAs.
+
+# %% [markdown]
+# ### "Almost" unique session IDs
+
+# %% [markdown]
+# There are some session IDs that only appear twice or three times.
+
+# %%
+df_session_counts[
+    (df_session_counts["esm_session"] < 4) & (df_session_counts["esm_session"] > 1)
+]
+
+# %% [markdown]
+# Some represent the morning EMAs that only contained three questions.
+
+# %%
+df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[
+    ["esm_trigger", "esm_instructions", "esm_user_answer"]
+]
+
+# %%
+df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[
+    ["esm_trigger", "esm_instructions", "esm_user_answer"]
+]
+
+# %% [markdown]
+# Others represent interrupted EMA sessions.
+
+# %%
+df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[
+    ["esm_trigger", "esm_instructions", "esm_user_answer"]
+]
+
+# %% [markdown]
+# ## Other possibilities
+
+# %% [markdown]
+# There are also answers that describe what happened to a pending question: "Removed%"
--- a/features/esm.py
+++ b/features/esm.py
@ -38,5 +38,7 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply(
        lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
    )
-    df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(columns=["esm_trigger"]) # The esm_trigger column is already present in the main df.
+    df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(
+        columns=["esm_trigger"]
+    )  # The esm_trigger column is already present in the main df.
    return df_esm.join(df_esm_json)
--- a/test/test_esm.py
+++ b/test/test_esm.py
@ -11,7 +11,7 @@ from features.esm import preprocess_esm
 class EsmFeatures(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
-        cls.esm = pd.read_csv("../data/example_esm.csv", sep=';')
+        cls.esm = pd.read_csv("../data/example_esm.csv", sep=";")
        cls.esm["esm_json"] = cls.esm["esm_json"].apply(eval)

    def test_preprocess_esm(self):