From 2da9a8f9e35258684919723ce23e77fc5c0dde6f Mon Sep 17 00:00:00 2001
From: junos <junos.lukan@ijs.si>
Date: Wed, 2 Jun 2021 18:35:00 +0200
Subject: [PATCH] Study session ID in depth.

---
 exploration/expl_esm.py | 113 +++++++++++++++++++++++++++++++++++++---
 features/esm.py         |   4 +-
 test/test_esm.py        |   2 +-
 3 files changed, 111 insertions(+), 8 deletions(-)

diff --git a/exploration/expl_esm.py b/exploration/expl_esm.py
index c41780a..ae7fd32 100644
--- a/exploration/expl_esm.py
+++ b/exploration/expl_esm.py
@@ -13,25 +13,126 @@
 #     name: straw2analysis
 # ---
 
+import datetime
+
 # %%
 import os
 import sys
 
+import seaborn as sns
+
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
     sys.path.append(nb_dir)
 import participants.query_db
 from features.esm import *
 
-# %%
-df_esm_nokia = get_esm_data(["nokia_0000003"])
-print(df_esm_nokia)
+# %% [markdown]
+# # ESM data
+
+# %% [markdown]
+# Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON.
 
 # %%
-df_esm_nokia_json = pd.json_normalize(df_esm_nokia["esm_json"]).drop(columns=["esm_trigger"])
-df_esm_nokia_full = df_esm_nokia.join(df_esm_nokia_json)
+participants_inactive_usernames = participants.query_db.get_usernames(
+    collection_start=datetime.date.fromisoformat("2020-08-01")
+)
+df_esm_inactive = get_esm_data(participants_inactive_usernames)
 
 # %%
-df_esm_nokia_full.loc[df_esm_nokia_full["esm_user_answer"].str.contains("Remove"),"esm_user_answer"].value_counts()
+df_esm_preprocessed = preprocess_esm(df_esm_inactive)
+df_esm_preprocessed.head()
 
 # %%
+df_esm_preprocessed.columns
+
+# %% [markdown]
+# # Concordance
+
+# %% [markdown]
+# The purpose of concordance is to count the number of EMA sessions that a participant answered in a day and possibly compare it to some maximum number of EMAs that could theoretically be presented for that day.
+
+# %% [markdown]
+# ## Session IDs
+
+# %% [markdown]
+# One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first.
+
+# %%
+session_counts = df_esm_preprocessed.groupby(["participant_id", "esm_session"])[
+    "esm_session"
+].count()
+
+# %%
+sns.displot(session_counts.to_numpy(), binwidth=1, height=8)
+
+# %% [markdown]
+# ### Unique session IDs
+
+# %%
+df_session_counts = pd.DataFrame(session_counts)
+df_session_1 = df_session_counts[(df_session_counts["esm_session"] == 1)]
+df_esm_unique_session = df_session_1.join(
+    df_esm_preprocessed.set_index(["participant_id", "esm_session"])
+)
+
+# %%
+df_esm_unique_session["esm_user_answer"].value_counts()
+
+# %% [markdown]
+# The "DayFinished3421" tag marks the last EMA, where the participant only marked "I finished with work for today" and did not answer any questions.
+# What do the answers "Ne" represent?
+
+# %%
+df_esm_unique_session.query("esm_user_answer == 'Ne'")[
+    ["esm_trigger", "esm_instructions", "esm_user_answer"]
+].head()
+
+# %%
+df_esm_unique_session.loc[
+    df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger"
+].value_counts()
+
+# %% [markdown]
+# These are all "first" questions of EMAs which serve as a way to postpone the daytime or evening EMAs.
+
+# %% [markdown]
+# The other answers signify expired or interrupted EMAs.
+
+# %% [markdown]
+# ### "Almost" unique session IDs
+
+# %% [markdown]
+# There are some session IDs that only appear twice or three times.
+
+# %%
+df_session_counts[
+    (df_session_counts["esm_session"] < 4) & (df_session_counts["esm_session"] > 1)
+]
+
+# %% [markdown]
+# Some represent the morning EMAs that only contained three questions.
+
+# %%
+df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[
+    ["esm_trigger", "esm_instructions", "esm_user_answer"]
+]
+
+# %%
+df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[
+    ["esm_trigger", "esm_instructions", "esm_user_answer"]
+]
+
+# %% [markdown]
+# Others represent interrupted EMA sessions.
+
+# %%
+df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[
+    ["esm_trigger", "esm_instructions", "esm_user_answer"]
+]
+
+# %% [markdown]
+# ## Other possibilities
+
+# %% [markdown]
+# There are also answers that describe what happened to a pending question: "Removed%"
diff --git a/features/esm.py b/features/esm.py
index b5dfbb1..66dc82c 100644
--- a/features/esm.py
+++ b/features/esm.py
@@ -38,5 +38,7 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
     df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply(
         lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
     )
-    df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(columns=["esm_trigger"]) # The esm_trigger column is already present in the main df.
+    df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(
+        columns=["esm_trigger"]
+    )  # The esm_trigger column is already present in the main df.
     return df_esm.join(df_esm_json)
diff --git a/test/test_esm.py b/test/test_esm.py
index 4cd65e3..c531093 100644
--- a/test/test_esm.py
+++ b/test/test_esm.py
@@ -11,7 +11,7 @@ from features.esm import preprocess_esm
 class EsmFeatures(unittest.TestCase):
     @classmethod
     def setUpClass(cls) -> None:
-        cls.esm = pd.read_csv("../data/example_esm.csv", sep=';')
+        cls.esm = pd.read_csv("../data/example_esm.csv", sep=";")
         cls.esm["esm_json"] = cls.esm["esm_json"].apply(eval)
 
     def test_preprocess_esm(self):