Study session ID in depth.

2021-06-02 18:35:00 +02:00 · 2021-06-02 18:35:00 +02:00 · 2da9a8f9e3
parent 53f34965e2
commit 2da9a8f9e3
3 changed files with 111 additions and 8 deletions
--- a/exploration/expl_esm.py
+++ b/exploration/expl_esm.py
@ -13,25 +13,126 @@
 #     name: straw2analysis
 # ---
 import datetime
 # %%
 import os
 import sys
 import seaborn as sns
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import participants.query_db
 from features.esm import *
-# %%
+# %% [markdown]
-df_esm_nokia = get_esm_data(["nokia_0000003"])
+# # ESM data
-print(df_esm_nokia)
+
 # %% [markdown]
 # Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON.
 # %%
-df_esm_nokia_json = pd.json_normalize(df_esm_nokia["esm_json"]).drop(columns=["esm_trigger"])
+participants_inactive_usernames = participants.query_db.get_usernames(
-df_esm_nokia_full = df_esm_nokia.join(df_esm_nokia_json)
+    collection_start=datetime.date.fromisoformat("2020-08-01")
 )
 df_esm_inactive = get_esm_data(participants_inactive_usernames)
 # %%
-df_esm_nokia_full.loc[df_esm_nokia_full["esm_user_answer"].str.contains("Remove"),"esm_user_answer"].value_counts()
+df_esm_preprocessed = preprocess_esm(df_esm_inactive)
 df_esm_preprocessed.head()
 # %%
 df_esm_preprocessed.columns
 # %% [markdown]
 # # Concordance
 # %% [markdown]
 # The purpose of concordance is to count the number of EMA sessions that a participant answered in a day and possibly compare it to some maximum number of EMAs that could theoretically be presented for that day.
 # %% [markdown]
 # ## Session IDs
 # %% [markdown]
 # One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first.
 # %%
 session_counts = df_esm_preprocessed.groupby(["participant_id", "esm_session"])[
    "esm_session"
 ].count()
 # %%
 sns.displot(session_counts.to_numpy(), binwidth=1, height=8)
 # %% [markdown]
 # ### Unique session IDs
 # %%
 df_session_counts = pd.DataFrame(session_counts)
 df_session_1 = df_session_counts[(df_session_counts["esm_session"] == 1)]
 df_esm_unique_session = df_session_1.join(
    df_esm_preprocessed.set_index(["participant_id", "esm_session"])
 )
 # %%
 df_esm_unique_session["esm_user_answer"].value_counts()
 # %% [markdown]
 # The "DayFinished3421" tag marks the last EMA, where the participant only marked "I finished with work for today" and did not answer any questions.
 # What do the answers "Ne" represent?
 # %%
 df_esm_unique_session.query("esm_user_answer == 'Ne'")[
    ["esm_trigger", "esm_instructions", "esm_user_answer"]
 ].head()
 # %%
 df_esm_unique_session.loc[
    df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger"
 ].value_counts()
 # %% [markdown]
 # These are all "first" questions of EMAs which serve as a way to postpone the daytime or evening EMAs.
 # %% [markdown]
 # The other answers signify expired or interrupted EMAs.
 # %% [markdown]
 # ### "Almost" unique session IDs
 # %% [markdown]
 # There are some session IDs that only appear twice or three times.
 # %%
 df_session_counts[
    (df_session_counts["esm_session"] < 4) & (df_session_counts["esm_session"] > 1)
 ]
 # %% [markdown]
 # Some represent the morning EMAs that only contained three questions.
 # %%
 df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[
    ["esm_trigger", "esm_instructions", "esm_user_answer"]
 ]
 # %%
 df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[
    ["esm_trigger", "esm_instructions", "esm_user_answer"]
 ]
 # %% [markdown]
 # Others represent interrupted EMA sessions.
 # %%
 df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[
    ["esm_trigger", "esm_instructions", "esm_user_answer"]
 ]
 # %% [markdown]
 # ## Other possibilities
 # %% [markdown]
 # There are also answers that describe what happened to a pending question: "Removed%"
--- a/features/esm.py
+++ b/features/esm.py
@ -38,5 +38,7 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply(
        lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
    )
-    df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(columns=["esm_trigger"]) # The esm_trigger column is already present in the main df.
+    df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(
        columns=["esm_trigger"]
    )  # The esm_trigger column is already present in the main df.
    return df_esm.join(df_esm_json)
--- a/test/test_esm.py
+++ b/test/test_esm.py
@ -11,7 +11,7 @@ from features.esm import preprocess_esm
 class EsmFeatures(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
-        cls.esm = pd.read_csv("../data/example_esm.csv", sep=';')
+        cls.esm = pd.read_csv("../data/example_esm.csv", sep=";")
        cls.esm["esm_json"] = cls.esm["esm_json"].apply(eval)
    def test_preprocess_esm(self):