Study session ID in depth.

communication
junos 2021-06-02 18:35:00 +02:00
parent 53f34965e2
commit 2da9a8f9e3
3 changed files with 111 additions and 8 deletions

View File

@ -13,25 +13,126 @@
# name: straw2analysis # name: straw2analysis
# --- # ---
import datetime
# %% # %%
import os import os
import sys import sys
import seaborn as sns
nb_dir = os.path.split(os.getcwd())[0] nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path: if nb_dir not in sys.path:
sys.path.append(nb_dir) sys.path.append(nb_dir)
import participants.query_db import participants.query_db
from features.esm import * from features.esm import *
# %% # %% [markdown]
df_esm_nokia = get_esm_data(["nokia_0000003"]) # # ESM data
print(df_esm_nokia)
# %% [markdown]
# Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON.
# %% # %%
df_esm_nokia_json = pd.json_normalize(df_esm_nokia["esm_json"]).drop(columns=["esm_trigger"]) participants_inactive_usernames = participants.query_db.get_usernames(
df_esm_nokia_full = df_esm_nokia.join(df_esm_nokia_json) collection_start=datetime.date.fromisoformat("2020-08-01")
)
df_esm_inactive = get_esm_data(participants_inactive_usernames)
# %% # %%
df_esm_nokia_full.loc[df_esm_nokia_full["esm_user_answer"].str.contains("Remove"),"esm_user_answer"].value_counts() df_esm_preprocessed = preprocess_esm(df_esm_inactive)
df_esm_preprocessed.head()
# %% # %%
df_esm_preprocessed.columns
# %% [markdown]
# # Concordance
# %% [markdown]
# The purpose of concordance is to count the number of EMA sessions that a participant answered in a day and possibly compare it to some maximum number of EMAs that could theoretically be presented for that day.
# %% [markdown]
# ## Session IDs
# %% [markdown]
# One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first.
# %%
session_counts = df_esm_preprocessed.groupby(["participant_id", "esm_session"])[
"esm_session"
].count()
# %%
sns.displot(session_counts.to_numpy(), binwidth=1, height=8)
# %% [markdown]
# ### Unique session IDs
# %%
df_session_counts = pd.DataFrame(session_counts)
df_session_1 = df_session_counts[(df_session_counts["esm_session"] == 1)]
df_esm_unique_session = df_session_1.join(
df_esm_preprocessed.set_index(["participant_id", "esm_session"])
)
# %%
df_esm_unique_session["esm_user_answer"].value_counts()
# %% [markdown]
# The "DayFinished3421" tag marks the last EMA, where the participant only marked "I finished with work for today" and did not answer any questions.
# What do the answers "Ne" represent?
# %%
df_esm_unique_session.query("esm_user_answer == 'Ne'")[
["esm_trigger", "esm_instructions", "esm_user_answer"]
].head()
# %%
df_esm_unique_session.loc[
df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger"
].value_counts()
# %% [markdown]
# These are all "first" questions of EMAs which serve as a way to postpone the daytime or evening EMAs.
# %% [markdown]
# The other answers signify expired or interrupted EMAs.
# %% [markdown]
# ### "Almost" unique session IDs
# %% [markdown]
# There are some session IDs that only appear twice or three times.
# %%
df_session_counts[
(df_session_counts["esm_session"] < 4) & (df_session_counts["esm_session"] > 1)
]
# %% [markdown]
# Some represent the morning EMAs that only contained three questions.
# %%
df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[
["esm_trigger", "esm_instructions", "esm_user_answer"]
]
# %%
df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[
["esm_trigger", "esm_instructions", "esm_user_answer"]
]
# %% [markdown]
# Others represent interrupted EMA sessions.
# %%
df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[
["esm_trigger", "esm_instructions", "esm_user_answer"]
]
# %% [markdown]
# ## Other possibilities
# %% [markdown]
# There are also answers that describe what happened to a pending question: "Removed%"

View File

@ -38,5 +38,7 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply( df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply(
lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ) lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
) )
df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(columns=["esm_trigger"]) # The esm_trigger column is already present in the main df. df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(
columns=["esm_trigger"]
) # The esm_trigger column is already present in the main df.
return df_esm.join(df_esm_json) return df_esm.join(df_esm_json)

View File

@ -11,7 +11,7 @@ from features.esm import preprocess_esm
class EsmFeatures(unittest.TestCase): class EsmFeatures(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls) -> None: def setUpClass(cls) -> None:
cls.esm = pd.read_csv("../data/example_esm.csv", sep=';') cls.esm = pd.read_csv("../data/example_esm.csv", sep=";")
cls.esm["esm_json"] = cls.esm["esm_json"].apply(eval) cls.esm["esm_json"] = cls.esm["esm_json"].apply(eval)
def test_preprocess_esm(self): def test_preprocess_esm(self):