Study session ID in depth.
parent
53f34965e2
commit
2da9a8f9e3
|
@ -13,25 +13,126 @@
|
|||
# name: straw2analysis
|
||||
# ---
|
||||
|
||||
import datetime
|
||||
|
||||
# %%
|
||||
import os
|
||||
import sys
|
||||
|
||||
import seaborn as sns
|
||||
|
||||
nb_dir = os.path.split(os.getcwd())[0]
|
||||
if nb_dir not in sys.path:
|
||||
sys.path.append(nb_dir)
|
||||
import participants.query_db
|
||||
from features.esm import *
|
||||
|
||||
# %%
|
||||
df_esm_nokia = get_esm_data(["nokia_0000003"])
|
||||
print(df_esm_nokia)
|
||||
# %% [markdown]
|
||||
# # ESM data
|
||||
|
||||
# %% [markdown]
|
||||
# Only take data from the main part of the study. The pilot data have different structure, there were especially many additions to ESM_JSON.
|
||||
|
||||
# %%
|
||||
df_esm_nokia_json = pd.json_normalize(df_esm_nokia["esm_json"]).drop(columns=["esm_trigger"])
|
||||
df_esm_nokia_full = df_esm_nokia.join(df_esm_nokia_json)
|
||||
participants_inactive_usernames = participants.query_db.get_usernames(
|
||||
collection_start=datetime.date.fromisoformat("2020-08-01")
|
||||
)
|
||||
df_esm_inactive = get_esm_data(participants_inactive_usernames)
|
||||
|
||||
# %%
|
||||
df_esm_nokia_full.loc[df_esm_nokia_full["esm_user_answer"].str.contains("Remove"),"esm_user_answer"].value_counts()
|
||||
df_esm_preprocessed = preprocess_esm(df_esm_inactive)
|
||||
df_esm_preprocessed.head()
|
||||
|
||||
# %%
|
||||
df_esm_preprocessed.columns
|
||||
|
||||
# %% [markdown]
|
||||
# # Concordance
|
||||
|
||||
# %% [markdown]
|
||||
# The purpose of concordance is to count the number of EMA sessions that a participant answered in a day and possibly compare it to some maximum number of EMAs that could theoretically be presented for that day.
|
||||
|
||||
# %% [markdown]
|
||||
# ## Session IDs
|
||||
|
||||
# %% [markdown]
|
||||
# One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first.
|
||||
|
||||
# %%
|
||||
session_counts = df_esm_preprocessed.groupby(["participant_id", "esm_session"])[
|
||||
"esm_session"
|
||||
].count()
|
||||
|
||||
# %%
|
||||
sns.displot(session_counts.to_numpy(), binwidth=1, height=8)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Unique session IDs
|
||||
|
||||
# %%
|
||||
df_session_counts = pd.DataFrame(session_counts)
|
||||
df_session_1 = df_session_counts[(df_session_counts["esm_session"] == 1)]
|
||||
df_esm_unique_session = df_session_1.join(
|
||||
df_esm_preprocessed.set_index(["participant_id", "esm_session"])
|
||||
)
|
||||
|
||||
# %%
|
||||
df_esm_unique_session["esm_user_answer"].value_counts()
|
||||
|
||||
# %% [markdown]
|
||||
# The "DayFinished3421" tag marks the last EMA, where the participant only marked "I finished with work for today" and did not answer any questions.
|
||||
# What do the answers "Ne" represent?
|
||||
|
||||
# %%
|
||||
df_esm_unique_session.query("esm_user_answer == 'Ne'")[
|
||||
["esm_trigger", "esm_instructions", "esm_user_answer"]
|
||||
].head()
|
||||
|
||||
# %%
|
||||
df_esm_unique_session.loc[
|
||||
df_esm_unique_session["esm_user_answer"].str.contains("Ne"), "esm_trigger"
|
||||
].value_counts()
|
||||
|
||||
# %% [markdown]
|
||||
# These are all "first" questions of EMAs which serve as a way to postpone the daytime or evening EMAs.
|
||||
|
||||
# %% [markdown]
|
||||
# The other answers signify expired or interrupted EMAs.
|
||||
|
||||
# %% [markdown]
|
||||
# ### "Almost" unique session IDs
|
||||
|
||||
# %% [markdown]
|
||||
# There are some session IDs that only appear twice or three times.
|
||||
|
||||
# %%
|
||||
df_session_counts[
|
||||
(df_session_counts["esm_session"] < 4) & (df_session_counts["esm_session"] > 1)
|
||||
]
|
||||
|
||||
# %% [markdown]
|
||||
# Some represent the morning EMAs that only contained three questions.
|
||||
|
||||
# %%
|
||||
df_esm_preprocessed.query("participant_id == 89 & esm_session == 158")[
|
||||
["esm_trigger", "esm_instructions", "esm_user_answer"]
|
||||
]
|
||||
|
||||
# %%
|
||||
df_esm_preprocessed.query("participant_id == 89 & esm_session == 157")[
|
||||
["esm_trigger", "esm_instructions", "esm_user_answer"]
|
||||
]
|
||||
|
||||
# %% [markdown]
|
||||
# Others represent interrupted EMA sessions.
|
||||
|
||||
# %%
|
||||
df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[
|
||||
["esm_trigger", "esm_instructions", "esm_user_answer"]
|
||||
]
|
||||
|
||||
# %% [markdown]
|
||||
# ## Other possibilities
|
||||
|
||||
# %% [markdown]
|
||||
# There are also answers that describe what happened to a pending question: "Removed%"
|
||||
|
|
|
@ -38,5 +38,7 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
|||
df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply(
|
||||
lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
|
||||
)
|
||||
df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(columns=["esm_trigger"]) # The esm_trigger column is already present in the main df.
|
||||
df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(
|
||||
columns=["esm_trigger"]
|
||||
) # The esm_trigger column is already present in the main df.
|
||||
return df_esm.join(df_esm_json)
|
||||
|
|
|
@ -11,7 +11,7 @@ from features.esm import preprocess_esm
|
|||
class EsmFeatures(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls) -> None:
|
||||
cls.esm = pd.read_csv("../data/example_esm.csv", sep=';')
|
||||
cls.esm = pd.read_csv("../data/example_esm.csv", sep=";")
|
||||
cls.esm["esm_json"] = cls.esm["esm_json"].apply(eval)
|
||||
|
||||
def test_preprocess_esm(self):
|
||||
|
|
Loading…
Reference in New Issue