Finish labelling EMA sessions and document classify_sessions_adherence function.

communication
junos 2021-06-11 14:50:14 +02:00
parent 371e755159
commit f48e5469e0
5 changed files with 218 additions and 95 deletions

View File

@ -13,35 +13,50 @@
# name: straw2analysis
# ---
import datetime
# %%
import os
import sys
import datetime
import seaborn as sns
import pandas as pd
import seaborn as sns
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path: sys.path.append(nb_dir)
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import participants.query_db
# %%
baseline_si = pd.read_csv('E:/STRAWbaseline/results-survey637813.csv')
baseline_be_1 = pd.read_csv('E:/STRAWbaseline/results-survey358134.csv')
baseline_be_2 = pd.read_csv('E:/STRAWbaseline/results-survey413767.csv')
baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
baseline_be_2 = pd.read_csv("E:/STRAWbaseline/results-survey413767.csv")
# %%
participants_inactive_usernames = participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01"))
participants_inactive_usernames = participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01")
)
# %%
baseline = pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner").reset_index().drop(columns="index")
baseline_inactive = baseline[baseline["Gebruikersnaam"].isin(participants_inactive_usernames)]
baseline = (
pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner")
.reset_index()
.drop(columns="index")
)
baseline_inactive = baseline[
baseline["Gebruikersnaam"].isin(participants_inactive_usernames)
]
# %%
baseline
# %%
participants_inactive_usernames = pd.Series(participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01")))
participants_inactive_usernames = pd.Series(
participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01")
)
)
# %% [markdown]
# # Demographic information
@ -54,7 +69,9 @@ print(baseline_inactive.shape[0])
print(participants_inactive_usernames.shape[0])
# %%
participants_inactive_usernames[~participants_inactive_usernames.isin(baseline["Gebruikersnaam"])].sort_values()
participants_inactive_usernames[
~participants_inactive_usernames.isin(baseline["Gebruikersnaam"])
].sort_values()
# %%
baseline_inactive["startlanguage"].value_counts()
@ -63,9 +80,10 @@ baseline_inactive["startlanguage"].value_counts()
baseline_inactive["Geslacht"].value_counts()
# %%
now = pd.Timestamp('now')
baseline_inactive = baseline_inactive.assign(dob = lambda x: pd.to_datetime(x.Geboortedatum),
age = lambda x: now - x.dob)
now = pd.Timestamp("now")
baseline_inactive = baseline_inactive.assign(
dob=lambda x: pd.to_datetime(x.Geboortedatum), age=lambda x: now - x.dob
)
# %%
baseline_inactive["age"].describe()

View File

@ -61,9 +61,9 @@ df_esm_preprocessed.columns
# One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first.
# %%
session_counts = df_esm_preprocessed.groupby(["participant_id", "device_id", "esm_session"]).count()[
"id"
]
session_counts = df_esm_preprocessed.groupby(
["participant_id", "device_id", "esm_session"]
).count()["id"]
# %% [markdown]
# Group data by participant_id and esm_session and count the number of instances (by id). Session counts are therefore counts of how many times a specific session ID appears *within* a specific participant.
@ -142,11 +142,17 @@ df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[
]
# %% tags=[]
df_esm_2 = df_session_counts[
df_session_counts["esm_session_count"] == 2
].reset_index().merge(df_esm_preprocessed, how="left", on=["participant_id", "device_id", "esm_session"])
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#display(df_esm_2)
df_esm_2 = (
df_session_counts[df_session_counts["esm_session_count"] == 2]
.reset_index()
.merge(
df_esm_preprocessed,
how="left",
on=["participant_id", "device_id", "esm_session"],
)
)
# with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
# display(df_esm_2)
# %% [markdown] tags=[]
# ### Long sessions
@ -215,7 +221,9 @@ df_session_counts.count()
# %%
non_session = session_group_by.apply(
lambda x: (
(x.esm_user_answer == "DayFinished3421") | (x.esm_user_answer == "DayOff3421") | (x.esm_user_answer == "DayFinishedSetEvening")
(x.esm_user_answer == "DayFinished3421")
| (x.esm_user_answer == "DayOff3421")
| (x.esm_user_answer == "DayFinishedSetEvening")
).any()
)
df_session_counts.loc[non_session, "session_response"] = "day_finished"
@ -243,6 +251,36 @@ df_session_counts.loc[esm_removed, "session_response"].value_counts()
# %% [markdown]
# It turns out that these had been accounted for with ESM statuses.
# %% [markdown]
# ### Singleton sessions
# %%
df_session_counts.count()
# %%
df_session_counts[
(df_session_counts.esm_session_count == 1)
& df_session_counts.session_response.isna()
]
# %%
df_session_1 = df_session_counts[
(df_session_counts["esm_session_count"] == 1)
& df_session_counts.session_response.isna()
]
df_esm_unique_session = df_session_1.join(
df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"])
)
df_esm_unique_session = df_esm_unique_session["esm_trigger"].rename("session_response")
# %%
df_session_counts.loc[
df_esm_unique_session.index, "session_response"
] = df_esm_unique_session
# %%
df_session_counts.count()
# %% [markdown]
# ### Evening_last
@ -270,38 +308,52 @@ sns.displot(
)
# %% [markdown]
# ### Singleton sessions
# ### Repeated sessions
# %% [markdown]
# The sessions lengths that repeat often can probably be used as filled in EMAs. Let's only review the session lengths that are rare.
# %%
df_session_counts.count()
df_session_counts.loc[
df_session_counts.session_response.isna(), "esm_session_count"
].value_counts().sort_index()
# %%
df_session_counts[(df_session_counts.esm_session_count == 1) & df_session_counts.session_response.isna()]
# %%
df_session_1 = df_session_counts[(df_session_counts["esm_session_count"] == 1) & df_session_counts.session_response.isna()]
df_esm_unique_session = df_session_1.join(
df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"])
df_session_7 = df_session_counts[
(df_session_counts["esm_session_count"] == 7)
& df_session_counts.session_response.isna()
]
df_esm_session_7 = df_session_7.join(
df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]),
how="left",
)
df_esm_unique_session = df_esm_unique_session["esm_trigger"].rename("session_response")
# %% jupyter={"outputs_hidden": true} tags=[]
with pd.option_context(
"display.max_rows", None, "display.max_columns", None
): # more options can be specified also
display(df_esm_session_7[["esm_trigger", "esm_instructions", "esm_user_answer"]])
# %% [markdown]
# These are all morning questionnaires with "commute" selected or rarely "long break" in the morning.
# %%
df_session_counts.loc[df_esm_unique_session.index, "session_response"] = df_esm_unique_session
# %%
df_session_counts.count()
# %%
df_session_counts.merge()
# %%
df_esm_78243 = df_esm_preprocessed[df_esm_preprocessed["username"] == "uploader_78243"]
df_esm_78243 = df_esm_78243.sort_values("_id")[["id","_id","datetime_lj", "esm_status","esm_trigger","esm_instructions","esm_user_answer","esm_session"]]
# %%
df_esm_78243.columns
# %%
df_esm_78243.to_csv("example.csv")
df_session_27 = df_session_counts[
(df_session_counts["esm_session_count"] == 27)
& df_session_counts.session_response.isna()
]
df_esm_session_27 = df_session_27.join(
df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]),
how="left",
)
# %% jupyter={"outputs_hidden": true} tags=[]
with pd.option_context(
"display.max_rows", None, "display.max_columns", None
): # more options can be specified also
display(df_esm_session_27[["esm_trigger", "esm_instructions", "esm_user_answer"]])
# %% [markdown]
# These are all morning questionnaires with morning *and* workday items, with the feedback added and also branched in the longest possible way.
# %%

View File

@ -2,11 +2,12 @@ from collections.abc import Collection
import pandas as pd
def read_baseline(paths: Collection) -> pd.DataFrame:
#TODO Read CSV files and concat them.
# TODO Read CSV files and concat them.
pass
def preprocess_baseline(df_baseline_from_csv: pd.DataFrame) -> pd.DataFrame:
#TODO Translate columns, calculate age.
# TODO Translate columns, calculate age.
pass

View File

@ -68,10 +68,21 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
def classify_sessions_adherence(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
"""
For each distinct EMA session, determine how the participant responded to it.
Possible outcomes are: esm_unanswered
Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
This is done in several steps.
#TODO Finish the documentation.
This is done in three steps.
First, the esm_status is considered.
If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
Second, the sessions which do not represent full questionnaires are identified.
These are sessions where participants only marked they are finished with the day or have not yet started working.
Third, the sessions with only one item are marked with their trigger.
We never offered questionnaires with single items, so we can be sure these are unfinished.
Finally, all sessions that remain are marked as completed.
By going through different possibilities in expl_esm.ipynb, this turned out to be a reasonable option.
Parameters
----------
@ -80,47 +91,55 @@ def classify_sessions_adherence(df_esm_preprocessed: pd.DataFrame) -> pd.DataFra
Returns
-------
some dataframe
df_session_counts: pd.Dataframe
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
"""
sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
# 0. First, assign all session statuses as NaN.
df_session_counts = pd.DataFrame(sessions_grouped.count()["id"]).rename(
columns={"id": "esm_session_count"}
)
df_session_counts["session_response"] = np.NaN
esm_not_answered = sessions_grouped.apply(lambda x: (x.esm_status != ESM_STATUS_ANSWERED).any())
df_session_counts.loc[esm_not_answered, "session_response"] = SESSION_STATUS_UNANSWERED
# 1. Identify all ESMs with status other than answered.
esm_not_answered = sessions_grouped.apply(
lambda x: (x.esm_status != ESM_STATUS_ANSWERED).any()
)
df_session_counts.loc[
esm_not_answered, "session_response"
] = SESSION_STATUS_UNANSWERED
# 2. Identify non-sessions, i.e. answers about the end of the day.
non_session = sessions_grouped.apply(
lambda x: (
(x.esm_user_answer == "DayFinished3421") # I finished working for today.
| (x.esm_user_answer == "DayOff3421") # I am not going to work today.
| (x.esm_user_answer == "DayFinishedSetEvening") # When would you like to answer the evening EMA?
| (
x.esm_user_answer == "DayFinishedSetEvening"
) # When would you like to answer the evening EMA?
).any()
)
df_session_counts.loc[non_session, "session_response"] = SESSION_STATUS_DAY_FINISHED
singleton_sessions = (df_session_counts.esm_session_count == 1) & (df_session_counts.session_response.isna())
# 3. Identify sessions appearing only once, as those were not true EMAs for sure.
singleton_sessions = (df_session_counts.esm_session_count == 1) & (
df_session_counts.session_response.isna()
)
df_session_1 = df_session_counts[singleton_sessions]
df_esm_unique_session = df_session_1.join(
df_esm_preprocessed.set_index(GROUP_SESSIONS_BY), how="left"
)
df_esm_unique_session = df_esm_unique_session.assign(session_response=lambda x: x.esm_trigger)["session_response"]
df_session_counts.loc[df_esm_unique_session.index, "session_response"] = df_esm_unique_session
df_esm_unique_session = df_esm_unique_session.assign(
session_response=lambda x: x.esm_trigger
)["session_response"]
df_session_counts.loc[
df_esm_unique_session.index, "session_response"
] = df_esm_unique_session
finished_sessions = sessions_grouped.apply(
lambda x: (x.esm_trigger.str.endswith("_last")).any()
)
df_session_counts.loc[finished_sessions, "session_response"] = SESSION_STATUS_COMPLETE
# TODO Look at evening-evening_last sequence, if everything is caught with finished sessions
# TODO What can be done about morning EMA, perhaps morning-morning_first (sic!) sequence?
# TODO What can be done about workday EMA.
df_session_counts.loc[df_session_counts.session_response.isna(), "session_response"] = "esm_finished"
# TODO But for now, simply take all other ESMs as answered.
# 4. Mark the remaining sessions as completed.
df_session_counts.loc[
df_session_counts.session_response.isna(), "session_response"
] = SESSION_STATUS_COMPLETE
return df_session_counts

View File

@ -13,12 +13,14 @@
# name: straw2analysis
# ---
import datetime
# %%
import os
import sys
import datetime
import seaborn as sns
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
@ -29,16 +31,24 @@ import participants.query_db
from features.esm import *
# %%
baseline_si = pd.read_csv('E:/STRAWbaseline/results-survey637813.csv')
baseline_be_1 = pd.read_csv('E:/STRAWbaseline/results-survey358134.csv')
baseline_be_2 = pd.read_csv('E:/STRAWbaseline/results-survey413767.csv')
baseline = pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner").reset_index().drop(columns="index")
baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
baseline_be_2 = pd.read_csv("E:/STRAWbaseline/results-survey413767.csv")
baseline = (
pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner")
.reset_index()
.drop(columns="index")
)
# %%
participants_inactive_usernames = participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01"))
participants_inactive_usernames = participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01")
)
# %%
baseline_inactive = baseline[baseline["Gebruikersnaam"].isin(participants_inactive_usernames)]
baseline_inactive = baseline[
baseline["Gebruikersnaam"].isin(participants_inactive_usernames)
]
# %%
df_esm_inactive = get_esm_data(participants_inactive_usernames)
@ -48,38 +58,57 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)
df_session_counts = classify_sessions_adherence(df_esm_preprocessed)
# %%
tbl_session_outcomes = df_session_counts.reset_index()["session_response"].value_counts()
tbl_session_outcomes = df_session_counts.reset_index()[
"session_response"
].value_counts()
# %%
print("All sessions:", len(df_session_counts))
print("-------------------------------------")
print(tbl_session_outcomes)
print("-------------------------------------")
print(tbl_session_outcomes/len(df_session_counts))
print(tbl_session_outcomes / len(df_session_counts))
# %%
VARIABLES_TO_TRANSLATE = {
"Gebruikersnaam": "username",
"Geslacht": "gender",
"Geboortedatum": "date_of_birth"
"Geboortedatum": "date_of_birth",
}
baseline_inactive.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
now = pd.Timestamp('now')
baseline_inactive = baseline_inactive.assign(date_of_birth = lambda x: pd.to_datetime(x.date_of_birth),
age = lambda x: (now - x.date_of_birth).dt.days/365.25245)
now = pd.Timestamp("now")
baseline_inactive = baseline_inactive.assign(
date_of_birth=lambda x: pd.to_datetime(x.date_of_birth),
age=lambda x: (now - x.date_of_birth).dt.days / 365.25245,
)
# %%
df_session_counts
# %%
df_session_finished = df_session_counts[df_session_counts["session_response"] == "esm_finished"].reset_index()
df_session_finished = df_session_counts[
df_session_counts["session_response"] == "esm_finished"
].reset_index()
# %%
df_participant_finished_sessions = df_session_finished.groupby("participant_id").count()["esm_session"].rename("finished_sessions")
df_participant_finished_sessions = (
df_session_finished.groupby("participant_id")
.count()["esm_session"]
.rename("finished_sessions")
)
# %%
df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge(df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(), how="left", on="username")
df_adherence = df_adherence.merge(df_participant_finished_sessions, how="left", left_on="participant_id", right_index=True)
df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge(
df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(),
how="left",
on="username",
)
df_adherence = df_adherence.merge(
df_participant_finished_sessions,
how="left",
left_on="participant_id",
right_index=True,
)
# %% tags=[]
df_adherence
@ -91,11 +120,15 @@ df_adherence.describe()
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5)
# %%
lm_adherence = smf.ols('finished_sessions ~ C(gender) + C(startlanguage) + age', data=df_adherence).fit()
table = sm.stats.anova_lm(lm_adherence, typ=2) # Type 2 ANOVA DataFrame
lm_adherence = smf.ols(
"finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
).fit()
table = sm.stats.anova_lm(lm_adherence, typ=2) # Type 2 ANOVA DataFrame
print(table)
# %%
lr_ols = smf.ols('finished_sessions ~ C(gender) + C(startlanguage) + age', data=df_adherence)
lr_ols = smf.ols(
"finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
)
ls_result = lr_ols.fit()
ls_result.summary()