Finish labelling EMA sessions and document classify_sessions_adherence function.

communication
junos 2021-06-11 14:50:14 +02:00
parent 371e755159
commit f48e5469e0
5 changed files with 218 additions and 95 deletions

View File

@ -13,35 +13,50 @@
# name: straw2analysis # name: straw2analysis
# --- # ---
import datetime
# %% # %%
import os import os
import sys import sys
import datetime
import seaborn as sns
import pandas as pd import pandas as pd
import seaborn as sns
nb_dir = os.path.split(os.getcwd())[0] nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path: sys.path.append(nb_dir) if nb_dir not in sys.path:
sys.path.append(nb_dir)
import participants.query_db import participants.query_db
# %% # %%
baseline_si = pd.read_csv('E:/STRAWbaseline/results-survey637813.csv') baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
baseline_be_1 = pd.read_csv('E:/STRAWbaseline/results-survey358134.csv') baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
baseline_be_2 = pd.read_csv('E:/STRAWbaseline/results-survey413767.csv') baseline_be_2 = pd.read_csv("E:/STRAWbaseline/results-survey413767.csv")
# %% # %%
participants_inactive_usernames = participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01")) participants_inactive_usernames = participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01")
)
# %% # %%
baseline = pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner").reset_index().drop(columns="index") baseline = (
baseline_inactive = baseline[baseline["Gebruikersnaam"].isin(participants_inactive_usernames)] pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner")
.reset_index()
.drop(columns="index")
)
baseline_inactive = baseline[
baseline["Gebruikersnaam"].isin(participants_inactive_usernames)
]
# %% # %%
baseline baseline
# %% # %%
participants_inactive_usernames = pd.Series(participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01"))) participants_inactive_usernames = pd.Series(
participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01")
)
)
# %% [markdown] # %% [markdown]
# # Demographic information # # Demographic information
@ -54,7 +69,9 @@ print(baseline_inactive.shape[0])
print(participants_inactive_usernames.shape[0]) print(participants_inactive_usernames.shape[0])
# %% # %%
participants_inactive_usernames[~participants_inactive_usernames.isin(baseline["Gebruikersnaam"])].sort_values() participants_inactive_usernames[
~participants_inactive_usernames.isin(baseline["Gebruikersnaam"])
].sort_values()
# %% # %%
baseline_inactive["startlanguage"].value_counts() baseline_inactive["startlanguage"].value_counts()
@ -63,9 +80,10 @@ baseline_inactive["startlanguage"].value_counts()
baseline_inactive["Geslacht"].value_counts() baseline_inactive["Geslacht"].value_counts()
# %% # %%
now = pd.Timestamp('now') now = pd.Timestamp("now")
baseline_inactive = baseline_inactive.assign(dob = lambda x: pd.to_datetime(x.Geboortedatum), baseline_inactive = baseline_inactive.assign(
age = lambda x: now - x.dob) dob=lambda x: pd.to_datetime(x.Geboortedatum), age=lambda x: now - x.dob
)
# %% # %%
baseline_inactive["age"].describe() baseline_inactive["age"].describe()

View File

@ -61,9 +61,9 @@ df_esm_preprocessed.columns
# One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first. # One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first.
# %% # %%
session_counts = df_esm_preprocessed.groupby(["participant_id", "device_id", "esm_session"]).count()[ session_counts = df_esm_preprocessed.groupby(
"id" ["participant_id", "device_id", "esm_session"]
] ).count()["id"]
# %% [markdown] # %% [markdown]
# Group data by participant_id and esm_session and count the number of instances (by id). Session counts are therefore counts of how many times a specific session ID appears *within* a specific participant. # Group data by participant_id and esm_session and count the number of instances (by id). Session counts are therefore counts of how many times a specific session ID appears *within* a specific participant.
@ -142,11 +142,17 @@ df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[
] ]
# %% tags=[] # %% tags=[]
df_esm_2 = df_session_counts[ df_esm_2 = (
df_session_counts["esm_session_count"] == 2 df_session_counts[df_session_counts["esm_session_count"] == 2]
].reset_index().merge(df_esm_preprocessed, how="left", on=["participant_id", "device_id", "esm_session"]) .reset_index()
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also .merge(
#display(df_esm_2) df_esm_preprocessed,
how="left",
on=["participant_id", "device_id", "esm_session"],
)
)
# with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
# display(df_esm_2)
# %% [markdown] tags=[] # %% [markdown] tags=[]
# ### Long sessions # ### Long sessions
@ -215,7 +221,9 @@ df_session_counts.count()
# %% # %%
non_session = session_group_by.apply( non_session = session_group_by.apply(
lambda x: ( lambda x: (
(x.esm_user_answer == "DayFinished3421") | (x.esm_user_answer == "DayOff3421") | (x.esm_user_answer == "DayFinishedSetEvening") (x.esm_user_answer == "DayFinished3421")
| (x.esm_user_answer == "DayOff3421")
| (x.esm_user_answer == "DayFinishedSetEvening")
).any() ).any()
) )
df_session_counts.loc[non_session, "session_response"] = "day_finished" df_session_counts.loc[non_session, "session_response"] = "day_finished"
@ -243,6 +251,36 @@ df_session_counts.loc[esm_removed, "session_response"].value_counts()
# %% [markdown] # %% [markdown]
# It turns out that these had been accounted for with ESM statuses. # It turns out that these had been accounted for with ESM statuses.
# %% [markdown]
# ### Singleton sessions
# %%
df_session_counts.count()
# %%
df_session_counts[
(df_session_counts.esm_session_count == 1)
& df_session_counts.session_response.isna()
]
# %%
df_session_1 = df_session_counts[
(df_session_counts["esm_session_count"] == 1)
& df_session_counts.session_response.isna()
]
df_esm_unique_session = df_session_1.join(
df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"])
)
df_esm_unique_session = df_esm_unique_session["esm_trigger"].rename("session_response")
# %%
df_session_counts.loc[
df_esm_unique_session.index, "session_response"
] = df_esm_unique_session
# %%
df_session_counts.count()
# %% [markdown] # %% [markdown]
# ### Evening_last # ### Evening_last
@ -270,38 +308,52 @@ sns.displot(
) )
# %% [markdown] # %% [markdown]
# ### Singleton sessions # ### Repeated sessions
# %% [markdown]
# The sessions lengths that repeat often can probably be used as filled in EMAs. Let's only review the session lengths that are rare.
# %% # %%
df_session_counts.count() df_session_counts.loc[
df_session_counts.session_response.isna(), "esm_session_count"
].value_counts().sort_index()
# %% # %%
df_session_counts[(df_session_counts.esm_session_count == 1) & df_session_counts.session_response.isna()] df_session_7 = df_session_counts[
(df_session_counts["esm_session_count"] == 7)
# %% & df_session_counts.session_response.isna()
df_session_1 = df_session_counts[(df_session_counts["esm_session_count"] == 1) & df_session_counts.session_response.isna()] ]
df_esm_unique_session = df_session_1.join( df_esm_session_7 = df_session_7.join(
df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]) df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]),
how="left",
) )
df_esm_unique_session = df_esm_unique_session["esm_trigger"].rename("session_response")
# %% jupyter={"outputs_hidden": true} tags=[]
with pd.option_context(
"display.max_rows", None, "display.max_columns", None
): # more options can be specified also
display(df_esm_session_7[["esm_trigger", "esm_instructions", "esm_user_answer"]])
# %% [markdown]
# These are all morning questionnaires with "commute" selected or rarely "long break" in the morning.
# %% # %%
df_session_counts.loc[df_esm_unique_session.index, "session_response"] = df_esm_unique_session df_session_27 = df_session_counts[
(df_session_counts["esm_session_count"] == 27)
# %% & df_session_counts.session_response.isna()
df_session_counts.count() ]
df_esm_session_27 = df_session_27.join(
# %% df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]),
df_session_counts.merge() how="left",
)
# %%
df_esm_78243 = df_esm_preprocessed[df_esm_preprocessed["username"] == "uploader_78243"] # %% jupyter={"outputs_hidden": true} tags=[]
df_esm_78243 = df_esm_78243.sort_values("_id")[["id","_id","datetime_lj", "esm_status","esm_trigger","esm_instructions","esm_user_answer","esm_session"]] with pd.option_context(
"display.max_rows", None, "display.max_columns", None
# %% ): # more options can be specified also
df_esm_78243.columns display(df_esm_session_27[["esm_trigger", "esm_instructions", "esm_user_answer"]])
# %% # %% [markdown]
df_esm_78243.to_csv("example.csv") # These are all morning questionnaires with morning *and* workday items, with the feedback added and also branched in the longest possible way.
# %% # %%

View File

@ -2,11 +2,12 @@ from collections.abc import Collection
import pandas as pd import pandas as pd
def read_baseline(paths: Collection) -> pd.DataFrame: def read_baseline(paths: Collection) -> pd.DataFrame:
#TODO Read CSV files and concat them. # TODO Read CSV files and concat them.
pass pass
def preprocess_baseline(df_baseline_from_csv: pd.DataFrame) -> pd.DataFrame: def preprocess_baseline(df_baseline_from_csv: pd.DataFrame) -> pd.DataFrame:
#TODO Translate columns, calculate age. # TODO Translate columns, calculate age.
pass pass

View File

@ -68,10 +68,21 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
def classify_sessions_adherence(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame: def classify_sessions_adherence(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
""" """
For each distinct EMA session, determine how the participant responded to it. For each distinct EMA session, determine how the participant responded to it.
Possible outcomes are: esm_unanswered Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
This is done in several steps. This is done in three steps.
#TODO Finish the documentation.
First, the esm_status is considered.
If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
Second, the sessions which do not represent full questionnaires are identified.
These are sessions where participants only marked they are finished with the day or have not yet started working.
Third, the sessions with only one item are marked with their trigger.
We never offered questionnaires with single items, so we can be sure these are unfinished.
Finally, all sessions that remain are marked as completed.
By going through different possibilities in expl_esm.ipynb, this turned out to be a reasonable option.
Parameters Parameters
---------- ----------
@ -80,47 +91,55 @@ def classify_sessions_adherence(df_esm_preprocessed: pd.DataFrame) -> pd.DataFra
Returns Returns
------- -------
some dataframe df_session_counts: pd.Dataframe
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
""" """
sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY) sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
# 0. First, assign all session statuses as NaN.
df_session_counts = pd.DataFrame(sessions_grouped.count()["id"]).rename( df_session_counts = pd.DataFrame(sessions_grouped.count()["id"]).rename(
columns={"id": "esm_session_count"} columns={"id": "esm_session_count"}
) )
df_session_counts["session_response"] = np.NaN df_session_counts["session_response"] = np.NaN
esm_not_answered = sessions_grouped.apply(lambda x: (x.esm_status != ESM_STATUS_ANSWERED).any()) # 1. Identify all ESMs with status other than answered.
df_session_counts.loc[esm_not_answered, "session_response"] = SESSION_STATUS_UNANSWERED esm_not_answered = sessions_grouped.apply(
lambda x: (x.esm_status != ESM_STATUS_ANSWERED).any()
)
df_session_counts.loc[
esm_not_answered, "session_response"
] = SESSION_STATUS_UNANSWERED
# 2. Identify non-sessions, i.e. answers about the end of the day.
non_session = sessions_grouped.apply( non_session = sessions_grouped.apply(
lambda x: ( lambda x: (
(x.esm_user_answer == "DayFinished3421") # I finished working for today. (x.esm_user_answer == "DayFinished3421") # I finished working for today.
| (x.esm_user_answer == "DayOff3421") # I am not going to work today. | (x.esm_user_answer == "DayOff3421") # I am not going to work today.
| (x.esm_user_answer == "DayFinishedSetEvening") # When would you like to answer the evening EMA? | (
x.esm_user_answer == "DayFinishedSetEvening"
) # When would you like to answer the evening EMA?
).any() ).any()
) )
df_session_counts.loc[non_session, "session_response"] = SESSION_STATUS_DAY_FINISHED df_session_counts.loc[non_session, "session_response"] = SESSION_STATUS_DAY_FINISHED
singleton_sessions = (df_session_counts.esm_session_count == 1) & (df_session_counts.session_response.isna()) # 3. Identify sessions appearing only once, as those were not true EMAs for sure.
singleton_sessions = (df_session_counts.esm_session_count == 1) & (
df_session_counts.session_response.isna()
)
df_session_1 = df_session_counts[singleton_sessions] df_session_1 = df_session_counts[singleton_sessions]
df_esm_unique_session = df_session_1.join( df_esm_unique_session = df_session_1.join(
df_esm_preprocessed.set_index(GROUP_SESSIONS_BY), how="left" df_esm_preprocessed.set_index(GROUP_SESSIONS_BY), how="left"
) )
df_esm_unique_session = df_esm_unique_session.assign(session_response=lambda x: x.esm_trigger)["session_response"] df_esm_unique_session = df_esm_unique_session.assign(
df_session_counts.loc[df_esm_unique_session.index, "session_response"] = df_esm_unique_session session_response=lambda x: x.esm_trigger
)["session_response"]
df_session_counts.loc[
df_esm_unique_session.index, "session_response"
] = df_esm_unique_session
finished_sessions = sessions_grouped.apply( # 4. Mark the remaining sessions as completed.
lambda x: (x.esm_trigger.str.endswith("_last")).any() df_session_counts.loc[
) df_session_counts.session_response.isna(), "session_response"
df_session_counts.loc[finished_sessions, "session_response"] = SESSION_STATUS_COMPLETE ] = SESSION_STATUS_COMPLETE
# TODO Look at evening-evening_last sequence, if everything is caught with finished sessions
# TODO What can be done about morning EMA, perhaps morning-morning_first (sic!) sequence?
# TODO What can be done about workday EMA.
df_session_counts.loc[df_session_counts.session_response.isna(), "session_response"] = "esm_finished"
# TODO But for now, simply take all other ESMs as answered.
return df_session_counts return df_session_counts

View File

@ -13,12 +13,14 @@
# name: straw2analysis # name: straw2analysis
# --- # ---
import datetime
# %% # %%
import os import os
import sys import sys
import datetime
import seaborn as sns
import pandas as pd import pandas as pd
import seaborn as sns
import statsmodels.api as sm import statsmodels.api as sm
import statsmodels.formula.api as smf import statsmodels.formula.api as smf
@ -29,16 +31,24 @@ import participants.query_db
from features.esm import * from features.esm import *
# %% # %%
baseline_si = pd.read_csv('E:/STRAWbaseline/results-survey637813.csv') baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
baseline_be_1 = pd.read_csv('E:/STRAWbaseline/results-survey358134.csv') baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
baseline_be_2 = pd.read_csv('E:/STRAWbaseline/results-survey413767.csv') baseline_be_2 = pd.read_csv("E:/STRAWbaseline/results-survey413767.csv")
baseline = pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner").reset_index().drop(columns="index") baseline = (
pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner")
.reset_index()
.drop(columns="index")
)
# %% # %%
participants_inactive_usernames = participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01")) participants_inactive_usernames = participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01")
)
# %% # %%
baseline_inactive = baseline[baseline["Gebruikersnaam"].isin(participants_inactive_usernames)] baseline_inactive = baseline[
baseline["Gebruikersnaam"].isin(participants_inactive_usernames)
]
# %% # %%
df_esm_inactive = get_esm_data(participants_inactive_usernames) df_esm_inactive = get_esm_data(participants_inactive_usernames)
@ -48,38 +58,57 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)
df_session_counts = classify_sessions_adherence(df_esm_preprocessed) df_session_counts = classify_sessions_adherence(df_esm_preprocessed)
# %% # %%
tbl_session_outcomes = df_session_counts.reset_index()["session_response"].value_counts() tbl_session_outcomes = df_session_counts.reset_index()[
"session_response"
].value_counts()
# %% # %%
print("All sessions:", len(df_session_counts)) print("All sessions:", len(df_session_counts))
print("-------------------------------------") print("-------------------------------------")
print(tbl_session_outcomes) print(tbl_session_outcomes)
print("-------------------------------------") print("-------------------------------------")
print(tbl_session_outcomes/len(df_session_counts)) print(tbl_session_outcomes / len(df_session_counts))
# %% # %%
VARIABLES_TO_TRANSLATE = { VARIABLES_TO_TRANSLATE = {
"Gebruikersnaam": "username", "Gebruikersnaam": "username",
"Geslacht": "gender", "Geslacht": "gender",
"Geboortedatum": "date_of_birth" "Geboortedatum": "date_of_birth",
} }
baseline_inactive.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True) baseline_inactive.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
now = pd.Timestamp('now') now = pd.Timestamp("now")
baseline_inactive = baseline_inactive.assign(date_of_birth = lambda x: pd.to_datetime(x.date_of_birth), baseline_inactive = baseline_inactive.assign(
age = lambda x: (now - x.date_of_birth).dt.days/365.25245) date_of_birth=lambda x: pd.to_datetime(x.date_of_birth),
age=lambda x: (now - x.date_of_birth).dt.days / 365.25245,
)
# %% # %%
df_session_counts df_session_counts
# %% # %%
df_session_finished = df_session_counts[df_session_counts["session_response"] == "esm_finished"].reset_index() df_session_finished = df_session_counts[
df_session_counts["session_response"] == "esm_finished"
].reset_index()
# %% # %%
df_participant_finished_sessions = df_session_finished.groupby("participant_id").count()["esm_session"].rename("finished_sessions") df_participant_finished_sessions = (
df_session_finished.groupby("participant_id")
.count()["esm_session"]
.rename("finished_sessions")
)
# %% # %%
df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge(df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(), how="left", on="username") df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge(
df_adherence = df_adherence.merge(df_participant_finished_sessions, how="left", left_on="participant_id", right_index=True) df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(),
how="left",
on="username",
)
df_adherence = df_adherence.merge(
df_participant_finished_sessions,
how="left",
left_on="participant_id",
right_index=True,
)
# %% tags=[] # %% tags=[]
df_adherence df_adherence
@ -91,11 +120,15 @@ df_adherence.describe()
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5) sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5)
# %% # %%
lm_adherence = smf.ols('finished_sessions ~ C(gender) + C(startlanguage) + age', data=df_adherence).fit() lm_adherence = smf.ols(
"finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
).fit()
table = sm.stats.anova_lm(lm_adherence, typ=2) # Type 2 ANOVA DataFrame table = sm.stats.anova_lm(lm_adherence, typ=2) # Type 2 ANOVA DataFrame
print(table) print(table)
# %% # %%
lr_ols = smf.ols('finished_sessions ~ C(gender) + C(startlanguage) + age', data=df_adherence) lr_ols = smf.ols(
"finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
)
ls_result = lr_ols.fit() ls_result = lr_ols.fit()
ls_result.summary() ls_result.summary()