439 lines
12 KiB
Python
439 lines
12 KiB
Python
# ---
|
|
# jupyter:
|
|
# jupytext:
|
|
# formats: ipynb,py:percent
|
|
# text_representation:
|
|
# extension: .py
|
|
# format_name: percent
|
|
# format_version: '1.3'
|
|
# jupytext_version: 1.12.0
|
|
# kernelspec:
|
|
# display_name: straw2analysis
|
|
# language: python
|
|
# name: straw2analysis
|
|
# ---
|
|
|
|
# %%
|
|
# %matplotlib inline
|
|
import datetime
|
|
import os
|
|
import sys
|
|
|
|
import matplotlib.pyplot as plt
|
|
import pandas as pd
|
|
import seaborn as sns
|
|
import statsmodels.api as sm
|
|
import statsmodels.formula.api as smf
|
|
|
|
nb_dir = os.path.split(os.getcwd())[0]
|
|
if nb_dir not in sys.path:
|
|
sys.path.append(nb_dir)
|
|
import participants.query_db
|
|
from features.esm import *
|
|
|
|
# %%
|
|
SAVE_FIGS = True
|
|
FIG_HEIGHT = 5
|
|
FIG_ASPECT = 1.7
|
|
FIG_COLOUR = "#28827C"
|
|
|
|
SMALL_SIZE = 14
|
|
MEDIUM_SIZE = SMALL_SIZE + 2
|
|
BIGGER_SIZE = MEDIUM_SIZE + 2
|
|
|
|
plt.rc("font", size=SMALL_SIZE) # controls default text sizes
|
|
plt.rc("axes", titlesize=SMALL_SIZE) # fontsize of the axes title
|
|
plt.rc("axes", labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
|
|
plt.rc("xtick", labelsize=SMALL_SIZE) # fontsize of the tick labels
|
|
plt.rc("ytick", labelsize=SMALL_SIZE) # fontsize of the tick labels
|
|
plt.rc("legend", fontsize=SMALL_SIZE) # legend fontsize
|
|
plt.rc("figure", titlesize=BIGGER_SIZE) # fontsize of the figure title
|
|
|
|
# %%
|
|
baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
|
|
baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
|
|
baseline_be_2 = pd.read_csv("E:/STRAWbaseline/results-survey413767.csv")
|
|
baseline = (
|
|
pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner")
|
|
.reset_index()
|
|
.drop(columns="index")
|
|
)
|
|
|
|
# %%
|
|
participants_inactive_usernames = participants.query_db.get_usernames(
|
|
collection_start=datetime.date.fromisoformat("2020-08-01")
|
|
)
|
|
|
|
# %%
|
|
baseline_inactive = baseline[
|
|
baseline["Gebruikersnaam"].isin(participants_inactive_usernames)
|
|
]
|
|
|
|
# %%
|
|
VARIABLES_TO_TRANSLATE = {
|
|
"Gebruikersnaam": "username",
|
|
"Geslacht": "gender",
|
|
"Geboortedatum": "date_of_birth",
|
|
}
|
|
baseline_inactive.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
|
|
now = pd.Timestamp("now")
|
|
baseline_inactive = baseline_inactive.assign(
|
|
date_of_birth=lambda x: pd.to_datetime(x.date_of_birth),
|
|
age=lambda x: (now - x.date_of_birth).dt.days / 365.25245,
|
|
)
|
|
|
|
# %%
|
|
df_esm_inactive = get_esm_data(participants_inactive_usernames)
|
|
|
|
# %% [markdown]
|
|
# # Classify EMA sessions
|
|
|
|
# %%
|
|
df_esm_preprocessed = preprocess_esm(df_esm_inactive)
|
|
df_session_counts_time = classify_sessions_by_completion_time(df_esm_preprocessed)
|
|
|
|
# %% [markdown]
|
|
# Sessions are now classified according to the type of a session (a true questionnaire or simple single questions) and users response.
|
|
|
|
# %%
|
|
df_session_counts_time["session_response_cat"] = df_session_counts_time[
|
|
"session_response"
|
|
].astype("category")
|
|
df_session_counts_time["session_response_cat"] = df_session_counts_time[
|
|
"session_response_cat"
|
|
].cat.remove_categories(
|
|
["during_work_first", "ema_unanswered", "evening_first", "morning", "morning_first"]
|
|
)
|
|
df_session_counts_time["session_response_cat"] = df_session_counts_time[
|
|
"session_response_cat"
|
|
].cat.add_categories("interrupted")
|
|
df_session_counts_time.loc[
|
|
df_session_counts_time["session_response_cat"].isna(), "session_response_cat"
|
|
] = "interrupted"
|
|
# df_session_counts_time["session_response_cat"] = df_session_counts_time["session_response_cat"].cat.rename_categories({
|
|
# "ema_unanswered": "interrupted",
|
|
# "morning_first": "interrupted",
|
|
# "evening_first": "interrupted",
|
|
# "morning": "interrupted",
|
|
# "during_work_first": "interrupted"})
|
|
|
|
# %%
|
|
df_session_counts_time.session_response_cat
|
|
|
|
# %%
|
|
tbl_session_outcomes = df_session_counts_time.reset_index()[
|
|
"session_response_cat"
|
|
].value_counts()
|
|
|
|
# %%
|
|
tbl_session_outcomes_relative = tbl_session_outcomes / len(df_session_counts_time)
|
|
|
|
# %%
|
|
print(tbl_session_outcomes_relative.to_latex(escape=True))
|
|
|
|
# %%
|
|
print("All sessions:", len(df_session_counts_time))
|
|
print("-------------------------------------")
|
|
print(tbl_session_outcomes)
|
|
print("-------------------------------------")
|
|
print(tbl_session_outcomes / len(df_session_counts_time))
|
|
|
|
# %% [markdown]
|
|
# ## Consider only true EMA sessions
|
|
|
|
# %%
|
|
df_session_finished = df_session_counts_time[
|
|
df_session_counts_time["session_response"] == SESSION_STATUS_COMPLETE
|
|
].reset_index()
|
|
|
|
# %%
|
|
df_participant_finished_sessions = (
|
|
df_session_finished.groupby("participant_id")
|
|
.count()["esm_session"]
|
|
.rename("finished_sessions")
|
|
)
|
|
|
|
# %%
|
|
df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge(
|
|
df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(),
|
|
how="left",
|
|
on="username",
|
|
)
|
|
df_adherence = df_adherence.merge(
|
|
df_participant_finished_sessions,
|
|
how="left",
|
|
left_on="participant_id",
|
|
right_index=True,
|
|
)
|
|
|
|
# %% tags=[]
|
|
df_adherence
|
|
|
|
# %%
|
|
df_adherence.describe()
|
|
|
|
# %%
|
|
df_adherence[["gender", "startlanguage"]].value_counts()
|
|
|
|
# %%
|
|
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=FIG_HEIGHT)
|
|
|
|
# %%
|
|
lm_adherence = smf.ols(
|
|
"finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
|
|
).fit()
|
|
table = sm.stats.anova_lm(lm_adherence, typ=2) # Type 2 ANOVA DataFrame
|
|
print(table)
|
|
|
|
# %%
|
|
lr_ols = smf.ols(
|
|
"finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
|
|
)
|
|
ls_result = lr_ols.fit()
|
|
ls_result.summary()
|
|
|
|
# %% [markdown]
|
|
# # Concordance by type
|
|
|
|
# %% [markdown]
|
|
# ## Workday EMA
|
|
|
|
# %% [markdown]
|
|
# ### Filter the EMA of interest.
|
|
|
|
# %% [markdown]
|
|
# Work with only completed EMA.
|
|
|
|
# %% tags=[]
|
|
df_session_counts_time_completed = df_session_counts_time[
|
|
df_session_counts_time.session_response == "ema_completed"
|
|
]
|
|
|
|
# %% [markdown]
|
|
# To be able to compare EMA sessions *within* one day, add a date-part column.
|
|
#
|
|
# **NOTE**: Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM, the datetime is first translated to 4 h earlier.
|
|
|
|
# %%
|
|
df_session_counts_time_completed = df_session_counts_time_completed.assign(
|
|
date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
|
|
)
|
|
|
|
# %%
|
|
df_session_counts_time_completed
|
|
|
|
# %% [markdown]
|
|
# Next, calculate differences between subsequent record. But first group them by participant and device ID (as usual) and *time*. This way, the differences between the same type of EMA sessions are calculated.
|
|
|
|
# %% tags=[]
|
|
df_session_time_diff = (
|
|
df_session_counts_time_completed[["datetime_lj", "date_lj", "time"]]
|
|
.groupby(["participant_id", "device_id", "time"])
|
|
.diff()
|
|
.rename(
|
|
columns={
|
|
"datetime_lj": "previous_same_type_time_diff",
|
|
"date_lj": "time_diff_days",
|
|
}
|
|
)
|
|
)
|
|
|
|
# %%
|
|
df_session_time_diff
|
|
|
|
# %% tags=[]
|
|
df_session_counts_time_diff = df_session_counts_time_completed.join(
|
|
df_session_time_diff, how="left"
|
|
)
|
|
|
|
# %% [markdown]
|
|
# Now, select only the daytime EMAs of interest. Discard the differences between *different day* EMAs.
|
|
|
|
# %% tags=[]
|
|
time_workday_completed_less_than_1_day = (
|
|
(df_session_counts_time_diff.time == "daytime") # Only take daytime EMAs.
|
|
& ~(
|
|
df_session_counts_time_diff.previous_same_type_time_diff.isna()
|
|
) # Only where the diff was actually calculated.
|
|
& (df_session_counts_time_diff.time_diff_days == datetime.timedelta(0))
|
|
) # Only take differences *within* a day.
|
|
|
|
# %% tags=[]
|
|
df_session_workday = df_session_counts_time_diff[time_workday_completed_less_than_1_day]
|
|
|
|
# %%
|
|
df_session_workday = df_session_workday.assign(
|
|
time_diff_minutes=lambda x: x.previous_same_type_time_diff.dt.seconds / 60
|
|
)
|
|
|
|
# %%
|
|
g1 = sns.displot(
|
|
df_session_workday["time_diff_minutes"],
|
|
binwidth=5,
|
|
height=FIG_HEIGHT,
|
|
aspect=FIG_ASPECT,
|
|
color=FIG_COLOUR,
|
|
)
|
|
g1.set_axis_labels("Time difference [min]", "Session count")
|
|
g1.set(xlim=(0, 570))
|
|
if SAVE_FIGS:
|
|
g1.savefig("WorkdayEMAtimeDiff.pdf")
|
|
|
|
# %% [markdown]
|
|
# There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those.
|
|
|
|
# %%
|
|
df_session_workday[df_session_workday.time_diff_minutes < 30]
|
|
|
|
# %% [markdown]
|
|
# There are only 2 instances, look at them individually.
|
|
|
|
# %%
|
|
df_esm_preprocessed.loc[
|
|
(df_esm_preprocessed.participant_id == 35)
|
|
& (df_esm_preprocessed.esm_session == 7)
|
|
& (df_esm_preprocessed.device_id == "62a44038-3ccb-401e-a69c-6f22152c54a6"),
|
|
[
|
|
"esm_trigger",
|
|
"esm_session",
|
|
"datetime_lj",
|
|
"esm_instructions",
|
|
"device_id",
|
|
"_id",
|
|
],
|
|
]
|
|
|
|
# %%
|
|
df_esm_preprocessed.loc[
|
|
(df_esm_preprocessed.participant_id == 45)
|
|
& (df_esm_preprocessed.esm_session < 3)
|
|
& (df_esm_preprocessed.device_id == "d848b1c4-33cc-4e22-82ae-96d6b6458a33"),
|
|
["esm_trigger", "esm_session", "datetime_lj", "esm_instructions"],
|
|
]
|
|
|
|
# %% [markdown]
|
|
# As these signify bugs, we can safely discard them in the following analysis.
|
|
|
|
# %%
|
|
df_session_workday = df_session_workday[df_session_workday.time_diff_minutes > 29]
|
|
|
|
# %% [markdown]
|
|
# ### All participants
|
|
|
|
# %%
|
|
df_session_workday.describe()
|
|
|
|
# %%
|
|
df_session_workday[df_session_workday["time_diff_minutes"] < 120].shape[
|
|
0
|
|
] / df_session_workday.shape[0]
|
|
|
|
# %% [markdown]
|
|
# These statistics look reasonable.
|
|
|
|
# %% [markdown]
|
|
# ### Differences between participants
|
|
|
|
# %%
|
|
df_mean_daytime_interval = df_session_workday.groupby("participant_id").median()
|
|
|
|
# %%
|
|
df_mean_daytime_interval.describe()
|
|
|
|
# %%
|
|
g2 = sns.displot(
|
|
df_mean_daytime_interval.time_diff_minutes,
|
|
binwidth=5,
|
|
height=FIG_HEIGHT,
|
|
aspect=FIG_ASPECT,
|
|
color=FIG_COLOUR,
|
|
)
|
|
g2.set_axis_labels("Median time difference [min]", "Participant count")
|
|
if SAVE_FIGS:
|
|
g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")
|
|
|
|
# %%
|
|
df_adherence = df_adherence.merge(
|
|
df_mean_daytime_interval, how="left", left_on="participant_id", right_index=True
|
|
)
|
|
|
|
# %%
|
|
lr_ols_time_diff_median = smf.ols(
|
|
"time_diff_minutes ~ C(gender) + C(startlanguage) + age", data=df_adherence
|
|
)
|
|
ls_result_time_diff_median = lr_ols_time_diff_median.fit()
|
|
ls_result_time_diff_median.summary()
|
|
|
|
# %%
|
|
df_count_daytime_per_participant = df_session_workday.groupby(
|
|
["participant_id", "date_lj"]
|
|
).count()
|
|
|
|
# %%
|
|
df_count_daytime_per_participant["time"].describe()
|
|
|
|
# %%
|
|
sns.displot(
|
|
df_count_daytime_per_participant.time,
|
|
binwidth=1,
|
|
height=FIG_HEIGHT,
|
|
aspect=FIG_ASPECT,
|
|
color=FIG_COLOUR,
|
|
)
|
|
|
|
# %% [markdown]
|
|
# ## Evening EMA
|
|
|
|
# %% [markdown]
|
|
# For evening EMA, determine whether in a day that any EMA session was completed, an evening EMA is also present.
|
|
#
|
|
# Note, we are only dealing with true EMA sessions, non-sessions etc. have already been filtered out.
|
|
|
|
# %%
|
|
s_evening_completed = df_session_counts_time_completed.groupby(
|
|
["participant_id", "device_id", "date_lj"]
|
|
).apply(lambda x: (x.time == "evening").any())
|
|
|
|
# %%
|
|
df_session_counts_time_completed
|
|
|
|
# %%
|
|
s_evening_completed.sum()
|
|
|
|
# %%
|
|
s_evening_completed_ratio = (
|
|
s_evening_completed.groupby("participant_id").sum()
|
|
/ s_evening_completed.groupby("participant_id").count()
|
|
)
|
|
|
|
# %%
|
|
s_evening_completed_ratio.describe()
|
|
|
|
# %%
|
|
g3 = sns.displot(
|
|
s_evening_completed_ratio - 0.001,
|
|
binwidth=0.05,
|
|
height=FIG_HEIGHT,
|
|
aspect=FIG_ASPECT,
|
|
color=FIG_COLOUR,
|
|
)
|
|
g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count")
|
|
g3.set(xlim=(1.01, 0.59))
|
|
if SAVE_FIGS:
|
|
g3.savefig("EveningEMAratioParticip.pdf")
|
|
|
|
# %%
|
|
df_adherence = df_adherence.merge(
|
|
s_evening_completed_ratio.rename("evening_EMA_ratio"),
|
|
how="left",
|
|
left_on="participant_id",
|
|
right_index=True,
|
|
)
|
|
|
|
# %%
|
|
lr_ols_evening_ratio = smf.ols(
|
|
"evening_EMA_ratio ~ C(gender) + C(startlanguage) + age", data=df_adherence
|
|
)
|
|
ls_result_evening_ratio = lr_ols_evening_ratio.fit()
|
|
ls_result_evening_ratio.summary()
|