stress_at_work_analysis/statistical_analysis/adherence.py

429 lines
12 KiB
Python

# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.12.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
# %matplotlib inline
import datetime
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
sys.path.append(nb_dir)
import participants.query_db
from features.esm import *
# %%
SAVE_FIGS = True
FIG_HEIGHT = 5
FIG_ASPECT = 1.7
FIG_COLOUR = "#28827C"
SMALL_SIZE = 14
MEDIUM_SIZE = SMALL_SIZE + 2
BIGGER_SIZE = MEDIUM_SIZE + 2
plt.rc("font", size=SMALL_SIZE) # controls default text sizes
plt.rc("axes", titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc("axes", labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc("xtick", labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc("ytick", labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc("legend", fontsize=SMALL_SIZE) # legend fontsize
plt.rc("figure", titlesize=BIGGER_SIZE) # fontsize of the figure title
# %%
baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
baseline_be_2 = pd.read_csv("E:/STRAWbaseline/results-survey413767.csv")
baseline = (
pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner")
.reset_index()
.drop(columns="index")
)
# %%
participants_inactive_usernames = participants.query_db.get_usernames(
collection_start=datetime.date.fromisoformat("2020-08-01")
)
# %%
baseline_inactive = baseline[
baseline["Gebruikersnaam"].isin(participants_inactive_usernames)
]
# %%
VARIABLES_TO_TRANSLATE = {
"Gebruikersnaam": "username",
"Geslacht": "gender",
"Geboortedatum": "date_of_birth",
}
baseline_inactive.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
now = pd.Timestamp("now")
baseline_inactive = baseline_inactive.assign(
date_of_birth=lambda x: pd.to_datetime(x.date_of_birth),
age=lambda x: (now - x.date_of_birth).dt.days / 365.25245,
)
# %%
df_esm_inactive = get_esm_data(participants_inactive_usernames)
# %% [markdown]
# # Classify EMA sessions
# %%
df_esm_preprocessed = preprocess_esm(df_esm_inactive)
df_session_counts_time = classify_sessions_by_completion_time(df_esm_preprocessed)
# %% [markdown]
# Sessions are now classified according to the type of a session (a true questionnaire or simple single questions) and users response.
# %%
df_session_counts_time["session_response_cat"] = df_session_counts_time["session_response"].astype("category")
df_session_counts_time["session_response_cat"] = df_session_counts_time["session_response_cat"].cat.remove_categories(['during_work_first', 'ema_unanswered', 'evening_first', 'morning', 'morning_first'])
df_session_counts_time["session_response_cat"] = df_session_counts_time["session_response_cat"].cat.add_categories("interrupted")
df_session_counts_time.loc[df_session_counts_time["session_response_cat"].isna(), "session_response_cat"] = "interrupted"
#df_session_counts_time["session_response_cat"] = df_session_counts_time["session_response_cat"].cat.rename_categories({
# "ema_unanswered": "interrupted",
# "morning_first": "interrupted",
# "evening_first": "interrupted",
# "morning": "interrupted",
# "during_work_first": "interrupted"})
# %%
df_session_counts_time.session_response_cat
# %%
tbl_session_outcomes = df_session_counts_time.reset_index()[
"session_response_cat"
].value_counts()
# %%
tbl_session_outcomes_relative = tbl_session_outcomes / len(df_session_counts_time)
# %%
print(tbl_session_outcomes_relative.to_latex(escape=True))
# %%
print("All sessions:", len(df_session_counts_time))
print("-------------------------------------")
print(tbl_session_outcomes)
print("-------------------------------------")
print(tbl_session_outcomes / len(df_session_counts_time))
# %% [markdown]
# ## Consider only true EMA sessions
# %%
df_session_finished = df_session_counts_time[
df_session_counts_time["session_response"] == SESSION_STATUS_COMPLETE
].reset_index()
# %%
df_participant_finished_sessions = (
df_session_finished.groupby("participant_id")
.count()["esm_session"]
.rename("finished_sessions")
)
# %%
df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge(
df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(),
how="left",
on="username",
)
df_adherence = df_adherence.merge(
df_participant_finished_sessions,
how="left",
left_on="participant_id",
right_index=True,
)
# %% tags=[]
df_adherence
# %%
df_adherence.describe()
# %%
df_adherence[["gender", "startlanguage"]].value_counts()
# %%
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=FIG_HEIGHT)
# %%
lm_adherence = smf.ols(
"finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
).fit()
table = sm.stats.anova_lm(lm_adherence, typ=2) # Type 2 ANOVA DataFrame
print(table)
# %%
lr_ols = smf.ols(
"finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
)
ls_result = lr_ols.fit()
ls_result.summary()
# %% [markdown]
# # Concordance by type
# %% [markdown]
# ## Workday EMA
# %% [markdown]
# ### Filter the EMA of interest.
# %% [markdown]
# Work with only completed EMA.
# %% tags=[]
df_session_counts_time_completed = df_session_counts_time[
df_session_counts_time.session_response == "ema_completed"
]
# %% [markdown]
# To be able to compare EMA sessions *within* one day, add a date-part column.
#
# **NOTE**: Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM, the datetime is first translated to 4 h earlier.
# %%
df_session_counts_time_completed = df_session_counts_time_completed.assign(
date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
)
# %%
df_session_counts_time_completed
# %% [markdown]
# Next, calculate differences between subsequent record. But first group them by participant and device ID (as usual) and *time*. This way, the differences between the same type of EMA sessions are calculated.
# %% tags=[]
df_session_time_diff = (
df_session_counts_time_completed[["datetime_lj", "date_lj", "time"]]
.groupby(["participant_id", "device_id", "time"])
.diff()
.rename(
columns={
"datetime_lj": "previous_same_type_time_diff",
"date_lj": "time_diff_days",
}
)
)
# %%
df_session_time_diff
# %% tags=[]
df_session_counts_time_diff = df_session_counts_time_completed.join(
df_session_time_diff, how="left"
)
# %% [markdown]
# Now, select only the daytime EMAs of interest. Discard the differences between *different day* EMAs.
# %% tags=[]
time_workday_completed_less_than_1_day = (
(df_session_counts_time_diff.time == "daytime") # Only take daytime EMAs.
& ~(
df_session_counts_time_diff.previous_same_type_time_diff.isna()
) # Only where the diff was actually calculated.
& (df_session_counts_time_diff.time_diff_days == datetime.timedelta(0))
) # Only take differences *within* a day.
# %% tags=[]
df_session_workday = df_session_counts_time_diff[time_workday_completed_less_than_1_day]
# %%
df_session_workday = df_session_workday.assign(
time_diff_minutes=lambda x: x.previous_same_type_time_diff.dt.seconds / 60
)
# %%
g1 = sns.displot(
df_session_workday["time_diff_minutes"],
binwidth=5,
height=FIG_HEIGHT,
aspect=FIG_ASPECT,
color=FIG_COLOUR,
)
g1.set_axis_labels("Time difference [min]", "Session count")
g1.set(xlim=(0, 570))
if SAVE_FIGS:
g1.savefig("WorkdayEMAtimeDiff.pdf")
# %% [markdown]
# There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those.
# %%
df_session_workday[df_session_workday.time_diff_minutes < 30]
# %% [markdown]
# There are only 2 instances, look at them individually.
# %%
df_esm_preprocessed.loc[
(df_esm_preprocessed.participant_id == 35)
& (df_esm_preprocessed.esm_session == 7)
& (df_esm_preprocessed.device_id == "62a44038-3ccb-401e-a69c-6f22152c54a6"),
[
"esm_trigger",
"esm_session",
"datetime_lj",
"esm_instructions",
"device_id",
"_id",
],
]
# %%
df_esm_preprocessed.loc[
(df_esm_preprocessed.participant_id == 45)
& (df_esm_preprocessed.esm_session < 3)
& (df_esm_preprocessed.device_id == "d848b1c4-33cc-4e22-82ae-96d6b6458a33"),
["esm_trigger", "esm_session", "datetime_lj", "esm_instructions"],
]
# %% [markdown]
# As these signify bugs, we can safely discard them in the following analysis.
# %%
df_session_workday = df_session_workday[df_session_workday.time_diff_minutes > 29]
# %% [markdown]
# ### All participants
# %%
df_session_workday.describe()
# %%
df_session_workday[df_session_workday["time_diff_minutes"] < 120].shape[
0
] / df_session_workday.shape[0]
# %% [markdown]
# These statistics look reasonable.
# %% [markdown]
# ### Differences between participants
# %%
df_mean_daytime_interval = df_session_workday.groupby("participant_id").median()
# %%
df_mean_daytime_interval.describe()
# %%
g2 = sns.displot(
df_mean_daytime_interval.time_diff_minutes,
binwidth=5,
height=FIG_HEIGHT,
aspect=FIG_ASPECT,
color=FIG_COLOUR,
)
g2.set_axis_labels("Median time difference [min]", "Participant count")
if SAVE_FIGS:
g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")
# %%
df_adherence = df_adherence.merge(
df_mean_daytime_interval, how="left", left_on="participant_id", right_index=True
)
# %%
lr_ols_time_diff_median = smf.ols(
"time_diff_minutes ~ C(gender) + C(startlanguage) + age", data=df_adherence
)
ls_result_time_diff_median = lr_ols_time_diff_median.fit()
ls_result_time_diff_median.summary()
# %%
df_count_daytime_per_participant = df_session_workday.groupby(
["participant_id", "date_lj"]
).count()
# %%
df_count_daytime_per_participant["time"].describe()
# %%
sns.displot(
df_count_daytime_per_participant.time,
binwidth=1,
height=FIG_HEIGHT,
aspect=FIG_ASPECT,
color=FIG_COLOUR,
)
# %% [markdown]
# ## Evening EMA
# %% [markdown]
# For evening EMA, determine whether in a day that any EMA session was completed, an evening EMA is also present.
#
# Note, we are only dealing with true EMA sessions, non-sessions etc. have already been filtered out.
# %%
s_evening_completed = df_session_counts_time_completed.groupby(
["participant_id", "device_id", "date_lj"]
).apply(lambda x: (x.time == "evening").any())
# %%
df_session_counts_time_completed
# %%
s_evening_completed.sum()
# %%
s_evening_completed_ratio = (
s_evening_completed.groupby("participant_id").sum()
/ s_evening_completed.groupby("participant_id").count()
)
# %%
s_evening_completed_ratio.describe()
# %%
g3 = sns.displot(
s_evening_completed_ratio - 0.001,
binwidth=0.05,
height=FIG_HEIGHT,
aspect=FIG_ASPECT,
color=FIG_COLOUR,
)
g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count")
g3.set(xlim=(1.01, 0.59))
if SAVE_FIGS:
g3.savefig("EveningEMAratioParticip.pdf")
# %%
df_adherence = df_adherence.merge(
s_evening_completed_ratio.rename("evening_EMA_ratio"),
how="left",
left_on="participant_id",
right_index=True,
)
# %%
lr_ols_evening_ratio = smf.ols(
"evening_EMA_ratio ~ C(gender) + C(startlanguage) + age", data=df_adherence
)
ls_result_evening_ratio = lr_ols_evening_ratio.fit()
ls_result_evening_ratio.summary()