stress_at_work_analysis/statistical_analysis/adherence.py

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.12.0
#   kernelspec:
#     display_name: straw2analysis
#     language: python
#     name: straw2analysis
# ---

# %%
# %matplotlib inline
import datetime
import os
import sys

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
import participants.query_db
from features.esm import *

# %%
SAVE_FIGS = True
FIG_HEIGHT = 5
FIG_ASPECT = 1.7
FIG_COLOUR = "#28827C"

SMALL_SIZE = 14
MEDIUM_SIZE = SMALL_SIZE + 2
BIGGER_SIZE = MEDIUM_SIZE + 2

plt.rc("font", size=SMALL_SIZE)  # controls default text sizes
plt.rc("axes", titlesize=SMALL_SIZE)  # fontsize of the axes title
plt.rc("axes", labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
plt.rc("xtick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
plt.rc("ytick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
plt.rc("legend", fontsize=SMALL_SIZE)  # legend fontsize
plt.rc("figure", titlesize=BIGGER_SIZE)  # fontsize of the figure title

# %%
baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
baseline_be_2 = pd.read_csv("E:/STRAWbaseline/results-survey413767.csv")
baseline = (
    pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner")
    .reset_index()
    .drop(columns="index")
)

# %%
participants_inactive_usernames = participants.query_db.get_usernames(
    collection_start=datetime.date.fromisoformat("2020-08-01")
)

# %%
baseline_inactive = baseline[
    baseline["Gebruikersnaam"].isin(participants_inactive_usernames)
]

# %%
VARIABLES_TO_TRANSLATE = {
    "Gebruikersnaam": "username",
    "Geslacht": "gender",
    "Geboortedatum": "date_of_birth",
}
baseline_inactive.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
now = pd.Timestamp("now")
baseline_inactive = baseline_inactive.assign(
    date_of_birth=lambda x: pd.to_datetime(x.date_of_birth),
    age=lambda x: (now - x.date_of_birth).dt.days / 365.25245,
)

# %%
df_esm_inactive = get_esm_data(participants_inactive_usernames)

# %% [markdown]
# # Classify EMA sessions

# %%
df_esm_preprocessed = preprocess_esm(df_esm_inactive)
df_session_counts_time = classify_sessions_by_completion_time(df_esm_preprocessed)

# %% [markdown]
# Sessions are now classified according to the type of a session (a true questionnaire or simple single questions) and users response.

# %%
df_session_counts_time["session_response_cat"] = df_session_counts_time["session_response"].astype("category")
df_session_counts_time["session_response_cat"] = df_session_counts_time["session_response_cat"].cat.remove_categories(['during_work_first', 'ema_unanswered', 'evening_first', 'morning', 'morning_first'])
df_session_counts_time["session_response_cat"] = df_session_counts_time["session_response_cat"].cat.add_categories("interrupted")
df_session_counts_time.loc[df_session_counts_time["session_response_cat"].isna(), "session_response_cat"] = "interrupted"
#df_session_counts_time["session_response_cat"] = df_session_counts_time["session_response_cat"].cat.rename_categories({
#    "ema_unanswered": "interrupted", 
#    "morning_first": "interrupted", 
#    "evening_first": "interrupted",
#    "morning": "interrupted",
#    "during_work_first": "interrupted"})

# %%
df_session_counts_time.session_response_cat

# %%
tbl_session_outcomes = df_session_counts_time.reset_index()[
    "session_response_cat"
].value_counts()

# %%
tbl_session_outcomes_relative = tbl_session_outcomes / len(df_session_counts_time)

# %%
print(tbl_session_outcomes_relative.to_latex(escape=True))

# %%
print("All sessions:", len(df_session_counts_time))
print("-------------------------------------")
print(tbl_session_outcomes)
print("-------------------------------------")
print(tbl_session_outcomes / len(df_session_counts_time))

# %% [markdown]
# ## Consider only true EMA sessions

# %%
df_session_finished = df_session_counts_time[
    df_session_counts_time["session_response"] == SESSION_STATUS_COMPLETE
].reset_index()

# %%
df_participant_finished_sessions = (
    df_session_finished.groupby("participant_id")
    .count()["esm_session"]
    .rename("finished_sessions")
)

# %%
df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge(
    df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(),
    how="left",
    on="username",
)
df_adherence = df_adherence.merge(
    df_participant_finished_sessions,
    how="left",
    left_on="participant_id",
    right_index=True,
)

# %% tags=[]
df_adherence

# %%
df_adherence.describe()

# %%
df_adherence[["gender", "startlanguage"]].value_counts()

# %%
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=FIG_HEIGHT)

# %%
lm_adherence = smf.ols(
    "finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
).fit()
table = sm.stats.anova_lm(lm_adherence, typ=2)  # Type 2 ANOVA DataFrame
print(table)

# %%
lr_ols = smf.ols(
    "finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
)
ls_result = lr_ols.fit()
ls_result.summary()

# %% [markdown]
# # Concordance by type

# %% [markdown]
# ## Workday EMA

# %% [markdown]
# ### Filter the EMA of interest.

# %% [markdown]
# Work with only completed EMA.

# %% tags=[]
df_session_counts_time_completed = df_session_counts_time[
    df_session_counts_time.session_response == "ema_completed"
]

# %% [markdown]
# To be able to compare EMA sessions *within* one day, add a date-part column.
#
# **NOTE**: Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM, the datetime is first translated to 4 h earlier.

# %%
df_session_counts_time_completed = df_session_counts_time_completed.assign(
    date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
)

# %%
df_session_counts_time_completed

# %% [markdown]
# Next, calculate differences between subsequent record. But first group them by participant and device ID (as usual) and *time*. This way, the differences between the same type of EMA sessions are calculated.

# %% tags=[]
df_session_time_diff = (
    df_session_counts_time_completed[["datetime_lj", "date_lj", "time"]]
    .groupby(["participant_id", "device_id", "time"])
    .diff()
    .rename(
        columns={
            "datetime_lj": "previous_same_type_time_diff",
            "date_lj": "time_diff_days",
        }
    )
)

# %%
df_session_time_diff

# %% tags=[]
df_session_counts_time_diff = df_session_counts_time_completed.join(
    df_session_time_diff, how="left"
)

# %% [markdown]
# Now, select only the daytime EMAs of interest. Discard the differences between *different day* EMAs.

# %% tags=[]
time_workday_completed_less_than_1_day = (
    (df_session_counts_time_diff.time == "daytime")  # Only take daytime EMAs.
    & ~(
        df_session_counts_time_diff.previous_same_type_time_diff.isna()
    )  # Only where the diff was actually calculated.
    & (df_session_counts_time_diff.time_diff_days == datetime.timedelta(0))
)  # Only take differences *within* a day.

# %% tags=[]
df_session_workday = df_session_counts_time_diff[time_workday_completed_less_than_1_day]

# %%
df_session_workday = df_session_workday.assign(
    time_diff_minutes=lambda x: x.previous_same_type_time_diff.dt.seconds / 60
)

# %%
g1 = sns.displot(
    df_session_workday["time_diff_minutes"],
    binwidth=5,
    height=FIG_HEIGHT,
    aspect=FIG_ASPECT,
    color=FIG_COLOUR,
)
g1.set_axis_labels("Time difference [min]", "Session count")
g1.set(xlim=(0, 570))
if SAVE_FIGS:
    g1.savefig("WorkdayEMAtimeDiff.pdf")

# %% [markdown]
# There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those.

# %%
df_session_workday[df_session_workday.time_diff_minutes < 30]

# %% [markdown]
# There are only 2 instances, look at them individually.

# %%
df_esm_preprocessed.loc[
    (df_esm_preprocessed.participant_id == 35)
    & (df_esm_preprocessed.esm_session == 7)
    & (df_esm_preprocessed.device_id == "62a44038-3ccb-401e-a69c-6f22152c54a6"),
    [
        "esm_trigger",
        "esm_session",
        "datetime_lj",
        "esm_instructions",
        "device_id",
        "_id",
    ],
]

# %%
df_esm_preprocessed.loc[
    (df_esm_preprocessed.participant_id == 45)
    & (df_esm_preprocessed.esm_session < 3)
    & (df_esm_preprocessed.device_id == "d848b1c4-33cc-4e22-82ae-96d6b6458a33"),
    ["esm_trigger", "esm_session", "datetime_lj", "esm_instructions"],
]

# %% [markdown]
# As these signify bugs, we can safely discard them in the following analysis.

# %%
df_session_workday = df_session_workday[df_session_workday.time_diff_minutes > 29]

# %% [markdown]
# ### All participants

# %%
df_session_workday.describe()

# %%
df_session_workday[df_session_workday["time_diff_minutes"] < 120].shape[
    0
] / df_session_workday.shape[0]

# %% [markdown]
# These statistics look reasonable.

# %% [markdown]
# ### Differences between participants

# %%
df_mean_daytime_interval = df_session_workday.groupby("participant_id").median()

# %%
df_mean_daytime_interval.describe()

# %%
g2 = sns.displot(
    df_mean_daytime_interval.time_diff_minutes,
    binwidth=5,
    height=FIG_HEIGHT,
    aspect=FIG_ASPECT,
    color=FIG_COLOUR,
)
g2.set_axis_labels("Median time difference [min]", "Participant count")
if SAVE_FIGS:
    g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")

# %%
df_adherence = df_adherence.merge(
    df_mean_daytime_interval, how="left", left_on="participant_id", right_index=True
)

# %%
lr_ols_time_diff_median = smf.ols(
    "time_diff_minutes ~ C(gender) + C(startlanguage) + age", data=df_adherence
)
ls_result_time_diff_median = lr_ols_time_diff_median.fit()
ls_result_time_diff_median.summary()

# %%
df_count_daytime_per_participant = df_session_workday.groupby(
    ["participant_id", "date_lj"]
).count()

# %%
df_count_daytime_per_participant["time"].describe()

# %%
sns.displot(
    df_count_daytime_per_participant.time,
    binwidth=1,
    height=FIG_HEIGHT,
    aspect=FIG_ASPECT,
    color=FIG_COLOUR,
)

# %% [markdown]
# ## Evening EMA

# %% [markdown]
# For evening EMA, determine whether in a day that any EMA session was completed, an evening EMA is also present.
#
# Note, we are only dealing with true EMA sessions, non-sessions etc. have already been filtered out.

# %%
s_evening_completed = df_session_counts_time_completed.groupby(
    ["participant_id", "device_id", "date_lj"]
).apply(lambda x: (x.time == "evening").any())

# %%
df_session_counts_time_completed

# %%
s_evening_completed.sum()

# %%
s_evening_completed_ratio = (
    s_evening_completed.groupby("participant_id").sum()
    / s_evening_completed.groupby("participant_id").count()
)

# %%
s_evening_completed_ratio.describe()

# %%
g3 = sns.displot(
    s_evening_completed_ratio - 0.001,
    binwidth=0.05,
    height=FIG_HEIGHT,
    aspect=FIG_ASPECT,
    color=FIG_COLOUR,
)
g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count")
g3.set(xlim=(1.01, 0.59))
if SAVE_FIGS:
    g3.savefig("EveningEMAratioParticip.pdf")

# %%
df_adherence = df_adherence.merge(
    s_evening_completed_ratio.rename("evening_EMA_ratio"),
    how="left",
    left_on="participant_id",
    right_index=True,
)

# %%
lr_ols_evening_ratio = smf.ols(
    "evening_EMA_ratio ~ C(gender) + C(startlanguage) + age", data=df_adherence
)
ls_result_evening_ratio = lr_ols_evening_ratio.fit()
ls_result_evening_ratio.summary()