stress_at_work_analysis/statistical_analysis/adherence.py

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.11.2
#   kernelspec:
#     display_name: straw2analysis
#     language: python
#     name: straw2analysis
# ---

# %%
import datetime

# %%
import os
import sys

import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
import participants.query_db
from features.esm import *

# %%
baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
baseline_be_2 = pd.read_csv("E:/STRAWbaseline/results-survey413767.csv")
baseline = (
    pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner")
    .reset_index()
    .drop(columns="index")
)

# %%
participants_inactive_usernames = participants.query_db.get_usernames(
    collection_start=datetime.date.fromisoformat("2020-08-01")
)

# %%
baseline_inactive = baseline[
    baseline["Gebruikersnaam"].isin(participants_inactive_usernames)
]

# %%
VARIABLES_TO_TRANSLATE = {
    "Gebruikersnaam": "username",
    "Geslacht": "gender",
    "Geboortedatum": "date_of_birth",
}
baseline_inactive.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
now = pd.Timestamp("now")
baseline_inactive = baseline_inactive.assign(
    date_of_birth=lambda x: pd.to_datetime(x.date_of_birth),
    age=lambda x: (now - x.date_of_birth).dt.days / 365.25245,
)

# %%
df_esm_inactive = get_esm_data(participants_inactive_usernames)

# %% [markdown]
# # Classify EMA sessions

# %%
df_esm_preprocessed = preprocess_esm(df_esm_inactive)
df_session_counts_time = classify_sessions_by_completion_time(df_esm_preprocessed)

# %% [markdown]
# Sessions are now classified according to the type of a session (a true questionnaire or simple single questions) and users response.

# %%
df_session_counts_time

# %%
tbl_session_outcomes = df_session_counts_time.reset_index()[
    "session_response"
].value_counts()

# %%
print("All sessions:", len(df_session_counts_time))
print("-------------------------------------")
print(tbl_session_outcomes)
print("-------------------------------------")
print(tbl_session_outcomes / len(df_session_counts_time))

# %% [markdown]
# ## Consider only true EMA sessions

# %%
df_session_finished = df_session_counts_time[
    df_session_counts_time["session_response"] == SESSION_STATUS_COMPLETE
].reset_index()

# %%
df_participant_finished_sessions = (
    df_session_finished.groupby("participant_id")
    .count()["esm_session"]
    .rename("finished_sessions")
)

# %%
df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge(
    df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(),
    how="left",
    on="username",
)
df_adherence = df_adherence.merge(
    df_participant_finished_sessions,
    how="left",
    left_on="participant_id",
    right_index=True,
)

# %% tags=[]
df_adherence

# %%
df_adherence.describe()

# %%
df_adherence[["gender", "startlanguage"]].value_counts()

# %%
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5)

# %%
lm_adherence = smf.ols(
    "finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
).fit()
table = sm.stats.anova_lm(lm_adherence, typ=2)  # Type 2 ANOVA DataFrame
print(table)

# %%
lr_ols = smf.ols(
    "finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
)
ls_result = lr_ols.fit()
ls_result.summary()

# %% [markdown]
# # Concordance by type

# %% [markdown]
# ## Workday EMA

# %% [markdown]
# ### Filter the EMA of interest.

# %% [markdown]
# Work with only completed EMA.

# %% tags=[]
df_session_counts_time_completed = df_session_counts_time[
    df_session_counts_time.session_response == "ema_completed"
]

# %% [markdown]
# To be able to compare EMA sessions *within* one day, add a date-part column.
#
# **NOTE**: Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM, the datetime is first translated to 4 h earlier.

# %%
df_session_counts_time_completed = df_session_counts_time_completed.assign(
    date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
)

# %%
df_session_counts_time_completed

# %% [markdown]
# Next, calculate differences between subsequent record. But first group them by participant and device ID (as usual) and *time*. This way, the differences between the same type of EMA sessions are calculated.

# %% tags=[]
df_session_time_diff = (
    df_session_counts_time_completed[["datetime_lj", "date_lj", "time"]]
    .groupby(["participant_id", "device_id", "time"])
    .diff()
    .rename(
        columns={
            "datetime_lj": "previous_same_type_time_diff",
            "date_lj": "time_diff_days",
        }
    )
)

# %%
df_session_time_diff

# %% tags=[]
df_session_counts_time_diff = df_session_counts_time_completed.join(
    df_session_time_diff, how="left"
)

# %% [markdown]
# Now, select only the daytime EMAs of interest. Discard the differences between *different day* EMAs.

# %% tags=[]
time_workday_completed_less_than_1_day = (
    (df_session_counts_time_diff.time == "daytime")  # Only take daytime EMAs.
    & ~(
        df_session_counts_time_diff.previous_same_type_time_diff.isna()
    )  # Only where the diff was actually calculated.
    & (df_session_counts_time_diff.time_diff_days == datetime.timedelta(0))
)  # Only take differences *within* a day.

# %% tags=[]
df_session_workday = df_session_counts_time_diff[time_workday_completed_less_than_1_day]

# %%
df_session_workday = df_session_workday.assign(
    time_diff_minutes=lambda x: x.previous_same_type_time_diff.dt.seconds / 60
)

# %%
g1 = sns.displot(
    df_session_workday["time_diff_minutes"],
    binwidth=5,
    height=5,
    aspect=1.5,
    color="#28827C",
)
g1.set_axis_labels("Time difference [min]", "Session count")
# g1.savefig("WorkdayEMAtimeDiff.pdf")

# %% [markdown]
# There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those.

# %%
df_session_workday[df_session_workday.time_diff_minutes < 30]

# %% [markdown]
# There are only 2 instances, look at them individually.

# %%
df_esm_preprocessed.loc[
    (df_esm_preprocessed.participant_id == 35)
    & (df_esm_preprocessed.esm_session == 7)
    & (df_esm_preprocessed.device_id == "62a44038-3ccb-401e-a69c-6f22152c54a6"),
    [
        "esm_trigger",
        "esm_session",
        "datetime_lj",
        "esm_instructions",
        "device_id",
        "_id",
    ],
]

# %%
df_esm_preprocessed.loc[
    (df_esm_preprocessed.participant_id == 45)
    & (df_esm_preprocessed.esm_session < 3)
    & (df_esm_preprocessed.device_id == "d848b1c4-33cc-4e22-82ae-96d6b6458a33"),
    ["esm_trigger", "esm_session", "datetime_lj", "esm_instructions"],
]

# %% [markdown]
# As these signify bugs, we can safely discard them in the following analysis.

# %%
df_session_workday = df_session_workday[df_session_workday.time_diff_minutes > 29]

# %% [markdown]
# ### All participants

# %%
df_session_workday.describe()

# %%
df_session_workday[df_session_workday["time_diff_minutes"] < 120].shape[
    0
] / df_session_workday.shape[0]

# %% [markdown]
# These statistics look reasonable.

# %% [markdown]
# ### Differences between participants

# %%
df_mean_daytime_interval = df_session_workday.groupby("participant_id").median()

# %%
df_mean_daytime_interval.describe()

# %%
g2 = sns.displot(
    df_mean_daytime_interval.time_diff_minutes,
    binwidth=5,
    height=5,
    aspect=1.5,
    color="#28827C",
)
g2.set_axis_labels("Median time difference [min]", "Participant count")
# g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")

# %%
df_adherence = df_adherence.merge(
    df_mean_daytime_interval, how="left", left_on="participant_id", right_index=True
)

# %%
lr_ols_time_diff_median = smf.ols(
    "time_diff_minutes ~ C(gender) + C(startlanguage) + age", data=df_adherence
)
ls_result_time_diff_median = lr_ols_time_diff_median.fit()
ls_result_time_diff_median.summary()

# %%
df_count_daytime_per_participant = df_session_workday.groupby(
    ["participant_id", "date_lj"]
).count()

# %%
df_count_daytime_per_participant["time"].describe()

# %%
sns.displot(
    df_count_daytime_per_participant.time,
    binwidth=1,
    height=5,
    aspect=1.5,
    color="#28827C",
)

# %% [markdown]
# ## Evening EMA

# %% [markdown]
# For evening EMA, determine whether in a day that any EMA session was completed, an evening EMA is also present.
#
# Note, we are only dealing with true EMA sessions, non-sessions etc. have already been filtered out.

# %%
s_evening_completed = df_session_counts_time_completed.groupby(
    ["participant_id", "device_id", "date_lj"]
).apply(lambda x: (x.time == "evening").any())

# %%
df_session_counts_time_completed

# %%
s_evening_completed.sum()

# %%
s_evening_completed_ratio = (
    s_evening_completed.groupby("participant_id").sum()
    / s_evening_completed.groupby("participant_id").count()
)

# %%
s_evening_completed_ratio.describe()

# %%
g3 = sns.displot(
    s_evening_completed_ratio - 0.001,
    binwidth=0.05,
    height=5,
    aspect=1.5,
    color="#28827C",
)
g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count")
g3.set(xlim=(1.01, 0.59))
# g3.savefig("EveningEMAratioParticip.pdf")

# %%
df_adherence = df_adherence.merge(
    s_evening_completed_ratio.rename("evening_EMA_ratio"),
    how="left",
    left_on="participant_id",
    right_index=True,
)

# %%
lr_ols_evening_ratio = smf.ols(
    "evening_EMA_ratio ~ C(gender) + C(startlanguage) + age", data=df_adherence
)
ls_result_evening_ratio = lr_ols_evening_ratio.fit()
ls_result_evening_ratio.summary()

# %%
[WIP] Start calculating concordance. Note, workday and morning EMAs have not been properly dealt with, but assumed answered. 2021-06-08 16:07:39 +02:00			`# ---`
			`# jupyter:`
			`# jupytext:`
			`# formats: ipynb,py:percent`
			`# text_representation:`
			`# extension: .py`
			`# format_name: percent`
			`# format_version: '1.3'`
			`# jupytext_version: 1.11.2`
			`# kernelspec:`
			`# display_name: straw2analysis`
			`# language: python`
			`# name: straw2analysis`
			`# ---`

Rename a file. 2021-06-11 20:30:18 +02:00			`# %%`
Finish labelling EMA sessions and document classify_sessions_adherence function. 2021-06-11 14:50:14 +02:00			`import datetime`

[WIP] Start calculating concordance. Note, workday and morning EMAs have not been properly dealt with, but assumed answered. 2021-06-08 16:07:39 +02:00			`# %%`
			`import os`
			`import sys`
Finish labelling EMA sessions and document classify_sessions_adherence function. 2021-06-11 14:50:14 +02:00
[WIP] Start calculating concordance. Note, workday and morning EMAs have not been properly dealt with, but assumed answered. 2021-06-08 16:07:39 +02:00			`import pandas as pd`
Finish labelling EMA sessions and document classify_sessions_adherence function. 2021-06-11 14:50:14 +02:00			`import seaborn as sns`
Classify and count adherence. Add statsmodels. 2021-06-08 16:40:01 +02:00			`import statsmodels.api as sm`
Correct adherence data to only count sessions once. Add age as a float predictor. Obtain the same result with linear regression. 2021-06-08 22:32:14 +02:00			`import statsmodels.formula.api as smf`
[WIP] Start calculating concordance. Note, workday and morning EMAs have not been properly dealt with, but assumed answered. 2021-06-08 16:07:39 +02:00
			`nb_dir = os.path.split(os.getcwd())[0]`
			`if nb_dir not in sys.path:`
			`sys.path.append(nb_dir)`
			`import participants.query_db`
			`from features.esm import *`

			`# %%`
Finish labelling EMA sessions and document classify_sessions_adherence function. 2021-06-11 14:50:14 +02:00			`baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")`
			`baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")`
			`baseline_be_2 = pd.read_csv("E:/STRAWbaseline/results-survey413767.csv")`
			`baseline = (`
			`pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner")`
			`.reset_index()`
			`.drop(columns="index")`
			`)`
[WIP] Start calculating concordance. Note, workday and morning EMAs have not been properly dealt with, but assumed answered. 2021-06-08 16:07:39 +02:00
			`# %%`
Finish labelling EMA sessions and document classify_sessions_adherence function. 2021-06-11 14:50:14 +02:00			`participants_inactive_usernames = participants.query_db.get_usernames(`
			`collection_start=datetime.date.fromisoformat("2020-08-01")`
			`)`
[WIP] Start calculating concordance. Note, workday and morning EMAs have not been properly dealt with, but assumed answered. 2021-06-08 16:07:39 +02:00
			`# %%`
Finish labelling EMA sessions and document classify_sessions_adherence function. 2021-06-11 14:50:14 +02:00			`baseline_inactive = baseline[`
			`baseline["Gebruikersnaam"].isin(participants_inactive_usernames)`
			`]`
[WIP] Start calculating concordance. Note, workday and morning EMAs have not been properly dealt with, but assumed answered. 2021-06-08 16:07:39 +02:00
Export figures and add additional linear regression analyses. 2021-06-14 17:09:45 +02:00			`# %%`
			`VARIABLES_TO_TRANSLATE = {`
			`"Gebruikersnaam": "username",`
			`"Geslacht": "gender",`
			`"Geboortedatum": "date_of_birth",`
			`}`
			`baseline_inactive.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)`
			`now = pd.Timestamp("now")`
			`baseline_inactive = baseline_inactive.assign(`
			`date_of_birth=lambda x: pd.to_datetime(x.date_of_birth),`
			`age=lambda x: (now - x.date_of_birth).dt.days / 365.25245,`
			`)`

[WIP] Start calculating concordance. Note, workday and morning EMAs have not been properly dealt with, but assumed answered. 2021-06-08 16:07:39 +02:00			`# %%`
			`df_esm_inactive = get_esm_data(participants_inactive_usernames)`
Classify and count adherence. Add statsmodels. 2021-06-08 16:40:01 +02:00
Export figures and add additional linear regression analyses. 2021-06-14 17:09:45 +02:00			`# %% [markdown]`
			`# # Classify EMA sessions`

Classify and count adherence. Add statsmodels. 2021-06-08 16:40:01 +02:00			`# %%`
[WIP] Start calculating concordance. Note, workday and morning EMAs have not been properly dealt with, but assumed answered. 2021-06-08 16:07:39 +02:00			`df_esm_preprocessed = preprocess_esm(df_esm_inactive)`
Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00			`df_session_counts_time = classify_sessions_by_completion_time(df_esm_preprocessed)`
Classify and count adherence. Add statsmodels. 2021-06-08 16:40:01 +02:00
Export figures and add additional linear regression analyses. 2021-06-14 17:09:45 +02:00			`# %% [markdown]`
			`# Sessions are now classified according to the type of a session (a true questionnaire or simple single questions) and users response.`

			`# %%`
			`df_session_counts_time`

Classify and count adherence. Add statsmodels. 2021-06-08 16:40:01 +02:00			`# %%`
Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00			`tbl_session_outcomes = df_session_counts_time.reset_index()[`
Finish labelling EMA sessions and document classify_sessions_adherence function. 2021-06-11 14:50:14 +02:00			`"session_response"`
			`].value_counts()`
[WIP] Start calculating concordance. Note, workday and morning EMAs have not been properly dealt with, but assumed answered. 2021-06-08 16:07:39 +02:00
Classify and count adherence. Add statsmodels. 2021-06-08 16:40:01 +02:00			`# %%`
Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00			`print("All sessions:", len(df_session_counts_time))`
Classify and count adherence. Add statsmodels. 2021-06-08 16:40:01 +02:00			`print("-------------------------------------")`
			`print(tbl_session_outcomes)`
			`print("-------------------------------------")`
Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00			`print(tbl_session_outcomes / len(df_session_counts_time))`
Classify and count adherence. Add statsmodels. 2021-06-08 16:40:01 +02:00
Export figures and add additional linear regression analyses. 2021-06-14 17:09:45 +02:00			`# %% [markdown]`
			`# ## Consider only true EMA sessions`
Add ANOVA for adherence and gender + country. 2021-06-08 17:16:08 +02:00
			`# %%`
Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00			`df_session_finished = df_session_counts_time[`
			`df_session_counts_time["session_response"] == SESSION_STATUS_COMPLETE`
Finish labelling EMA sessions and document classify_sessions_adherence function. 2021-06-11 14:50:14 +02:00			`].reset_index()`
Add ANOVA for adherence and gender + country. 2021-06-08 17:16:08 +02:00
			`# %%`
Finish labelling EMA sessions and document classify_sessions_adherence function. 2021-06-11 14:50:14 +02:00			`df_participant_finished_sessions = (`
			`df_session_finished.groupby("participant_id")`
			`.count()["esm_session"]`
			`.rename("finished_sessions")`
			`)`
Add ANOVA for adherence and gender + country. 2021-06-08 17:16:08 +02:00
			`# %%`
Finish labelling EMA sessions and document classify_sessions_adherence function. 2021-06-11 14:50:14 +02:00			`df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge(`
			`df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(),`
			`how="left",`
			`on="username",`
			`)`
			`df_adherence = df_adherence.merge(`
			`df_participant_finished_sessions,`
			`how="left",`
			`left_on="participant_id",`
			`right_index=True,`
			`)`
Correct adherence data to only count sessions once. Add age as a float predictor. Obtain the same result with linear regression. 2021-06-08 22:32:14 +02:00
			`# %% tags=[]`
			`df_adherence`
Add ANOVA for adherence and gender + country. 2021-06-08 17:16:08 +02:00
Add a plot of adherence. 2021-06-08 22:42:56 +02:00			`# %%`
			`df_adherence.describe()`

Export figures and add additional linear regression analyses. 2021-06-14 17:09:45 +02:00			`# %%`
			`df_adherence[["gender", "startlanguage"]].value_counts()`

Add a plot of adherence. 2021-06-08 22:42:56 +02:00			`# %%`
			`sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5)`

Add ANOVA for adherence and gender + country. 2021-06-08 17:16:08 +02:00			`# %%`
Finish labelling EMA sessions and document classify_sessions_adherence function. 2021-06-11 14:50:14 +02:00			`lm_adherence = smf.ols(`
			`"finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence`
			`).fit()`
			`table = sm.stats.anova_lm(lm_adherence, typ=2) # Type 2 ANOVA DataFrame`
Add ANOVA for adherence and gender + country. 2021-06-08 17:16:08 +02:00			`print(table)`
Correct adherence data to only count sessions once. Add age as a float predictor. Obtain the same result with linear regression. 2021-06-08 22:32:14 +02:00
			`# %%`
Finish labelling EMA sessions and document classify_sessions_adherence function. 2021-06-11 14:50:14 +02:00			`lr_ols = smf.ols(`
			`"finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence`
			`)`
Correct adherence data to only count sessions once. Add age as a float predictor. Obtain the same result with linear regression. 2021-06-08 22:32:14 +02:00			`ls_result = lr_ols.fit()`
			`ls_result.summary()`
Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00
			`# %% [markdown]`
			`# # Concordance by type`

			`# %% [markdown]`
			`# ## Workday EMA`

			`# %% [markdown]`
			`# ### Filter the EMA of interest.`

			`# %% [markdown]`
			`# Work with only completed EMA.`

			`# %% tags=[]`
			`df_session_counts_time_completed = df_session_counts_time[`
			`df_session_counts_time.session_response == "ema_completed"`
			`]`

			`# %% [markdown]`
			`# To be able to compare EMA sessions within one day, add a date-part column.`
			`#`
			`# NOTE: Since daytime EMAs could theoretically last beyond midnight, but never after 4 AM, the datetime is first translated to 4 h earlier.`

			`# %%`
			`df_session_counts_time_completed = df_session_counts_time_completed.assign(`
			`date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date`
			`)`

			`# %%`
			`df_session_counts_time_completed`

			`# %% [markdown]`
			`# Next, calculate differences between subsequent record. But first group them by participant and device ID (as usual) and time. This way, the differences between the same type of EMA sessions are calculated.`

			`# %% tags=[]`
			`df_session_time_diff = (`
			`df_session_counts_time_completed[["datetime_lj", "date_lj", "time"]]`
			`.groupby(["participant_id", "device_id", "time"])`
			`.diff()`
			`.rename(`
			`columns={`
			`"datetime_lj": "previous_same_type_time_diff",`
			`"date_lj": "time_diff_days",`
			`}`
			`)`
			`)`

			`# %%`
			`df_session_time_diff`

			`# %% tags=[]`
			`df_session_counts_time_diff = df_session_counts_time_completed.join(`
			`df_session_time_diff, how="left"`
			`)`

			`# %% [markdown]`
			`# Now, select only the daytime EMAs of interest. Discard the differences between different day EMAs.`

Prepare figures for publication. 2021-07-02 16:02:55 +02:00			`# %% tags=[]`
Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00			`time_workday_completed_less_than_1_day = (`
			`(df_session_counts_time_diff.time == "daytime") # Only take daytime EMAs.`
			`& ~(`
			`df_session_counts_time_diff.previous_same_type_time_diff.isna()`
			`) # Only where the diff was actually calculated.`
			`& (df_session_counts_time_diff.time_diff_days == datetime.timedelta(0))`
			`) # Only take differences within a day.`

			`# %% tags=[]`
			`df_session_workday = df_session_counts_time_diff[time_workday_completed_less_than_1_day]`

			`# %%`
			`df_session_workday = df_session_workday.assign(`
			`time_diff_minutes=lambda x: x.previous_same_type_time_diff.dt.seconds / 60`
			`)`

			`# %%`
Start exploring PANAS data. Add a function to clean up ESM data. 2021-07-02 16:33:48 +02:00			`g1 = sns.displot(`
			`df_session_workday["time_diff_minutes"],`
			`binwidth=5,`
			`height=5,`
			`aspect=1.5,`
			`color="#28827C",`
			`)`
Export figures and add additional linear regression analyses. 2021-06-14 17:09:45 +02:00			`g1.set_axis_labels("Time difference [min]", "Session count")`
Start exploring PANAS data. Add a function to clean up ESM data. 2021-07-02 16:33:48 +02:00			`# g1.savefig("WorkdayEMAtimeDiff.pdf")`
Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00
			`# %% [markdown]`
			`# There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those.`

			`# %%`
			`df_session_workday[df_session_workday.time_diff_minutes < 30]`

			`# %% [markdown]`
			`# There are only 2 instances, look at them individually.`

			`# %%`
			`df_esm_preprocessed.loc[`
Fix formatting. 2021-07-04 14:34:57 +02:00			`(df_esm_preprocessed.participant_id == 35)`
			`& (df_esm_preprocessed.esm_session == 7)`
			`& (df_esm_preprocessed.device_id == "62a44038-3ccb-401e-a69c-6f22152c54a6"),`
			`[`
			`"esm_trigger",`
			`"esm_session",`
			`"datetime_lj",`
			`"esm_instructions",`
			`"device_id",`
			`"_id",`
			`],`
Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00			`]`

			`# %%`
			`df_esm_preprocessed.loc[`
			`(df_esm_preprocessed.participant_id == 45)`
			`& (df_esm_preprocessed.esm_session < 3)`
			`& (df_esm_preprocessed.device_id == "d848b1c4-33cc-4e22-82ae-96d6b6458a33"),`
			`["esm_trigger", "esm_session", "datetime_lj", "esm_instructions"],`
			`]`

			`# %% [markdown]`
			`# As these signify bugs, we can safely discard them in the following analysis.`

			`# %%`
			`df_session_workday = df_session_workday[df_session_workday.time_diff_minutes > 29]`

			`# %% [markdown]`
			`# ### All participants`

			`# %%`
			`df_session_workday.describe()`

Export figures and add additional linear regression analyses. 2021-06-14 17:09:45 +02:00			`# %%`
			`df_session_workday[df_session_workday["time_diff_minutes"] < 120].shape[`
			`0`
			`] / df_session_workday.shape[0]`

Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00			`# %% [markdown]`
			`# These statistics look reasonable.`

			`# %% [markdown]`
			`# ### Differences between participants`

			`# %%`
Export figures and add additional linear regression analyses. 2021-06-14 17:09:45 +02:00			`df_mean_daytime_interval = df_session_workday.groupby("participant_id").median()`
Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00
			`# %%`
			`df_mean_daytime_interval.describe()`

			`# %%`
Start exploring PANAS data. Add a function to clean up ESM data. 2021-07-02 16:33:48 +02:00			`g2 = sns.displot(`
			`df_mean_daytime_interval.time_diff_minutes,`
			`binwidth=5,`
			`height=5,`
			`aspect=1.5,`
			`color="#28827C",`
			`)`
Export figures and add additional linear regression analyses. 2021-06-14 17:09:45 +02:00			`g2.set_axis_labels("Median time difference [min]", "Participant count")`
Start exploring PANAS data. Add a function to clean up ESM data. 2021-07-02 16:33:48 +02:00			`# g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf")`
Export figures and add additional linear regression analyses. 2021-06-14 17:09:45 +02:00
			`# %%`
			`df_adherence = df_adherence.merge(`
			`df_mean_daytime_interval, how="left", left_on="participant_id", right_index=True`
			`)`

			`# %%`
			`lr_ols_time_diff_median = smf.ols(`
			`"time_diff_minutes ~ C(gender) + C(startlanguage) + age", data=df_adherence`
			`)`
			`ls_result_time_diff_median = lr_ols_time_diff_median.fit()`
			`ls_result_time_diff_median.summary()`
Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00
			`# %%`
			`df_count_daytime_per_participant = df_session_workday.groupby(`
			`["participant_id", "date_lj"]`
			`).count()`

			`# %%`
			`df_count_daytime_per_participant["time"].describe()`

			`# %%`
Start exploring PANAS data. Add a function to clean up ESM data. 2021-07-02 16:33:48 +02:00			`sns.displot(`
			`df_count_daytime_per_participant.time,`
			`binwidth=1,`
			`height=5,`
			`aspect=1.5,`
			`color="#28827C",`
			`)`
Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00
			`# %% [markdown]`
			`# ## Evening EMA`

			`# %% [markdown]`
			`# For evening EMA, determine whether in a day that any EMA session was completed, an evening EMA is also present.`
			`#`
			`# Note, we are only dealing with true EMA sessions, non-sessions etc. have already been filtered out.`

			`# %%`
			`s_evening_completed = df_session_counts_time_completed.groupby(`
			`["participant_id", "device_id", "date_lj"]`
			`).apply(lambda x: (x.time == "evening").any())`

			`# %%`
			`df_session_counts_time_completed`

Export figures and add additional linear regression analyses. 2021-06-14 17:09:45 +02:00			`# %%`
			`s_evening_completed.sum()`

Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00			`# %%`
			`s_evening_completed_ratio = (`
			`s_evening_completed.groupby("participant_id").sum()`
			`/ s_evening_completed.groupby("participant_id").count()`
			`)`

			`# %%`
			`s_evening_completed_ratio.describe()`

			`# %%`
Start exploring PANAS data. Add a function to clean up ESM data. 2021-07-02 16:33:48 +02:00			`g3 = sns.displot(`
			`s_evening_completed_ratio - 0.001,`
			`binwidth=0.05,`
			`height=5,`
			`aspect=1.5,`
			`color="#28827C",`
			`)`
Export figures and add additional linear regression analyses. 2021-06-14 17:09:45 +02:00			`g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count")`
Start exploring PANAS data. Add a function to clean up ESM data. 2021-07-02 16:33:48 +02:00			`g3.set(xlim=(1.01, 0.59))`
			`# g3.savefig("EveningEMAratioParticip.pdf")`
Export figures and add additional linear regression analyses. 2021-06-14 17:09:45 +02:00
			`# %%`
			`df_adherence = df_adherence.merge(`
			`s_evening_completed_ratio.rename("evening_EMA_ratio"),`
			`how="left",`
			`left_on="participant_id",`
			`right_index=True,`
			`)`

			`# %%`
			`lr_ols_evening_ratio = smf.ols(`
			`"evening_EMA_ratio ~ C(gender) + C(startlanguage) + age", data=df_adherence`
			`)`
			`ls_result_evening_ratio = lr_ols_evening_ratio.fit()`
			`ls_result_evening_ratio.summary()`
Analyze adherence: Look at time differences between subsequent daytime EMA. Look at the daily evening EMA proportion. 2021-06-11 20:28:24 +02:00
			`# %%`