diff --git a/exploration/expl_esm.py b/exploration/expl_esm.py index 3ec86f7..aa33a66 100644 --- a/exploration/expl_esm.py +++ b/exploration/expl_esm.py @@ -328,7 +328,7 @@ df_esm_session_7 = df_session_7.join( how="left", ) -# %% jupyter={"outputs_hidden": true} tags=[] +# %% tags=[] with pd.option_context( "display.max_rows", None, "display.max_columns", None ): # more options can be specified also @@ -347,7 +347,7 @@ df_esm_session_27 = df_session_27.join( how="left", ) -# %% jupyter={"outputs_hidden": true} tags=[] +# %% tags=[] with pd.option_context( "display.max_rows", None, "display.max_columns", None ): # more options can be specified also @@ -357,3 +357,28 @@ with pd.option_context( # These are all morning questionnaires with morning *and* workday items, with the feedback added and also branched in the longest possible way. # %% +df_session_6 = df_session_counts[ + (df_session_counts["esm_session_count"] == 6) + & df_session_counts.session_response.isna() +] +df_esm_session_6 = df_session_6.join( + df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]), + how="left", +) + +# %% +display(df_esm_session_6[["esm_trigger", "esm_instructions", "esm_user_answer"]]) + +# %% +df_session_counts = classify_sessions_by_completion(df_esm_preprocessed) +df_session_time = classify_sessions_by_time(df_esm_preprocessed) + +# %% [markdown] +# The sessions were classified by time by taking the **first** record in a session. +# However, a morning questionnaire could seamlessly transition into a daytime questionnaire, if the participant was already at work. +# In this case, the "time" label changed mid-session. +# +# Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning". +# But for all intents and purposes, it can be treated as a "daytime" EMA. +# +# This is corrected in `classify_sessions_by_completion_time` diff --git a/statistical_analysis/concordance.py b/statistical_analysis/concordance.py index 55600d3..2c5ecfd 100644 --- a/statistical_analysis/concordance.py +++ b/statistical_analysis/concordance.py @@ -55,19 +55,19 @@ df_esm_inactive = get_esm_data(participants_inactive_usernames) # %% df_esm_preprocessed = preprocess_esm(df_esm_inactive) -df_session_counts = classify_sessions_adherence(df_esm_preprocessed) +df_session_counts_time = classify_sessions_by_completion_time(df_esm_preprocessed) # %% -tbl_session_outcomes = df_session_counts.reset_index()[ +tbl_session_outcomes = df_session_counts_time.reset_index()[ "session_response" ].value_counts() # %% -print("All sessions:", len(df_session_counts)) +print("All sessions:", len(df_session_counts_time)) print("-------------------------------------") print(tbl_session_outcomes) print("-------------------------------------") -print(tbl_session_outcomes / len(df_session_counts)) +print(tbl_session_outcomes / len(df_session_counts_time)) # %% VARIABLES_TO_TRANSLATE = { @@ -83,11 +83,11 @@ baseline_inactive = baseline_inactive.assign( ) # %% -df_session_counts +df_session_counts_time # %% -df_session_finished = df_session_counts[ - df_session_counts["session_response"] == "esm_finished" +df_session_finished = df_session_counts_time[ + df_session_counts_time["session_response"] == SESSION_STATUS_COMPLETE ].reset_index() # %% @@ -132,3 +132,171 @@ lr_ols = smf.ols( ) ls_result = lr_ols.fit() ls_result.summary() + +# %% [markdown] +# # Concordance by type + +# %% [markdown] +# ## Workday EMA + +# %% [markdown] +# ### Filter the EMA of interest. + +# %% [markdown] +# Work with only completed EMA. + +# %% tags=[] +df_session_counts_time_completed = df_session_counts_time[ + df_session_counts_time.session_response == "ema_completed" +] + +# %% [markdown] +# To be able to compare EMA sessions *within* one day, add a date-part column. +# +# **NOTE**: Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM, the datetime is first translated to 4 h earlier. + +# %% +df_session_counts_time_completed = df_session_counts_time_completed.assign( + date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date +) + +# %% +df_session_counts_time_completed + +# %% [markdown] +# Next, calculate differences between subsequent record. But first group them by participant and device ID (as usual) and *time*. This way, the differences between the same type of EMA sessions are calculated. + +# %% tags=[] +df_session_time_diff = ( + df_session_counts_time_completed[["datetime_lj", "date_lj", "time"]] + .groupby(["participant_id", "device_id", "time"]) + .diff() + .rename( + columns={ + "datetime_lj": "previous_same_type_time_diff", + "date_lj": "time_diff_days", + } + ) +) + +# %% +df_session_time_diff + +# %% tags=[] +df_session_counts_time_diff = df_session_counts_time_completed.join( + df_session_time_diff, how="left" +) + +# %% [markdown] +# Now, select only the daytime EMAs of interest. Discard the differences between *different day* EMAs. + +# %% +time_workday_completed_less_than_1_day = ( + (df_session_counts_time_diff.time == "daytime") # Only take daytime EMAs. + & ~( + df_session_counts_time_diff.previous_same_type_time_diff.isna() + ) # Only where the diff was actually calculated. + & (df_session_counts_time_diff.time_diff_days == datetime.timedelta(0)) +) # Only take differences *within* a day. + +# %% tags=[] +df_session_workday = df_session_counts_time_diff[time_workday_completed_less_than_1_day] + +# %% +df_session_workday = df_session_workday.assign( + time_diff_minutes=lambda x: x.previous_same_type_time_diff.dt.seconds / 60 +) + +# %% +sns.displot(df_session_workday["time_diff_minutes"], binwidth=5, height=5) + +# %% [markdown] +# There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those. + +# %% +df_session_workday[df_session_workday.time_diff_minutes < 30] + +# %% [markdown] +# There are only 2 instances, look at them individually. + +# %% +df_esm_preprocessed.loc[ + (df_esm_preprocessed.participant_id == 35) & (df_esm_preprocessed.esm_session == 6), + ["esm_trigger", "esm_session", "datetime_lj", "esm_instructions"], +] + +# %% +df_esm_preprocessed.loc[ + (df_esm_preprocessed.participant_id == 45) + & (df_esm_preprocessed.esm_session < 3) + & (df_esm_preprocessed.device_id == "d848b1c4-33cc-4e22-82ae-96d6b6458a33"), + ["esm_trigger", "esm_session", "datetime_lj", "esm_instructions"], +] + +# %% [markdown] +# As these signify bugs, we can safely discard them in the following analysis. + +# %% +df_session_workday = df_session_workday[df_session_workday.time_diff_minutes > 29] + +# %% [markdown] +# ### All participants + +# %% +df_session_workday.describe() + +# %% [markdown] +# These statistics look reasonable. + +# %% [markdown] +# ### Differences between participants + +# %% +df_mean_daytime_interval = df_session_workday.groupby("participant_id").mean() + +# %% +df_mean_daytime_interval.describe() + +# %% +sns.displot(df_mean_daytime_interval.time_diff_minutes, binwidth=5, height=5) + +# %% +df_count_daytime_per_participant = df_session_workday.groupby( + ["participant_id", "date_lj"] +).count() + +# %% +df_count_daytime_per_participant["time"].describe() + +# %% +sns.displot(df_count_daytime_per_participant.time, binwidth=1, height=5) + +# %% [markdown] +# ## Evening EMA + +# %% [markdown] +# For evening EMA, determine whether in a day that any EMA session was completed, an evening EMA is also present. +# +# Note, we are only dealing with true EMA sessions, non-sessions etc. have already been filtered out. + +# %% +s_evening_completed = df_session_counts_time_completed.groupby( + ["participant_id", "device_id", "date_lj"] +).apply(lambda x: (x.time == "evening").any()) + +# %% +df_session_counts_time_completed + +# %% +s_evening_completed_ratio = ( + s_evening_completed.groupby("participant_id").sum() + / s_evening_completed.groupby("participant_id").count() +) + +# %% +s_evening_completed_ratio.describe() + +# %% +sns.displot(s_evening_completed_ratio - 0.001, binwidth=0.1, height=5) + +# %%