diff --git a/exploration/expl_baseline.py b/exploration/expl_baseline.py index 709960b..333ea73 100644 --- a/exploration/expl_baseline.py +++ b/exploration/expl_baseline.py @@ -89,6 +89,6 @@ baseline_inactive = baseline_inactive.assign( baseline_inactive["age"].describe() # %% -3710/365.25 +3618 / 365.25 # %% diff --git a/statistical_analysis/adherence.py b/statistical_analysis/adherence.py index 1106617..613fff2 100644 --- a/statistical_analysis/adherence.py +++ b/statistical_analysis/adherence.py @@ -51,25 +51,6 @@ baseline_inactive = baseline[ baseline["Gebruikersnaam"].isin(participants_inactive_usernames) ] -# %% -df_esm_inactive = get_esm_data(participants_inactive_usernames) - -# %% -df_esm_preprocessed = preprocess_esm(df_esm_inactive) -df_session_counts_time = classify_sessions_by_completion_time(df_esm_preprocessed) - -# %% -tbl_session_outcomes = df_session_counts_time.reset_index()[ - "session_response" -].value_counts() - -# %% -print("All sessions:", len(df_session_counts_time)) -print("-------------------------------------") -print(tbl_session_outcomes) -print("-------------------------------------") -print(tbl_session_outcomes / len(df_session_counts_time)) - # %% VARIABLES_TO_TRANSLATE = { "Gebruikersnaam": "username", @@ -83,9 +64,37 @@ baseline_inactive = baseline_inactive.assign( age=lambda x: (now - x.date_of_birth).dt.days / 365.25245, ) +# %% +df_esm_inactive = get_esm_data(participants_inactive_usernames) + +# %% [markdown] +# # Classify EMA sessions + +# %% +df_esm_preprocessed = preprocess_esm(df_esm_inactive) +df_session_counts_time = classify_sessions_by_completion_time(df_esm_preprocessed) + +# %% [markdown] +# Sessions are now classified according to the type of a session (a true questionnaire or simple single questions) and users response. + # %% df_session_counts_time +# %% +tbl_session_outcomes = df_session_counts_time.reset_index()[ + "session_response" +].value_counts() + +# %% +print("All sessions:", len(df_session_counts_time)) +print("-------------------------------------") +print(tbl_session_outcomes) +print("-------------------------------------") +print(tbl_session_outcomes / len(df_session_counts_time)) + +# %% [markdown] +# ## Consider only true EMA sessions + # %% df_session_finished = df_session_counts_time[ df_session_counts_time["session_response"] == SESSION_STATUS_COMPLETE @@ -117,6 +126,9 @@ df_adherence # %% df_adherence.describe() +# %% +df_adherence[["gender", "startlanguage"]].value_counts() + # %% sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5) @@ -209,7 +221,9 @@ df_session_workday = df_session_workday.assign( ) # %% -sns.displot(df_session_workday["time_diff_minutes"], binwidth=5, height=5) +g1 = sns.displot(df_session_workday["time_diff_minutes"], binwidth=5, height=5) +g1.set_axis_labels("Time difference [min]", "Session count") +# g1.savefig("WorkdayEMAtimeDiff.pdf") # %% [markdown] # There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those. @@ -246,6 +260,11 @@ df_session_workday = df_session_workday[df_session_workday.time_diff_minutes > 2 # %% df_session_workday.describe() +# %% +df_session_workday[df_session_workday["time_diff_minutes"] < 120].shape[ + 0 +] / df_session_workday.shape[0] + # %% [markdown] # These statistics look reasonable. @@ -253,13 +272,27 @@ df_session_workday.describe() # ### Differences between participants # %% -df_mean_daytime_interval = df_session_workday.groupby("participant_id").mean() +df_mean_daytime_interval = df_session_workday.groupby("participant_id").median() # %% df_mean_daytime_interval.describe() # %% -sns.displot(df_mean_daytime_interval.time_diff_minutes, binwidth=5, height=5) +g2 = sns.displot(df_mean_daytime_interval.time_diff_minutes, binwidth=5, height=5) +g2.set_axis_labels("Median time difference [min]", "Participant count") +# g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf") + +# %% +df_adherence = df_adherence.merge( + df_mean_daytime_interval, how="left", left_on="participant_id", right_index=True +) + +# %% +lr_ols_time_diff_median = smf.ols( + "time_diff_minutes ~ C(gender) + C(startlanguage) + age", data=df_adherence +) +ls_result_time_diff_median = lr_ols_time_diff_median.fit() +ls_result_time_diff_median.summary() # %% df_count_daytime_per_participant = df_session_workday.groupby( @@ -288,6 +321,9 @@ s_evening_completed = df_session_counts_time_completed.groupby( # %% df_session_counts_time_completed +# %% +s_evening_completed.sum() + # %% s_evening_completed_ratio = ( s_evening_completed.groupby("participant_id").sum() @@ -298,6 +334,23 @@ s_evening_completed_ratio = ( s_evening_completed_ratio.describe() # %% -sns.displot(s_evening_completed_ratio - 0.001, binwidth=0.1, height=5) +g3 = sns.displot(s_evening_completed_ratio - 0.001, binwidth=0.05, height=5) +g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count") +# g3.savefig("EveningEMAratioParticip.pdf") + +# %% +df_adherence = df_adherence.merge( + s_evening_completed_ratio.rename("evening_EMA_ratio"), + how="left", + left_on="participant_id", + right_index=True, +) + +# %% +lr_ols_evening_ratio = smf.ols( + "evening_EMA_ratio ~ C(gender) + C(startlanguage) + age", data=df_adherence +) +ls_result_evening_ratio = lr_ols_evening_ratio.fit() +ls_result_evening_ratio.summary() # %%