2021-06-08 16:07:39 +02:00
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
2021-08-04 17:41:09 +02:00
# jupytext_version: 1.11.4
2021-06-08 16:07:39 +02:00
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
2021-06-11 20:30:18 +02:00
# %%
2021-08-04 17:41:09 +02:00
# %matplotlib inline
2021-06-11 14:50:14 +02:00
import datetime
2021-06-08 16:07:39 +02:00
import os
import sys
2021-06-11 14:50:14 +02:00
2021-08-04 17:41:09 +02:00
import matplotlib . pyplot as plt
2021-06-08 16:07:39 +02:00
import pandas as pd
2021-06-11 14:50:14 +02:00
import seaborn as sns
2021-06-08 16:40:01 +02:00
import statsmodels . api as sm
2021-06-08 22:32:14 +02:00
import statsmodels . formula . api as smf
2021-06-08 16:07:39 +02:00
nb_dir = os . path . split ( os . getcwd ( ) ) [ 0 ]
if nb_dir not in sys . path :
sys . path . append ( nb_dir )
import participants . query_db
from features . esm import *
2021-08-04 17:41:09 +02:00
# %%
SAVE_FIGS = True
FIG_HEIGHT = 5
2021-08-04 18:05:52 +02:00
FIG_ASPECT = 1.7
2021-08-04 17:41:09 +02:00
FIG_COLOUR = " #28827C "
2021-08-04 18:05:52 +02:00
SMALL_SIZE = 14
MEDIUM_SIZE = SMALL_SIZE + 2
BIGGER_SIZE = MEDIUM_SIZE + 2
2021-08-04 17:41:09 +02:00
plt . rc ( " font " , size = SMALL_SIZE ) # controls default text sizes
plt . rc ( " axes " , titlesize = SMALL_SIZE ) # fontsize of the axes title
plt . rc ( " axes " , labelsize = MEDIUM_SIZE ) # fontsize of the x and y labels
plt . rc ( " xtick " , labelsize = SMALL_SIZE ) # fontsize of the tick labels
plt . rc ( " ytick " , labelsize = SMALL_SIZE ) # fontsize of the tick labels
plt . rc ( " legend " , fontsize = SMALL_SIZE ) # legend fontsize
plt . rc ( " figure " , titlesize = BIGGER_SIZE ) # fontsize of the figure title
2021-06-08 16:07:39 +02:00
# %%
2021-06-11 14:50:14 +02:00
baseline_si = pd . read_csv ( " E:/STRAWbaseline/results-survey637813.csv " )
baseline_be_1 = pd . read_csv ( " E:/STRAWbaseline/results-survey358134.csv " )
baseline_be_2 = pd . read_csv ( " E:/STRAWbaseline/results-survey413767.csv " )
baseline = (
pd . concat ( [ baseline_si , baseline_be_1 , baseline_be_2 ] , join = " inner " )
. reset_index ( )
. drop ( columns = " index " )
)
2021-06-08 16:07:39 +02:00
# %%
2021-06-11 14:50:14 +02:00
participants_inactive_usernames = participants . query_db . get_usernames (
collection_start = datetime . date . fromisoformat ( " 2020-08-01 " )
)
2021-06-08 16:07:39 +02:00
# %%
2021-06-11 14:50:14 +02:00
baseline_inactive = baseline [
baseline [ " Gebruikersnaam " ] . isin ( participants_inactive_usernames )
]
2021-06-08 16:07:39 +02:00
2021-06-14 17:09:45 +02:00
# %%
VARIABLES_TO_TRANSLATE = {
" Gebruikersnaam " : " username " ,
" Geslacht " : " gender " ,
" Geboortedatum " : " date_of_birth " ,
}
baseline_inactive . rename ( columns = VARIABLES_TO_TRANSLATE , copy = False , inplace = True )
now = pd . Timestamp ( " now " )
baseline_inactive = baseline_inactive . assign (
date_of_birth = lambda x : pd . to_datetime ( x . date_of_birth ) ,
age = lambda x : ( now - x . date_of_birth ) . dt . days / 365.25245 ,
)
2021-06-08 16:07:39 +02:00
# %%
df_esm_inactive = get_esm_data ( participants_inactive_usernames )
2021-06-08 16:40:01 +02:00
2021-06-14 17:09:45 +02:00
# %% [markdown]
# # Classify EMA sessions
2021-06-08 16:40:01 +02:00
# %%
2021-06-08 16:07:39 +02:00
df_esm_preprocessed = preprocess_esm ( df_esm_inactive )
2021-06-11 20:28:24 +02:00
df_session_counts_time = classify_sessions_by_completion_time ( df_esm_preprocessed )
2021-06-08 16:40:01 +02:00
2021-06-14 17:09:45 +02:00
# %% [markdown]
# Sessions are now classified according to the type of a session (a true questionnaire or simple single questions) and users response.
# %%
df_session_counts_time
2021-06-08 16:40:01 +02:00
# %%
2021-06-11 20:28:24 +02:00
tbl_session_outcomes = df_session_counts_time . reset_index ( ) [
2021-06-11 14:50:14 +02:00
" session_response "
] . value_counts ( )
2021-06-08 16:07:39 +02:00
2021-06-08 16:40:01 +02:00
# %%
2021-06-11 20:28:24 +02:00
print ( " All sessions: " , len ( df_session_counts_time ) )
2021-06-08 16:40:01 +02:00
print ( " ------------------------------------- " )
print ( tbl_session_outcomes )
print ( " ------------------------------------- " )
2021-06-11 20:28:24 +02:00
print ( tbl_session_outcomes / len ( df_session_counts_time ) )
2021-06-08 16:40:01 +02:00
2021-06-14 17:09:45 +02:00
# %% [markdown]
# ## Consider only true EMA sessions
2021-06-08 17:16:08 +02:00
# %%
2021-06-11 20:28:24 +02:00
df_session_finished = df_session_counts_time [
df_session_counts_time [ " session_response " ] == SESSION_STATUS_COMPLETE
2021-06-11 14:50:14 +02:00
] . reset_index ( )
2021-06-08 17:16:08 +02:00
# %%
2021-06-11 14:50:14 +02:00
df_participant_finished_sessions = (
df_session_finished . groupby ( " participant_id " )
. count ( ) [ " esm_session " ]
. rename ( " finished_sessions " )
)
2021-06-08 17:16:08 +02:00
# %%
2021-06-11 14:50:14 +02:00
df_adherence = baseline_inactive [ [ " username " , " gender " , " age " , " startlanguage " ] ] . merge (
df_esm_preprocessed [ [ " username " , " participant_id " ] ] . drop_duplicates ( ) ,
how = " left " ,
on = " username " ,
)
df_adherence = df_adherence . merge (
df_participant_finished_sessions ,
how = " left " ,
left_on = " participant_id " ,
right_index = True ,
)
2021-06-08 22:32:14 +02:00
# %% tags=[]
df_adherence
2021-06-08 17:16:08 +02:00
2021-06-08 22:42:56 +02:00
# %%
df_adherence . describe ( )
2021-06-14 17:09:45 +02:00
# %%
df_adherence [ [ " gender " , " startlanguage " ] ] . value_counts ( )
2021-06-08 22:42:56 +02:00
# %%
2021-08-04 17:41:09 +02:00
sns . displot ( df_adherence [ " finished_sessions " ] , binwidth = 5 , height = FIG_HEIGHT )
2021-06-08 22:42:56 +02:00
2021-06-08 17:16:08 +02:00
# %%
2021-06-11 14:50:14 +02:00
lm_adherence = smf . ols (
" finished_sessions ~ C(gender) + C(startlanguage) + age " , data = df_adherence
) . fit ( )
table = sm . stats . anova_lm ( lm_adherence , typ = 2 ) # Type 2 ANOVA DataFrame
2021-06-08 17:16:08 +02:00
print ( table )
2021-06-08 22:32:14 +02:00
# %%
2021-06-11 14:50:14 +02:00
lr_ols = smf . ols (
" finished_sessions ~ C(gender) + C(startlanguage) + age " , data = df_adherence
)
2021-06-08 22:32:14 +02:00
ls_result = lr_ols . fit ( )
ls_result . summary ( )
2021-06-11 20:28:24 +02:00
# %% [markdown]
# # Concordance by type
# %% [markdown]
# ## Workday EMA
# %% [markdown]
# ### Filter the EMA of interest.
# %% [markdown]
# Work with only completed EMA.
# %% tags=[]
df_session_counts_time_completed = df_session_counts_time [
df_session_counts_time . session_response == " ema_completed "
]
# %% [markdown]
# To be able to compare EMA sessions *within* one day, add a date-part column.
#
# **NOTE**: Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM, the datetime is first translated to 4 h earlier.
# %%
df_session_counts_time_completed = df_session_counts_time_completed . assign (
date_lj = lambda x : ( x . datetime_lj - datetime . timedelta ( hours = 4 ) ) . dt . date
)
# %%
df_session_counts_time_completed
# %% [markdown]
# Next, calculate differences between subsequent record. But first group them by participant and device ID (as usual) and *time*. This way, the differences between the same type of EMA sessions are calculated.
# %% tags=[]
df_session_time_diff = (
df_session_counts_time_completed [ [ " datetime_lj " , " date_lj " , " time " ] ]
. groupby ( [ " participant_id " , " device_id " , " time " ] )
. diff ( )
. rename (
columns = {
" datetime_lj " : " previous_same_type_time_diff " ,
" date_lj " : " time_diff_days " ,
}
)
)
# %%
df_session_time_diff
# %% tags=[]
df_session_counts_time_diff = df_session_counts_time_completed . join (
df_session_time_diff , how = " left "
)
# %% [markdown]
# Now, select only the daytime EMAs of interest. Discard the differences between *different day* EMAs.
2021-07-02 16:02:55 +02:00
# %% tags=[]
2021-06-11 20:28:24 +02:00
time_workday_completed_less_than_1_day = (
( df_session_counts_time_diff . time == " daytime " ) # Only take daytime EMAs.
& ~ (
df_session_counts_time_diff . previous_same_type_time_diff . isna ( )
) # Only where the diff was actually calculated.
& ( df_session_counts_time_diff . time_diff_days == datetime . timedelta ( 0 ) )
) # Only take differences *within* a day.
# %% tags=[]
df_session_workday = df_session_counts_time_diff [ time_workday_completed_less_than_1_day ]
# %%
df_session_workday = df_session_workday . assign (
time_diff_minutes = lambda x : x . previous_same_type_time_diff . dt . seconds / 60
)
# %%
2021-07-02 16:33:48 +02:00
g1 = sns . displot (
df_session_workday [ " time_diff_minutes " ] ,
binwidth = 5 ,
2021-08-04 17:41:09 +02:00
height = FIG_HEIGHT ,
aspect = FIG_ASPECT ,
color = FIG_COLOUR ,
2021-07-02 16:33:48 +02:00
)
2021-06-14 17:09:45 +02:00
g1 . set_axis_labels ( " Time difference [min] " , " Session count " )
2021-08-04 17:41:09 +02:00
g1 . set ( xlim = ( 0 , 570 ) )
if SAVE_FIGS :
g1 . savefig ( " WorkdayEMAtimeDiff.pdf " )
2021-06-11 20:28:24 +02:00
# %% [markdown]
# There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those.
# %%
df_session_workday [ df_session_workday . time_diff_minutes < 30 ]
# %% [markdown]
# There are only 2 instances, look at them individually.
# %%
df_esm_preprocessed . loc [
2021-07-04 14:34:57 +02:00
( df_esm_preprocessed . participant_id == 35 )
& ( df_esm_preprocessed . esm_session == 7 )
& ( df_esm_preprocessed . device_id == " 62a44038-3ccb-401e-a69c-6f22152c54a6 " ) ,
[
" esm_trigger " ,
" esm_session " ,
" datetime_lj " ,
" esm_instructions " ,
" device_id " ,
" _id " ,
] ,
2021-06-11 20:28:24 +02:00
]
# %%
df_esm_preprocessed . loc [
( df_esm_preprocessed . participant_id == 45 )
& ( df_esm_preprocessed . esm_session < 3 )
& ( df_esm_preprocessed . device_id == " d848b1c4-33cc-4e22-82ae-96d6b6458a33 " ) ,
[ " esm_trigger " , " esm_session " , " datetime_lj " , " esm_instructions " ] ,
]
# %% [markdown]
# As these signify bugs, we can safely discard them in the following analysis.
# %%
df_session_workday = df_session_workday [ df_session_workday . time_diff_minutes > 29 ]
# %% [markdown]
# ### All participants
# %%
df_session_workday . describe ( )
2021-06-14 17:09:45 +02:00
# %%
df_session_workday [ df_session_workday [ " time_diff_minutes " ] < 120 ] . shape [
0
] / df_session_workday . shape [ 0 ]
2021-06-11 20:28:24 +02:00
# %% [markdown]
# These statistics look reasonable.
# %% [markdown]
# ### Differences between participants
# %%
2021-06-14 17:09:45 +02:00
df_mean_daytime_interval = df_session_workday . groupby ( " participant_id " ) . median ( )
2021-06-11 20:28:24 +02:00
# %%
df_mean_daytime_interval . describe ( )
# %%
2021-07-02 16:33:48 +02:00
g2 = sns . displot (
df_mean_daytime_interval . time_diff_minutes ,
binwidth = 5 ,
2021-08-04 17:41:09 +02:00
height = FIG_HEIGHT ,
aspect = FIG_ASPECT ,
color = FIG_COLOUR ,
2021-07-02 16:33:48 +02:00
)
2021-06-14 17:09:45 +02:00
g2 . set_axis_labels ( " Median time difference [min] " , " Participant count " )
2021-08-04 17:41:09 +02:00
if SAVE_FIGS :
g2 . savefig ( " WorkdayEMAtimeDiffMedianParticip.pdf " )
2021-06-14 17:09:45 +02:00
# %%
df_adherence = df_adherence . merge (
df_mean_daytime_interval , how = " left " , left_on = " participant_id " , right_index = True
)
# %%
lr_ols_time_diff_median = smf . ols (
" time_diff_minutes ~ C(gender) + C(startlanguage) + age " , data = df_adherence
)
ls_result_time_diff_median = lr_ols_time_diff_median . fit ( )
ls_result_time_diff_median . summary ( )
2021-06-11 20:28:24 +02:00
# %%
df_count_daytime_per_participant = df_session_workday . groupby (
[ " participant_id " , " date_lj " ]
) . count ( )
# %%
df_count_daytime_per_participant [ " time " ] . describe ( )
# %%
2021-07-02 16:33:48 +02:00
sns . displot (
df_count_daytime_per_participant . time ,
binwidth = 1 ,
2021-08-04 17:41:09 +02:00
height = FIG_HEIGHT ,
aspect = FIG_ASPECT ,
color = FIG_COLOUR ,
2021-07-02 16:33:48 +02:00
)
2021-06-11 20:28:24 +02:00
# %% [markdown]
# ## Evening EMA
# %% [markdown]
# For evening EMA, determine whether in a day that any EMA session was completed, an evening EMA is also present.
#
# Note, we are only dealing with true EMA sessions, non-sessions etc. have already been filtered out.
# %%
s_evening_completed = df_session_counts_time_completed . groupby (
[ " participant_id " , " device_id " , " date_lj " ]
) . apply ( lambda x : ( x . time == " evening " ) . any ( ) )
# %%
df_session_counts_time_completed
2021-06-14 17:09:45 +02:00
# %%
s_evening_completed . sum ( )
2021-06-11 20:28:24 +02:00
# %%
s_evening_completed_ratio = (
s_evening_completed . groupby ( " participant_id " ) . sum ( )
/ s_evening_completed . groupby ( " participant_id " ) . count ( )
)
# %%
s_evening_completed_ratio . describe ( )
# %%
2021-07-02 16:33:48 +02:00
g3 = sns . displot (
s_evening_completed_ratio - 0.001 ,
binwidth = 0.05 ,
2021-08-04 17:41:09 +02:00
height = FIG_HEIGHT ,
aspect = FIG_ASPECT ,
color = FIG_COLOUR ,
2021-07-02 16:33:48 +02:00
)
2021-06-14 17:09:45 +02:00
g3 . set_axis_labels ( " Ratio of days with the evening EMA filled out " , " Participant count " )
2021-07-02 16:33:48 +02:00
g3 . set ( xlim = ( 1.01 , 0.59 ) )
2021-08-04 17:41:09 +02:00
if SAVE_FIGS :
g3 . savefig ( " EveningEMAratioParticip.pdf " )
2021-06-14 17:09:45 +02:00
# %%
df_adherence = df_adherence . merge (
s_evening_completed_ratio . rename ( " evening_EMA_ratio " ) ,
how = " left " ,
left_on = " participant_id " ,
right_index = True ,
)
# %%
lr_ols_evening_ratio = smf . ols (
" evening_EMA_ratio ~ C(gender) + C(startlanguage) + age " , data = df_adherence
)
ls_result_evening_ratio = lr_ols_evening_ratio . fit ( )
ls_result_evening_ratio . summary ( )