# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.11.4 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% # %matplotlib inline import datetime import os import sys import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import statsmodels.api as sm import statsmodels.formula.api as smf nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import participants.query_db from features.esm import * # %% SAVE_FIGS = True FIG_HEIGHT = 5 FIG_ASPECT = 1.7 FIG_COLOUR = "#28827C" SMALL_SIZE = 14 MEDIUM_SIZE = SMALL_SIZE + 2 BIGGER_SIZE = MEDIUM_SIZE + 2 plt.rc("font", size=SMALL_SIZE) # controls default text sizes plt.rc("axes", titlesize=SMALL_SIZE) # fontsize of the axes title plt.rc("axes", labelsize=MEDIUM_SIZE) # fontsize of the x and y labels plt.rc("xtick", labelsize=SMALL_SIZE) # fontsize of the tick labels plt.rc("ytick", labelsize=SMALL_SIZE) # fontsize of the tick labels plt.rc("legend", fontsize=SMALL_SIZE) # legend fontsize plt.rc("figure", titlesize=BIGGER_SIZE) # fontsize of the figure title # %% baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv") baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv") baseline_be_2 = pd.read_csv("E:/STRAWbaseline/results-survey413767.csv") baseline = ( pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner") .reset_index() .drop(columns="index") ) # %% participants_inactive_usernames = participants.query_db.get_usernames( collection_start=datetime.date.fromisoformat("2020-08-01") ) # %% baseline_inactive = baseline[ baseline["Gebruikersnaam"].isin(participants_inactive_usernames) ] # %% VARIABLES_TO_TRANSLATE = { "Gebruikersnaam": "username", "Geslacht": "gender", "Geboortedatum": "date_of_birth", } baseline_inactive.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True) now = pd.Timestamp("now") baseline_inactive = baseline_inactive.assign( date_of_birth=lambda x: pd.to_datetime(x.date_of_birth), age=lambda x: (now - x.date_of_birth).dt.days / 365.25245, ) # %% df_esm_inactive = get_esm_data(participants_inactive_usernames) # %% [markdown] # # Classify EMA sessions # %% df_esm_preprocessed = preprocess_esm(df_esm_inactive) df_session_counts_time = classify_sessions_by_completion_time(df_esm_preprocessed) # %% [markdown] # Sessions are now classified according to the type of a session (a true questionnaire or simple single questions) and users response. # %% df_session_counts_time # %% tbl_session_outcomes = df_session_counts_time.reset_index()[ "session_response" ].value_counts() # %% print("All sessions:", len(df_session_counts_time)) print("-------------------------------------") print(tbl_session_outcomes) print("-------------------------------------") print(tbl_session_outcomes / len(df_session_counts_time)) # %% [markdown] # ## Consider only true EMA sessions # %% df_session_finished = df_session_counts_time[ df_session_counts_time["session_response"] == SESSION_STATUS_COMPLETE ].reset_index() # %% df_participant_finished_sessions = ( df_session_finished.groupby("participant_id") .count()["esm_session"] .rename("finished_sessions") ) # %% df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge( df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(), how="left", on="username", ) df_adherence = df_adherence.merge( df_participant_finished_sessions, how="left", left_on="participant_id", right_index=True, ) # %% tags=[] df_adherence # %% df_adherence.describe() # %% df_adherence[["gender", "startlanguage"]].value_counts() # %% sns.displot(df_adherence["finished_sessions"], binwidth=5, height=FIG_HEIGHT) # %% lm_adherence = smf.ols( "finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence ).fit() table = sm.stats.anova_lm(lm_adherence, typ=2) # Type 2 ANOVA DataFrame print(table) # %% lr_ols = smf.ols( "finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence ) ls_result = lr_ols.fit() ls_result.summary() # %% [markdown] # # Concordance by type # %% [markdown] # ## Workday EMA # %% [markdown] # ### Filter the EMA of interest. # %% [markdown] # Work with only completed EMA. # %% tags=[] df_session_counts_time_completed = df_session_counts_time[ df_session_counts_time.session_response == "ema_completed" ] # %% [markdown] # To be able to compare EMA sessions *within* one day, add a date-part column. # # **NOTE**: Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM, the datetime is first translated to 4 h earlier. # %% df_session_counts_time_completed = df_session_counts_time_completed.assign( date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date ) # %% df_session_counts_time_completed # %% [markdown] # Next, calculate differences between subsequent record. But first group them by participant and device ID (as usual) and *time*. This way, the differences between the same type of EMA sessions are calculated. # %% tags=[] df_session_time_diff = ( df_session_counts_time_completed[["datetime_lj", "date_lj", "time"]] .groupby(["participant_id", "device_id", "time"]) .diff() .rename( columns={ "datetime_lj": "previous_same_type_time_diff", "date_lj": "time_diff_days", } ) ) # %% df_session_time_diff # %% tags=[] df_session_counts_time_diff = df_session_counts_time_completed.join( df_session_time_diff, how="left" ) # %% [markdown] # Now, select only the daytime EMAs of interest. Discard the differences between *different day* EMAs. # %% tags=[] time_workday_completed_less_than_1_day = ( (df_session_counts_time_diff.time == "daytime") # Only take daytime EMAs. & ~( df_session_counts_time_diff.previous_same_type_time_diff.isna() ) # Only where the diff was actually calculated. & (df_session_counts_time_diff.time_diff_days == datetime.timedelta(0)) ) # Only take differences *within* a day. # %% tags=[] df_session_workday = df_session_counts_time_diff[time_workday_completed_less_than_1_day] # %% df_session_workday = df_session_workday.assign( time_diff_minutes=lambda x: x.previous_same_type_time_diff.dt.seconds / 60 ) # %% g1 = sns.displot( df_session_workday["time_diff_minutes"], binwidth=5, height=FIG_HEIGHT, aspect=FIG_ASPECT, color=FIG_COLOUR, ) g1.set_axis_labels("Time difference [min]", "Session count") g1.set(xlim=(0, 570)) if SAVE_FIGS: g1.savefig("WorkdayEMAtimeDiff.pdf") # %% [markdown] # There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those. # %% df_session_workday[df_session_workday.time_diff_minutes < 30] # %% [markdown] # There are only 2 instances, look at them individually. # %% df_esm_preprocessed.loc[ (df_esm_preprocessed.participant_id == 35) & (df_esm_preprocessed.esm_session == 7) & (df_esm_preprocessed.device_id == "62a44038-3ccb-401e-a69c-6f22152c54a6"), [ "esm_trigger", "esm_session", "datetime_lj", "esm_instructions", "device_id", "_id", ], ] # %% df_esm_preprocessed.loc[ (df_esm_preprocessed.participant_id == 45) & (df_esm_preprocessed.esm_session < 3) & (df_esm_preprocessed.device_id == "d848b1c4-33cc-4e22-82ae-96d6b6458a33"), ["esm_trigger", "esm_session", "datetime_lj", "esm_instructions"], ] # %% [markdown] # As these signify bugs, we can safely discard them in the following analysis. # %% df_session_workday = df_session_workday[df_session_workday.time_diff_minutes > 29] # %% [markdown] # ### All participants # %% df_session_workday.describe() # %% df_session_workday[df_session_workday["time_diff_minutes"] < 120].shape[ 0 ] / df_session_workday.shape[0] # %% [markdown] # These statistics look reasonable. # %% [markdown] # ### Differences between participants # %% df_mean_daytime_interval = df_session_workday.groupby("participant_id").median() # %% df_mean_daytime_interval.describe() # %% g2 = sns.displot( df_mean_daytime_interval.time_diff_minutes, binwidth=5, height=FIG_HEIGHT, aspect=FIG_ASPECT, color=FIG_COLOUR, ) g2.set_axis_labels("Median time difference [min]", "Participant count") if SAVE_FIGS: g2.savefig("WorkdayEMAtimeDiffMedianParticip.pdf") # %% df_adherence = df_adherence.merge( df_mean_daytime_interval, how="left", left_on="participant_id", right_index=True ) # %% lr_ols_time_diff_median = smf.ols( "time_diff_minutes ~ C(gender) + C(startlanguage) + age", data=df_adherence ) ls_result_time_diff_median = lr_ols_time_diff_median.fit() ls_result_time_diff_median.summary() # %% df_count_daytime_per_participant = df_session_workday.groupby( ["participant_id", "date_lj"] ).count() # %% df_count_daytime_per_participant["time"].describe() # %% sns.displot( df_count_daytime_per_participant.time, binwidth=1, height=FIG_HEIGHT, aspect=FIG_ASPECT, color=FIG_COLOUR, ) # %% [markdown] # ## Evening EMA # %% [markdown] # For evening EMA, determine whether in a day that any EMA session was completed, an evening EMA is also present. # # Note, we are only dealing with true EMA sessions, non-sessions etc. have already been filtered out. # %% s_evening_completed = df_session_counts_time_completed.groupby( ["participant_id", "device_id", "date_lj"] ).apply(lambda x: (x.time == "evening").any()) # %% df_session_counts_time_completed # %% s_evening_completed.sum() # %% s_evening_completed_ratio = ( s_evening_completed.groupby("participant_id").sum() / s_evening_completed.groupby("participant_id").count() ) # %% s_evening_completed_ratio.describe() # %% g3 = sns.displot( s_evening_completed_ratio - 0.001, binwidth=0.05, height=FIG_HEIGHT, aspect=FIG_ASPECT, color=FIG_COLOUR, ) g3.set_axis_labels("Ratio of days with the evening EMA filled out", "Participant count") g3.set(xlim=(1.01, 0.59)) if SAVE_FIGS: g3.savefig("EveningEMAratioParticip.pdf") # %% df_adherence = df_adherence.merge( s_evening_completed_ratio.rename("evening_EMA_ratio"), how="left", left_on="participant_id", right_index=True, ) # %% lr_ols_evening_ratio = smf.ols( "evening_EMA_ratio ~ C(gender) + C(startlanguage) + age", data=df_adherence ) ls_result_evening_ratio = lr_ols_evening_ratio.fit() ls_result_evening_ratio.summary()