# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.11.2 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% import datetime # %% import os import sys import pandas as pd import seaborn as sns import statsmodels.api as sm import statsmodels.formula.api as smf nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import participants.query_db from features.esm import * # %% baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv") baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv") baseline_be_2 = pd.read_csv("E:/STRAWbaseline/results-survey413767.csv") baseline = ( pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner") .reset_index() .drop(columns="index") ) # %% participants_inactive_usernames = participants.query_db.get_usernames( collection_start=datetime.date.fromisoformat("2020-08-01") ) # %% baseline_inactive = baseline[ baseline["Gebruikersnaam"].isin(participants_inactive_usernames) ] # %% df_esm_inactive = get_esm_data(participants_inactive_usernames) # %% df_esm_preprocessed = preprocess_esm(df_esm_inactive) df_session_counts_time = classify_sessions_by_completion_time(df_esm_preprocessed) # %% tbl_session_outcomes = df_session_counts_time.reset_index()[ "session_response" ].value_counts() # %% print("All sessions:", len(df_session_counts_time)) print("-------------------------------------") print(tbl_session_outcomes) print("-------------------------------------") print(tbl_session_outcomes / len(df_session_counts_time)) # %% VARIABLES_TO_TRANSLATE = { "Gebruikersnaam": "username", "Geslacht": "gender", "Geboortedatum": "date_of_birth", } baseline_inactive.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True) now = pd.Timestamp("now") baseline_inactive = baseline_inactive.assign( date_of_birth=lambda x: pd.to_datetime(x.date_of_birth), age=lambda x: (now - x.date_of_birth).dt.days / 365.25245, ) # %% df_session_counts_time # %% df_session_finished = df_session_counts_time[ df_session_counts_time["session_response"] == SESSION_STATUS_COMPLETE ].reset_index() # %% df_participant_finished_sessions = ( df_session_finished.groupby("participant_id") .count()["esm_session"] .rename("finished_sessions") ) # %% df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge( df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(), how="left", on="username", ) df_adherence = df_adherence.merge( df_participant_finished_sessions, how="left", left_on="participant_id", right_index=True, ) # %% tags=[] df_adherence # %% df_adherence.describe() # %% sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5) # %% lm_adherence = smf.ols( "finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence ).fit() table = sm.stats.anova_lm(lm_adherence, typ=2) # Type 2 ANOVA DataFrame print(table) # %% lr_ols = smf.ols( "finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence ) ls_result = lr_ols.fit() ls_result.summary() # %% [markdown] # # Concordance by type # %% [markdown] # ## Workday EMA # %% [markdown] # ### Filter the EMA of interest. # %% [markdown] # Work with only completed EMA. # %% tags=[] df_session_counts_time_completed = df_session_counts_time[ df_session_counts_time.session_response == "ema_completed" ] # %% [markdown] # To be able to compare EMA sessions *within* one day, add a date-part column. # # **NOTE**: Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM, the datetime is first translated to 4 h earlier. # %% df_session_counts_time_completed = df_session_counts_time_completed.assign( date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date ) # %% df_session_counts_time_completed # %% [markdown] # Next, calculate differences between subsequent record. But first group them by participant and device ID (as usual) and *time*. This way, the differences between the same type of EMA sessions are calculated. # %% tags=[] df_session_time_diff = ( df_session_counts_time_completed[["datetime_lj", "date_lj", "time"]] .groupby(["participant_id", "device_id", "time"]) .diff() .rename( columns={ "datetime_lj": "previous_same_type_time_diff", "date_lj": "time_diff_days", } ) ) # %% df_session_time_diff # %% tags=[] df_session_counts_time_diff = df_session_counts_time_completed.join( df_session_time_diff, how="left" ) # %% [markdown] # Now, select only the daytime EMAs of interest. Discard the differences between *different day* EMAs. # %% time_workday_completed_less_than_1_day = ( (df_session_counts_time_diff.time == "daytime") # Only take daytime EMAs. & ~( df_session_counts_time_diff.previous_same_type_time_diff.isna() ) # Only where the diff was actually calculated. & (df_session_counts_time_diff.time_diff_days == datetime.timedelta(0)) ) # Only take differences *within* a day. # %% tags=[] df_session_workday = df_session_counts_time_diff[time_workday_completed_less_than_1_day] # %% df_session_workday = df_session_workday.assign( time_diff_minutes=lambda x: x.previous_same_type_time_diff.dt.seconds / 60 ) # %% sns.displot(df_session_workday["time_diff_minutes"], binwidth=5, height=5) # %% [markdown] # There are some sessions that are really close together. By design, none should be closer than 30 min. Let's take a look at those. # %% df_session_workday[df_session_workday.time_diff_minutes < 30] # %% [markdown] # There are only 2 instances, look at them individually. # %% df_esm_preprocessed.loc[ (df_esm_preprocessed.participant_id == 35) & (df_esm_preprocessed.esm_session == 6), ["esm_trigger", "esm_session", "datetime_lj", "esm_instructions"], ] # %% df_esm_preprocessed.loc[ (df_esm_preprocessed.participant_id == 45) & (df_esm_preprocessed.esm_session < 3) & (df_esm_preprocessed.device_id == "d848b1c4-33cc-4e22-82ae-96d6b6458a33"), ["esm_trigger", "esm_session", "datetime_lj", "esm_instructions"], ] # %% [markdown] # As these signify bugs, we can safely discard them in the following analysis. # %% df_session_workday = df_session_workday[df_session_workday.time_diff_minutes > 29] # %% [markdown] # ### All participants # %% df_session_workday.describe() # %% [markdown] # These statistics look reasonable. # %% [markdown] # ### Differences between participants # %% df_mean_daytime_interval = df_session_workday.groupby("participant_id").mean() # %% df_mean_daytime_interval.describe() # %% sns.displot(df_mean_daytime_interval.time_diff_minutes, binwidth=5, height=5) # %% df_count_daytime_per_participant = df_session_workday.groupby( ["participant_id", "date_lj"] ).count() # %% df_count_daytime_per_participant["time"].describe() # %% sns.displot(df_count_daytime_per_participant.time, binwidth=1, height=5) # %% [markdown] # ## Evening EMA # %% [markdown] # For evening EMA, determine whether in a day that any EMA session was completed, an evening EMA is also present. # # Note, we are only dealing with true EMA sessions, non-sessions etc. have already been filtered out. # %% s_evening_completed = df_session_counts_time_completed.groupby( ["participant_id", "device_id", "date_lj"] ).apply(lambda x: (x.time == "evening").any()) # %% df_session_counts_time_completed # %% s_evening_completed_ratio = ( s_evening_completed.groupby("participant_id").sum() / s_evening_completed.groupby("participant_id").count() ) # %% s_evening_completed_ratio.describe() # %% sns.displot(s_evening_completed_ratio - 0.001, binwidth=0.1, height=5) # %%