102 lines
3.2 KiB
Python
102 lines
3.2 KiB
Python
# ---
|
|
# jupyter:
|
|
# jupytext:
|
|
# formats: ipynb,py:percent
|
|
# text_representation:
|
|
# extension: .py
|
|
# format_name: percent
|
|
# format_version: '1.3'
|
|
# jupytext_version: 1.11.2
|
|
# kernelspec:
|
|
# display_name: straw2analysis
|
|
# language: python
|
|
# name: straw2analysis
|
|
# ---
|
|
|
|
# %%
|
|
import os
|
|
import sys
|
|
import datetime
|
|
import seaborn as sns
|
|
import pandas as pd
|
|
import statsmodels.api as sm
|
|
import statsmodels.formula.api as smf
|
|
|
|
nb_dir = os.path.split(os.getcwd())[0]
|
|
if nb_dir not in sys.path:
|
|
sys.path.append(nb_dir)
|
|
import participants.query_db
|
|
from features.esm import *
|
|
|
|
# %%
|
|
baseline_si = pd.read_csv('E:/STRAWbaseline/results-survey637813.csv')
|
|
baseline_be_1 = pd.read_csv('E:/STRAWbaseline/results-survey358134.csv')
|
|
baseline_be_2 = pd.read_csv('E:/STRAWbaseline/results-survey413767.csv')
|
|
baseline = pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner").reset_index().drop(columns="index")
|
|
|
|
# %%
|
|
participants_inactive_usernames = participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01"))
|
|
|
|
# %%
|
|
baseline_inactive = baseline[baseline["Gebruikersnaam"].isin(participants_inactive_usernames)]
|
|
|
|
# %%
|
|
df_esm_inactive = get_esm_data(participants_inactive_usernames)
|
|
|
|
# %%
|
|
df_esm_preprocessed = preprocess_esm(df_esm_inactive)
|
|
df_session_counts = classify_sessions_adherence(df_esm_preprocessed)
|
|
|
|
# %%
|
|
tbl_session_outcomes = df_session_counts.reset_index()["session_response"].value_counts()
|
|
|
|
# %%
|
|
print("All sessions:", len(df_session_counts))
|
|
print("-------------------------------------")
|
|
print(tbl_session_outcomes)
|
|
print("-------------------------------------")
|
|
print(tbl_session_outcomes/len(df_session_counts))
|
|
|
|
# %%
|
|
VARIABLES_TO_TRANSLATE = {
|
|
"Gebruikersnaam": "username",
|
|
"Geslacht": "gender",
|
|
"Geboortedatum": "date_of_birth"
|
|
}
|
|
baseline_inactive.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
|
|
now = pd.Timestamp('now')
|
|
baseline_inactive = baseline_inactive.assign(date_of_birth = lambda x: pd.to_datetime(x.date_of_birth),
|
|
age = lambda x: (now - x.date_of_birth).dt.days/365.25245)
|
|
|
|
# %%
|
|
df_session_counts
|
|
|
|
# %%
|
|
df_session_finished = df_session_counts[df_session_counts["session_response"] == "esm_finished"].reset_index()
|
|
|
|
# %%
|
|
df_participant_finished_sessions = df_session_finished.groupby("participant_id").count()["esm_session"].rename("finished_sessions")
|
|
|
|
# %%
|
|
df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge(df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(), how="left", on="username")
|
|
df_adherence = df_adherence.merge(df_participant_finished_sessions, how="left", left_on="participant_id", right_index=True)
|
|
|
|
# %% tags=[]
|
|
df_adherence
|
|
|
|
# %%
|
|
df_adherence.describe()
|
|
|
|
# %%
|
|
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5)
|
|
|
|
# %%
|
|
lm_adherence = smf.ols('finished_sessions ~ C(gender) + C(startlanguage) + age', data=df_adherence).fit()
|
|
table = sm.stats.anova_lm(lm_adherence, typ=2) # Type 2 ANOVA DataFrame
|
|
print(table)
|
|
|
|
# %%
|
|
lr_ols = smf.ols('finished_sessions ~ C(gender) + C(startlanguage) + age', data=df_adherence)
|
|
ls_result = lr_ols.fit()
|
|
ls_result.summary()
|