Finish labelling EMA sessions and document classify_sessions_adherence function.
parent
371e755159
commit
f48e5469e0
|
@ -13,35 +13,50 @@
|
||||||
# name: straw2analysis
|
# name: straw2analysis
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import datetime
|
|
||||||
import seaborn as sns
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
|
||||||
nb_dir = os.path.split(os.getcwd())[0]
|
nb_dir = os.path.split(os.getcwd())[0]
|
||||||
if nb_dir not in sys.path: sys.path.append(nb_dir)
|
if nb_dir not in sys.path:
|
||||||
|
sys.path.append(nb_dir)
|
||||||
|
|
||||||
import participants.query_db
|
import participants.query_db
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
baseline_si = pd.read_csv('E:/STRAWbaseline/results-survey637813.csv')
|
baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
|
||||||
baseline_be_1 = pd.read_csv('E:/STRAWbaseline/results-survey358134.csv')
|
baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
|
||||||
baseline_be_2 = pd.read_csv('E:/STRAWbaseline/results-survey413767.csv')
|
baseline_be_2 = pd.read_csv("E:/STRAWbaseline/results-survey413767.csv")
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
participants_inactive_usernames = participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01"))
|
participants_inactive_usernames = participants.query_db.get_usernames(
|
||||||
|
collection_start=datetime.date.fromisoformat("2020-08-01")
|
||||||
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
baseline = pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner").reset_index().drop(columns="index")
|
baseline = (
|
||||||
baseline_inactive = baseline[baseline["Gebruikersnaam"].isin(participants_inactive_usernames)]
|
pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner")
|
||||||
|
.reset_index()
|
||||||
|
.drop(columns="index")
|
||||||
|
)
|
||||||
|
baseline_inactive = baseline[
|
||||||
|
baseline["Gebruikersnaam"].isin(participants_inactive_usernames)
|
||||||
|
]
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
baseline
|
baseline
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
participants_inactive_usernames = pd.Series(participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01")))
|
participants_inactive_usernames = pd.Series(
|
||||||
|
participants.query_db.get_usernames(
|
||||||
|
collection_start=datetime.date.fromisoformat("2020-08-01")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# # Demographic information
|
# # Demographic information
|
||||||
|
@ -54,7 +69,9 @@ print(baseline_inactive.shape[0])
|
||||||
print(participants_inactive_usernames.shape[0])
|
print(participants_inactive_usernames.shape[0])
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
participants_inactive_usernames[~participants_inactive_usernames.isin(baseline["Gebruikersnaam"])].sort_values()
|
participants_inactive_usernames[
|
||||||
|
~participants_inactive_usernames.isin(baseline["Gebruikersnaam"])
|
||||||
|
].sort_values()
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
baseline_inactive["startlanguage"].value_counts()
|
baseline_inactive["startlanguage"].value_counts()
|
||||||
|
@ -63,9 +80,10 @@ baseline_inactive["startlanguage"].value_counts()
|
||||||
baseline_inactive["Geslacht"].value_counts()
|
baseline_inactive["Geslacht"].value_counts()
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
now = pd.Timestamp('now')
|
now = pd.Timestamp("now")
|
||||||
baseline_inactive = baseline_inactive.assign(dob = lambda x: pd.to_datetime(x.Geboortedatum),
|
baseline_inactive = baseline_inactive.assign(
|
||||||
age = lambda x: now - x.dob)
|
dob=lambda x: pd.to_datetime(x.Geboortedatum), age=lambda x: now - x.dob
|
||||||
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
baseline_inactive["age"].describe()
|
baseline_inactive["age"].describe()
|
||||||
|
|
|
@ -61,9 +61,9 @@ df_esm_preprocessed.columns
|
||||||
# One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first.
|
# One approach would be to count distinct session IDs which are incremented for each group of EMAs. However, since not every question answered counts as a fulfilled EMA, some unique session IDs should be eliminated first.
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
session_counts = df_esm_preprocessed.groupby(["participant_id", "device_id", "esm_session"]).count()[
|
session_counts = df_esm_preprocessed.groupby(
|
||||||
"id"
|
["participant_id", "device_id", "esm_session"]
|
||||||
]
|
).count()["id"]
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# Group data by participant_id and esm_session and count the number of instances (by id). Session counts are therefore counts of how many times a specific session ID appears *within* a specific participant.
|
# Group data by participant_id and esm_session and count the number of instances (by id). Session counts are therefore counts of how many times a specific session ID appears *within* a specific participant.
|
||||||
|
@ -142,11 +142,17 @@ df_esm_preprocessed.query("participant_id == 31 & esm_session == 77")[
|
||||||
]
|
]
|
||||||
|
|
||||||
# %% tags=[]
|
# %% tags=[]
|
||||||
df_esm_2 = df_session_counts[
|
df_esm_2 = (
|
||||||
df_session_counts["esm_session_count"] == 2
|
df_session_counts[df_session_counts["esm_session_count"] == 2]
|
||||||
].reset_index().merge(df_esm_preprocessed, how="left", on=["participant_id", "device_id", "esm_session"])
|
.reset_index()
|
||||||
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
|
.merge(
|
||||||
#display(df_esm_2)
|
df_esm_preprocessed,
|
||||||
|
how="left",
|
||||||
|
on=["participant_id", "device_id", "esm_session"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
|
||||||
|
# display(df_esm_2)
|
||||||
|
|
||||||
# %% [markdown] tags=[]
|
# %% [markdown] tags=[]
|
||||||
# ### Long sessions
|
# ### Long sessions
|
||||||
|
@ -215,7 +221,9 @@ df_session_counts.count()
|
||||||
# %%
|
# %%
|
||||||
non_session = session_group_by.apply(
|
non_session = session_group_by.apply(
|
||||||
lambda x: (
|
lambda x: (
|
||||||
(x.esm_user_answer == "DayFinished3421") | (x.esm_user_answer == "DayOff3421") | (x.esm_user_answer == "DayFinishedSetEvening")
|
(x.esm_user_answer == "DayFinished3421")
|
||||||
|
| (x.esm_user_answer == "DayOff3421")
|
||||||
|
| (x.esm_user_answer == "DayFinishedSetEvening")
|
||||||
).any()
|
).any()
|
||||||
)
|
)
|
||||||
df_session_counts.loc[non_session, "session_response"] = "day_finished"
|
df_session_counts.loc[non_session, "session_response"] = "day_finished"
|
||||||
|
@ -243,6 +251,36 @@ df_session_counts.loc[esm_removed, "session_response"].value_counts()
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# It turns out that these had been accounted for with ESM statuses.
|
# It turns out that these had been accounted for with ESM statuses.
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# ### Singleton sessions
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_session_counts.count()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_session_counts[
|
||||||
|
(df_session_counts.esm_session_count == 1)
|
||||||
|
& df_session_counts.session_response.isna()
|
||||||
|
]
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_session_1 = df_session_counts[
|
||||||
|
(df_session_counts["esm_session_count"] == 1)
|
||||||
|
& df_session_counts.session_response.isna()
|
||||||
|
]
|
||||||
|
df_esm_unique_session = df_session_1.join(
|
||||||
|
df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"])
|
||||||
|
)
|
||||||
|
df_esm_unique_session = df_esm_unique_session["esm_trigger"].rename("session_response")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_session_counts.loc[
|
||||||
|
df_esm_unique_session.index, "session_response"
|
||||||
|
] = df_esm_unique_session
|
||||||
|
|
||||||
|
# %%
|
||||||
|
df_session_counts.count()
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ### Evening_last
|
# ### Evening_last
|
||||||
|
|
||||||
|
@ -270,38 +308,52 @@ sns.displot(
|
||||||
)
|
)
|
||||||
|
|
||||||
# %% [markdown]
|
# %% [markdown]
|
||||||
# ### Singleton sessions
|
# ### Repeated sessions
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# The sessions lengths that repeat often can probably be used as filled in EMAs. Let's only review the session lengths that are rare.
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_session_counts.count()
|
df_session_counts.loc[
|
||||||
|
df_session_counts.session_response.isna(), "esm_session_count"
|
||||||
|
].value_counts().sort_index()
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_session_counts[(df_session_counts.esm_session_count == 1) & df_session_counts.session_response.isna()]
|
df_session_7 = df_session_counts[
|
||||||
|
(df_session_counts["esm_session_count"] == 7)
|
||||||
# %%
|
& df_session_counts.session_response.isna()
|
||||||
df_session_1 = df_session_counts[(df_session_counts["esm_session_count"] == 1) & df_session_counts.session_response.isna()]
|
]
|
||||||
df_esm_unique_session = df_session_1.join(
|
df_esm_session_7 = df_session_7.join(
|
||||||
df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"])
|
df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]),
|
||||||
|
how="left",
|
||||||
)
|
)
|
||||||
df_esm_unique_session = df_esm_unique_session["esm_trigger"].rename("session_response")
|
|
||||||
|
# %% jupyter={"outputs_hidden": true} tags=[]
|
||||||
|
with pd.option_context(
|
||||||
|
"display.max_rows", None, "display.max_columns", None
|
||||||
|
): # more options can be specified also
|
||||||
|
display(df_esm_session_7[["esm_trigger", "esm_instructions", "esm_user_answer"]])
|
||||||
|
|
||||||
|
# %% [markdown]
|
||||||
|
# These are all morning questionnaires with "commute" selected or rarely "long break" in the morning.
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_session_counts.loc[df_esm_unique_session.index, "session_response"] = df_esm_unique_session
|
df_session_27 = df_session_counts[
|
||||||
|
(df_session_counts["esm_session_count"] == 27)
|
||||||
# %%
|
& df_session_counts.session_response.isna()
|
||||||
df_session_counts.count()
|
]
|
||||||
|
df_esm_session_27 = df_session_27.join(
|
||||||
# %%
|
df_esm_preprocessed.set_index(["participant_id", "device_id", "esm_session"]),
|
||||||
df_session_counts.merge()
|
how="left",
|
||||||
|
)
|
||||||
# %%
|
|
||||||
df_esm_78243 = df_esm_preprocessed[df_esm_preprocessed["username"] == "uploader_78243"]
|
# %% jupyter={"outputs_hidden": true} tags=[]
|
||||||
df_esm_78243 = df_esm_78243.sort_values("_id")[["id","_id","datetime_lj", "esm_status","esm_trigger","esm_instructions","esm_user_answer","esm_session"]]
|
with pd.option_context(
|
||||||
|
"display.max_rows", None, "display.max_columns", None
|
||||||
# %%
|
): # more options can be specified also
|
||||||
df_esm_78243.columns
|
display(df_esm_session_27[["esm_trigger", "esm_instructions", "esm_user_answer"]])
|
||||||
|
|
||||||
# %%
|
# %% [markdown]
|
||||||
df_esm_78243.to_csv("example.csv")
|
# These are all morning questionnaires with morning *and* workday items, with the feedback added and also branched in the longest possible way.
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
|
|
@ -2,11 +2,12 @@ from collections.abc import Collection
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
def read_baseline(paths: Collection) -> pd.DataFrame:
|
def read_baseline(paths: Collection) -> pd.DataFrame:
|
||||||
#TODO Read CSV files and concat them.
|
# TODO Read CSV files and concat them.
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def preprocess_baseline(df_baseline_from_csv: pd.DataFrame) -> pd.DataFrame:
|
def preprocess_baseline(df_baseline_from_csv: pd.DataFrame) -> pd.DataFrame:
|
||||||
#TODO Translate columns, calculate age.
|
# TODO Translate columns, calculate age.
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -68,10 +68,21 @@ def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||||
def classify_sessions_adherence(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
def classify_sessions_adherence(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
For each distinct EMA session, determine how the participant responded to it.
|
For each distinct EMA session, determine how the participant responded to it.
|
||||||
Possible outcomes are: esm_unanswered
|
Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
|
||||||
|
|
||||||
This is done in several steps.
|
This is done in three steps.
|
||||||
#TODO Finish the documentation.
|
|
||||||
|
First, the esm_status is considered.
|
||||||
|
If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
|
||||||
|
|
||||||
|
Second, the sessions which do not represent full questionnaires are identified.
|
||||||
|
These are sessions where participants only marked they are finished with the day or have not yet started working.
|
||||||
|
|
||||||
|
Third, the sessions with only one item are marked with their trigger.
|
||||||
|
We never offered questionnaires with single items, so we can be sure these are unfinished.
|
||||||
|
|
||||||
|
Finally, all sessions that remain are marked as completed.
|
||||||
|
By going through different possibilities in expl_esm.ipynb, this turned out to be a reasonable option.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
@ -80,47 +91,55 @@ def classify_sessions_adherence(df_esm_preprocessed: pd.DataFrame) -> pd.DataFra
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
some dataframe
|
df_session_counts: pd.Dataframe
|
||||||
|
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
|
||||||
"""
|
"""
|
||||||
sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
|
sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
|
||||||
|
|
||||||
|
# 0. First, assign all session statuses as NaN.
|
||||||
df_session_counts = pd.DataFrame(sessions_grouped.count()["id"]).rename(
|
df_session_counts = pd.DataFrame(sessions_grouped.count()["id"]).rename(
|
||||||
columns={"id": "esm_session_count"}
|
columns={"id": "esm_session_count"}
|
||||||
)
|
)
|
||||||
df_session_counts["session_response"] = np.NaN
|
df_session_counts["session_response"] = np.NaN
|
||||||
|
|
||||||
esm_not_answered = sessions_grouped.apply(lambda x: (x.esm_status != ESM_STATUS_ANSWERED).any())
|
# 1. Identify all ESMs with status other than answered.
|
||||||
df_session_counts.loc[esm_not_answered, "session_response"] = SESSION_STATUS_UNANSWERED
|
esm_not_answered = sessions_grouped.apply(
|
||||||
|
lambda x: (x.esm_status != ESM_STATUS_ANSWERED).any()
|
||||||
|
)
|
||||||
|
df_session_counts.loc[
|
||||||
|
esm_not_answered, "session_response"
|
||||||
|
] = SESSION_STATUS_UNANSWERED
|
||||||
|
|
||||||
|
# 2. Identify non-sessions, i.e. answers about the end of the day.
|
||||||
non_session = sessions_grouped.apply(
|
non_session = sessions_grouped.apply(
|
||||||
lambda x: (
|
lambda x: (
|
||||||
(x.esm_user_answer == "DayFinished3421") # I finished working for today.
|
(x.esm_user_answer == "DayFinished3421") # I finished working for today.
|
||||||
| (x.esm_user_answer == "DayOff3421") # I am not going to work today.
|
| (x.esm_user_answer == "DayOff3421") # I am not going to work today.
|
||||||
| (x.esm_user_answer == "DayFinishedSetEvening") # When would you like to answer the evening EMA?
|
| (
|
||||||
|
x.esm_user_answer == "DayFinishedSetEvening"
|
||||||
|
) # When would you like to answer the evening EMA?
|
||||||
).any()
|
).any()
|
||||||
)
|
)
|
||||||
df_session_counts.loc[non_session, "session_response"] = SESSION_STATUS_DAY_FINISHED
|
df_session_counts.loc[non_session, "session_response"] = SESSION_STATUS_DAY_FINISHED
|
||||||
|
|
||||||
singleton_sessions = (df_session_counts.esm_session_count == 1) & (df_session_counts.session_response.isna())
|
# 3. Identify sessions appearing only once, as those were not true EMAs for sure.
|
||||||
|
singleton_sessions = (df_session_counts.esm_session_count == 1) & (
|
||||||
|
df_session_counts.session_response.isna()
|
||||||
|
)
|
||||||
df_session_1 = df_session_counts[singleton_sessions]
|
df_session_1 = df_session_counts[singleton_sessions]
|
||||||
df_esm_unique_session = df_session_1.join(
|
df_esm_unique_session = df_session_1.join(
|
||||||
df_esm_preprocessed.set_index(GROUP_SESSIONS_BY), how="left"
|
df_esm_preprocessed.set_index(GROUP_SESSIONS_BY), how="left"
|
||||||
)
|
)
|
||||||
df_esm_unique_session = df_esm_unique_session.assign(session_response=lambda x: x.esm_trigger)["session_response"]
|
df_esm_unique_session = df_esm_unique_session.assign(
|
||||||
df_session_counts.loc[df_esm_unique_session.index, "session_response"] = df_esm_unique_session
|
session_response=lambda x: x.esm_trigger
|
||||||
|
)["session_response"]
|
||||||
|
df_session_counts.loc[
|
||||||
|
df_esm_unique_session.index, "session_response"
|
||||||
|
] = df_esm_unique_session
|
||||||
|
|
||||||
finished_sessions = sessions_grouped.apply(
|
# 4. Mark the remaining sessions as completed.
|
||||||
lambda x: (x.esm_trigger.str.endswith("_last")).any()
|
df_session_counts.loc[
|
||||||
)
|
df_session_counts.session_response.isna(), "session_response"
|
||||||
df_session_counts.loc[finished_sessions, "session_response"] = SESSION_STATUS_COMPLETE
|
] = SESSION_STATUS_COMPLETE
|
||||||
|
|
||||||
# TODO Look at evening-evening_last sequence, if everything is caught with finished sessions
|
|
||||||
|
|
||||||
# TODO What can be done about morning EMA, perhaps morning-morning_first (sic!) sequence?
|
|
||||||
|
|
||||||
# TODO What can be done about workday EMA.
|
|
||||||
|
|
||||||
df_session_counts.loc[df_session_counts.session_response.isna(), "session_response"] = "esm_finished"
|
|
||||||
# TODO But for now, simply take all other ESMs as answered.
|
|
||||||
|
|
||||||
return df_session_counts
|
return df_session_counts
|
||||||
|
|
|
@ -13,12 +13,14 @@
|
||||||
# name: straw2analysis
|
# name: straw2analysis
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import datetime
|
|
||||||
import seaborn as sns
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
import statsmodels.api as sm
|
import statsmodels.api as sm
|
||||||
import statsmodels.formula.api as smf
|
import statsmodels.formula.api as smf
|
||||||
|
|
||||||
|
@ -29,16 +31,24 @@ import participants.query_db
|
||||||
from features.esm import *
|
from features.esm import *
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
baseline_si = pd.read_csv('E:/STRAWbaseline/results-survey637813.csv')
|
baseline_si = pd.read_csv("E:/STRAWbaseline/results-survey637813.csv")
|
||||||
baseline_be_1 = pd.read_csv('E:/STRAWbaseline/results-survey358134.csv')
|
baseline_be_1 = pd.read_csv("E:/STRAWbaseline/results-survey358134.csv")
|
||||||
baseline_be_2 = pd.read_csv('E:/STRAWbaseline/results-survey413767.csv')
|
baseline_be_2 = pd.read_csv("E:/STRAWbaseline/results-survey413767.csv")
|
||||||
baseline = pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner").reset_index().drop(columns="index")
|
baseline = (
|
||||||
|
pd.concat([baseline_si, baseline_be_1, baseline_be_2], join="inner")
|
||||||
|
.reset_index()
|
||||||
|
.drop(columns="index")
|
||||||
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
participants_inactive_usernames = participants.query_db.get_usernames(collection_start=datetime.date.fromisoformat("2020-08-01"))
|
participants_inactive_usernames = participants.query_db.get_usernames(
|
||||||
|
collection_start=datetime.date.fromisoformat("2020-08-01")
|
||||||
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
baseline_inactive = baseline[baseline["Gebruikersnaam"].isin(participants_inactive_usernames)]
|
baseline_inactive = baseline[
|
||||||
|
baseline["Gebruikersnaam"].isin(participants_inactive_usernames)
|
||||||
|
]
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_esm_inactive = get_esm_data(participants_inactive_usernames)
|
df_esm_inactive = get_esm_data(participants_inactive_usernames)
|
||||||
|
@ -48,38 +58,57 @@ df_esm_preprocessed = preprocess_esm(df_esm_inactive)
|
||||||
df_session_counts = classify_sessions_adherence(df_esm_preprocessed)
|
df_session_counts = classify_sessions_adherence(df_esm_preprocessed)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
tbl_session_outcomes = df_session_counts.reset_index()["session_response"].value_counts()
|
tbl_session_outcomes = df_session_counts.reset_index()[
|
||||||
|
"session_response"
|
||||||
|
].value_counts()
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
print("All sessions:", len(df_session_counts))
|
print("All sessions:", len(df_session_counts))
|
||||||
print("-------------------------------------")
|
print("-------------------------------------")
|
||||||
print(tbl_session_outcomes)
|
print(tbl_session_outcomes)
|
||||||
print("-------------------------------------")
|
print("-------------------------------------")
|
||||||
print(tbl_session_outcomes/len(df_session_counts))
|
print(tbl_session_outcomes / len(df_session_counts))
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
VARIABLES_TO_TRANSLATE = {
|
VARIABLES_TO_TRANSLATE = {
|
||||||
"Gebruikersnaam": "username",
|
"Gebruikersnaam": "username",
|
||||||
"Geslacht": "gender",
|
"Geslacht": "gender",
|
||||||
"Geboortedatum": "date_of_birth"
|
"Geboortedatum": "date_of_birth",
|
||||||
}
|
}
|
||||||
baseline_inactive.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
|
baseline_inactive.rename(columns=VARIABLES_TO_TRANSLATE, copy=False, inplace=True)
|
||||||
now = pd.Timestamp('now')
|
now = pd.Timestamp("now")
|
||||||
baseline_inactive = baseline_inactive.assign(date_of_birth = lambda x: pd.to_datetime(x.date_of_birth),
|
baseline_inactive = baseline_inactive.assign(
|
||||||
age = lambda x: (now - x.date_of_birth).dt.days/365.25245)
|
date_of_birth=lambda x: pd.to_datetime(x.date_of_birth),
|
||||||
|
age=lambda x: (now - x.date_of_birth).dt.days / 365.25245,
|
||||||
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_session_counts
|
df_session_counts
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_session_finished = df_session_counts[df_session_counts["session_response"] == "esm_finished"].reset_index()
|
df_session_finished = df_session_counts[
|
||||||
|
df_session_counts["session_response"] == "esm_finished"
|
||||||
|
].reset_index()
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_participant_finished_sessions = df_session_finished.groupby("participant_id").count()["esm_session"].rename("finished_sessions")
|
df_participant_finished_sessions = (
|
||||||
|
df_session_finished.groupby("participant_id")
|
||||||
|
.count()["esm_session"]
|
||||||
|
.rename("finished_sessions")
|
||||||
|
)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge(df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(), how="left", on="username")
|
df_adherence = baseline_inactive[["username", "gender", "age", "startlanguage"]].merge(
|
||||||
df_adherence = df_adherence.merge(df_participant_finished_sessions, how="left", left_on="participant_id", right_index=True)
|
df_esm_preprocessed[["username", "participant_id"]].drop_duplicates(),
|
||||||
|
how="left",
|
||||||
|
on="username",
|
||||||
|
)
|
||||||
|
df_adherence = df_adherence.merge(
|
||||||
|
df_participant_finished_sessions,
|
||||||
|
how="left",
|
||||||
|
left_on="participant_id",
|
||||||
|
right_index=True,
|
||||||
|
)
|
||||||
|
|
||||||
# %% tags=[]
|
# %% tags=[]
|
||||||
df_adherence
|
df_adherence
|
||||||
|
@ -91,11 +120,15 @@ df_adherence.describe()
|
||||||
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5)
|
sns.displot(df_adherence["finished_sessions"], binwidth=5, height=5)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
lm_adherence = smf.ols('finished_sessions ~ C(gender) + C(startlanguage) + age', data=df_adherence).fit()
|
lm_adherence = smf.ols(
|
||||||
table = sm.stats.anova_lm(lm_adherence, typ=2) # Type 2 ANOVA DataFrame
|
"finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
|
||||||
|
).fit()
|
||||||
|
table = sm.stats.anova_lm(lm_adherence, typ=2) # Type 2 ANOVA DataFrame
|
||||||
print(table)
|
print(table)
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
lr_ols = smf.ols('finished_sessions ~ C(gender) + C(startlanguage) + age', data=df_adherence)
|
lr_ols = smf.ols(
|
||||||
|
"finished_sessions ~ C(gender) + C(startlanguage) + age", data=df_adherence
|
||||||
|
)
|
||||||
ls_result = lr_ols.fit()
|
ls_result = lr_ols.fit()
|
||||||
ls_result.summary()
|
ls_result.summary()
|
||||||
|
|
Loading…
Reference in New Issue