2021-05-27 18:10:34 +02:00
|
|
|
from collections.abc import Collection
|
|
|
|
|
2021-06-07 19:32:38 +02:00
|
|
|
import numpy as np
|
2021-05-27 18:10:34 +02:00
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
from config.models import ESM, Participant
|
2021-08-11 17:26:44 +02:00
|
|
|
from features import helper
|
2021-05-27 18:10:34 +02:00
|
|
|
from setup import db_engine, session
|
|
|
|
|
2021-06-11 13:50:24 +02:00
|
|
|
ESM_STATUS_ANSWERED = 2
|
|
|
|
|
|
|
|
GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]
|
|
|
|
|
|
|
|
SESSION_STATUS_UNANSWERED = "ema_unanswered"
|
|
|
|
SESSION_STATUS_DAY_FINISHED = "day_finished"
|
|
|
|
SESSION_STATUS_COMPLETE = "ema_completed"
|
2021-05-27 18:10:34 +02:00
|
|
|
|
2021-07-03 16:34:11 +02:00
|
|
|
ANSWER_DAY_FINISHED = "DayFinished3421"
|
|
|
|
ANSWER_DAY_OFF = "DayOff3421"
|
|
|
|
ANSWER_SET_EVENING = "DayFinishedSetEvening"
|
|
|
|
|
2021-06-11 16:44:33 +02:00
|
|
|
MAX_MORNING_LENGTH = 3
|
2023-07-03 17:09:15 +02:00
|
|
|
# When the participant was not yet at work at the time of the first (morning) EMA,
|
2021-06-11 16:44:33 +02:00
|
|
|
# only three items were answered.
|
|
|
|
# Two sleep related items and one indicating NOT starting work yet.
|
|
|
|
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
|
|
|
|
|
2023-07-03 17:09:15 +02:00
|
|
|
QUESTIONNAIRE_IDS = {
|
|
|
|
"sleep_quality": 1,
|
|
|
|
"PANAS_positive_affect": 8,
|
|
|
|
"PANAS_negative_affect": 9,
|
|
|
|
"JCQ_job_demand": 10,
|
|
|
|
"JCQ_job_control": 11,
|
|
|
|
"JCQ_supervisor_support": 12,
|
|
|
|
"JCQ_coworker_support": 13,
|
|
|
|
"PFITS_supervisor": 14,
|
|
|
|
"PFITS_coworkers": 15,
|
|
|
|
"UWES_vigor": 16,
|
|
|
|
"UWES_dedication": 17,
|
|
|
|
"UWES_absorption": 18,
|
|
|
|
"COPE_active": 19,
|
|
|
|
"COPE_support": 20,
|
|
|
|
"COPE_emotions": 21,
|
|
|
|
"balance_life_work": 22,
|
|
|
|
"balance_work_life": 23,
|
|
|
|
"recovery_experience_detachment": 24,
|
|
|
|
"recovery_experience_relaxation": 25,
|
|
|
|
"symptoms": 26,
|
|
|
|
"appraisal_stressfulness_event": 87,
|
|
|
|
"appraisal_threat": 88,
|
|
|
|
"appraisal_challenge": 89,
|
|
|
|
"appraisal_event_time": 90,
|
|
|
|
"appraisal_event_duration": 91,
|
|
|
|
"appraisal_event_work_related": 92,
|
|
|
|
"appraisal_stressfulness_period": 93,
|
|
|
|
"late_work": 94,
|
|
|
|
"work_hours": 95,
|
|
|
|
"left_work": 96,
|
|
|
|
"activities": 97,
|
|
|
|
"coffee_breaks": 98,
|
|
|
|
"at_work_yet": 99,
|
|
|
|
}
|
|
|
|
|
2021-05-27 18:10:34 +02:00
|
|
|
|
|
|
|
def get_esm_data(usernames: Collection) -> pd.DataFrame:
|
|
|
|
"""
|
|
|
|
Read the data from the esm table and return it in a dataframe.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
usernames: Collection
|
|
|
|
A list of usernames to put into the WHERE condition.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
df_esm: pd.DataFrame
|
2021-06-01 12:10:42 +02:00
|
|
|
A dataframe of esm data.
|
2021-05-27 18:10:34 +02:00
|
|
|
"""
|
|
|
|
query_esm = (
|
|
|
|
session.query(ESM, Participant.username)
|
|
|
|
.filter(Participant.id == ESM.participant_id)
|
|
|
|
.filter(Participant.username.in_(usernames))
|
|
|
|
)
|
|
|
|
with db_engine.connect() as connection:
|
|
|
|
df_esm = pd.read_sql(query_esm.statement, connection)
|
|
|
|
return df_esm
|
|
|
|
|
|
|
|
|
|
|
|
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
2021-06-07 16:50:27 +02:00
|
|
|
"""
|
2023-07-03 17:09:15 +02:00
|
|
|
Convert timestamps and expand JSON column.
|
|
|
|
|
2021-07-03 16:39:32 +02:00
|
|
|
Convert timestamps into human-readable datetimes and dates
|
2023-07-03 17:09:15 +02:00
|
|
|
and expand the JSON column into several Pandas DF columns.
|
2021-06-07 16:50:27 +02:00
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
df_esm: pd.DataFrame
|
|
|
|
A dataframe of esm data.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
df_esm_preprocessed: pd.DataFrame
|
2023-07-03 17:09:15 +02:00
|
|
|
A dataframe with added columns: datetime in Ljubljana timezone
|
|
|
|
and all fields from ESM_JSON column.
|
2021-06-07 16:50:27 +02:00
|
|
|
"""
|
2021-08-11 17:19:14 +02:00
|
|
|
df_esm = helper.get_date_from_timestamp(df_esm)
|
|
|
|
|
2021-06-02 18:35:00 +02:00
|
|
|
df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(
|
|
|
|
columns=["esm_trigger"]
|
|
|
|
) # The esm_trigger column is already present in the main df.
|
2021-06-01 17:57:08 +02:00
|
|
|
return df_esm.join(df_esm_json)
|
2021-06-07 19:32:38 +02:00
|
|
|
|
|
|
|
|
2021-06-11 16:34:09 +02:00
|
|
|
def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
2021-06-07 19:32:38 +02:00
|
|
|
"""
|
|
|
|
For each distinct EMA session, determine how the participant responded to it.
|
2023-07-03 17:09:15 +02:00
|
|
|
|
|
|
|
Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED,
|
|
|
|
and SESSION_STATUS_COMPLETE
|
2021-06-07 19:32:38 +02:00
|
|
|
|
2021-06-11 14:50:14 +02:00
|
|
|
This is done in three steps.
|
|
|
|
|
|
|
|
First, the esm_status is considered.
|
2023-07-03 17:09:15 +02:00
|
|
|
If any of the ESMs in a session has a status *other than* "answered",
|
|
|
|
then this session is taken as unfinished.
|
2021-06-11 14:50:14 +02:00
|
|
|
|
|
|
|
Second, the sessions which do not represent full questionnaires are identified.
|
2023-07-03 17:09:15 +02:00
|
|
|
These are sessions where participants only marked they are finished with the day
|
|
|
|
or have not yet started working.
|
2021-06-11 14:50:14 +02:00
|
|
|
|
|
|
|
Third, the sessions with only one item are marked with their trigger.
|
2023-07-03 17:09:15 +02:00
|
|
|
We never offered questionnaires with single items,
|
|
|
|
so we can be sure these are unfinished.
|
2021-06-11 14:50:14 +02:00
|
|
|
|
|
|
|
Finally, all sessions that remain are marked as completed.
|
2023-07-03 17:09:15 +02:00
|
|
|
By going through different possibilities in expl_esm_adherence.ipynb,
|
|
|
|
this turned out to be a reasonable option.
|
2021-06-07 19:32:38 +02:00
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
df_esm_preprocessed: pd.DataFrame
|
2023-07-03 17:09:15 +02:00
|
|
|
A preprocessed dataframe of esm data,
|
|
|
|
which must include the session ID (esm_session).
|
2021-06-07 19:32:38 +02:00
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
2021-06-11 14:50:14 +02:00
|
|
|
df_session_counts: pd.Dataframe
|
2023-07-03 17:09:15 +02:00
|
|
|
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
|
|
|
|
with their statuses and the number of items.
|
2021-06-07 19:32:38 +02:00
|
|
|
"""
|
2021-06-11 13:50:24 +02:00
|
|
|
sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
|
2021-06-07 19:32:38 +02:00
|
|
|
|
2021-06-11 14:50:14 +02:00
|
|
|
# 0. First, assign all session statuses as NaN.
|
2021-06-07 19:32:38 +02:00
|
|
|
df_session_counts = pd.DataFrame(sessions_grouped.count()["id"]).rename(
|
|
|
|
columns={"id": "esm_session_count"}
|
|
|
|
)
|
2021-07-04 14:34:13 +02:00
|
|
|
df_session_counts["session_response"] = np.nan
|
2021-06-07 19:32:38 +02:00
|
|
|
|
2021-06-11 14:50:14 +02:00
|
|
|
# 1. Identify all ESMs with status other than answered.
|
|
|
|
esm_not_answered = sessions_grouped.apply(
|
|
|
|
lambda x: (x.esm_status != ESM_STATUS_ANSWERED).any()
|
|
|
|
)
|
|
|
|
df_session_counts.loc[
|
|
|
|
esm_not_answered, "session_response"
|
|
|
|
] = SESSION_STATUS_UNANSWERED
|
2021-06-07 19:32:38 +02:00
|
|
|
|
2021-06-11 14:50:14 +02:00
|
|
|
# 2. Identify non-sessions, i.e. answers about the end of the day.
|
2021-06-07 19:32:38 +02:00
|
|
|
non_session = sessions_grouped.apply(
|
|
|
|
lambda x: (
|
2021-07-03 16:34:11 +02:00
|
|
|
(x.esm_user_answer == ANSWER_DAY_FINISHED) # I finished working for today.
|
|
|
|
| (x.esm_user_answer == ANSWER_DAY_OFF) # I am not going to work today.
|
2021-06-11 14:50:14 +02:00
|
|
|
| (
|
2021-07-03 16:34:11 +02:00
|
|
|
x.esm_user_answer == ANSWER_SET_EVENING
|
2021-06-11 14:50:14 +02:00
|
|
|
) # When would you like to answer the evening EMA?
|
2021-06-07 19:32:38 +02:00
|
|
|
).any()
|
|
|
|
)
|
2021-06-11 13:50:24 +02:00
|
|
|
df_session_counts.loc[non_session, "session_response"] = SESSION_STATUS_DAY_FINISHED
|
|
|
|
|
2021-06-11 14:50:14 +02:00
|
|
|
# 3. Identify sessions appearing only once, as those were not true EMAs for sure.
|
|
|
|
singleton_sessions = (df_session_counts.esm_session_count == 1) & (
|
|
|
|
df_session_counts.session_response.isna()
|
|
|
|
)
|
2021-06-11 13:50:24 +02:00
|
|
|
df_session_1 = df_session_counts[singleton_sessions]
|
|
|
|
df_esm_unique_session = df_session_1.join(
|
|
|
|
df_esm_preprocessed.set_index(GROUP_SESSIONS_BY), how="left"
|
|
|
|
)
|
2021-06-11 14:50:14 +02:00
|
|
|
df_esm_unique_session = df_esm_unique_session.assign(
|
|
|
|
session_response=lambda x: x.esm_trigger
|
|
|
|
)["session_response"]
|
|
|
|
df_session_counts.loc[
|
|
|
|
df_esm_unique_session.index, "session_response"
|
|
|
|
] = df_esm_unique_session
|
|
|
|
|
|
|
|
# 4. Mark the remaining sessions as completed.
|
|
|
|
df_session_counts.loc[
|
|
|
|
df_session_counts.session_response.isna(), "session_response"
|
|
|
|
] = SESSION_STATUS_COMPLETE
|
2021-06-08 16:07:39 +02:00
|
|
|
|
|
|
|
return df_session_counts
|
2021-06-11 16:34:09 +02:00
|
|
|
|
|
|
|
|
|
|
|
def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
"""
|
2023-07-03 17:09:15 +02:00
|
|
|
Classify EMA sessions into morning, workday, or evening.
|
|
|
|
|
|
|
|
For each EMA session, determine the time of the first user answer
|
|
|
|
and its time type (morning, workday, or evening).
|
2021-06-11 16:34:09 +02:00
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
df_esm_preprocessed: pd.DataFrame
|
2023-07-03 17:09:15 +02:00
|
|
|
A preprocessed dataframe of esm data,
|
|
|
|
which must include the session ID (esm_session).
|
2021-06-11 16:34:09 +02:00
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
df_session_time: pd.DataFrame
|
2023-07-03 17:09:15 +02:00
|
|
|
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY)
|
|
|
|
with their time type and timestamp of first answer.
|
2021-06-11 16:34:09 +02:00
|
|
|
"""
|
|
|
|
df_session_time = (
|
|
|
|
df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
|
|
|
|
.groupby(GROUP_SESSIONS_BY)
|
|
|
|
.first()[["time", "datetime_lj"]]
|
|
|
|
)
|
|
|
|
return df_session_time
|
2021-06-11 16:44:33 +02:00
|
|
|
|
|
|
|
|
|
|
|
def classify_sessions_by_completion_time(
|
|
|
|
df_esm_preprocessed: pd.DataFrame,
|
|
|
|
) -> pd.DataFrame:
|
|
|
|
"""
|
2023-07-03 17:09:15 +02:00
|
|
|
Classify sessions and correct the time type.
|
|
|
|
|
|
|
|
The point of this function is to not only classify sessions
|
|
|
|
by using the previously defined functions.
|
2021-06-11 16:44:33 +02:00
|
|
|
It also serves to "correct" the time type of some EMA sessions.
|
|
|
|
|
|
|
|
A morning questionnaire could seamlessly transition into a daytime questionnaire,
|
|
|
|
if the participant was already at work.
|
|
|
|
In this case, the "time" label changed mid-session.
|
2023-07-03 17:09:15 +02:00
|
|
|
Because of the way classify_sessions_by_time works,
|
|
|
|
this questionnaire was classified as "morning".
|
2021-06-11 16:44:33 +02:00
|
|
|
But for all intents and purposes, it can be treated as a "daytime" EMA.
|
|
|
|
|
|
|
|
The way this scenario is differentiated from a true "morning" questionnaire,
|
|
|
|
where the participants NOT yet at work, is by considering their length.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
df_esm_preprocessed: pd.DataFrame
|
2023-07-03 17:09:15 +02:00
|
|
|
A preprocessed dataframe of esm data,
|
|
|
|
which must include the session ID (esm_session).
|
2021-06-11 16:44:33 +02:00
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
2021-06-11 20:17:17 +02:00
|
|
|
df_session_counts_time: pd.DataFrame
|
2023-07-03 17:09:15 +02:00
|
|
|
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses,
|
|
|
|
the number of items,
|
|
|
|
their time type (with some morning EMAs reclassified)
|
|
|
|
and timestamp of first answer.
|
2021-06-11 16:44:33 +02:00
|
|
|
|
|
|
|
"""
|
|
|
|
df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
|
|
|
|
df_session_time = classify_sessions_by_time(df_esm_preprocessed)
|
|
|
|
|
|
|
|
df_session_counts_time = df_session_time.join(df_session_counts)
|
|
|
|
|
2021-06-11 20:17:17 +02:00
|
|
|
morning_transition_to_daytime = (df_session_counts_time.time == "morning") & (
|
|
|
|
df_session_counts_time.esm_session_count > MAX_MORNING_LENGTH
|
|
|
|
)
|
|
|
|
|
|
|
|
df_session_counts_time.loc[morning_transition_to_daytime, "time"] = "daytime"
|
2021-06-11 16:44:33 +02:00
|
|
|
|
|
|
|
return df_session_counts_time
|
2021-07-02 16:33:48 +02:00
|
|
|
|
|
|
|
|
|
|
|
def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
"""
|
2023-07-03 17:09:15 +02:00
|
|
|
Eliminate invalid ESM responses.
|
|
|
|
|
2021-07-05 18:24:22 +02:00
|
|
|
It removes unanswered ESMs and those that indicate end of work and similar.
|
|
|
|
It also extracts a numeric answer from strings such as "4 - I strongly agree".
|
2021-07-02 16:33:48 +02:00
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
df_esm_preprocessed: pd.DataFrame
|
|
|
|
A preprocessed dataframe of esm data.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
df_esm_clean: pd.DataFrame
|
|
|
|
A subset of the original dataframe.
|
|
|
|
|
|
|
|
"""
|
|
|
|
df_esm_clean = df_esm_preprocessed[
|
|
|
|
df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED
|
|
|
|
]
|
2021-07-03 16:34:11 +02:00
|
|
|
df_esm_clean = df_esm_clean[
|
|
|
|
~df_esm_clean["esm_user_answer"].isin(
|
|
|
|
[ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING]
|
|
|
|
)
|
|
|
|
]
|
2021-07-04 14:34:13 +02:00
|
|
|
df_esm_clean["esm_user_answer_numeric"] = np.nan
|
2021-07-04 14:34:57 +02:00
|
|
|
esm_type_numeric = [
|
|
|
|
ESM.ESM_TYPE.get("radio"),
|
|
|
|
ESM.ESM_TYPE.get("scale"),
|
|
|
|
ESM.ESM_TYPE.get("number"),
|
|
|
|
]
|
2021-08-11 14:53:59 +02:00
|
|
|
df_esm_clean.loc[
|
2021-07-04 14:34:57 +02:00
|
|
|
df_esm_clean["esm_type"].isin(esm_type_numeric)
|
2021-08-11 14:53:59 +02:00
|
|
|
] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
|
2021-07-03 16:34:11 +02:00
|
|
|
esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
|
|
|
|
int
|
|
|
|
)
|
|
|
|
)
|
2021-07-02 16:33:48 +02:00
|
|
|
return df_esm_clean
|