stress_at_work_analysis/features/esm.py

import datetime
from collections.abc import Collection

import numpy as np
import pandas as pd
from pytz import timezone

from config.models import ESM, Participant
from setup import db_engine, session

TZ_LJ = timezone("Europe/Ljubljana")
ESM_STATUS_ANSWERED = 2

GROUP_SESSIONS_BY = ["participant_id", "device_id", "esm_session"]

SESSION_STATUS_UNANSWERED = "ema_unanswered"
SESSION_STATUS_DAY_FINISHED = "day_finished"
SESSION_STATUS_COMPLETE = "ema_completed"


def get_esm_data(usernames: Collection) -> pd.DataFrame:
    """
    Read the data from the esm table and return it in a dataframe.

    Parameters
    ----------
    usernames: Collection
        A list of usernames to put into the WHERE condition.

    Returns
    -------
    df_esm: pd.DataFrame
        A dataframe of esm data.
    """
    query_esm = (
        session.query(ESM, Participant.username)
        .filter(Participant.id == ESM.participant_id)
        .filter(Participant.username.in_(usernames))
    )
    with db_engine.connect() as connection:
        df_esm = pd.read_sql(query_esm.statement, connection)
    return df_esm


def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
    """
    Convert timestamps into human-readable datetimes and expand the JSON column into several Pandas DF columns.

    Parameters
    ----------
    df_esm: pd.DataFrame
        A dataframe of esm data.

    Returns
    -------
    df_esm_preprocessed: pd.DataFrame
        A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
    """
    df_esm["datetime_lj"] = df_esm["double_esm_user_answer_timestamp"].apply(
        lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
    )
    df_esm_json = pd.json_normalize(df_esm["esm_json"]).drop(
        columns=["esm_trigger"]
    )  # The esm_trigger column is already present in the main df.
    return df_esm.join(df_esm_json)


def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
    For each distinct EMA session, determine how the participant responded to it.
    Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE

    This is done in three steps.

    First, the esm_status is considered.
    If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.

    Second, the sessions which do not represent full questionnaires are identified.
    These are sessions where participants only marked they are finished with the day or have not yet started working.

    Third, the sessions with only one item are marked with their trigger.
    We never offered questionnaires with single items, so we can be sure these are unfinished.

    Finally, all sessions that remain are marked as completed.
    By going through different possibilities in expl_esm.ipynb, this turned out to be a reasonable option.

    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
        A preprocessed dataframe of esm data, which must include the session ID (esm_session).

    Returns
    -------
    df_session_counts: pd.Dataframe
        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
    """
    sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)

    # 0. First, assign all session statuses as NaN.
    df_session_counts = pd.DataFrame(sessions_grouped.count()["id"]).rename(
        columns={"id": "esm_session_count"}
    )
    df_session_counts["session_response"] = np.NaN

    # 1. Identify all ESMs with status other than answered.
    esm_not_answered = sessions_grouped.apply(
        lambda x: (x.esm_status != ESM_STATUS_ANSWERED).any()
    )
    df_session_counts.loc[
        esm_not_answered, "session_response"
    ] = SESSION_STATUS_UNANSWERED

    # 2. Identify non-sessions, i.e. answers about the end of the day.
    non_session = sessions_grouped.apply(
        lambda x: (
            (x.esm_user_answer == "DayFinished3421")  # I finished working for today.
            | (x.esm_user_answer == "DayOff3421")  # I am not going to work today.
            | (
                x.esm_user_answer == "DayFinishedSetEvening"
            )  # When would you like to answer the evening EMA?
        ).any()
    )
    df_session_counts.loc[non_session, "session_response"] = SESSION_STATUS_DAY_FINISHED

    # 3. Identify sessions appearing only once, as those were not true EMAs for sure.
    singleton_sessions = (df_session_counts.esm_session_count == 1) & (
        df_session_counts.session_response.isna()
    )
    df_session_1 = df_session_counts[singleton_sessions]
    df_esm_unique_session = df_session_1.join(
        df_esm_preprocessed.set_index(GROUP_SESSIONS_BY), how="left"
    )
    df_esm_unique_session = df_esm_unique_session.assign(
        session_response=lambda x: x.esm_trigger
    )["session_response"]
    df_session_counts.loc[
        df_esm_unique_session.index, "session_response"
    ] = df_esm_unique_session

    # 4. Mark the remaining sessions as completed.
    df_session_counts.loc[
        df_session_counts.session_response.isna(), "session_response"
    ] = SESSION_STATUS_COMPLETE

    return df_session_counts


def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
    For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)

    Parameters
    ----------
    df_esm_preprocessed: pd.DataFrame
        A preprocessed dataframe of esm data, which must include the session ID (esm_session).

    Returns
    -------
    df_session_time: pd.DataFrame
        A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
    """
    df_session_time = (
        df_esm_preprocessed.sort_values(["participant_id", "datetime_lj"])
        .groupby(GROUP_SESSIONS_BY)
        .first()[["time", "datetime_lj"]]
    )
    return df_session_time