stress_at_work_analysis/features/esm_SAM.py

import numpy as np
import pandas as pd

import features.esm

SAM_ORIGINAL_MAX = 5
SAM_ORIGINAL_MIN = 1

QUESTIONNAIRE_ID_SAM = {
    "event_stress": 87,
    "event_threat": 88,
    "event_challenge": 89,
    "event_time": 90,
    "event_duration": 91,
    "event_work_related": 92,
    "period_stress": 93,
}
QUESTIONNAIRE_ID_SAM_LOW = min(QUESTIONNAIRE_ID_SAM.values())
QUESTIONNAIRE_ID_SAM_HIGH = max(QUESTIONNAIRE_ID_SAM.values())

GROUP_QUESTIONNAIRES_BY = [
    "participant_id",
    "device_id",
    "esm_session",
]
# Each questionnaire occurs only once within each esm_session on the same device
# within the same participant.


DICT_SAM_QUESTION_IDS = {
    87: (
        "Was there a particular event that created tension in you?",
        "Was er een bepaalde gebeurtenis die spanning veroorzaakte?",
        "Je prišlo do kakega dogodka, ki je v vas ustvaril napetost?",
    ),
    88: (
        "Did this event make you feel anxious?",
        "Voelde je je angstig door deze gebeurtenis?",
        "Ste se zaradi tega dogodka počutili tesnobno?",
    ),
    89: (
        "Will the outcome of this event be negative?",
        "Zal de uitkomst van deze gebeurtenis negatief zijn? ",
        "Bo izid tega dogodka negativen?",
    ),
    90: (
        "How threatening was this event?",
        "Hoe bedreigend was deze gebeurtenis?",
        "Kako grozeč je bil ta dogodek?",
    ),
    91: (
        "Is this going to have a negative impact on you?",
        "Zal dit een negatieve impact op je hebben?",
        "Ali bo to negativno vplivalo na vas?",
    ),
    92: (
        "Is this going to have a positive impact on you?",
        "Zal dit een positief effect op je hebben?",
        "Ali bo to pozitivno vplivalo na vas?",
    ),
    93: (
        "How eager are you to tackle this event?",
        "Hoe graag wil je deze gebeurtenis aanpakken?",
        "Kako zagnani ste bili pri spopadanju s tem dogodkom?",
    ),
    94: (
        "To what extent can you become a stronger person because of this event?",
        "In welke mate kan je een sterkere persoon worden door deze gebeurtenis?",
        "V kolikšni meri lahko zaradi tega dogodka postanete močnejša oseba?",
    ),
    95: (
        "To what extent are you excited thinking about the outcome of this event?",
        "In welke mate ben je enthousiast bij de gedachte aan",
        "V kolikšni meri vas misel na izid tega dogodka navdušuje?",
    ),
    96: (
        "At what time did this event occur?",
        "Hoe laat vond deze gebeurtenis plaats?",
        "Kdaj se je ta dogodek zgodil?",
    ),
    97: (
        "How long did this event last?",
        "Hoe lang duurde deze gebeurtenis?",
        "Kako dolgo je trajal ta dogodek?",
    ),
    98: (
        "Was/is this event work-related?",
        "Was/is deze gebeurtenis werkgerelateerd?",
        "Je (bil) ta dogodek povezan s službo?",
    ),
    99: (
        "Did this overall period create tension in you?",
        "Heeft deze globale periode spanning veroorzaakt?",
        "Je to obdobje kot celota v vas ustvarilo napetost?",
    ),
    100: (
        "To what extent do you perceive this overall period as stressful?",
        "In welke mate ervaar je deze globale periode als stressvol?",
        "V kolikšni meri ste to obdobje dojemali kot stresno?",
    ),
    101: (
        "Was there a particular event that created tension in you?",
        "Was er een bepaalde gebeurtenis die spanning veroorzaakte?",
        "Je prišlo do kakega dogodka, ki je v vas ustvaril napetost?",
    ),
    102: (
        "Did this event make you feel anxious?",
        "Voelde je je angstig door deze gebeurtenis?",
        "Ste se zaradi tega dogodka počutili tesnobne?",
    ),
    103: (
        "Will the outcome of this event be negative?",
        "Zal de uitkomst van deze gebeurtenis negatief zijn? ",
        "Bo izid tega dogodka negativen?",
    ),
    104: (
        "How threatening was this event?",
        "Hoe bedreigend was deze gebeurtenis?",
        "Kako grozeč je bil ta dogodek?",
    ),
    105: (
        "Is this going to have a negative impact on you?",
        "Zal dit een negatieve impact op je hebben?",
        "Ali bo to negativno vplivalo na vas?",
    ),
    106: (
        "Is this going to have a positive impact on you?",
        "Zal dit een positief effect op je hebben?",
        "Ali bo to pozitivno vplivalo na vas?",
    ),
    107: (
        "How eager are you to tackle this event?",
        "Hoe graag wil je deze gebeurtenis aanpakken?",
        "Kako zagnani ste bili, da se spopadete s tem dogodkom?",
    ),
    108: (
        "To what extent can you become a stronger person because of this event?",
        "In welke mate kan je een sterkere persoon worden door deze gebeurtenis?",
        "V kolikšni meri lahko zaradi tega dogodka postanete močnejša oseba?",
    ),
    109: (
        "To what extent are you excited thinking about the outcome of this event?",
        "In welke mate ben je enthousiast bij de gedachte",
        "V kolikšni meri vas misel na izid tega dogodka navdušuje?",
    ),
    110: (
        "At what time did this event occur?",
        "Hoe laat vond deze gebeurtenis plaats?",
        "Kdaj se je ta dogodek zgodil?",
    ),
    111: (
        "How long did this event last?",
        "Hoe lang duurde deze gebeurtenis?",
        "Kako dolgo je trajal ta dogodek?",
    ),
    112: (
        "Was/is this event work-related?",
        "Was/is deze gebeurtenis werkgerelateerd?",
        "Je bil ali je ta dogodek povezan s službo?",
    ),
    113: (
        "Did this overall period create tension in you?",
        "Heeft deze globale periode spanning veroorzaakt?",
        "Je to celo obdobje v vas ustvarilo napetost?",
    ),
    114: (
        "To what extent do you perceive this overall period as stressful?",
        "In welke mate ervaar je deze globale periode als stressvol?",
        "V kolikšni meri ste celo to obdobje dojemali kot stresno?",
    ),
}


def extract_stressful_events(df_esm: pd.DataFrame) -> pd.DataFrame:
    """
    Extract information about stressful events.

    Participants were asked: "Was there a particular event that created tension in you?"
    Then a subset of questions related to this event followed.
    This function goes through the follow-up questions one by one
        and preprocesses them, so that it adds new columns to the dataframe.

    Parameters
    ----------
    df_esm: pd.DataFrame
        A raw dataframe of all ESM data.

    Returns
    -------
    df_esm_events: pd.DataFrame
        A cleaned up df of Stress Appraisal Measure items with additional columns.

    """
    # 0. Select only questions from Stress Appraisal Measure.
    df_esm_preprocessed = features.esm.preprocess_esm(df_esm)
    df_esm_sam = df_esm_preprocessed[
        (df_esm_preprocessed["questionnaire_id"] >= QUESTIONNAIRE_ID_SAM_LOW)
        & (df_esm_preprocessed["questionnaire_id"] <= QUESTIONNAIRE_ID_SAM_HIGH)
    ]

    df_esm_sam_clean = features.esm.clean_up_esm(df_esm_sam)
    # 1.
    df_esm_event_threat_challenge_mean_wide = calculate_threat_challenge_means(
        df_esm_sam_clean
    )
    # 2.
    df_esm_event_stress = detect_stressful_event(df_esm_sam_clean)

    # Join to the previously calculated features related to the events.
    df_esm_events = df_esm_event_threat_challenge_mean_wide.join(
        df_esm_event_stress[
            GROUP_QUESTIONNAIRES_BY + ["event_present", "event_stressfulness"]
        ].set_index(GROUP_QUESTIONNAIRES_BY)
    )

    # 3.
    df_esm_event_work_related = detect_event_work_related(df_esm_sam_clean)

    df_esm_events = df_esm_events.join(
        df_esm_event_work_related[
            GROUP_QUESTIONNAIRES_BY + ["event_work_related"]
        ].set_index(GROUP_QUESTIONNAIRES_BY)
    )

    # 4.
    df_esm_event_time = convert_event_time(df_esm_sam_clean)

    df_esm_events = df_esm_events.join(
        df_esm_event_time[GROUP_QUESTIONNAIRES_BY + ["event_time"]].set_index(
            GROUP_QUESTIONNAIRES_BY
        )
    )

    # 5.
    df_esm_event_duration = extract_event_duration(df_esm_sam_clean)

    df_esm_events = df_esm_events.join(
        df_esm_event_duration[
            GROUP_QUESTIONNAIRES_BY + ["event_duration", "event_duration_info"]
        ].set_index(GROUP_QUESTIONNAIRES_BY)
    )

    return df_esm_events


def calculate_threat_challenge_means(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
    This function calculates challenge and threat
        (two Stress Appraisal Measure subscales) means,
        for each ESM session (within participants and devices).
    It creates a grouped dataframe with means in two columns.

    Parameters
    ----------
    df_esm_sam_clean: pd.DataFrame
        A cleaned up dataframe of Stress Appraisal Measure items.

    Returns
    -------
    df_esm_event_threat_challenge_mean_wide: pd.DataFrame
        A dataframe of unique ESM sessions (by participants and devices)
        with threat and challenge means.
    """
    # Select only threat and challenge assessments for events
    df_esm_event_threat_challenge = df_esm_sam_clean[
        (
            df_esm_sam_clean["questionnaire_id"]
            == QUESTIONNAIRE_ID_SAM.get("event_threat")
        )
        | (
            df_esm_sam_clean["questionnaire_id"]
            == QUESTIONNAIRE_ID_SAM.get("event_challenge")
        )
    ]
    # Calculate mean of threat and challenge subscales for each ESM session.
    df_esm_event_threat_challenge_mean_wide = pd.pivot_table(
        df_esm_event_threat_challenge,
        index=["participant_id", "device_id", "esm_session"],
        columns=["questionnaire_id"],
        values=["esm_user_answer_numeric"],
        aggfunc="mean",
    )
    # Drop unnecessary column values.
    df_esm_event_threat_challenge_mean_wide.columns = (
        df_esm_event_threat_challenge_mean_wide.columns.get_level_values(1)
    )
    df_esm_event_threat_challenge_mean_wide.columns.name = None
    df_esm_event_threat_challenge_mean_wide.rename(
        columns={
            QUESTIONNAIRE_ID_SAM.get("event_threat"): "threat_mean",
            QUESTIONNAIRE_ID_SAM.get("event_challenge"): "challenge_mean",
        },
        inplace=True,
    )
    return df_esm_event_threat_challenge_mean_wide


def detect_stressful_event(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
    Participants were asked: "Was there a particular event that created tension in you?"
    The following options were available:
        0 - No,
        1 - Yes, slightly,
        2 - Yes, moderately,
        3 - Yes, considerably,
        4 - Yes, extremely.
    This function indicates whether there was a stressful event (True/False)
        and how stressful it was on a scale of 1 to 4.

    Parameters
    ----------
    df_esm_sam_clean: pd.DataFrame
        A cleaned up dataframe of Stress Appraisal Measure items.

    Returns
    -------
    df_esm_event_stress: pd.DataFrame
        The same dataframe with two new columns:
            - event_present, indicating whether there was a stressful event at all,
            - event_stressfulness, a numeric answer (1-4) to the single item question.

    """
    df_esm_event_stress = df_esm_sam_clean[
        df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_stress")
    ]
    df_esm_event_stress = df_esm_event_stress.assign(
        event_present=lambda x: x.esm_user_answer_numeric > 0,
        event_stressfulness=lambda x: x.esm_user_answer_numeric,
    )
    return df_esm_event_stress


def detect_event_work_related(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
    This function simply adds a column indicating the answer to the question:
        "Was/is this event work-related?"

    Parameters
    ----------
    df_esm_sam_clean: pd.DataFrame
        A cleaned up dataframe of Stress Appraisal Measure items.

    Returns
    -------
    df_esm_event_stress: pd.DataFrame
        The same dataframe with a new column event_work_related (True/False).

    """
    df_esm_event_stress = df_esm_sam_clean[
        df_esm_sam_clean["questionnaire_id"]
        == QUESTIONNAIRE_ID_SAM.get("event_work_related")
    ]
    df_esm_event_stress = df_esm_event_stress.assign(
        event_work_related=lambda x: x.esm_user_answer_numeric > 0
    )
    return df_esm_event_stress


def convert_event_time(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
    This function only serves to convert the string datetime answer
        into a real datetime type.
    Errors during this conversion are coerced, meaning that non-datetime answers
        are assigned Not a Time (NaT).
    NOTE: Since the only available non-datetime answer to this question was
        "0 - I do not remember", the NaTs can be interpreted to mean this.

    Parameters
    ----------
    df_esm_sam_clean: pd.DataFrame
        A cleaned up dataframe of Stress Appraisal Measure items.

    Returns
    -------
    df_esm_event_time: pd.DataFrame
        The same dataframe with a new column event_time of datetime type.
    """
    df_esm_event_time = df_esm_sam_clean[
        df_esm_sam_clean["questionnaire_id"] == QUESTIONNAIRE_ID_SAM.get("event_time")
    ].assign(
        event_time=lambda x: pd.to_datetime(
            x.esm_user_answer,
            errors="coerce",
            format="%Y-%m-%d %H:%M:%S %z",
            exact=True,
        )
    )
    # Example answer: 2020-09-29 00:05:00 +0200
    return df_esm_event_time


def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
    """
    If participants indicated a stressful events, they were asked:
        "How long did this event last? (Answer in hours and minutes)"
    This function extracts this duration time and saves additional answers:
        0 - I do not remember,
        1 - It is still going on.

    Parameters
    ----------
    df_esm_sam_clean: pd.DataFrame
        A cleaned up dataframe of Stress Appraisal Measure items.

    Returns
    -------
    df_esm_event_duration: pd.DataFrame
        The same dataframe with two new columns:
            - event_duration, a time part of a datetime,
            - event_duration_info, giving other options to this question:
                0 - I do not remember,
                1 - It is still going on
    """
    df_esm_event_duration = df_esm_sam_clean[
        df_esm_sam_clean["questionnaire_id"]
        == QUESTIONNAIRE_ID_SAM.get("event_duration")
    ].assign(
        event_duration=lambda x: pd.to_datetime(
            x.esm_user_answer.str.slice(start=0, stop=-6),
            errors="coerce",
            format="%Y-%m-%d %H:%M:%S",
        ).dt.time
    )
    # Example answer: 2020-09-29 00:05:00 +0200
    # TODO Explore the values recorded in event_duration and possibly fix mistakes.
    # For example, participants reported setting 23:50:00 instead of 00:50:00.

    # For the events that no duration was found (i.e. event_duration = NaT),
    # we can determine whether:
    #   - this event is still going on ("1 - It is still going on")
    #   - the participant couldn't remember it's duration ("0 - I do not remember")
    # Generally, these answers were converted to esm_user_answer_numeric in clean_up_esm
    # but only the numeric types of questions and answers.
    # Since this was of "datetime" type, convert these specific answers here again.
    df_esm_event_duration["event_duration_info"] = np.nan
    df_esm_event_duration[
        df_esm_event_duration.event_duration.isna()
    ] = df_esm_event_duration[df_esm_event_duration.event_duration.isna()].assign(
        event_duration_info=lambda x: x.esm_user_answer.str.slice(stop=1).astype(int)
    )

    return df_esm_event_duration


# TODO: How many questions about the stressfulness of the period were asked
#  and how does this relate to events?


def reassign_question_ids(df_sam_cleaned: pd.DataFrame) -> pd.DataFrame:
    """
    Fix question IDs to match their actual content.

    Unfortunately, when altering the protocol to adapt to COVID pandemic,
    we did not retain original question IDs.
    This means that for participants before 2021, they are different
    from for the rest of them.
    This function searches for question IDs by matching their strings.

    Parameters
    ----------
    df_sam_cleaned: pd.DataFrame
        A cleaned up dataframe, which must also include esm_user_answer_numeric.

    Returns
    -------
    df_sam_fixed: pd.DataFrame
        The same dataframe but with fixed question IDs.
    """
    df_esm_sam_unique_questions = (
        df_sam_cleaned.groupby("question_id")
        .esm_instructions.value_counts()
        .rename()
        .reset_index()
    )
    # Tabulate all possible answers to each question (group by question ID).

    # First, check that we anticipated all esm instructions.
    for q_id in DICT_SAM_QUESTION_IDS.keys():
        # Look for all questions ("instructions") occurring in the dataframe.
        actual_questions = df_esm_sam_unique_questions.loc[
            df_esm_sam_unique_questions["question_id"] == q_id,
            "esm_instructions",
        ]
        # These are all answers to a given question (by q_id).
        questions_matches = actual_questions.str.startswith(
            DICT_SAM_QUESTION_IDS.get(q_id)
        )
        # See if they are expected, i.e. included in the dictionary.
        if ~actual_questions.all():
            print("One of the questions that occur in the data was undefined.")
            print("This were the questions found in the data: ")
            raise KeyError(actual_questions[~questions_matches])
            # In case there is an unexpected answer, raise an exception.

    # Next, replace question IDs.
    df_sam_fixed = df_sam_cleaned.copy()
    df_sam_fixed["question_id"] = df_sam_cleaned["esm_instructions"].apply(
        lambda x: next(
            (
                key
                for key, values in DICT_SAM_QUESTION_IDS.items()
                if x.startswith(values)
            ),
            None,
        )
    )

    return df_sam_fixed