Implement a method to recode JCQ answers.

2021-07-05 18:24:22 +02:00 · 2021-07-05 18:24:22 +02:00 · e2808422db
parent 459f7a2c72
commit e2808422db
2 changed files with 79 additions and 1 deletions
--- a/features/esm.py
+++ b/features/esm.py
@ -228,7 +228,8 @@ def classify_sessions_by_completion_time(
 def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
    """
    This function eliminates invalid ESM responses.
-    It removes unanswered ESMs.
+    It removes unanswered ESMs and those that indicate end of work and similar.
    It also extracts a numeric answer from strings such as "4 - I strongly agree".
    Parameters
    ----------
--- a/features/esm_JCQ.py
+++ b/features/esm_JCQ.py
@ -1,3 +1,8 @@
 import pandas as pd
 JCQ_ORIGINAL_MAX = 4
 JCQ_ORIGINAL_MIN = 1
 dict_JCQ_demand_control_reverse = {
    75: (
        "I was NOT asked",
@ -29,3 +34,75 @@ dict_JCQ_demand_control_reverse = {
        "Pri svojem delu sem imela zelo malo svobode",
    ),
 }
 def reverse_jcq_demand_control_scoring(
    df_esm_jcq_demand_control: pd.DataFrame,
 ) -> pd.DataFrame:
    """
    This function recodes answers in Job content questionnaire by first incrementing them by 1,
    to be in line with original (1-4) scoring.
    Then, some answers are reversed (i.e. 1 becomes 4 etc.), because the questions are negatively phrased.
    These answers are listed in dict_JCQ_demand_control_reverse and identified by their question ID.
    However, the existing data is checked against literal phrasing of these questions
        to protect against wrong numbering of questions (differing question IDs).
    Parameters
    ----------
    df_esm_jcq_demand_control: pd.DataFrame
        A cleaned up dataframe, which must also include esm_user_answer_numeric.
    Returns
    -------
    df_esm_jcq_demand_control: pd.DataFrame
        The same dataframe with a column esm_user_score containing answers recoded and reversed.
    """
    df_esm_jcq_demand_control_unique_answers = (
        df_esm_jcq_demand_control.groupby("question_id")
        .esm_instructions.value_counts()
        .rename()
        .reset_index()
    )
    # Tabulate all possible answers to each question (group by question ID).
    for q_id in dict_JCQ_demand_control_reverse.keys():
        # Look through all answers that need to be reversed.
        possible_answers = df_esm_jcq_demand_control_unique_answers.loc[
            df_esm_jcq_demand_control_unique_answers["question_id"] == q_id,
            "esm_instructions",
        ]
        # These are all answers to a given question (by q_id).
        answers_matches = possible_answers.str.startswith(
            dict_JCQ_demand_control_reverse.get(q_id)
        )
        # See if they are expected, i.e. included in the dictionary.
        if ~answers_matches.all():
            print("One of the answers that occur in the data should not be reversed.")
            print("This was the answer found in the data: ")
            raise KeyError(possible_answers[~answers_matches])
            # In case there is an unexpected answer, raise an exception.
    try:
        df_esm_jcq_demand_control = df_esm_jcq_demand_control.assign(
            esm_user_score=lambda x: x.esm_user_answer_numeric + 1
        )
        # Increment the original answer by 1
        # to keep in line with traditional scoring (JCQ_ORIGINAL_MIN - JCQ_ORIGINAL_MAX).
        df_esm_jcq_demand_control[
            df_esm_jcq_demand_control["question_id"].isin(
                dict_JCQ_demand_control_reverse.keys()
            )
        ] = df_esm_jcq_demand_control[
            df_esm_jcq_demand_control["question_id"].isin(
                dict_JCQ_demand_control_reverse.keys()
            )
        ].assign(
            esm_user_score=lambda x: JCQ_ORIGINAL_MAX
            + JCQ_ORIGINAL_MIN
            - x.esm_user_score
        )
        # Reverse the items that require it.
    except KeyError as e:
        print("Please, clean the dataframe first using features.esm.clean_up_esm.")
        print(e)
    return df_esm_jcq_demand_control