Extract method to reuse.

2023-07-03 21:13:50 +02:00 · 2023-07-03 21:13:50 +02:00 · 82b53bc0d3
parent c688580fe8
commit 82b53bc0d3
3 changed files with 66 additions and 126 deletions
--- a/features/esm.py
+++ b/features/esm.py
@ -347,3 +347,69 @@ def increment_answers(df_esm_clean: pd.DataFrame, increment_by=1):
        print("Please, clean the dataframe first using features.esm.clean_up_esm.")
        print(e)
    return df_esm_clean
 def reassign_question_ids(
    df_esm_cleaned: pd.DataFrame, question_ids_content: dict
 ) -> pd.DataFrame:
    """
    Fix question IDs to match their actual content.
    Unfortunately, when altering the protocol to adapt to COVID pandemic,
    we did not retain original question IDs.
    This means that for participants before 2021, they are different
    from for the rest of them.
    This function searches for question IDs by matching their strings.
    Parameters
    ----------
    df_esm_cleaned: pd.DataFrame
        A cleaned up dataframe, which must also include esm_user_answer_numeric.
    question_ids_content: dict
        A dictionary, linking question IDs with their content ("instructions").
    Returns
    -------
    df_esm_fixed: pd.DataFrame
        The same dataframe but with fixed question IDs.
    """
    df_esm_unique_questions = (
        df_esm_cleaned.groupby("question_id")
        .esm_instructions.value_counts()
        .rename()
        .reset_index()
    )
    # Tabulate all possible answers to each question (group by question ID).
    # First, check that we anticipated all esm instructions.
    for q_id in question_ids_content.keys():
        # Look for all questions ("instructions") occurring in the dataframe.
        actual_questions = df_esm_unique_questions.loc[
            df_esm_unique_questions["question_id"] == q_id,
            "esm_instructions",
        ]
        # These are all answers to a given question (by q_id).
        questions_matches = actual_questions.str.startswith(
            question_ids_content.get(q_id)
        )
        # See if they are expected, i.e. included in the dictionary.
        if ~actual_questions.all():
            print("One of the questions that occur in the data was undefined.")
            print("This were the questions found in the data: ")
            raise KeyError(actual_questions[~questions_matches])
            # In case there is an unexpected answer, raise an exception.
    # Next, replace question IDs.
    df_esm_fixed = df_esm_cleaned.copy()
    df_esm_fixed["question_id"] = df_esm_cleaned["esm_instructions"].apply(
        lambda x: next(
            (
                key
                for key, values in question_ids_content.items()
                if x.startswith(values)
            ),
            None,
        )
    )
    return df_esm_fixed
--- a/features/esm_COPE.py
+++ b/features/esm_COPE.py
@ -1,5 +1,3 @@
 import pandas as pd
 COPE_ORIGINAL_MAX = 4
 COPE_ORIGINAL_MIN = 1
@ -125,65 +123,3 @@ DICT_COPE_QUESTION_IDS = {
        "Razburil sem se in razmišljal samo o tem",
    ),
 }
 def reassign_question_ids(df_cope_cleaned: pd.DataFrame) -> pd.DataFrame:
    """
    Fix question IDs to match their actual content.
    Unfortunately, when altering the protocol to adapt to COVID pandemic,
    we did not retain original question IDs.
    This means that for participants before 2021, they are different
    from for the rest of them.
    This function searches for question IDs by matching their strings.
    Parameters
    ----------
    df_cope_cleaned: pd.DataFrame
        A cleaned up dataframe, which must also include esm_user_answer_numeric.
    Returns
    -------
    df_cope_fixed: pd.DataFrame
        The same dataframe but with fixed question IDs.
    """
    df_esm_cope_unique_questions = (
        df_cope_cleaned.groupby("question_id")
        .esm_instructions.value_counts()
        .rename()
        .reset_index()
    )
    # Tabulate all possible answers to each question (group by question ID).
    # First, check that we anticipated all esm instructions.
    for q_id in DICT_COPE_QUESTION_IDS.keys():
        # Look for all questions ("instructions") occurring in the dataframe.
        actual_questions = df_esm_cope_unique_questions.loc[
            df_esm_cope_unique_questions["question_id"] == q_id,
            "esm_instructions",
        ]
        # These are all answers to a given question (by q_id).
        questions_matches = actual_questions.str.startswith(
            DICT_COPE_QUESTION_IDS.get(q_id)
        )
        # See if they are expected, i.e. included in the dictionary.
        if ~actual_questions.all():
            print("One of the questions that occur in the data was undefined.")
            print("This were the questions found in the data: ")
            raise KeyError(actual_questions[~questions_matches])
            # In case there is an unexpected answer, raise an exception.
    # Next, replace question IDs.
    df_cope_fixed = df_cope_cleaned.copy()
    df_cope_fixed["question_id"] = df_cope_cleaned["esm_instructions"].apply(
        lambda x: next(
            (
                key
                for key, values in DICT_COPE_QUESTION_IDS.items()
                if x.startswith(values)
            ),
            None,
        )
    )
    return df_cope_fixed
--- a/features/esm_SAM.py
+++ b/features/esm_SAM.py
@ -444,65 +444,3 @@ def extract_event_duration(df_esm_sam_clean: pd.DataFrame) -> pd.DataFrame:
 # TODO: How many questions about the stressfulness of the period were asked
 #  and how does this relate to events?
 def reassign_question_ids(df_sam_cleaned: pd.DataFrame) -> pd.DataFrame:
    """
    Fix question IDs to match their actual content.
    Unfortunately, when altering the protocol to adapt to COVID pandemic,
    we did not retain original question IDs.
    This means that for participants before 2021, they are different
    from for the rest of them.
    This function searches for question IDs by matching their strings.
    Parameters
    ----------
    df_sam_cleaned: pd.DataFrame
        A cleaned up dataframe, which must also include esm_user_answer_numeric.
    Returns
    -------
    df_sam_fixed: pd.DataFrame
        The same dataframe but with fixed question IDs.
    """
    df_esm_sam_unique_questions = (
        df_sam_cleaned.groupby("question_id")
        .esm_instructions.value_counts()
        .rename()
        .reset_index()
    )
    # Tabulate all possible answers to each question (group by question ID).
    # First, check that we anticipated all esm instructions.
    for q_id in DICT_SAM_QUESTION_IDS.keys():
        # Look for all questions ("instructions") occurring in the dataframe.
        actual_questions = df_esm_sam_unique_questions.loc[
            df_esm_sam_unique_questions["question_id"] == q_id,
            "esm_instructions",
        ]
        # These are all answers to a given question (by q_id).
        questions_matches = actual_questions.str.startswith(
            DICT_SAM_QUESTION_IDS.get(q_id)
        )
        # See if they are expected, i.e. included in the dictionary.
        if ~actual_questions.all():
            print("One of the questions that occur in the data was undefined.")
            print("This were the questions found in the data: ")
            raise KeyError(actual_questions[~questions_matches])
            # In case there is an unexpected answer, raise an exception.
    # Next, replace question IDs.
    df_sam_fixed = df_sam_cleaned.copy()
    df_sam_fixed["question_id"] = df_sam_cleaned["esm_instructions"].apply(
        lambda x: next(
            (
                key
                for key, values in DICT_SAM_QUESTION_IDS.items()
                if x.startswith(values)
            ),
            None,
        )
    )
    return df_sam_fixed